1 files changed, 362 insertions, 469 deletions
diff --git a/libavcodec/proresdec.c b/libavcodec/proresdec.c
index 83c083f..51807bc 100644
--- a/libavcodec/proresdec.c
+++ b/libavcodec/proresdec.c
@@ -1,78 +1,44 @@
 /*
- * Apple ProRes compatible decoder
- *
  * Copyright (c) 2010-2011 Maxim Poliakovski
+ * Copyright (c) 2010-2011 Elvis Presley
  *
- * This file is part of Libav.
+ * This file is part of FFmpeg.
  *
- * Libav is free software; you can redistribute it and/or
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
- * Libav is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
 /**
  * @file
- * This is a decoder for Apple ProRes 422 SD/HQ/LT/Proxy and ProRes 4444.
- * It is used for storing and editing high definition video data in Apple's Final Cut Pro.
- *
- * @see http://wiki.multimedia.cx/index.php?title=Apple_ProRes
+ * Known FOURCCs: 'apch' (HQ), 'apcn' (SD), 'apcs' (LT), 'acpo' (Proxy), 'ap4h' (4444)
  */
 
-#define A32_BITSTREAM_READER // some ProRes vlc codes require up to 28 bits to be read at once
+//#define DEBUG
 
-#include <stdint.h>
+#define A32_BITSTREAM_READER
 
-#include "libavutil/intmath.h"
 #include "avcodec.h"
-#include "proresdsp.h"
 #include "get_bits.h"
+#include "simple_idct.h"
+#include "proresdec.h"
 
-typedef struct {
-    const uint8_t *index;            ///< pointers to the data of this slice
-    int slice_num;
-    int x_pos, y_pos;
-    int slice_width;
-    DECLARE_ALIGNED(16, DCTELEM, blocks[8 * 4 * 64]);
-} ProresThreadData;
-
-typedef struct {
-    ProresDSPContext dsp;
-    AVFrame    picture;
-    ScanTable  scantable;
-    int        scantable_type;           ///< -1 = uninitialized, 0 = progressive, 1/2 = interlaced
-
-    int        frame_type;               ///< 0 = progressive, 1 = top-field first, 2 = bottom-field first
-    int        pic_format;               ///< 2 = 422, 3 = 444
-    uint8_t    qmat_luma[64];            ///< dequantization matrix for luma
-    uint8_t    qmat_chroma[64];          ///< dequantization matrix for chroma
-    int        qmat_changed;             ///< 1 - global quantization matrices changed
-    int        prev_slice_sf;            ///< scalefactor of the previous decoded slice
-    DECLARE_ALIGNED(16, int16_t, qmat_luma_scaled[64]);
-    DECLARE_ALIGNED(16, int16_t, qmat_chroma_scaled[64]);
-    int        total_slices;            ///< total number of slices in a picture
-    ProresThreadData *slice_data;
-    int        pic_num;
-    int        chroma_factor;
-    int        mb_chroma_factor;
-    int        num_chroma_blocks;       ///< number of chrominance blocks in a macroblock
-    int        num_x_slices;
-    int        num_y_slices;
-    int        slice_width_factor;
-    int        slice_height_factor;
-    int        num_x_mbs;
-    int        num_y_mbs;
-} ProresContext;
-
+static void permute(uint8_t *dst, const uint8_t *src, const uint8_t permutation[64])
+{
+    int i;
+    for (i = 0; i < 64; i++)
+        dst[i] = permutation[src[i]];
+}
 
 static const uint8_t progressive_scan[64] = {
      0,  1,  8,  9,  2,  3, 10, 11,
@@ -93,600 +59,527 @@ static const uint8_t interlaced_scan[64] = {
      4, 12,  5,  6, 13, 20, 28, 21,
     14,  7, 15, 22, 29, 36, 44, 37,
     30, 23, 31, 38, 45, 52, 60, 53,
-    46, 39, 47, 54, 61, 62, 55, 63
+    46, 39, 47, 54, 61, 62, 55, 63,
 };
 
-
 static av_cold int decode_init(AVCodecContext *avctx)
 {
     ProresContext *ctx = avctx->priv_data;
+    uint8_t idct_permutation[64];
 
-    ctx->total_slices     = 0;
-    ctx->slice_data       = NULL;
+    avctx->bits_per_raw_sample = 10;
 
-    avctx->pix_fmt = PIX_FMT_YUV422P10; // set default pixel format
+    dsputil_init(&ctx->dsp, avctx);
+    ff_proresdsp_init(&ctx->prodsp);
 
-    avctx->bits_per_raw_sample = PRORES_BITS_PER_SAMPLE;
-    ff_proresdsp_init(&ctx->dsp);
+    avctx->coded_frame = &ctx->frame;
+    ctx->frame.type = FF_I_TYPE;
+    ctx->frame.key_frame = 1;
 
-    avctx->coded_frame = &ctx->picture;
-    avcodec_get_frame_defaults(&ctx->picture);
-    ctx->picture.type      = AV_PICTURE_TYPE_I;
-    ctx->picture.key_frame = 1;
+    ff_init_scantable_permutation(idct_permutation,
+                                  ctx->prodsp.idct_permutation_type);
 
-    ctx->scantable_type = -1;   // set scantable type to uninitialized
-    memset(ctx->qmat_luma, 4, 64);
-    memset(ctx->qmat_chroma, 4, 64);
-    ctx->prev_slice_sf = 0;
+    permute(ctx->progressive_scan, progressive_scan, idct_permutation);
+    permute(ctx->interlaced_scan, interlaced_scan, idct_permutation);
 
     return 0;
 }
 
-
 static int decode_frame_header(ProresContext *ctx, const uint8_t *buf,
                                const int data_size, AVCodecContext *avctx)
 {
-    int hdr_size, version, width, height, flags;
+    int hdr_size, width, height, flags;
+    int version;
     const uint8_t *ptr;
 
     hdr_size = AV_RB16(buf);
+    av_dlog(avctx, "header size %d\n", hdr_size);
     if (hdr_size > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "frame data too small\n");
-        return AVERROR_INVALIDDATA;
+        av_log(avctx, AV_LOG_ERROR, "error, wrong header size\n");
+        return -1;
     }
 
     version = AV_RB16(buf + 2);
-    if (version >= 2) {
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported header version: %d\n", version);
-        return AVERROR_INVALIDDATA;
+    av_dlog(avctx, "%.4s version %d\n", buf+4, version);
+    if (version > 1) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported version: %d\n", version);
+        return -1;
     }
 
     width  = AV_RB16(buf + 8);
     height = AV_RB16(buf + 10);
     if (width != avctx->width || height != avctx->height) {
-        av_log(avctx, AV_LOG_ERROR,
-               "picture dimension changed: old: %d x %d, new: %d x %d\n",
+        av_log(avctx, AV_LOG_ERROR, "picture resolution change: %dx%d -> %dx%d\n",
                avctx->width, avctx->height, width, height);
-        return AVERROR_INVALIDDATA;
+        return -1;
     }
 
     ctx->frame_type = (buf[12] >> 2) & 3;
-    if (ctx->frame_type > 2) {
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported frame type: %d\n", ctx->frame_type);
-        return AVERROR_INVALIDDATA;
-    }
 
-    ctx->chroma_factor     = (buf[12] >> 6) & 3;
-    ctx->mb_chroma_factor  = ctx->chroma_factor + 2;
-    ctx->num_chroma_blocks = (1 << ctx->chroma_factor) >> 1;
-    switch (ctx->chroma_factor) {
-    case 2:
-        avctx->pix_fmt = PIX_FMT_YUV422P10;
-        break;
-    case 3:
-        avctx->pix_fmt = PIX_FMT_YUV444P10;
-        break;
-    default:
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported picture format: %d\n", ctx->pic_format);
-        return AVERROR_INVALIDDATA;
-    }
+    av_dlog(avctx, "frame type %d\n", ctx->frame_type);
 
-    if (ctx->scantable_type != ctx->frame_type) {
-        if (!ctx->frame_type)
-            ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable,
-                              progressive_scan);
-        else
-            ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable,
-                              interlaced_scan);
-        ctx->scantable_type = ctx->frame_type;
+    if (ctx->frame_type == 0) {
+        ctx->scan = ctx->progressive_scan; // permuted
+    } else {
+        ctx->scan = ctx->interlaced_scan; // permuted
+        ctx->frame.interlaced_frame = 1;
+        ctx->frame.top_field_first = ctx->frame_type == 1;
     }
 
-    if (ctx->frame_type) {      /* if interlaced */
-        ctx->picture.interlaced_frame = 1;
-        ctx->picture.top_field_first  = ctx->frame_type & 1;
-    }
+    avctx->pix_fmt = (buf[12] & 0xC0) == 0xC0 ? PIX_FMT_YUV444P10 : PIX_FMT_YUV422P10;
 
-    ctx->qmat_changed = 0;
     ptr   = buf + 20;
     flags = buf[19];
+    av_dlog(avctx, "flags %x\n", flags);
+
     if (flags & 2) {
-        if (ptr - buf > hdr_size - 64) {
-            av_log(avctx, AV_LOG_ERROR, "header data too small\n");
-            return AVERROR_INVALIDDATA;
-        }
-        if (memcmp(ctx->qmat_luma, ptr, 64)) {
-            memcpy(ctx->qmat_luma, ptr, 64);
-            ctx->qmat_changed = 1;
-        }
+        permute(ctx->qmat_luma, ctx->prodsp.idct_permutation, ptr);
         ptr += 64;
     } else {
         memset(ctx->qmat_luma, 4, 64);
-        ctx->qmat_changed = 1;
     }
 
     if (flags & 1) {
-        if (ptr - buf > hdr_size - 64) {
-            av_log(avctx, AV_LOG_ERROR, "header data too small\n");
-            return -1;
-        }
-        if (memcmp(ctx->qmat_chroma, ptr, 64)) {
-            memcpy(ctx->qmat_chroma, ptr, 64);
-            ctx->qmat_changed = 1;
-        }
+        permute(ctx->qmat_chroma, ctx->prodsp.idct_permutation, ptr);
     } else {
         memset(ctx->qmat_chroma, 4, 64);
-        ctx->qmat_changed = 1;
     }
 
     return hdr_size;
 }
 
-
-static int decode_picture_header(ProresContext *ctx, const uint8_t *buf,
-                                 const int data_size, AVCodecContext *avctx)
+static int decode_picture_header(AVCodecContext *avctx, const uint8_t *buf, const int buf_size)
 {
-    int   i, hdr_size, pic_data_size, num_slices;
-    int   slice_width_factor, slice_height_factor;
-    int   remainder, num_x_slices;
+    ProresContext *ctx = avctx->priv_data;
+    int i, hdr_size, slice_count;
+    unsigned pic_data_size;
+    int log2_slice_mb_width, log2_slice_mb_height;
+    int slice_mb_count, mb_x, mb_y;
     const uint8_t *data_ptr, *index_ptr;
 
-    hdr_size = data_size > 0 ? buf[0] >> 3 : 0;
-    if (hdr_size < 8 || hdr_size > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "picture header too small\n");
-        return AVERROR_INVALIDDATA;
+    hdr_size = buf[0] >> 3;
+    if (hdr_size < 8 || hdr_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture header size\n");
+        return -1;
     }
 
     pic_data_size = AV_RB32(buf + 1);
-    if (pic_data_size > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "picture data too small\n");
-        return AVERROR_INVALIDDATA;
+    if (pic_data_size > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong picture data size\n");
+        return -1;
     }
 
-    slice_width_factor  = buf[7] >> 4;
-    slice_height_factor = buf[7] & 0xF;
-    if (slice_width_factor > 3 || slice_height_factor) {
-        av_log(avctx, AV_LOG_ERROR,
-               "unsupported slice dimension: %d x %d\n",
-               1 << slice_width_factor, 1 << slice_height_factor);
-        return AVERROR_INVALIDDATA;
+    log2_slice_mb_width  = buf[7] >> 4;
+    log2_slice_mb_height = buf[7] & 0xF;
+    if (log2_slice_mb_width > 3 || log2_slice_mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "unsupported slice resolution: %dx%d\n",
+               1 << log2_slice_mb_width, 1 << log2_slice_mb_height);
+        return -1;
     }
 
-    ctx->slice_width_factor  = slice_width_factor;
-    ctx->slice_height_factor = slice_height_factor;
+    ctx->mb_width  = (avctx->width  + 15) >> 4;
+    if (ctx->frame_type)
+        ctx->mb_height = (avctx->height + 31) >> 5;
+    else
+        ctx->mb_height = (avctx->height + 15) >> 4;
 
-    ctx->num_x_mbs = (avctx->width + 15) >> 4;
-    ctx->num_y_mbs = (avctx->height +
-                      (1 << (4 + ctx->picture.interlaced_frame)) - 1) >>
-                     (4 + ctx->picture.interlaced_frame);
+    slice_count = AV_RB16(buf + 5);
 
-    remainder    = ctx->num_x_mbs & ((1 << slice_width_factor) - 1);
-    num_x_slices = (ctx->num_x_mbs >> slice_width_factor) + (remainder & 1) +
-                   ((remainder >> 1) & 1) + ((remainder >> 2) & 1);
-
-    num_slices = num_x_slices * ctx->num_y_mbs;
-    if (num_slices != AV_RB16(buf + 5)) {
-        av_log(avctx, AV_LOG_ERROR, "invalid number of slices\n");
-        return AVERROR_INVALIDDATA;
-    }
-
-    if (ctx->total_slices != num_slices) {
-        av_freep(&ctx->slice_data);
-        ctx->slice_data = av_malloc((num_slices + 1) * sizeof(ctx->slice_data[0]));
-        if (!ctx->slice_data)
+    if (ctx->slice_count != slice_count || !ctx->slices) {
+        av_freep(&ctx->slices);
+        ctx->slices = av_mallocz(slice_count * sizeof(*ctx->slices));
+        if (!ctx->slices)
             return AVERROR(ENOMEM);
-        ctx->total_slices = num_slices;
+        ctx->slice_count = slice_count;
     }
 
-    if (hdr_size + num_slices * 2 > data_size) {
-        av_log(avctx, AV_LOG_ERROR, "slice table too small\n");
-        return AVERROR_INVALIDDATA;
+    if (!slice_count)
+        return AVERROR(EINVAL);
+
+    if (hdr_size + slice_count*2 > buf_size) {
+        av_log(avctx, AV_LOG_ERROR, "error, wrong slice count\n");
+        return -1;
     }
 
-    /* parse slice table allowing quick access to the slice data */
+    // parse slice information
     index_ptr = buf + hdr_size;
-    data_ptr = index_ptr + num_slices * 2;
+    data_ptr  = index_ptr + slice_count*2;
 
-    for (i = 0; i < num_slices; i++) {
-        ctx->slice_data[i].index = data_ptr;
-        data_ptr += AV_RB16(index_ptr + i * 2);
-    }
-    ctx->slice_data[i].index = data_ptr;
+    slice_mb_count = 1 << log2_slice_mb_width;
+    mb_x = 0;
+    mb_y = 0;
 
-    if (data_ptr > buf + data_size) {
-        av_log(avctx, AV_LOG_ERROR, "out of slice data\n");
-        return -1;
-    }
+    for (i = 0; i < slice_count; i++) {
+        SliceContext *slice = &ctx->slices[i];
 
-    return pic_data_size;
-}
+        slice->data = data_ptr;
+        data_ptr += AV_RB16(index_ptr + i*2);
 
+        while (ctx->mb_width - mb_x < slice_mb_count)
+            slice_mb_count >>= 1;
 
-/**
- * Read an unsigned rice/exp golomb codeword.
- */
-static inline int decode_vlc_codeword(GetBitContext *gb, uint8_t codebook)
-{
-    unsigned int rice_order, exp_order, switch_bits;
-    unsigned int buf, code;
-    int log, prefix_len, len;
+        slice->mb_x = mb_x;
+        slice->mb_y = mb_y;
+        slice->mb_count = slice_mb_count;
+        slice->data_size = data_ptr - slice->data;
 
-    OPEN_READER(re, gb);
-    UPDATE_CACHE(re, gb);
-    buf = GET_CACHE(re, gb);
-
-    /* number of prefix bits to switch between Rice and expGolomb */
-    switch_bits = (codebook & 3) + 1;
-    rice_order  = codebook >> 5;        /* rice code order */
-    exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
-
-    log = 31 - av_log2(buf); /* count prefix bits (zeroes) */
-
-    if (log < switch_bits) { /* ok, we got a rice code */
-        if (!rice_order) {
-            /* shortcut for faster decoding of rice codes without remainder */
-            code = log;
-            LAST_SKIP_BITS(re, gb, log + 1);
-        } else {
-            prefix_len = log + 1;
-            code = (log << rice_order) + NEG_USR32(buf << prefix_len, rice_order);
-            LAST_SKIP_BITS(re, gb, prefix_len + rice_order);
+        if (slice->data_size < 6) {
+            av_log(avctx, AV_LOG_ERROR, "error, wrong slice data size\n");
+            return -1;
+        }
+
+        mb_x += slice_mb_count;
+        if (mb_x == ctx->mb_width) {
+            slice_mb_count = 1 << log2_slice_mb_width;
+            mb_x = 0;
+            mb_y++;
+        }
+        if (data_ptr > buf + buf_size) {
+            av_log(avctx, AV_LOG_ERROR, "error, slice out of bounds\n");
+            return -1;
         }
-    } else { /* otherwise we got a exp golomb code */
-        len  = (log << 1) - switch_bits + exp_order + 1;
-        code = NEG_USR32(buf, len) - (1 << exp_order) + (switch_bits << rice_order);
-        LAST_SKIP_BITS(re, gb, len);
     }
 
-    CLOSE_READER(re, gb);
+    if (mb_x || mb_y != ctx->mb_height) {
+        av_log(avctx, AV_LOG_ERROR, "error wrong mb count y %d h %d\n",
+               mb_y, ctx->mb_height);
+        return -1;
+    }
 
-    return code;
+    return pic_data_size;
 }
 
-#define LSB2SIGN(x) (-((x) & 1))
-#define TOSIGNED(x) (((x) >> 1) ^ LSB2SIGN(x))
-
-#define FIRST_DC_CB 0xB8 // rice_order = 5, exp_golomb_order = 6, switch_bits = 0
-
-static uint8_t dc_codebook[4] = {
-    0x04, // rice_order = 0, exp_golomb_order = 1, switch_bits = 0
-    0x28, // rice_order = 1, exp_golomb_order = 2, switch_bits = 0
-    0x4D, // rice_order = 2, exp_golomb_order = 3, switch_bits = 1
-    0x70  // rice_order = 3, exp_golomb_order = 4, switch_bits = 0
-};
-
-
-/**
- * Decode DC coefficients for all blocks in a slice.
- */
-static inline void decode_dc_coeffs(GetBitContext *gb, DCTELEM *out,
-                                    int nblocks)
+#define DECODE_CODEWORD(val, codebook)                                  \
+    do {                                                                \
+        unsigned int rice_order, exp_order, switch_bits;                \
+        unsigned int q, buf, bits;                                      \
+                                                                        \
+        UPDATE_CACHE(re, gb);                                           \
+        buf = GET_CACHE(re, gb);                                        \
+                                                                        \
+        /* number of bits to switch between rice and exp golomb */      \
+        switch_bits =  codebook & 3;                                    \
+        rice_order  =  codebook >> 5;                                   \
+        exp_order   = (codebook >> 2) & 7;                              \
+                                                                        \
+        q = 31 - av_log2(buf);                                          \
+                                                                        \
+        if (q > switch_bits) { /* exp golomb */                         \
+            bits = exp_order - switch_bits + (q<<1);                    \
+            val = SHOW_UBITS(re, gb, bits) - (1 << exp_order) +         \
+                ((switch_bits + 1) << rice_order);                      \
+            SKIP_BITS(re, gb, bits);                                    \
+        } else if (rice_order) {                                        \
+            SKIP_BITS(re, gb, q+1);                                     \
+            val = (q << rice_order) + SHOW_UBITS(re, gb, rice_order);   \
+            SKIP_BITS(re, gb, rice_order);                              \
+        } else {                                                        \
+            val = q;                                                    \
+            SKIP_BITS(re, gb, q+1);                                     \
+        }                                                               \
+    } while (0);                                                        \
+
+#define TOSIGNED(x) (((x) >> 1) ^ (-((x) & 1)))
+
+#define FIRST_DC_CB 0xB8
+
+static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
+
+static av_always_inline void decode_dc_coeffs(GetBitContext *gb, DCTELEM *out,
+                                              int blocks_per_slice)
 {
     DCTELEM prev_dc;
-    int     i, sign;
-    int16_t delta;
-    unsigned int code;
+    int code, i, sign;
 
-    code   = decode_vlc_codeword(gb, FIRST_DC_CB);
-    out[0] = prev_dc = TOSIGNED(code);
+    OPEN_READER(re, gb);
 
-    out   += 64; /* move to the DC coeff of the next block */
-    delta  = 3;
+    DECODE_CODEWORD(code, FIRST_DC_CB);
+    prev_dc = TOSIGNED(code);
+    out[0] = prev_dc;
 
-    for (i = 1; i < nblocks; i++, out += 64) {
-        code = decode_vlc_codeword(gb, dc_codebook[FFMIN(FFABS(delta), 3)]);
+    out += 64; // dc coeff for the next block
 
-        sign     = -(((delta >> 15) & 1) ^ (code & 1));
-        delta    = (((code + 1) >> 1) ^ sign) - sign;
-        prev_dc += delta;
-        out[0]   = prev_dc;
+    code = 5;
+    sign = 0;
+    for (i = 1; i < blocks_per_slice; i++, out += 64) {
+        DECODE_CODEWORD(code, dc_codebook[FFMIN(code, 6)]);
+        if(code) sign ^= -(code & 1);
+        else     sign  = 0;
+        prev_dc += (((code + 1) >> 1) ^ sign) - sign;
+        out[0] = prev_dc;
     }
+    CLOSE_READER(re, gb);
 }
 
+// adaptive codebook switching lut according to previous run/level values
+static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29, 0x29, 0x29, 0x29, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x4C };
+static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28, 0x28, 0x28, 0x28, 0x4C };
 
-static uint8_t ac_codebook[7] = {
-    0x04, // rice_order = 0, exp_golomb_order = 1, switch_bits = 0
-    0x28, // rice_order = 1, exp_golomb_order = 2, switch_bits = 0
-    0x4C, // rice_order = 2, exp_golomb_order = 3, switch_bits = 0
-    0x05, // rice_order = 0, exp_golomb_order = 1, switch_bits = 1
-    0x29, // rice_order = 1, exp_golomb_order = 2, switch_bits = 1
-    0x06, // rice_order = 0, exp_golomb_order = 1, switch_bits = 2
-    0x0A, // rice_order = 0, exp_golomb_order = 2, switch_bits = 2
-};
-
-/**
- * Lookup tables for adaptive switching between codebooks
- * according with previous run/level value.
- */
-static uint8_t run_to_cb_index[16] =
-    { 5, 5, 3, 3, 0, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 2 };
-
-static uint8_t lev_to_cb_index[10] = { 0, 6, 3, 5, 0, 1, 1, 1, 1, 2 };
-
-
-/**
- * Decode AC coefficients for all blocks in a slice.
- */
-static inline void decode_ac_coeffs(GetBitContext *gb, DCTELEM *out,
-                                    int blocks_per_slice,
-                                    int plane_size_factor,
-                                    const uint8_t *scan)
+static av_always_inline void decode_ac_coeffs(AVCodecContext *avctx, GetBitContext *gb,
+                                              DCTELEM *out, int blocks_per_slice)
 {
-    int pos, block_mask, run, level, sign, run_cb_index, lev_cb_index;
-    int max_coeffs, bits_left;
+    ProresContext *ctx = avctx->priv_data;
+    int block_mask, sign;
+    unsigned pos, run, level;
+    int max_coeffs, i, bits_left;
+    int log2_block_count = av_log2(blocks_per_slice);
+
+    OPEN_READER(re, gb);
 
-    /* set initial prediction values */
     run   = 4;
     level = 2;
 
-    max_coeffs = blocks_per_slice << 6;
+    max_coeffs = 64 << log2_block_count;
     block_mask = blocks_per_slice - 1;
 
-    for (pos = blocks_per_slice - 1; pos < max_coeffs;) {
-        run_cb_index = run_to_cb_index[FFMIN(run, 15)];
-        lev_cb_index = lev_to_cb_index[FFMIN(level, 9)];
-
-        bits_left = get_bits_left(gb);
-        if (bits_left <= 0 || (bits_left <= 8 && !show_bits(gb, bits_left)))
-            return;
-
-        run = decode_vlc_codeword(gb, ac_codebook[run_cb_index]);
+    for (pos = block_mask;;) {
+        bits_left = gb->size_in_bits - (((uint8_t*)re_buffer_ptr - gb->buffer)*8 - 32 + re_bit_count);
+        if (!bits_left || (bits_left < 32 && !SHOW_UBITS(re, gb, bits_left)))
+            break;
 
-        bits_left = get_bits_left(gb);
-        if (bits_left <= 0 || (bits_left <= 8 && !show_bits(gb, bits_left)))
+        DECODE_CODEWORD(run, run_to_cb[FFMIN(run,  15)]);
+        pos += run + 1;
+        if (pos >= max_coeffs) {
+            av_log(avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", pos, max_coeffs);
             return;
+        }
 
-        level = decode_vlc_codeword(gb, ac_codebook[lev_cb_index]) + 1;
+        DECODE_CODEWORD(level, lev_to_cb[FFMIN(level, 9)]);
+        level += 1;
 
-        pos += run + 1;
-        if (pos >= max_coeffs)
-            break;
+        i = pos >> log2_block_count;
 
-        sign = get_sbits(gb, 1);
-        out[((pos & block_mask) << 6) + scan[pos >> plane_size_factor]] =
-            (level ^ sign) - sign;
+        sign = SHOW_SBITS(re, gb, 1);
+        SKIP_BITS(re, gb, 1);
+        out[((pos & block_mask) << 6) + ctx->scan[i]] = ((level ^ sign) - sign);
     }
-}
 
+    CLOSE_READER(re, gb);
+}
 
-/**
- * Decode a slice plane (luma or chroma).
- */
-static void decode_slice_plane(ProresContext *ctx, ProresThreadData *td,
-                               const uint8_t *buf,
-                               int data_size, uint16_t *out_ptr,
-                               int linesize, int mbs_per_slice,
-                               int blocks_per_mb, int plane_size_factor,
-                               const int16_t *qmat)
+static void decode_slice_luma(AVCodecContext *avctx, SliceContext *slice,
+                              uint8_t *dst, int dst_stride,
+                              const uint8_t *buf, unsigned buf_size,
+                              const int16_t *qmat)
 {
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_16(DCTELEM, blocks, [8*4*64]);
+    DCTELEM *block;
     GetBitContext gb;
-    DCTELEM *block_ptr;
-    int mb_num, blocks_per_slice;
+    int i, blocks_per_slice = slice->mb_count<<2;
 
-    blocks_per_slice = mbs_per_slice * blocks_per_mb;
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->dsp.clear_block(blocks+(i<<6));
 
-    memset(td->blocks, 0, 8 * 4 * 64 * sizeof(*td->blocks));
+    init_get_bits(&gb, buf, buf_size << 3);
 
-    init_get_bits(&gb, buf, data_size << 3);
+    decode_dc_coeffs(&gb, blocks, blocks_per_slice);
+    decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice);
 
-    decode_dc_coeffs(&gb, td->blocks, blocks_per_slice);
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        ctx->prodsp.idct_put(dst, dst_stride, block+(0<<6), qmat);
+        ctx->prodsp.idct_put(dst+16, dst_stride, block+(1<<6), qmat);
+        ctx->prodsp.idct_put(dst+8*dst_stride, dst_stride, block+(2<<6), qmat);
+        ctx->prodsp.idct_put(dst+8*dst_stride+16, dst_stride, block+(3<<6), qmat);
+        block += 4*64;
+        dst += 32;
+    }
+}
 
-    decode_ac_coeffs(&gb, td->blocks, blocks_per_slice,
-                     plane_size_factor, ctx->scantable.permutated);
+static void decode_slice_chroma(AVCodecContext *avctx, SliceContext *slice,
+                                uint8_t *dst, int dst_stride,
+                                const uint8_t *buf, unsigned buf_size,
+                                const int16_t *qmat, int log2_blocks_per_mb)
+{
+    ProresContext *ctx = avctx->priv_data;
+    LOCAL_ALIGNED_16(DCTELEM, blocks, [8*4*64]);
+    DCTELEM *block;
+    GetBitContext gb;
+    int i, j, blocks_per_slice = slice->mb_count << log2_blocks_per_mb;
 
-    /* inverse quantization, inverse transform and output */
-    block_ptr = td->blocks;
+    for (i = 0; i < blocks_per_slice; i++)
+        ctx->dsp.clear_block(blocks+(i<<6));
 
-    for (mb_num = 0; mb_num < mbs_per_slice; mb_num++, out_ptr += blocks_per_mb * 4) {
-        ctx->dsp.idct_put(out_ptr,                    linesize, block_ptr, qmat);
-        block_ptr += 64;
-        if (blocks_per_mb > 2) {
-            ctx->dsp.idct_put(out_ptr + 8,            linesize, block_ptr, qmat);
-            block_ptr += 64;
-        }
-        ctx->dsp.idct_put(out_ptr + linesize * 4,     linesize, block_ptr, qmat);
-        block_ptr += 64;
-        if (blocks_per_mb > 2) {
-            ctx->dsp.idct_put(out_ptr + linesize * 4 + 8, linesize, block_ptr, qmat);
-            block_ptr += 64;
+    init_get_bits(&gb, buf, buf_size << 3);
+
+    decode_dc_coeffs(&gb, blocks, blocks_per_slice);
+    decode_ac_coeffs(avctx, &gb, blocks, blocks_per_slice);
+
+    block = blocks;
+    for (i = 0; i < slice->mb_count; i++) {
+        for (j = 0; j < log2_blocks_per_mb; j++) {
+            ctx->prodsp.idct_put(dst,              dst_stride, block+(0<<6), qmat);
+            ctx->prodsp.idct_put(dst+8*dst_stride, dst_stride, block+(1<<6), qmat);
+            block += 2*64;
+            dst += 16;
         }
     }
 }
 
-
-static int decode_slice(AVCodecContext *avctx, ProresThreadData *td)
+static int decode_slice_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
 {
     ProresContext *ctx = avctx->priv_data;
-    int mb_x_pos  = td->x_pos;
-    int mb_y_pos  = td->y_pos;
-    int pic_num   = ctx->pic_num;
-    int slice_num = td->slice_num;
-    int mbs_per_slice = td->slice_width;
-    const uint8_t *buf;
-    uint8_t *y_data, *u_data, *v_data;
+    SliceContext *slice = &ctx->slices[jobnr];
+    const uint8_t *buf = slice->data;
     AVFrame *pic = avctx->coded_frame;
-    int i, sf, slice_width_factor;
-    int slice_data_size, hdr_size, y_data_size, u_data_size, v_data_size;
-    int y_linesize, u_linesize, v_linesize;
-
-    buf             = ctx->slice_data[slice_num].index;
-    slice_data_size = ctx->slice_data[slice_num + 1].index - buf;
-
-    slice_width_factor = av_log2(mbs_per_slice);
-
-    y_data     = pic->data[0];
-    u_data     = pic->data[1];
-    v_data     = pic->data[2];
-    y_linesize = pic->linesize[0];
-    u_linesize = pic->linesize[1];
-    v_linesize = pic->linesize[2];
-
-    if (pic->interlaced_frame) {
-        if (!(pic_num ^ pic->top_field_first)) {
-            y_data += y_linesize;
-            u_data += u_linesize;
-            v_data += v_linesize;
-        }
-        y_linesize <<= 1;
-        u_linesize <<= 1;
-        v_linesize <<= 1;
+    int i, hdr_size, qscale, log2_chroma_blocks_per_mb;
+    int luma_stride, chroma_stride;
+    int y_data_size, u_data_size, v_data_size;
+    uint8_t *dest_y, *dest_u, *dest_v;
+    int16_t qmat_luma_scaled[64];
+    int16_t qmat_chroma_scaled[64];
+    int mb_x_shift;
+
+    //av_log(avctx, AV_LOG_INFO, "slice %d mb width %d mb x %d y %d\n",
+    //       jobnr, slice->mb_count, slice->mb_x, slice->mb_y);
+
+    // slice header
+    hdr_size = buf[0] >> 3;
+    qscale = av_clip(buf[1], 1, 224);
+    qscale = qscale > 128 ? qscale - 96 << 2: qscale;
+    y_data_size = AV_RB16(buf + 2);
+    u_data_size = AV_RB16(buf + 4);
+    v_data_size = slice->data_size - y_data_size - u_data_size - hdr_size;
+    if (hdr_size > 7) v_data_size = AV_RB16(buf + 6);
+
+    if (y_data_size < 0 || u_data_size < 0 || v_data_size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "invalid plane data size\n");
+        return -1;
     }
 
-    if (slice_data_size < 6) {
-        av_log(avctx, AV_LOG_ERROR, "slice data too small\n");
-        return AVERROR_INVALIDDATA;
+    buf += hdr_size;
+
+    for (i = 0; i < 64; i++) {
+        qmat_luma_scaled  [i] = ctx->qmat_luma  [i] * qscale;
+        qmat_chroma_scaled[i] = ctx->qmat_chroma[i] * qscale;
     }
 
-    /* parse slice header */
-    hdr_size    = buf[0] >> 3;
-    y_data_size = AV_RB16(buf + 2);
-    u_data_size = AV_RB16(buf + 4);
-    v_data_size = slice_data_size - y_data_size - u_data_size - hdr_size;
+    if (ctx->frame_type == 0) {
+        luma_stride   = pic->linesize[0];
+        chroma_stride = pic->linesize[1];
+    } else {
+        luma_stride   = pic->linesize[0] << 1;
+        chroma_stride = pic->linesize[1] << 1;
+    }
 
-    if (v_data_size < 0 || hdr_size < 6) {
-        av_log(avctx, AV_LOG_ERROR, "invalid data size\n");
-        return AVERROR_INVALIDDATA;
+    if (avctx->pix_fmt == PIX_FMT_YUV444P10) {
+        mb_x_shift = 5;
+        log2_chroma_blocks_per_mb = 2;
+    } else {
+        mb_x_shift = 4;
+        log2_chroma_blocks_per_mb = 1;
     }
 
-    sf = av_clip(buf[1], 1, 224);
-    sf = sf > 128 ? (sf - 96) << 2 : sf;
+    dest_y = pic->data[0] + (slice->mb_y << 4) * luma_stride + (slice->mb_x << 5);
+    dest_u = pic->data[1] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
+    dest_v = pic->data[2] + (slice->mb_y << 4) * chroma_stride + (slice->mb_x << mb_x_shift);
 
-    /* scale quantization matrixes according with slice's scale factor */
-    /* TODO: this can be SIMD-optimized alot */
-    if (ctx->qmat_changed || sf != ctx->prev_slice_sf) {
-        ctx->prev_slice_sf = sf;
-        for (i = 0; i < 64; i++) {
-            ctx->qmat_luma_scaled[ctx->dsp.idct_permutation[i]]   = ctx->qmat_luma[i]   * sf;
-            ctx->qmat_chroma_scaled[ctx->dsp.idct_permutation[i]] = ctx->qmat_chroma[i] * sf;
-        }
+    if (ctx->frame_type && ctx->first_field ^ ctx->frame.top_field_first) {
+        dest_y += pic->linesize[0];
+        dest_u += pic->linesize[1];
+        dest_v += pic->linesize[2];
     }
 
-    /* decode luma plane */
-    decode_slice_plane(ctx, td, buf + hdr_size, y_data_size,
-                       (uint16_t*) (y_data + (mb_y_pos << 4) * y_linesize +
-                                    (mb_x_pos << 5)), y_linesize,
-                       mbs_per_slice, 4, slice_width_factor + 2,
-                       ctx->qmat_luma_scaled);
-
-    /* decode U chroma plane */
-    decode_slice_plane(ctx, td, buf + hdr_size + y_data_size, u_data_size,
-                       (uint16_t*) (u_data + (mb_y_pos << 4) * u_linesize +
-                                    (mb_x_pos << ctx->mb_chroma_factor)),
-                       u_linesize, mbs_per_slice, ctx->num_chroma_blocks,
-                       slice_width_factor + ctx->chroma_factor - 1,
-                       ctx->qmat_chroma_scaled);
-
-    /* decode V chroma plane */
-    decode_slice_plane(ctx, td, buf + hdr_size + y_data_size + u_data_size,
-                       v_data_size,
-                       (uint16_t*) (v_data + (mb_y_pos << 4) * v_linesize +
-                                    (mb_x_pos << ctx->mb_chroma_factor)),
-                       v_linesize, mbs_per_slice, ctx->num_chroma_blocks,
-                       slice_width_factor + ctx->chroma_factor - 1,
-                       ctx->qmat_chroma_scaled);
+    decode_slice_luma(avctx, slice, dest_y, luma_stride,
+                      buf, y_data_size, qmat_luma_scaled);
+
+    if (!(avctx->flags & CODEC_FLAG_GRAY)) {
+        decode_slice_chroma(avctx, slice, dest_u, chroma_stride,
+                            buf + y_data_size, u_data_size,
+                            qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+        decode_slice_chroma(avctx, slice, dest_v, chroma_stride,
+                            buf + y_data_size + u_data_size, v_data_size,
+                            qmat_chroma_scaled, log2_chroma_blocks_per_mb);
+    }
 
     return 0;
 }
 
-
-static int decode_picture(ProresContext *ctx, int pic_num,
-                          AVCodecContext *avctx)
+static int decode_picture(AVCodecContext *avctx)
 {
-    int slice_num, slice_width, x_pos, y_pos;
-
-    slice_num = 0;
-
-    ctx->pic_num = pic_num;
-    for (y_pos = 0; y_pos < ctx->num_y_mbs; y_pos++) {
-        slice_width = 1 << ctx->slice_width_factor;
-
-        for (x_pos = 0; x_pos < ctx->num_x_mbs && slice_width;
-             x_pos += slice_width) {
-            while (ctx->num_x_mbs - x_pos < slice_width)
-                slice_width >>= 1;
+    ProresContext *ctx = avctx->priv_data;
+    int i, threads_ret[ctx->slice_count];
 
-            ctx->slice_data[slice_num].slice_num   = slice_num;
-            ctx->slice_data[slice_num].x_pos       = x_pos;
-            ctx->slice_data[slice_num].y_pos       = y_pos;
-            ctx->slice_data[slice_num].slice_width = slice_width;
+    avctx->execute2(avctx, decode_slice_thread, NULL, threads_ret, ctx->slice_count);
 
-            slice_num++;
-        }
-    }
+    for (i = 0; i < ctx->slice_count; i++)
+        if (threads_ret[i] < 0)
+            return threads_ret[i];
 
-    return avctx->execute(avctx, (void *) decode_slice,
-                          ctx->slice_data, NULL, slice_num,
-                          sizeof(ctx->slice_data[0]));
+    return 0;
 }
 
-
-#define FRAME_ID MKBETAG('i', 'c', 'p', 'f')
-#define MOVE_DATA_PTR(nbytes) buf += (nbytes); buf_size -= (nbytes)
-
 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size,
                         AVPacket *avpkt)
 {
     ProresContext *ctx = avctx->priv_data;
-    AVFrame *picture   = avctx->coded_frame;
+    AVFrame *frame = avctx->coded_frame;
     const uint8_t *buf = avpkt->data;
-    int buf_size       = avpkt->size;
-    int frame_hdr_size, pic_num, pic_data_size;
-
-    /* check frame atom container */
-    if (buf_size < 28 || buf_size < AV_RB32(buf) ||
-        AV_RB32(buf + 4) != FRAME_ID) {
-        av_log(avctx, AV_LOG_ERROR, "invalid frame\n");
-        return AVERROR_INVALIDDATA;
+    int buf_size = avpkt->size;
+    int frame_hdr_size, pic_size;
+
+    if (buf_size < 28 || AV_RL32(buf + 4) != AV_RL32("icpf")) {
+        av_log(avctx, AV_LOG_ERROR, "invalid frame header\n");
+        return -1;
     }
 
-    MOVE_DATA_PTR(8);
+    ctx->first_field = 1;
+
+    buf += 8;
+    buf_size -= 8;
 
     frame_hdr_size = decode_frame_header(ctx, buf, buf_size, avctx);
     if (frame_hdr_size < 0)
-        return AVERROR_INVALIDDATA;
+        return -1;
+
+    buf += frame_hdr_size;
+    buf_size -= frame_hdr_size;
 
-    MOVE_DATA_PTR(frame_hdr_size);
+    if (frame->data[0])
+        avctx->release_buffer(avctx, frame);
 
-    if (picture->data[0])
-        avctx->release_buffer(avctx, picture);
+    if (avctx->get_buffer(avctx, frame) < 0)
+        return -1;
 
-    picture->reference = 0;
-    if (avctx->get_buffer(avctx, picture) < 0)
+ decode_picture:
+    pic_size = decode_picture_header(avctx, buf, buf_size);
+    if (pic_size < 0) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture header\n");
         return -1;
+    }
 
-    for (pic_num = 0; ctx->picture.interlaced_frame - pic_num + 1; pic_num++) {
-        pic_data_size = decode_picture_header(ctx, buf, buf_size, avctx);
-        if (pic_data_size < 0)
-            return AVERROR_INVALIDDATA;
+    if (decode_picture(avctx)) {
+        av_log(avctx, AV_LOG_ERROR, "error decoding picture\n");
+        return -1;
+    }
 
-        if (decode_picture(ctx, pic_num, avctx))
-            return -1;
+    buf += pic_size;
+    buf_size -= pic_size;
 
-        MOVE_DATA_PTR(pic_data_size);
+    if (ctx->frame_type && buf_size > 0 && ctx->first_field) {
+        ctx->first_field = 0;
+        goto decode_picture;
     }
 
-    *data_size       = sizeof(AVPicture);
-    *(AVFrame*) data = *avctx->coded_frame;
+    *data_size = sizeof(AVFrame);
+    *(AVFrame*)data = *frame;
 
     return avpkt->size;
 }
 
-
 static av_cold int decode_close(AVCodecContext *avctx)
 {
     ProresContext *ctx = avctx->priv_data;
 
-    if (ctx->picture.data[0])
-        avctx->release_buffer(avctx, &ctx->picture);
-
-    av_freep(&ctx->slice_data);
+    AVFrame *frame = avctx->coded_frame;
+    if (frame->data[0])
+        avctx->release_buffer(avctx, frame);
+    av_freep(&ctx->slices);
 
     return 0;
 }
 
-
 AVCodec ff_prores_decoder = {
     .name           = "prores",
     .type           = AVMEDIA_TYPE_VIDEO,
@@ -695,6 +588,6 @@ AVCodec ff_prores_decoder = {
     .init           = decode_init,
     .close          = decode_close,
     .decode         = decode_frame,
+    .long_name      = NULL_IF_CONFIG_SMALL("ProRes"),
     .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_SLICE_THREADS,
-    .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)")
 };