1 files changed, 475 insertions, 190 deletions
diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c
index 94430db..a71045e 100644
--- a/libavcodec/vp9.c
+++ b/libavcodec/vp9.c
@@ -36,6 +36,64 @@
 
 #define VP9_SYNCCODE 0x498342
 
+#if HAVE_THREADS
+static void vp9_free_entries(AVCodecContext *avctx) {
+    VP9Context *s = avctx->priv_data;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        pthread_mutex_destroy(&s->progress_mutex);
+        pthread_cond_destroy(&s->progress_cond);
+        av_freep(&s->entries);
+    }
+}
+
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) {
+    VP9Context *s = avctx->priv_data;
+    int i;
+
+    if (avctx->active_thread_type & FF_THREAD_SLICE)  {
+        if (s->entries)
+            av_freep(&s->entries);
+
+        s->entries = av_malloc_array(n, sizeof(atomic_int));
+
+        if (!s->entries) {
+            av_freep(&s->entries);
+            return AVERROR(ENOMEM);
+        }
+
+        for (i  = 0; i < n; i++)
+            atomic_init(&s->entries[i], 0);
+
+        pthread_mutex_init(&s->progress_mutex, NULL);
+        pthread_cond_init(&s->progress_cond, NULL);
+    }
+    return 0;
+}
+
+static void vp9_report_tile_progress(VP9Context *s, int field, int n) {
+    pthread_mutex_lock(&s->progress_mutex);
+    atomic_fetch_add_explicit(&s->entries[field], n, memory_order_relaxed);
+    pthread_cond_signal(&s->progress_cond);
+    pthread_mutex_unlock(&s->progress_mutex);
+}
+
+static void vp9_await_tile_progress(VP9Context *s, int field, int n) {
+    if (atomic_load_explicit(&s->entries[field], memory_order_acquire) >= n)
+        return;
+
+    pthread_mutex_lock(&s->progress_mutex);
+    while (atomic_load_explicit(&s->entries[field], memory_order_relaxed) != n)
+        pthread_cond_wait(&s->progress_cond, &s->progress_mutex);
+    pthread_mutex_unlock(&s->progress_mutex);
+}
+#else
+static void vp9_free_entries(VP9Context *s) {}
+static int vp9_alloc_entries(AVCodecContext *avctx, int n) { return 0; }
+static void vp9_report_tile_progress(VP9Context *s, int field, int n) {}
+static void vp9_await_tile_progress(VP9Context *s, int field, int n) {}
+#endif
+
 static void vp9_frame_unref(AVCodecContext *avctx, VP9Frame *f)
 {
     ff_thread_release_buffer(avctx, &f->tf);
@@ -118,6 +176,7 @@ static int update_size(AVCodecContext *avctx, int w, int h)
     VP9Context *s = avctx->priv_data;
     uint8_t *p;
     int bytesperpixel = s->bytesperpixel, ret, cols, rows;
+    int lflvl_len, i;
 
     av_assert0(w > 0 && h > 0);
 
@@ -170,13 +229,14 @@ static int update_size(AVCodecContext *avctx, int w, int h)
     s->sb_rows   = (h + 63) >> 6;
     s->cols      = (w + 7) >> 3;
     s->rows      = (h + 7) >> 3;
+    lflvl_len    = avctx->active_thread_type == FF_THREAD_SLICE ? s->sb_rows : 1;
 
 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
     av_freep(&s->intra_pred_data[0]);
     // FIXME we slightly over-allocate here for subsampled chroma, but a little
     // bit of padding shouldn't affect performance...
     p = av_malloc(s->sb_cols * (128 + 192 * bytesperpixel +
-                                sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
+                                lflvl_len * sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
     if (!p)
         return AVERROR(ENOMEM);
     assign(s->intra_pred_data[0],  uint8_t *,             64 * bytesperpixel);
@@ -195,12 +255,15 @@ static int update_size(AVCodecContext *avctx, int w, int h)
     assign(s->above_comp_ctx,      uint8_t *,              8);
     assign(s->above_ref_ctx,       uint8_t *,              8);
     assign(s->above_filter_ctx,    uint8_t *,              8);
-    assign(s->lflvl,               VP9Filter *,            1);
+    assign(s->lflvl,               VP9Filter *,            lflvl_len);
 #undef assign
 
-    // these will be re-allocated a little later
-    av_freep(&s->b_base);
-    av_freep(&s->block_base);
+    if (s->td) {
+        for (i = 0; i < s->active_tile_cols; i++) {
+            av_freep(&s->td[i].b_base);
+            av_freep(&s->td[i].block_base);
+        }
+    }
 
     if (s->s.h.bpp != s->last_bpp) {
         ff_vp9dsp_init(&s->dsp, s->s.h.bpp, avctx->flags & AV_CODEC_FLAG_BITEXACT);
@@ -213,40 +276,50 @@ static int update_size(AVCodecContext *avctx, int w, int h)
 
 static int update_block_buffers(AVCodecContext *avctx)
 {
+    int i;
     VP9Context *s = avctx->priv_data;
     int chroma_blocks, chroma_eobs, bytesperpixel = s->bytesperpixel;
+    VP9TileData *td = &s->td[0];
 
-    if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
+    if (td->b_base && td->block_base && s->block_alloc_using_2pass == s->s.frames[CUR_FRAME].uses_2pass)
         return 0;
 
-    av_free(s->b_base);
-    av_free(s->block_base);
+    av_free(td->b_base);
+    av_free(td->block_base);
     chroma_blocks = 64 * 64 >> (s->ss_h + s->ss_v);
     chroma_eobs   = 16 * 16 >> (s->ss_h + s->ss_v);
     if (s->s.frames[CUR_FRAME].uses_2pass) {
         int sbs = s->sb_cols * s->sb_rows;
 
-        s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
-        s->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+        td->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
+        td->block_base = av_mallocz(((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
                                     16 * 16 + 2 * chroma_eobs) * sbs);
-        if (!s->b_base || !s->block_base)
+        if (!td->b_base || !td->block_base)
             return AVERROR(ENOMEM);
-        s->uvblock_base[0] = s->block_base + sbs * 64 * 64 * bytesperpixel;
-        s->uvblock_base[1] = s->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
-        s->uveob_base[0] = s->eob_base + 16 * 16 * sbs;
-        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs * sbs;
+        td->uvblock_base[0] = td->block_base + sbs * 64 * 64 * bytesperpixel;
+        td->uvblock_base[1] = td->uvblock_base[0] + sbs * chroma_blocks * bytesperpixel;
+        td->eob_base = (uint8_t *) (td->uvblock_base[1] + sbs * chroma_blocks * bytesperpixel);
+        td->uveob_base[0] = td->eob_base + 16 * 16 * sbs;
+        td->uveob_base[1] = td->uveob_base[0] + chroma_eobs * sbs;
     } else {
-        s->b_base = av_malloc(sizeof(VP9Block));
-        s->block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
-                                   16 * 16 + 2 * chroma_eobs);
-        if (!s->b_base || !s->block_base)
-            return AVERROR(ENOMEM);
-        s->uvblock_base[0] = s->block_base + 64 * 64 * bytesperpixel;
-        s->uvblock_base[1] = s->uvblock_base[0] + chroma_blocks * bytesperpixel;
-        s->eob_base = (uint8_t *) (s->uvblock_base[1] + chroma_blocks * bytesperpixel);
-        s->uveob_base[0] = s->eob_base + 16 * 16;
-        s->uveob_base[1] = s->uveob_base[0] + chroma_eobs;
+        for (i = 1; i < s->active_tile_cols; i++) {
+            if (s->td[i].b_base && s->td[i].block_base) {
+                av_free(s->td[i].b_base);
+                av_free(s->td[i].block_base);
+            }
+        }
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].b_base = av_malloc(sizeof(VP9Block));
+            s->td[i].block_base = av_mallocz((64 * 64 + 2 * chroma_blocks) * bytesperpixel * sizeof(int16_t) +
+                                       16 * 16 + 2 * chroma_eobs);
+            if (!s->td[i].b_base || !s->td[i].block_base)
+                return AVERROR(ENOMEM);
+            s->td[i].uvblock_base[0] = s->td[i].block_base + 64 * 64 * bytesperpixel;
+            s->td[i].uvblock_base[1] = s->td[i].uvblock_base[0] + chroma_blocks * bytesperpixel;
+            s->td[i].eob_base = (uint8_t *) (s->td[i].uvblock_base[1] + chroma_blocks * bytesperpixel);
+            s->td[i].uveob_base[0] = s->td[i].eob_base + 16 * 16;
+            s->td[i].uveob_base[1] = s->td[i].uveob_base[0] + chroma_eobs;
+        }
     }
     s->block_alloc_using_2pass = s->s.frames[CUR_FRAME].uses_2pass;
 
@@ -543,8 +616,20 @@ static int decode_frame_header(AVCodecContext *avctx,
     sharp = get_bits(&s->gb, 3);
     // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
     // the old cache values since they are still valid
-    if (s->s.h.filter.sharpness != sharp)
-        memset(s->filter_lut.lim_lut, 0, sizeof(s->filter_lut.lim_lut));
+    if (s->s.h.filter.sharpness != sharp) {
+        for (i = 1; i <= 63; i++) {
+            int limit = i;
+
+            if (sharp > 0) {
+                limit >>= (sharp + 3) >> 2;
+                limit = FFMIN(limit, 9 - sharp);
+            }
+            limit = FFMAX(limit, 1);
+
+            s->filter_lut.lim_lut[i] = limit;
+            s->filter_lut.mblim_lut[i] = 2 * (i + 2) + limit;
+        }
+    }
     s->s.h.filter.sharpness = sharp;
     if ((s->s.h.lf_delta.enabled = get_bits1(&s->gb))) {
         if ((s->s.h.lf_delta.updated = get_bits1(&s->gb))) {
@@ -662,12 +747,36 @@ static int decode_frame_header(AVCodecContext *avctx,
     s->s.h.tiling.log2_tile_rows = decode012(&s->gb);
     s->s.h.tiling.tile_rows = 1 << s->s.h.tiling.log2_tile_rows;
     if (s->s.h.tiling.tile_cols != (1 << s->s.h.tiling.log2_tile_cols)) {
+        int n_range_coders;
+        VP56RangeCoder *rc;
+
+        if (s->td) {
+            for (i = 0; i < s->active_tile_cols; i++) {
+                av_free(s->td[i].b_base);
+                av_free(s->td[i].block_base);
+            }
+            av_free(s->td);
+        }
+
         s->s.h.tiling.tile_cols = 1 << s->s.h.tiling.log2_tile_cols;
-        s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
-                                 sizeof(VP56RangeCoder) * s->s.h.tiling.tile_cols);
-        if (!s->c_b) {
-            av_log(avctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
+        vp9_free_entries(avctx);
+        s->active_tile_cols = avctx->active_thread_type == FF_THREAD_SLICE ?
+                              s->s.h.tiling.tile_cols : 1;
+        vp9_alloc_entries(avctx, s->sb_rows);
+        if (avctx->active_thread_type == FF_THREAD_SLICE) {
+            n_range_coders = 4; // max_tile_rows
+        } else {
+            n_range_coders = s->s.h.tiling.tile_cols;
+        }
+        s->td = av_mallocz_array(s->active_tile_cols, sizeof(VP9TileData) +
+                                 n_range_coders * sizeof(VP56RangeCoder));
+        if (!s->td)
             return AVERROR(ENOMEM);
+        rc = (VP56RangeCoder *) &s->td[s->active_tile_cols];
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].s = s;
+            s->td[i].c_b = rc;
+            rc += n_range_coders;
         }
     }
 
@@ -735,12 +844,15 @@ static int decode_frame_header(AVCodecContext *avctx,
         return AVERROR_INVALIDDATA;
     }
 
-    if (s->s.h.keyframe || s->s.h.intraonly) {
-        memset(s->counts.coef, 0, sizeof(s->counts.coef));
-        memset(s->counts.eob,  0, sizeof(s->counts.eob));
-    } else {
-        memset(&s->counts, 0, sizeof(s->counts));
+    for (i = 0; i < s->active_tile_cols; i++) {
+        if (s->s.h.keyframe || s->s.h.intraonly) {
+            memset(s->td[i].counts.coef, 0, sizeof(s->td[0].counts.coef));
+            memset(s->td[i].counts.eob,  0, sizeof(s->td[0].counts.eob));
+        } else {
+            memset(&s->td[i].counts, 0, sizeof(s->td[0].counts));
+        }
     }
+
     /* FIXME is it faster to not copy here, but do it down in the fw updates
      * as explicit copies if the fw update is missing (and skip the copy upon
      * fw update)? */
@@ -789,7 +901,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                                 else
                                     p[n] = r[n];
                             }
-                            p[3] = 0;
+                            memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
                         }
         } else {
             for (j = 0; j < 2; j++)
@@ -801,7 +913,7 @@ static int decode_frame_header(AVCodecContext *avctx,
                             if (m > 3 && l == 0) // dc only has 3 pt
                                 break;
                             memcpy(p, r, 3);
-                            p[3] = 0;
+                            memcpy(&p[3], ff_vp9_model_pareto8[p[2]], 8);
                         }
         }
         if (s->s.h.txfmmode == i)
@@ -929,12 +1041,12 @@ static int decode_frame_header(AVCodecContext *avctx,
     return (data2 - data) + size2;
 }
 
-static void decode_sb(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
+static void decode_sb(VP9TileData *td, int row, int col, VP9Filter *lflvl,
                       ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
 {
-    VP9Context *s = avctx->priv_data;
+    const VP9Context *s = td->s;
     int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
-            (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
+            (((td->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
     const uint8_t *p = s->s.h.keyframe || s->s.h.intraonly ? ff_vp9_default_kf_partition_probs[bl][c] :
                                                      s->prob.p.partition[bl][c];
     enum BlockPartition bp;
@@ -944,75 +1056,75 @@ static void decode_sb(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
     int bytesperpixel = s->bytesperpixel;
 
     if (bl == BL_8X8) {
-        bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
-        ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+        bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
     } else if (col + hbs < s->cols) { // FIXME why not <=?
         if (row + hbs < s->rows) { // FIXME why not <=?
-            bp = vp8_rac_get_tree(&s->c, ff_vp9_partition_tree, p);
+            bp = vp8_rac_get_tree(td->c, ff_vp9_partition_tree, p);
             switch (bp) {
             case PARTITION_NONE:
-                ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_H:
-                ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
                 yoff  += hbs * 8 * y_stride;
                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
-                ff_vp9_decode_block(avctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_V:
-                ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
                 yoff  += hbs * 8 * bytesperpixel;
                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
-                ff_vp9_decode_block(avctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
+                ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
                 break;
             case PARTITION_SPLIT:
-                decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-                decode_sb(avctx, row, col + hbs, lflvl,
+                decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(td, row, col + hbs, lflvl,
                           yoff + 8 * hbs * bytesperpixel,
                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 yoff  += hbs * 8 * y_stride;
                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
-                decode_sb(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
-                decode_sb(avctx, row + hbs, col + hbs, lflvl,
+                decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb(td, row + hbs, col + hbs, lflvl,
                           yoff + 8 * hbs * bytesperpixel,
                           uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 break;
             default:
                 av_assert0(0);
             }
-        } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
+        } else if (vp56_rac_get_prob_branchy(td->c, p[1])) {
             bp = PARTITION_SPLIT;
-            decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
-            decode_sb(avctx, row, col + hbs, lflvl,
+            decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(td, row, col + hbs, lflvl,
                       yoff + 8 * hbs * bytesperpixel,
                       uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
         } else {
             bp = PARTITION_H;
-            ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+            ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
         }
     } else if (row + hbs < s->rows) { // FIXME why not <=?
-        if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
+        if (vp56_rac_get_prob_branchy(td->c, p[2])) {
             bp = PARTITION_SPLIT;
-            decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
             yoff  += hbs * 8 * y_stride;
             uvoff += hbs * 8 * uv_stride >> s->ss_v;
-            decode_sb(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         } else {
             bp = PARTITION_V;
-            ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, bl, bp);
+            ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, bl, bp);
         }
     } else {
         bp = PARTITION_SPLIT;
-        decode_sb(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        decode_sb(td, row, col, lflvl, yoff, uvoff, bl + 1);
     }
-    s->counts.partition[bl][c][bp]++;
+    td->counts.partition[bl][c][bp]++;
 }
 
-static void decode_sb_mem(AVCodecContext *avctx, int row, int col, VP9Filter *lflvl,
+static void decode_sb_mem(VP9TileData *td, int row, int col, VP9Filter *lflvl,
                           ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
 {
-    VP9Context *s = avctx->priv_data;
-    VP9Block *b = s->b;
+    const VP9Context *s = td->s;
+    VP9Block *b = td->b;
     ptrdiff_t hbs = 4 >> bl;
     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
     ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
@@ -1020,39 +1132,39 @@ static void decode_sb_mem(AVCodecContext *avctx, int row, int col, VP9Filter *lf
 
     if (bl == BL_8X8) {
         av_assert2(b->bl == BL_8X8);
-        ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
-    } else if (s->b->bl == bl) {
-        ff_vp9_decode_block(avctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
+    } else if (td->b->bl == bl) {
+        ff_vp9_decode_block(td, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
         if (b->bp == PARTITION_H && row + hbs < s->rows) {
             yoff  += hbs * 8 * y_stride;
             uvoff += hbs * 8 * uv_stride >> s->ss_v;
-            ff_vp9_decode_block(avctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
+            ff_vp9_decode_block(td, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
         } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
             yoff  += hbs * 8 * bytesperpixel;
             uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
-            ff_vp9_decode_block(avctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
+            ff_vp9_decode_block(td, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
         }
     } else {
-        decode_sb_mem(avctx, row, col, lflvl, yoff, uvoff, bl + 1);
+        decode_sb_mem(td, row, col, lflvl, yoff, uvoff, bl + 1);
         if (col + hbs < s->cols) { // FIXME why not <=?
             if (row + hbs < s->rows) {
-                decode_sb_mem(avctx, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
+                decode_sb_mem(td, row, col + hbs, lflvl, yoff + 8 * hbs * bytesperpixel,
                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
                 yoff  += hbs * 8 * y_stride;
                 uvoff += hbs * 8 * uv_stride >> s->ss_v;
-                decode_sb_mem(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
-                decode_sb_mem(avctx, row + hbs, col + hbs, lflvl,
+                decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+                decode_sb_mem(td, row + hbs, col + hbs, lflvl,
                               yoff + 8 * hbs * bytesperpixel,
                               uvoff + (8 * hbs * bytesperpixel >> s->ss_h), bl + 1);
             } else {
                 yoff  += hbs * 8 * bytesperpixel;
                 uvoff += hbs * 8 * bytesperpixel >> s->ss_h;
-                decode_sb_mem(avctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
+                decode_sb_mem(td, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
             }
         } else if (row + hbs < s->rows) {
             yoff  += hbs * 8 * y_stride;
             uvoff += hbs * 8 * uv_stride >> s->ss_v;
-            decode_sb_mem(avctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
+            decode_sb_mem(td, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
         }
     }
 }
@@ -1067,9 +1179,13 @@ static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
 
 static void free_buffers(VP9Context *s)
 {
+    int i;
+
     av_freep(&s->intra_pred_data[0]);
-    av_freep(&s->b_base);
-    av_freep(&s->block_base);
+    for (i = 0; i < s->active_tile_cols; i++) {
+        av_freep(&s->td[i].b_base);
+        av_freep(&s->td[i].block_base);
+    }
 }
 
 static av_cold int vp9_decode_free(AVCodecContext *avctx)
@@ -1090,10 +1206,249 @@ static av_cold int vp9_decode_free(AVCodecContext *avctx)
             ff_thread_release_buffer(avctx, &s->next_refs[i]);
         av_frame_free(&s->next_refs[i].f);
     }
+
     free_buffers(s);
-    av_freep(&s->c_b);
-    s->c_b_size = 0;
+    vp9_free_entries(avctx);
+    av_freep(&s->td);
+    return 0;
+}
+
+static int decode_tiles(AVCodecContext *avctx,
+                        const uint8_t *data, int size)
+{
+    VP9Context *s = avctx->priv_data;
+    VP9TileData *td = &s->td[0];
+    int row, col, tile_row, tile_col, ret;
+    int bytesperpixel;
+    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    AVFrame *f;
+    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
+
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+    bytesperpixel = s->bytesperpixel;
+
+    yoff = uvoff = 0;
+    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+        set_tile_offset(&tile_row_start, &tile_row_end,
+                        tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+        for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+            int64_t tile_size;
 
+            if (tile_col == s->s.h.tiling.tile_cols - 1 &&
+                tile_row == s->s.h.tiling.tile_rows - 1) {
+                tile_size = size;
+            } else {
+                tile_size = AV_RB32(data);
+                data += 4;
+                size -= 4;
+            }
+            if (tile_size > size) {
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                return AVERROR_INVALIDDATA;
+            }
+            ret = ff_vp56_init_range_decoder(&td->c_b[tile_col], data, tile_size);
+            if (ret < 0)
+                return ret;
+            if (vp56_rac_get_prob_branchy(&td->c_b[tile_col], 128)) { // marker bit
+                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                return AVERROR_INVALIDDATA;
+            }
+            data += tile_size;
+            size -= tile_size;
+        }
+
+        for (row = tile_row_start; row < tile_row_end;
+             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+            VP9Filter *lflvl_ptr = s->lflvl;
+            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+
+            for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
+                set_tile_offset(&tile_col_start, &tile_col_end,
+                                tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+                td->tile_col_start = tile_col_start;
+                if (s->pass != 2) {
+                    memset(td->left_partition_ctx, 0, 8);
+                    memset(td->left_skip_ctx, 0, 8);
+                    if (s->s.h.keyframe || s->s.h.intraonly) {
+                        memset(td->left_mode_ctx, DC_PRED, 16);
+                    } else {
+                        memset(td->left_mode_ctx, NEARESTMV, 8);
+                    }
+                    memset(td->left_y_nnz_ctx, 0, 16);
+                    memset(td->left_uv_nnz_ctx, 0, 32);
+                    memset(td->left_segpred_ctx, 0, 8);
+
+                    td->c = &td->c_b[tile_col];
+                }
+
+                for (col = tile_col_start;
+                     col < tile_col_end;
+                     col += 8, yoff2 += 64 * bytesperpixel,
+                     uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                    // FIXME integrate with lf code (i.e. zero after each
+                    // use, similar to invtxfm coefficients, or similar)
+                    if (s->pass != 1) {
+                        memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                    }
+
+                    if (s->pass == 2) {
+                        decode_sb_mem(td, row, col, lflvl_ptr,
+                                      yoff2, uvoff2, BL_64X64);
+                    } else {
+                        decode_sb(td, row, col, lflvl_ptr,
+                                  yoff2, uvoff2, BL_64X64);
+                    }
+                }
+            }
+
+            if (s->pass == 1)
+                continue;
+
+            // backup pre-loopfilter reconstruction data for intra
+            // prediction of next row of sb64s
+            if (row + 8 < s->rows) {
+                memcpy(s->intra_pred_data[0],
+                       f->data[0] + yoff + 63 * ls_y,
+                       8 * s->cols * bytesperpixel);
+                memcpy(s->intra_pred_data[1],
+                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * s->cols * bytesperpixel >> s->ss_h);
+                memcpy(s->intra_pred_data[2],
+                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * s->cols * bytesperpixel >> s->ss_h);
+            }
+
+            // loopfilter one row
+            if (s->s.h.filter.level) {
+                yoff2 = yoff;
+                uvoff2 = uvoff;
+                lflvl_ptr = s->lflvl;
+                for (col = 0; col < s->cols;
+                     col += 8, yoff2 += 64 * bytesperpixel,
+                     uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                    ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
+                                         yoff2, uvoff2);
+                }
+            }
+
+            // FIXME maybe we can make this more finegrained by running the
+            // loopfilter per-block instead of after each sbrow
+            // In fact that would also make intra pred left preparation easier?
+            ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
+        }
+    }
+    return 0;
+}
+
+
+static av_always_inline
+int decode_tiles_mt(AVCodecContext *avctx, void *tdata, int jobnr,
+                              int threadnr)
+{
+    VP9Context *s = avctx->priv_data;
+    VP9TileData *td = &s->td[jobnr];
+    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+    int bytesperpixel = s->bytesperpixel, row, col, tile_row;
+    unsigned tile_cols_len;
+    int tile_row_start, tile_row_end, tile_col_start, tile_col_end;
+    VP9Filter *lflvl_ptr_base;
+    AVFrame *f;
+
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+
+    set_tile_offset(&tile_col_start, &tile_col_end,
+                    jobnr, s->s.h.tiling.log2_tile_cols, s->sb_cols);
+    td->tile_col_start  = tile_col_start;
+    uvoff = (64 * bytesperpixel >> s->ss_h)*(tile_col_start >> 3);
+    yoff = (64 * bytesperpixel)*(tile_col_start >> 3);
+    lflvl_ptr_base = s->lflvl+(tile_col_start >> 3);
+
+    for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
+        set_tile_offset(&tile_row_start, &tile_row_end,
+                        tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
+
+        td->c = &td->c_b[tile_row];
+        for (row = tile_row_start; row < tile_row_end;
+             row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
+            ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
+            VP9Filter *lflvl_ptr = lflvl_ptr_base+s->sb_cols*(row >> 3);
+
+            memset(td->left_partition_ctx, 0, 8);
+            memset(td->left_skip_ctx, 0, 8);
+            if (s->s.h.keyframe || s->s.h.intraonly) {
+                memset(td->left_mode_ctx, DC_PRED, 16);
+            } else {
+                memset(td->left_mode_ctx, NEARESTMV, 8);
+            }
+            memset(td->left_y_nnz_ctx, 0, 16);
+            memset(td->left_uv_nnz_ctx, 0, 32);
+            memset(td->left_segpred_ctx, 0, 8);
+
+            for (col = tile_col_start;
+                 col < tile_col_end;
+                 col += 8, yoff2 += 64 * bytesperpixel,
+                 uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                // FIXME integrate with lf code (i.e. zero after each
+                // use, similar to invtxfm coefficients, or similar)
+                memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
+                decode_sb(td, row, col, lflvl_ptr,
+                            yoff2, uvoff2, BL_64X64);
+            }
+
+            // backup pre-loopfilter reconstruction data for intra
+            // prediction of next row of sb64s
+            tile_cols_len = tile_col_end - tile_col_start;
+            if (row + 8 < s->rows) {
+                memcpy(s->intra_pred_data[0] + (tile_col_start * 8 * bytesperpixel),
+                       f->data[0] + yoff + 63 * ls_y,
+                       8 * tile_cols_len * bytesperpixel);
+                memcpy(s->intra_pred_data[1] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+                       f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
+                memcpy(s->intra_pred_data[2] + (tile_col_start * 8 * bytesperpixel >> s->ss_h),
+                       f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
+                       8 * tile_cols_len * bytesperpixel >> s->ss_h);
+            }
+
+            vp9_report_tile_progress(s, row >> 3, 1);
+        }
+    }
+    return 0;
+}
+
+static av_always_inline
+int loopfilter_proc(AVCodecContext *avctx)
+{
+    VP9Context *s = avctx->priv_data;
+    ptrdiff_t uvoff, yoff, ls_y, ls_uv;
+    VP9Filter *lflvl_ptr;
+    int bytesperpixel = s->bytesperpixel, col, i;
+    AVFrame *f;
+
+    f = s->s.frames[CUR_FRAME].tf.f;
+    ls_y = f->linesize[0];
+    ls_uv =f->linesize[1];
+
+    for (i = 0; i < s->sb_rows; i++) {
+        vp9_await_tile_progress(s, i, s->s.h.tiling.tile_cols);
+
+        if (s->s.h.filter.level) {
+            yoff = (ls_y * 64)*i;
+            uvoff =  (ls_uv * 64 >> s->ss_v)*i;
+            lflvl_ptr = s->lflvl+s->sb_cols*i;
+            for (col = 0; col < s->cols;
+                 col += 8, yoff += 64 * bytesperpixel,
+                 uvoff += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
+                ff_vp9_loopfilter_sb(avctx, lflvl_ptr, i << 3, col,
+                                     yoff, uvoff);
+            }
+        }
+    }
     return 0;
 }
 
@@ -1104,12 +1459,10 @@ static int vp9_decode_frame(AVCodecContext *avctx, void *frame,
     const uint8_t *data = pkt->data;
     int size = pkt->size;
     VP9Context *s = avctx->priv_data;
-    int ret, tile_row, tile_col, i, ref, row, col;
+    int ret, i, j, ref;
     int retain_segmap_ref = s->s.frames[REF_FRAME_SEGMAP].segmentation_map &&
                             (!s->s.h.segmentation.enabled || !s->s.h.segmentation.update_map);
-    ptrdiff_t yoff, uvoff, ls_y, ls_uv;
     AVFrame *f;
-    int bytesperpixel;
 
     if ((ret = decode_frame_header(avctx, data, size, &ref)) < 0) {
         return ret;
@@ -1159,8 +1512,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     f = s->s.frames[CUR_FRAME].tf.f;
     f->key_frame = s->s.h.keyframe;
     f->pict_type = (s->s.h.keyframe || s->s.h.intraonly) ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
-    ls_y = f->linesize[0];
-    ls_uv =f->linesize[1];
 
     if (s->s.frames[REF_FRAME_SEGMAP].tf.f->buf[0] &&
         (s->s.frames[REF_FRAME_MVPAIR].tf.f->width  != s->s.frames[CUR_FRAME].tf.f->width ||
@@ -1195,7 +1546,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
     }
 
     // main tile decode loop
-    bytesperpixel = s->bytesperpixel;
     memset(s->above_partition_ctx, 0, s->cols);
     memset(s->above_skip_ctx, 0, s->cols);
     if (s->s.h.keyframe || s->s.h.intraonly) {
@@ -1233,20 +1583,28 @@ FF_ENABLE_DEPRECATION_WARNINGS
         ff_thread_finish_setup(avctx);
     }
 
+    if (avctx->active_thread_type & FF_THREAD_SLICE) {
+        for (i = 0; i < s->sb_rows; i++)
+            atomic_store(&s->entries[i], 0);
+    }
+
     do {
-        yoff = uvoff = 0;
-        s->b = s->b_base;
-        s->block = s->block_base;
-        s->uvblock[0] = s->uvblock_base[0];
-        s->uvblock[1] = s->uvblock_base[1];
-        s->eob = s->eob_base;
-        s->uveob[0] = s->uveob_base[0];
-        s->uveob[1] = s->uveob_base[1];
-
-        for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
-            set_tile_offset(&s->tile_row_start, &s->tile_row_end,
-                            tile_row, s->s.h.tiling.log2_tile_rows, s->sb_rows);
-            if (s->pass != 2) {
+        for (i = 0; i < s->active_tile_cols; i++) {
+            s->td[i].b = s->td[i].b_base;
+            s->td[i].block = s->td[i].block_base;
+            s->td[i].uvblock[0] = s->td[i].uvblock_base[0];
+            s->td[i].uvblock[1] = s->td[i].uvblock_base[1];
+            s->td[i].eob = s->td[i].eob_base;
+            s->td[i].uveob[0] = s->td[i].uveob_base[0];
+            s->td[i].uveob[1] = s->td[i].uveob_base[1];
+        }
+
+        if (avctx->active_thread_type == FF_THREAD_SLICE) {
+            int tile_row, tile_col;
+
+            assert(!pass);
+
+            for (tile_row = 0; tile_row < s->s.h.tiling.tile_rows; tile_row++) {
                 for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
                     int64_t tile_size;
 
@@ -1258,105 +1616,31 @@ FF_ENABLE_DEPRECATION_WARNINGS
                         data += 4;
                         size -= 4;
                     }
-                    if (tile_size > size) {
-                        ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                    if (tile_size > size)
                         return AVERROR_INVALIDDATA;
-                    }
-                    ret = ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
+                    ret = ff_vp56_init_range_decoder(&s->td[tile_col].c_b[tile_row], data, tile_size);
                     if (ret < 0)
                         return ret;
-                    if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
-                        ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, INT_MAX, 0);
+                    if (vp56_rac_get_prob_branchy(&s->td[tile_col].c_b[tile_row], 128)) // marker bit
                         return AVERROR_INVALIDDATA;
-                    }
                     data += tile_size;
                     size -= tile_size;
                 }
             }
 
-            for (row = s->tile_row_start; row < s->tile_row_end;
-                 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 64 >> s->ss_v) {
-                VP9Filter *lflvl_ptr = s->lflvl;
-                ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
-
-                for (tile_col = 0; tile_col < s->s.h.tiling.tile_cols; tile_col++) {
-                    set_tile_offset(&s->tile_col_start, &s->tile_col_end,
-                                    tile_col, s->s.h.tiling.log2_tile_cols, s->sb_cols);
-
-                    if (s->pass != 2) {
-                        memset(s->left_partition_ctx, 0, 8);
-                        memset(s->left_skip_ctx, 0, 8);
-                        if (s->s.h.keyframe || s->s.h.intraonly) {
-                            memset(s->left_mode_ctx, DC_PRED, 16);
-                        } else {
-                            memset(s->left_mode_ctx, NEARESTMV, 8);
-                        }
-                        memset(s->left_y_nnz_ctx, 0, 16);
-                        memset(s->left_uv_nnz_ctx, 0, 32);
-                        memset(s->left_segpred_ctx, 0, 8);
-
-                        memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
-                    }
-
-                    for (col = s->tile_col_start;
-                         col < s->tile_col_end;
-                         col += 8, yoff2 += 64 * bytesperpixel,
-                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
-                        // FIXME integrate with lf code (i.e. zero after each
-                        // use, similar to invtxfm coefficients, or similar)
-                        if (s->pass != 1) {
-                            memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
-                        }
-
-                        if (s->pass == 2) {
-                            decode_sb_mem(avctx, row, col, lflvl_ptr,
-                                          yoff2, uvoff2, BL_64X64);
-                        } else {
-                            decode_sb(avctx, row, col, lflvl_ptr,
-                                      yoff2, uvoff2, BL_64X64);
-                        }
-                    }
-                    if (s->pass != 2)
-                        memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
-                }
-
-                if (s->pass == 1)
-                    continue;
-
-                // backup pre-loopfilter reconstruction data for intra
-                // prediction of next row of sb64s
-                if (row + 8 < s->rows) {
-                    memcpy(s->intra_pred_data[0],
-                           f->data[0] + yoff + 63 * ls_y,
-                           8 * s->cols * bytesperpixel);
-                    memcpy(s->intra_pred_data[1],
-                           f->data[1] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
-                           8 * s->cols * bytesperpixel >> s->ss_h);
-                    memcpy(s->intra_pred_data[2],
-                           f->data[2] + uvoff + ((64 >> s->ss_v) - 1) * ls_uv,
-                           8 * s->cols * bytesperpixel >> s->ss_h);
-                }
-
-                // loopfilter one row
-                if (s->s.h.filter.level) {
-                    yoff2 = yoff;
-                    uvoff2 = uvoff;
-                    lflvl_ptr = s->lflvl;
-                    for (col = 0; col < s->cols;
-                         col += 8, yoff2 += 64 * bytesperpixel,
-                         uvoff2 += 64 * bytesperpixel >> s->ss_h, lflvl_ptr++) {
-                        ff_vp9_loopfilter_sb(avctx, lflvl_ptr, row, col,
-                                             yoff2, uvoff2);
-                    }
-                }
-
-                // FIXME maybe we can make this more finegrained by running the
-                // loopfilter per-block instead of after each sbrow
-                // In fact that would also make intra pred left preparation easier?
-                ff_thread_report_progress(&s->s.frames[CUR_FRAME].tf, row >> 3, 0);
-            }
+            ff_slice_thread_execute_with_mainfunc(avctx, decode_tiles_mt, loopfilter_proc, s->td, NULL, s->s.h.tiling.tile_cols);
+        } else {
+            ret = decode_tiles(avctx, data, size);
+            if (ret < 0)
+                return ret;
         }
 
+        // Sum all counts fields into td[0].counts for tile threading
+        if (avctx->active_thread_type == FF_THREAD_SLICE)
+            for (i = 1; i < s->s.h.tiling.tile_cols; i++)
+                for (j = 0; j < sizeof(s->td[i].counts) / sizeof(unsigned); j++)
+                    ((unsigned *)&s->td[0].counts)[j] += ((unsigned *)&s->td[i].counts)[j];
+
         if (s->pass < 2 && s->s.h.refreshctx && !s->s.h.parallelmode) {
             ff_vp9_adapt_probs(s);
             ff_thread_finish_setup(avctx);
@@ -1492,7 +1776,8 @@ AVCodec ff_vp9_decoder = {
     .init                  = vp9_decode_init,
     .close                 = vp9_decode_free,
     .decode                = vp9_decode_frame,
-    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS,
+    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS | AV_CODEC_CAP_SLICE_THREADS,
+    .caps_internal         = FF_CODEC_CAP_SLICE_THREAD_HAS_MF,
     .flush                 = vp9_decode_flush,
     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),