summaryrefslogtreecommitdiffstats
path: root/libavcodec/diracdec.c
diff options
context:
space:
mode:
authorRostislav Pehlivanov <rpehlivanov@ob-encoder.com>2016-06-23 18:07:00 +0100
committerRostislav Pehlivanov <atomnuker@gmail.com>2016-07-11 23:38:45 +0100
commitc43485f70765cb488bfdf95dc783bb9b14eb1179 (patch)
tree329688ba1391b0030a8b8825b9f59626eebc7daa /libavcodec/diracdec.c
parent09d89d940635e34b0f61266d66fbb9802b18564c (diff)
downloadffmpeg-streaming-c43485f70765cb488bfdf95dc783bb9b14eb1179.zip
ffmpeg-streaming-c43485f70765cb488bfdf95dc783bb9b14eb1179.tar.gz
diracdec: rewrite HQ slice decoding
Now coefficients are written to a buffer and are then dequantized by the new SIMD dequantization functions. For the lower bands without enough coefficients to fill a register (and hence they overwrite) the C version of the dequantization function is used. The buffer is per-thread and will be realloc'd if anything changes. This prevents regressions and having to limit slice size. Signed-off-by: Rostislav Pehlivanov <rpehlivanov@obe.tv>
Diffstat (limited to 'libavcodec/diracdec.c')
-rw-r--r--libavcodec/diracdec.c126
1 files changed, 111 insertions, 15 deletions
diff --git a/libavcodec/diracdec.c b/libavcodec/diracdec.c
index ad33809..7913656 100644
--- a/libavcodec/diracdec.c
+++ b/libavcodec/diracdec.c
@@ -161,6 +161,10 @@ typedef struct DiracContext {
unsigned num_x; /* number of horizontal slices */
unsigned num_y; /* number of vertical slices */
+ uint8_t *thread_buf; /* Per-thread buffer for coefficient storage */
+ int threads_num_buf; /* Current # of buffers allocated */
+ int thread_buf_size; /* Each thread has a buffer this size */
+
struct {
unsigned width;
unsigned height;
@@ -370,6 +374,10 @@ static av_cold int dirac_decode_init(AVCodecContext *avctx)
s->avctx = avctx;
s->frame_number = -1;
+ s->thread_buf = NULL;
+ s->threads_num_buf = -1;
+ s->thread_buf_size = -1;
+
ff_diracdsp_init(&s->diracdsp);
ff_mpegvideoencdsp_init(&s->mpvencdsp, avctx);
ff_videodsp_init(&s->vdsp, 8);
@@ -403,6 +411,8 @@ static av_cold int dirac_decode_end(AVCodecContext *avctx)
for (i = 0; i < MAX_FRAMES; i++)
av_frame_free(&s->all_frames[i].avframe);
+ av_freep(&s->thread_buf);
+
return 0;
}
@@ -760,46 +770,108 @@ static int decode_lowdelay_slice(AVCodecContext *avctx, void *arg)
return 0;
}
+typedef struct SliceCoeffs {
+ int left;
+ int top;
+ int tot_h;
+ int tot_v;
+ int tot;
+} SliceCoeffs;
+
+static int subband_coeffs(DiracContext *s, int x, int y, int p,
+ SliceCoeffs c[MAX_DWT_LEVELS])
+{
+ int level, coef = 0;
+ for (level = 0; level < s->wavelet_depth; level++) {
+ SliceCoeffs *o = &c[level];
+ SubBand *b = &s->plane[p].band[level][3]; /* orientation doens't matter */
+ o->top = b->height * y / s->num_y;
+ o->left = b->width * x / s->num_x;
+ o->tot_h = ((b->width * (x + 1)) / s->num_x) - o->left;
+ o->tot_v = ((b->height * (y + 1)) / s->num_y) - o->top;
+ o->tot = o->tot_h*o->tot_v;
+ coef += o->tot * (4 - !!level);
+ }
+ return coef;
+}
+
/**
* VC-2 Specification ->
* 13.5.3 hq_slice(sx,sy)
*/
-static int decode_hq_slice(AVCodecContext *avctx, void *arg)
+static int decode_hq_slice(DiracContext *s, DiracSlice *slice, uint8_t *tmp_buf)
{
- int i, quant, level, orientation, quant_idx;
- uint8_t quants[MAX_DWT_LEVELS][4];
- DiracContext *s = avctx->priv_data;
- DiracSlice *slice = arg;
+ int i, level, orientation, quant_idx;
+ int qfactor[MAX_DWT_LEVELS][4], qoffset[MAX_DWT_LEVELS][4];
GetBitContext *gb = &slice->gb;
+ SliceCoeffs coeffs_num[MAX_DWT_LEVELS];
skip_bits_long(gb, 8*s->highquality.prefix_bytes);
quant_idx = get_bits(gb, 8);
+ if (quant_idx > DIRAC_MAX_QUANT_INDEX) {
+ av_log(s->avctx, AV_LOG_ERROR, "Invalid quantization index - %i\n", quant_idx);
+ return AVERROR_INVALIDDATA;
+ }
+
/* Slice quantization (slice_quantizers() in the specs) */
for (level = 0; level < s->wavelet_depth; level++) {
for (orientation = !!level; orientation < 4; orientation++) {
- quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
- quants[level][orientation] = quant;
+ const int quant = FFMAX(quant_idx - s->lowdelay.quant[level][orientation], 0);
+ qfactor[level][orientation] = ff_dirac_qscale_tab[quant];
+ qoffset[level][orientation] = ff_dirac_qoffset_intra_tab[quant] + 2;
}
}
/* Luma + 2 Chroma planes */
for (i = 0; i < 3; i++) {
- int64_t length = s->highquality.size_scaler * get_bits(gb, 8);
- int64_t bits_left = 8 * length;
- int64_t bits_end = get_bits_count(gb) + bits_left;
+ int c, coef_num, coef_par, off = 0;
+ int64_t length = s->highquality.size_scaler*get_bits(gb, 8);
+ int64_t start = get_bits_count(gb);
+ int64_t bits_end = start + 8*length;
if (bits_end >= INT_MAX) {
av_log(s->avctx, AV_LOG_ERROR, "end too far away\n");
return AVERROR_INVALIDDATA;
}
+ coef_num = subband_coeffs(s, slice->slice_x, slice->slice_y, i, coeffs_num);
+
+ if (s->pshift) {
+ int32_t *dst = (int32_t *)tmp_buf;
+ for (c = 0; c < coef_num; c++)
+ dst[c] = dirac_get_se_golomb(gb);
+ coef_par = c;
+ } else {
+ int16_t *dst = (int16_t *)tmp_buf;
+ for (c = 0; c < coef_num; c++)
+ dst[c] = dirac_get_se_golomb(gb);
+ coef_par = c;
+ }
+
+ if (coef_num > coef_par) {
+ const int start_b = coef_par * (4 >> s->pshift);
+ const int end_b = coef_num * (4 >> s->pshift);
+ memset(&tmp_buf[start_b], 0, end_b - start_b);
+ }
+
for (level = 0; level < s->wavelet_depth; level++) {
+ const SliceCoeffs *c = &coeffs_num[level];
for (orientation = !!level; orientation < 4; orientation++) {
- decode_subband(s, gb, quants[level][orientation], slice->slice_x, slice->slice_y, bits_end,
- &s->plane[i].band[level][orientation], NULL);
+ const SubBand *b1 = &s->plane[i].band[level][orientation];
+ uint8_t *buf = b1->ibuf + c->top * b1->stride + (c->left << (s->pshift + 1));
+
+ /* Change to c->tot_h <= 4 for AVX2 dequantization */
+ const int qfunc = s->pshift + 2*(c->tot_h <= 2);
+ s->diracdsp.dequant_subband[qfunc](&tmp_buf[off], buf, b1->stride,
+ qfactor[level][orientation],
+ qoffset[level][orientation],
+ c->tot_v, c->tot_h);
+
+ off += c->tot << (s->pshift + 1);
}
}
+
skip_bits_long(gb, bits_end - get_bits_count(gb));
}
@@ -811,8 +883,9 @@ static int decode_hq_slice_row(AVCodecContext *avctx, void *arg, int jobnr, int
int i;
DiracContext *s = avctx->priv_data;
DiracSlice *slices = ((DiracSlice *)arg) + s->num_x*jobnr;
+ uint8_t *thread_buf = &s->thread_buf[s->thread_buf_size*threadnr];
for (i = 0; i < s->num_x; i++)
- decode_hq_slice(avctx, &slices[i]);
+ decode_hq_slice(s, &slices[i], thread_buf);
return 0;
}
@@ -824,15 +897,32 @@ static int decode_lowdelay(DiracContext *s)
{
AVCodecContext *avctx = s->avctx;
int slice_x, slice_y, bufsize;
- int64_t bytes = 0;
+ int64_t coef_buf_size, bytes = 0;
const uint8_t *buf;
DiracSlice *slices;
+ SliceCoeffs tmp[MAX_DWT_LEVELS];
int slice_num = 0;
slices = av_mallocz_array(s->num_x, s->num_y * sizeof(DiracSlice));
if (!slices)
return AVERROR(ENOMEM);
+ /* 8 becacuse that's how much the golomb reader could overread junk data
+ * from another plane/slice at most, and 512 because SIMD */
+ coef_buf_size = subband_coeffs(s, s->num_x - 1, s->num_y - 1, 0, tmp) + 8;
+ coef_buf_size = (coef_buf_size << (1 + s->pshift)) + 512;
+
+ if (s->threads_num_buf != avctx->thread_count ||
+ s->thread_buf_size != coef_buf_size) {
+ s->threads_num_buf = avctx->thread_count;
+ s->thread_buf_size = coef_buf_size;
+ s->thread_buf = av_realloc_f(s->thread_buf, avctx->thread_count, s->thread_buf_size);
+ if (!s->thread_buf) {
+ av_log(s->avctx, AV_LOG_ERROR, "thread buffer allocation failure\n");
+ return AVERROR(ENOMEM);
+ }
+ }
+
align_get_bits(&s->gb);
/*[DIRAC_STD] 13.5.2 Slices. slice(sx,sy) */
buf = s->gb.buffer + get_bits_count(&s->gb)/8;
@@ -848,7 +938,7 @@ static int decode_lowdelay(DiracContext *s)
if (bytes <= bufsize/8)
bytes += buf[bytes] * s->highquality.size_scaler + 1;
}
- if (bytes >= INT_MAX) {
+ if (bytes >= INT_MAX || bytes*8 > bufsize) {
av_log(s->avctx, AV_LOG_ERROR, "too many bytes\n");
av_free(slices);
return AVERROR_INVALIDDATA;
@@ -867,6 +957,12 @@ static int decode_lowdelay(DiracContext *s)
bufsize = 0;
}
}
+
+ if (s->num_x*s->num_y != slice_num) {
+ av_log(s->avctx, AV_LOG_ERROR, "too few slices\n");
+ return AVERROR_INVALIDDATA;
+ }
+
avctx->execute2(avctx, decode_hq_slice_row, slices, NULL, s->num_y);
} else {
for (slice_y = 0; bufsize > 0 && slice_y < s->num_y; slice_y++) {
OpenPOWER on IntegriCloud