opusenc: implement a psychoacoustic system

This commit implements a psychoacoustic system for the native Opus encoder. Its unlike any other psychoacoustic system known since its capable of using a lookahead to make better choices on how to treat the current frame and how many bits to allocate for it (and future frames). Also, whilst the main bulk of the analysis function has to run in a single thread, the per-frame anaylsis functions does not modify the main psychoacoustic context, so in the future it will be fairly trivial to run those as slice threads. Signed-off-by: Rostislav Pehlivanov <atomnuker@gmail.com>
author: Rostislav Pehlivanov <atomnuker@gmail.com> 2017-09-23 00:38:37 +0100
committer: Rostislav Pehlivanov <atomnuker@gmail.com> 2017-09-23 08:27:44 +0100
commit: 2ad1768c7b48b6c77bbe9c42a4c2744f7b029182 (patch)
tree: cf97ad8da31d2e5695b4135f88ae92b90fe2f053 /libavcodec/opusenc_psy.c
parent: 1ef7752d64cbe9af2f27cc65aba3a2ca3831c128 (diff)
download: ffmpeg-streaming-2ad1768c7b48b6c77bbe9c42a4c2744f7b029182.zip
ffmpeg-streaming-2ad1768c7b48b6c77bbe9c42a4c2744f7b029182.tar.gz
1 files changed, 556 insertions, 0 deletions
diff --git a/libavcodec/opusenc_psy.c b/libavcodec/opusenc_psy.c
new file mode 100644
index 0000000..7c356fc
--- /dev/null
+++ b/libavcodec/opusenc_psy.c
@@ -0,0 +1,556 @@
+/*
+ * Opus encoder
+ * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "opusenc_psy.h"
+#include "opus_pvq.h"
+#include "opustab.h"
+#include "mdct15.h"
+#include "libavutil/qsort.h"
+
+/* Populate metrics without taking into consideration neighbouring steps */
+static void step_collect_psy_metrics(OpusPsyContext *s, int index)
+{
+    int silence = 0, ch, i, j;
+    OpusPsyStep *st = s->steps[index];
+
+    st->index = index;
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        const int lap_size = (1 << s->bsize_analysis);
+        for (i = 1; i <= FFMIN(lap_size, index); i++) {
+            const int offset = i*120;
+            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index - i);
+            memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
+        }
+        for (i = 0; i < lap_size; i++) {
+            const int offset = i*120 + lap_size;
+            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + i);
+            memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
+        }
+
+        s->dsp->vector_fmul(s->scratch, s->scratch, s->window[s->bsize_analysis],
+                            (OPUS_BLOCK_SIZE(s->bsize_analysis) << 1));
+
+        s->mdct[s->bsize_analysis]->mdct(s->mdct[s->bsize_analysis], st->coeffs[ch], s->scratch, 1);
+
+        for (i = 0; i < CELT_MAX_BANDS; i++)
+            st->bands[ch][i] = &st->coeffs[ch][ff_celt_freq_bands[i] << s->bsize_analysis];
+    }
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            float avg_c_s, energy = 0.0f, dist_dev = 0.0f;
+            const int range = ff_celt_freq_range[i] << s->bsize_analysis;
+            const float *coeffs = st->bands[ch][i];
+            for (j = 0; j < range; j++)
+                energy += coeffs[j]*coeffs[j];
+
+            st->energy[ch][i] += sqrtf(energy);
+            silence |= !!st->energy[ch][i];
+            avg_c_s = energy / range;
+
+            for (j = 0; j < range; j++) {
+                const float c_s = coeffs[j]*coeffs[j];
+                dist_dev = (avg_c_s - c_s)*(avg_c_s - c_s);
+            }
+
+            st->tone[ch][i] += sqrtf(dist_dev);
+        }
+    }
+
+    st->silence = !silence;
+
+    if (s->avctx->channels > 1) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            float incompat = 0.0f;
+            const float *coeffs1 = st->bands[0][i];
+            const float *coeffs2 = st->bands[1][i];
+            const int range = ff_celt_freq_range[i] << s->bsize_analysis;
+            for (j = 0; j < range; j++)
+                incompat += (coeffs1[j] - coeffs2[j])*(coeffs1[j] - coeffs2[j]);
+            st->stereo[i] = sqrtf(incompat);
+        }
+    }
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            OpusBandExcitation *ex = &s->ex[ch][i];
+            float bp_e = bessel_filter(&s->bfilter_lo[ch][i], st->energy[ch][i]);
+            bp_e = bessel_filter(&s->bfilter_hi[ch][i], bp_e);
+            bp_e *= bp_e;
+            if (bp_e > ex->excitation) {
+                st->change_amp[ch][i] = bp_e - ex->excitation;
+                st->total_change += st->change_amp[ch][i];
+                ex->excitation = ex->excitation_init = bp_e;
+                ex->excitation_dist = 0.0f;
+            }
+            if (ex->excitation > 0.0f) {
+                ex->excitation -= av_clipf((1/expf(ex->excitation_dist)), ex->excitation_init/20, ex->excitation_init/1.09);
+                ex->excitation = FFMAX(ex->excitation, 0.0f);
+                ex->excitation_dist += 1.0f;
+            }
+        }
+    }
+}
+
+static void search_for_change_points(OpusPsyContext *s, float tgt_change,
+                                     int offset_s, int offset_e, int resolution,
+                                     int level)
+{
+    int i;
+    float c_change = 0.0f;
+    if ((offset_e - offset_s) <= resolution)
+        return;
+    for (i = offset_s; i < offset_e; i++) {
+        c_change += s->steps[i]->total_change;
+        if (c_change > tgt_change)
+            break;
+    }
+    if (i == offset_e)
+        return;
+    search_for_change_points(s, tgt_change / 2.0f, offset_s, i + 0, resolution, level + 1);
+    s->inflection_points[s->inflection_points_count++] = i;
+    search_for_change_points(s, tgt_change / 2.0f, i + 1, offset_e, resolution, level + 1);
+}
+
+static int flush_silent_frames(OpusPsyContext *s)
+{
+    int fsize, silent_frames;
+
+    for (silent_frames = 0; silent_frames < s->buffered_steps; silent_frames++)
+        if (!s->steps[silent_frames]->silence)
+            break;
+    if (--silent_frames < 0)
+        return 0;
+
+    for (fsize = CELT_BLOCK_960; fsize > CELT_BLOCK_120; fsize--) {
+        if ((1 << fsize) > silent_frames)
+            continue;
+        s->p.frames = FFMIN(silent_frames / (1 << fsize), 48 >> fsize);
+        s->p.framesize = fsize;
+        return 1;
+    }
+
+    return 0;
+}
+
+/* Main function which decides frame size and frames per current packet */
+static void psy_output_groups(OpusPsyContext *s)
+{
+    int max_delay_samples = (s->options->max_delay_ms*s->avctx->sample_rate)/1000;
+    int max_bsize = FFMIN(OPUS_SAMPLES_TO_BLOCK_SIZE(max_delay_samples), CELT_BLOCK_960);
+
+    /* These don't change for now */
+    s->p.mode      = OPUS_MODE_CELT;
+    s->p.bandwidth = OPUS_BANDWIDTH_FULLBAND;
+
+    /* Flush silent frames ASAP */
+    if (s->steps[0]->silence && flush_silent_frames(s))
+        return;
+
+    s->p.framesize = FFMIN(max_bsize, CELT_BLOCK_960);
+    s->p.frames    = 1;
+}
+
+int ff_opus_psy_process(OpusPsyContext *s, OpusPacketInfo *p)
+{
+    int i;
+    float total_energy_change = 0.0f;
+
+    if (s->buffered_steps < s->max_steps && !s->eof) {
+        const int awin = (1 << s->bsize_analysis);
+        if (++s->steps_to_process >= awin) {
+            step_collect_psy_metrics(s, s->buffered_steps - awin + 1);
+            s->steps_to_process = 0;
+        }
+        if ((++s->buffered_steps) < s->max_steps)
+            return 1;
+    }
+
+    for (i = 0; i < s->buffered_steps; i++)
+        total_energy_change += s->steps[i]->total_change;
+
+    search_for_change_points(s, total_energy_change / 2.0f, 0,
+                             s->buffered_steps, 1, 0);
+
+    psy_output_groups(s);
+
+    p->frames    = s->p.frames;
+    p->framesize = s->p.framesize;
+    p->mode      = s->p.mode;
+    p->bandwidth = s->p.bandwidth;
+
+    return 0;
+}
+
+void ff_opus_psy_celt_frame_init(OpusPsyContext *s, CeltFrame *f, int index)
+{
+    int i, neighbouring_points = 0, start_offset = 0;
+    int radius = (1 << s->p.framesize), step_offset = radius*index;
+    int silence = 1;
+
+    f->start_band = (s->p.mode == OPUS_MODE_HYBRID) ? 17 : 0;
+    f->end_band   = ff_celt_band_end[s->p.bandwidth];
+    f->channels   = s->avctx->channels;
+    f->size       = s->p.framesize;
+
+    for (i = 0; i < (1 << f->size); i++)
+        silence &= s->steps[index*(1 << f->size) + i]->silence;
+
+    f->silence = silence;
+    if (f->silence) {
+        f->framebits = 0; /* Otherwise the silence flag eats up 16(!) bits */
+        return;
+    }
+
+    for (i = 0; i < s->inflection_points_count; i++) {
+        if (s->inflection_points[i] >= step_offset) {
+            start_offset = i;
+            break;
+        }
+    }
+
+    for (i = start_offset; i < FFMIN(radius, s->inflection_points_count - start_offset); i++) {
+        if (s->inflection_points[i] < (step_offset + radius)) {
+            neighbouring_points++;
+        }
+    }
+
+    /* Transient flagging */
+    f->transient = neighbouring_points > 0;
+    f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
+
+    /* Some sane defaults */
+    f->pfilter   = 0;
+    f->pf_gain   = 0.5f;
+    f->pf_octave = 2;
+    f->pf_period = 1;
+    f->pf_tapset = 2;
+
+    /* More sane defaults */
+    f->tf_select = 0;
+    f->anticollapse = 1;
+    f->alloc_trim = 5;
+    f->skip_band_floor = f->end_band;
+    f->intensity_stereo = f->end_band;
+    f->dual_stereo = 0;
+    f->spread = CELT_SPREAD_NORMAL;
+    memset(f->tf_change, 0, sizeof(int)*CELT_MAX_BANDS);
+    memset(f->alloc_boost, 0, sizeof(int)*CELT_MAX_BANDS);
+}
+
+static void celt_gauge_psy_weight(OpusPsyContext *s, OpusPsyStep **start,
+                                  CeltFrame *f_out)
+{
+    int i, f, ch;
+    int frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
+    float rate, frame_bits = 0;
+
+    /* Used for the global ROTATE flag */
+    float tonal = 0.0f;
+
+    /* Pseudo-weights */
+    float band_score[CELT_MAX_BANDS] = { 0 };
+    float max_score = 1.0f;
+
+    /* Pass one - one loop around each band, computing unquant stuff */
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        float weight = 0.0f;
+        float tonal_contrib = 0.0f;
+        for (f = 0; f < (1 << s->p.framesize); f++) {
+            weight = start[f]->stereo[i];
+            for (ch = 0; ch < s->avctx->channels; ch++) {
+                weight += start[f]->change_amp[ch][i] + start[f]->tone[ch][i] + start[f]->energy[ch][i];
+                tonal_contrib += start[f]->tone[ch][i];
+            }
+        }
+        tonal += tonal_contrib;
+        band_score[i] = weight;
+    }
+
+    tonal /= (float)CELT_MAX_BANDS;
+
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        if (band_score[i] > max_score)
+            max_score = band_score[i];
+    }
+
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        f_out->alloc_boost[i] = (int)((band_score[i]/max_score)*3.0f);
+        frame_bits += band_score[i]*8.0f;
+    }
+
+    tonal /= 1333136.0f;
+    f_out->spread = av_clip(lrintf(tonal), 0, 3);
+
+    rate = ((float)s->avctx->bit_rate) + frame_bits*frame_size*16;
+    rate *= s->lambda;
+    rate /= s->avctx->sample_rate/frame_size;
+
+    f_out->framebits = lrintf(rate);
+    f_out->framebits = FFMIN(f_out->framebits, OPUS_MAX_PACKET_SIZE*8);
+    f_out->framebits = FFALIGN(f_out->framebits, 8);
+}
+
+static int bands_dist(OpusPsyContext *s, CeltFrame *f, float *total_dist)
+{
+    int i, tdist = 0.0f;
+    OpusRangeCoder dump;
+
+    ff_opus_rc_enc_init(&dump);
+    ff_celt_enc_bitalloc(&dump, f);
+
+    for (i = 0; i < CELT_MAX_BANDS; i++) {
+        float bits = 0.0f;
+        float dist = f->pvq->band_cost(f->pvq, f, &dump, i, &bits, s->lambda);
+        tdist += dist;
+    }
+
+    *total_dist = tdist;
+
+    return 0;
+}
+
+static void celt_search_for_dual_stereo(OpusPsyContext *s, CeltFrame *f)
+{
+    float td1, td2;
+    f->dual_stereo = 0;
+    bands_dist(s, f, &td1);
+    f->dual_stereo = 1;
+    bands_dist(s, f, &td2);
+
+    f->dual_stereo = td2 < td1;
+    s->dual_stereo_used += td2 < td1;
+}
+
+static void celt_search_for_intensity(OpusPsyContext *s, CeltFrame *f)
+{
+    int i, best_band = CELT_MAX_BANDS - 1;
+    float dist, best_dist = FLT_MAX;
+
+    /* TODO: fix, make some heuristic up here using the lambda value */
+    float end_band = 0;
+
+    for (i = f->end_band; i >= end_band; i--) {
+        f->intensity_stereo = i;
+        bands_dist(s, f, &dist);
+        if (best_dist > dist) {
+            best_dist = dist;
+            best_band = i;
+        }
+    }
+
+    f->intensity_stereo = best_band;
+    s->avg_is_band = (s->avg_is_band + f->intensity_stereo)/2.0f;
+}
+
+static int celt_search_for_tf(OpusPsyContext *s, OpusPsyStep **start, CeltFrame *f)
+{
+    int i, j, k, cway, config[2][CELT_MAX_BANDS] = { { 0 } };
+    float score[2] = { 0 };
+
+    for (cway = 0; cway < 2; cway++) {
+        int mag[2];
+        int base = f->transient ? 120 : 960;
+
+        for (int i = 0; i < 2; i++) {
+            int c = ff_celt_tf_select[f->size][f->transient][cway][i];
+            mag[i] = c < 0 ? base >> FFABS(c) : base << FFABS(c);
+        }
+
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            float iscore0 = 0.0f;
+            float iscore1 = 0.0f;
+            for (j = 0; j < (1 << f->size); j++) {
+                for (k = 0; k < s->avctx->channels; k++) {
+                    iscore0 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[0];
+                    iscore1 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[1];
+                }
+            }
+            config[cway][i] = FFABS(iscore0 - 1.0f) < FFABS(iscore1 - 1.0f);
+            score[cway] += config[cway][i] ? iscore1 : iscore0;
+        }
+    }
+
+    f->tf_select = score[0] < score[1];
+    memcpy(f->tf_change, config[f->tf_select], sizeof(int)*CELT_MAX_BANDS);
+
+    return 0;
+}
+
+int ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index)
+{
+    int start_transient_flag = f->transient;
+    OpusPsyStep **start = &s->steps[index * (1 << s->p.framesize)];
+
+    if (f->silence)
+        return 0;
+
+    celt_gauge_psy_weight(s, start, f);
+    celt_search_for_intensity(s, f);
+    celt_search_for_dual_stereo(s, f);
+    celt_search_for_tf(s, start, f);
+
+    if (f->transient != start_transient_flag) {
+        f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
+        s->redo_analysis = 1;
+        return 1;
+    }
+
+    s->redo_analysis = 0;
+
+    return 0;
+}
+
+void ff_opus_psy_postencode_update(OpusPsyContext *s, CeltFrame *f, OpusRangeCoder *rc)
+{
+    int i, frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
+    int steps_out = s->p.frames*(frame_size/120);
+    void *tmp[FF_BUFQUEUE_SIZE];
+    float ideal_fbits;
+
+    for (i = 0; i < steps_out; i++)
+        memset(s->steps[i], 0, sizeof(OpusPsyStep));
+
+    for (i = 0; i < s->max_steps; i++)
+        tmp[i] = s->steps[i];
+
+    for (i = 0; i < s->max_steps; i++) {
+        const int i_new = i - steps_out;
+        s->steps[i_new < 0 ? s->max_steps + i_new : i_new] = tmp[i];
+    }
+
+    for (i = steps_out; i < s->buffered_steps; i++)
+        s->steps[i]->index -= steps_out;
+
+    ideal_fbits = s->avctx->bit_rate/(s->avctx->sample_rate/frame_size);
+
+    for (i = 0; i < s->p.frames; i++) {
+        s->avg_is_band += f[i].intensity_stereo;
+        s->lambda *= ideal_fbits / f[i].framebits;
+    }
+
+    s->avg_is_band /= (s->p.frames + 1);
+
+    s->cs_num = 0;
+    s->steps_to_process = 0;
+    s->buffered_steps -= steps_out;
+    s->total_packets_out += s->p.frames;
+    s->inflection_points_count = 0;
+}
+
+av_cold int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx,
+                             struct FFBufQueue *bufqueue, OpusEncOptions *options)
+{
+    int i, ch, ret;
+
+    s->redo_analysis = 0;
+    s->lambda = 1.0f;
+    s->options = options;
+    s->avctx = avctx;
+    s->bufqueue = bufqueue;
+    s->max_steps = ceilf(s->options->max_delay_ms/2.5f);
+    s->bsize_analysis = CELT_BLOCK_960;
+    s->avg_is_band = CELT_MAX_BANDS - 1;
+    s->inflection_points_count = 0;
+
+    s->inflection_points = av_mallocz(sizeof(*s->inflection_points)*s->max_steps);
+    if (!s->inflection_points) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
+    if (!s->dsp) {
+        ret = AVERROR(ENOMEM);
+        goto fail;
+    }
+
+    for (ch = 0; ch < s->avctx->channels; ch++) {
+        for (i = 0; i < CELT_MAX_BANDS; i++) {
+            bessel_init(&s->bfilter_hi[ch][i], 1.0f, 19.0f, 100.0f, 1);
+            bessel_init(&s->bfilter_lo[ch][i], 1.0f, 20.0f, 100.0f, 0);
+        }
+    }
+
+    for (i = 0; i < s->max_steps; i++) {
+        s->steps[i] = av_mallocz(sizeof(OpusPsyStep));
+        if (!s->steps[i]) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+    }
+
+    for (i = 0; i < CELT_BLOCK_NB; i++) {
+        float tmp;
+        const int len = OPUS_BLOCK_SIZE(i);
+        s->window[i] = av_malloc(2*len*sizeof(float));
+        if (!s->window[i]) {
+            ret = AVERROR(ENOMEM);
+            goto fail;
+        }
+        ff_generate_window_func(s->window[i], 2*len, WFUNC_SINE, &tmp);
+        if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
+            goto fail;
+    }
+
+    return 0;
+
+fail:
+    av_freep(&s->inflection_points);
+    av_freep(&s->dsp);
+
+    for (i = 0; i < CELT_BLOCK_NB; i++) {
+        ff_mdct15_uninit(&s->mdct[i]);
+        av_freep(&s->window[i]);
+    }
+
+    for (i = 0; i < s->max_steps; i++)
+        av_freep(&s->steps[i]);
+
+    return ret;
+}
+
+void ff_opus_psy_signal_eof(OpusPsyContext *s)
+{
+    s->eof = 1;
+}
+
+av_cold int ff_opus_psy_end(OpusPsyContext *s)
+{
+    int i;
+
+    av_freep(&s->inflection_points);
+    av_freep(&s->dsp);
+
+    for (i = 0; i < CELT_BLOCK_NB; i++) {
+        ff_mdct15_uninit(&s->mdct[i]);
+        av_freep(&s->window[i]);
+    }
+
+    for (i = 0; i < s->max_steps; i++)
+        av_freep(&s->steps[i]);
+
+    av_log(s->avctx, AV_LOG_INFO, "Average Intensity Stereo band: %0.1f\n", s->avg_is_band);
+    av_log(s->avctx, AV_LOG_INFO, "Dual Stereo used: %0.2f%%\n", ((float)s->dual_stereo_used/s->total_packets_out)*100.0f);
+
+    return 0;
+}
author	Rostislav Pehlivanov <atomnuker@gmail.com>	2017-09-23 00:38:37 +0100
committer	Rostislav Pehlivanov <atomnuker@gmail.com>	2017-09-23 08:27:44 +0100
commit	2ad1768c7b48b6c77bbe9c42a4c2744f7b029182 (patch)
tree	cf97ad8da31d2e5695b4135f88ae92b90fe2f053 /libavcodec/opusenc_psy.c
parent	1ef7752d64cbe9af2f27cc65aba3a2ca3831c128 (diff)
download	ffmpeg-streaming-2ad1768c7b48b6c77bbe9c42a4c2744f7b029182.zip ffmpeg-streaming-2ad1768c7b48b6c77bbe9c42a4c2744f7b029182.tar.gz