diff options
author | Ruiling Song <ruiling.song@intel.com> | 2018-06-19 09:57:31 +0800 |
---|---|---|
committer | Mark Thompson <sw@jkqxz.net> | 2018-06-21 01:19:18 +0100 |
commit | 8b8b0e2cd26cf1f522c630859fcbcc62b6493fb9 (patch) | |
tree | 317b7360eeb1df6f7e7c5bb935f70006963e7ae4 /libavfilter/opencl | |
parent | 714da1fd898f83c7bef38fe427af3692917cbcb2 (diff) | |
download | ffmpeg-streaming-8b8b0e2cd26cf1f522c630859fcbcc62b6493fb9.zip ffmpeg-streaming-8b8b0e2cd26cf1f522c630859fcbcc62b6493fb9.tar.gz |
lavfi: add opencl tonemap filter
This filter does HDR(HDR10/HLG) to SDR conversion with tone-mapping.
An example command to use this filter with vaapi codecs:
FFMPEG -init_hw_device vaapi=va:/dev/dri/renderD128 -init_hw_device \
opencl=ocl@va -hwaccel vaapi -hwaccel_device va -hwaccel_output_format \
vaapi -i INPUT -filter_hw_device ocl -filter_complex \
'[0:v]hwmap,tonemap_opencl=t=bt2020:tonemap=linear:format=p010[x1]; \
[x1]hwmap=derive_device=vaapi:reverse=1' -c:v hevc_vaapi -profile 2 OUTPUT
Signed-off-by: Ruiling Song <ruiling.song@intel.com>
Diffstat (limited to 'libavfilter/opencl')
-rw-r--r-- | libavfilter/opencl/colorspace_common.cl | 220 | ||||
-rw-r--r-- | libavfilter/opencl/tonemap.cl | 272 |
2 files changed, 492 insertions, 0 deletions
diff --git a/libavfilter/opencl/colorspace_common.cl b/libavfilter/opencl/colorspace_common.cl new file mode 100644 index 0000000..94a4dd0 --- /dev/null +++ b/libavfilter/opencl/colorspace_common.cl @@ -0,0 +1,220 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define ST2084_MAX_LUMINANCE 10000.0f +#define REFERENCE_WHITE 100.0f + +#if chroma_loc == 1 + #define chroma_sample(a,b,c,d) (((a) + (c)) * 0.5f) +#elif chroma_loc == 3 + #define chroma_sample(a,b,c,d) (a) +#elif chroma_loc == 4 + #define chroma_sample(a,b,c,d) (((a) + (b)) * 0.5f) +#elif chroma_loc == 5 + #define chroma_sample(a,b,c,d) (c) +#elif chroma_loc == 6 + #define chroma_sample(a,b,c,d) (((c) + (d)) * 0.5f) +#else + #define chroma_sample(a,b,c,d) (((a) + (b) + (c) + (d)) * 0.25f) +#endif + +constant const float ST2084_M1 = 0.1593017578125f; +constant const float ST2084_M2 = 78.84375f; +constant const float ST2084_C1 = 0.8359375f; +constant const float ST2084_C2 = 18.8515625f; +constant const float ST2084_C3 = 18.6875f; + +__constant float yuv2rgb_bt2020[] = { + 1.0f, 0.0f, 1.4746f, + 1.0f, -0.16455f, -0.57135f, + 1.0f, 1.8814f, 0.0f +}; + +__constant float yuv2rgb_bt709[] = { + 1.0f, 0.0f, 1.5748f, + 1.0f, -0.18732f, -0.46812f, + 1.0f, 1.8556f, 0.0f +}; + +__constant float rgb2yuv_bt709[] = { + 0.2126f, 0.7152f, 0.0722f, + -0.11457f, -0.38543f, 0.5f, + 0.5f, -0.45415f, -0.04585f +}; + +__constant float rgb2yuv_bt2020[] ={ + 0.2627f, 0.678f, 0.0593f, + -0.1396f, -0.36037f, 0.5f, + 0.5f, -0.4598f, -0.0402f, +}; + + +float get_luma_dst(float3 c) { + return luma_dst.x * c.x + luma_dst.y * c.y + luma_dst.z * c.z; +} + +float get_luma_src(float3 c) { + return luma_src.x * c.x + luma_src.y * c.y + luma_src.z * c.z; +} + +float3 get_chroma_sample(float3 a, float3 b, float3 c, float3 d) { + return chroma_sample(a, b, c, d); +} + +float eotf_st2084(float x) { + float p = powr(x, 1.0f / ST2084_M2); + float a = max(p -ST2084_C1, 0.0f); + float b = max(ST2084_C2 - ST2084_C3 * p, 1e-6f); + float c = powr(a / b, 1.0f / ST2084_M1); + return x > 0.0f ? c * ST2084_MAX_LUMINANCE / REFERENCE_WHITE : 0.0f; +} + +__constant const float HLG_A = 0.17883277f; +__constant const float HLG_B = 0.28466892f; +__constant const float HLG_C = 0.55991073f; + +// linearizer for HLG +float inverse_oetf_hlg(float x) { + float a = 4.0f * x * x; + float b = exp((x - HLG_C) / HLG_A) + HLG_B; + return x < 0.5f ? a : b; +} + +// delinearizer for HLG +float oetf_hlg(float x) { + float a = 0.5f * sqrt(x); + float b = HLG_A * log(x - HLG_B) + HLG_C; + return x <= 1.0f ? a : b; +} + +float3 ootf_hlg(float3 c, float peak) { + float luma = get_luma_src(c); + float gamma = 1.2f + 0.42f * log10(peak * REFERENCE_WHITE / 1000.0f); + gamma = max(1.0f, gamma); + float factor = peak * powr(luma, gamma - 1.0f) / powr(12.0f, gamma); + return c * factor; +} + +float3 inverse_ootf_hlg(float3 c, float peak) { + float gamma = 1.2f + 0.42f * log10(peak * REFERENCE_WHITE / 1000.0f); + c *= powr(12.0f, gamma) / peak; + c /= powr(get_luma_dst(c), (gamma - 1.0f) / gamma); + return c; +} + +float inverse_eotf_bt1886(float c) { + return c < 0.0f ? 0.0f : powr(c, 1.0f / 2.4f); +} + +float oetf_bt709(float c) { + c = c < 0.0f ? 0.0f : c; + float r1 = 4.5f * c; + float r2 = 1.099f * powr(c, 0.45f) - 0.099f; + return c < 0.018f ? r1 : r2; +} +float inverse_oetf_bt709(float c) { + float r1 = c / 4.5f; + float r2 = powr((c + 0.099f) / 1.099f, 1.0f / 0.45f); + return c < 0.081f ? r1 : r2; +} + +float3 yuv2rgb(float y, float u, float v) { +#ifdef FULL_RANGE_IN + u -= 0.5f; v -= 0.5f; +#else + y = (y * 255.0f - 16.0f) / 219.0f; + u = (u * 255.0f - 128.0f) / 224.0f; + v = (v * 255.0f - 128.0f) / 224.0f; +#endif + float r = y * rgb_matrix[0] + u * rgb_matrix[1] + v * rgb_matrix[2]; + float g = y * rgb_matrix[3] + u * rgb_matrix[4] + v * rgb_matrix[5]; + float b = y * rgb_matrix[6] + u * rgb_matrix[7] + v * rgb_matrix[8]; + return (float3)(r, g, b); +} + +float3 yuv2lrgb(float3 yuv) { + float3 rgb = yuv2rgb(yuv.x, yuv.y, yuv.z); + float r = linearize(rgb.x); + float g = linearize(rgb.y); + float b = linearize(rgb.z); + return (float3)(r, g, b); +} + +float3 rgb2yuv(float r, float g, float b) { + float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2]; + float u = r*yuv_matrix[3] + g*yuv_matrix[4] + b*yuv_matrix[5]; + float v = r*yuv_matrix[6] + g*yuv_matrix[7] + b*yuv_matrix[8]; +#ifdef FULL_RANGE_OUT + u += 0.5f; v += 0.5f; +#else + y = (219.0f * y + 16.0f) / 255.0f; + u = (224.0f * u + 128.0f) / 255.0f; + v = (224.0f * v + 128.0f) / 255.0f; +#endif + return (float3)(y, u, v); +} + +float rgb2y(float r, float g, float b) { + float y = r*yuv_matrix[0] + g*yuv_matrix[1] + b*yuv_matrix[2]; + y = (219.0f * y + 16.0f) / 255.0f; + return y; +} + +float3 lrgb2yuv(float3 c) { + float r = delinearize(c.x); + float g = delinearize(c.y); + float b = delinearize(c.z); + + return rgb2yuv(r, g, b); +} + +float lrgb2y(float3 c) { + float r = delinearize(c.x); + float g = delinearize(c.y); + float b = delinearize(c.z); + + return rgb2y(r, g, b); +} + +float3 lrgb2lrgb(float3 c) { +#ifdef RGB2RGB_PASSTHROUGH + return c; +#else + float r = c.x, g = c.y, b = c.z; + float rr = rgb2rgb[0] * r + rgb2rgb[1] * g + rgb2rgb[2] * b; + float gg = rgb2rgb[3] * r + rgb2rgb[4] * g + rgb2rgb[5] * b; + float bb = rgb2rgb[6] * r + rgb2rgb[7] * g + rgb2rgb[8] * b; + return (float3)(rr, gg, bb); +#endif +} + +float3 ootf(float3 c, float peak) { +#ifdef ootf_impl + return ootf_impl(c, peak); +#else + return c; +#endif +} + +float3 inverse_ootf(float3 c, float peak) { +#ifdef inverse_ootf_impl + return inverse_ootf_impl(c, peak); +#else + return c; +#endif +} diff --git a/libavfilter/opencl/tonemap.cl b/libavfilter/opencl/tonemap.cl new file mode 100644 index 0000000..9448ba4 --- /dev/null +++ b/libavfilter/opencl/tonemap.cl @@ -0,0 +1,272 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define REFERENCE_WHITE 100.0f +extern float3 lrgb2yuv(float3); +extern float lrgb2y(float3); +extern float3 yuv2lrgb(float3); +extern float3 lrgb2lrgb(float3); +extern float get_luma_src(float3); +extern float get_luma_dst(float3); +extern float3 ootf(float3 c, float peak); +extern float3 inverse_ootf(float3 c, float peak); +extern float3 get_chroma_sample(float3, float3, float3, float3); + +struct detection_result { + float peak; + float average; +}; + +float hable_f(float in) { + float a = 0.15f, b = 0.50f, c = 0.10f, d = 0.20f, e = 0.02f, f = 0.30f; + return (in * (in * a + b * c) + d * e) / (in * (in * a + b) + d * f) - e / f; +} + +float direct(float s, float peak) { + return s; +} + +float linear(float s, float peak) { + return s * tone_param / peak; +} + +float gamma(float s, float peak) { + float p = s > 0.05f ? s /peak : 0.05f / peak; + float v = powr(p, 1.0f / tone_param); + return s > 0.05f ? v : (s * v /0.05f); +} + +float clip(float s, float peak) { + return clamp(s * tone_param, 0.0f, 1.0f); +} + +float reinhard(float s, float peak) { + return s / (s + tone_param) * (peak + tone_param) / peak; +} + +float hable(float s, float peak) { + return hable_f(s)/hable_f(peak); +} + +float mobius(float s, float peak) { + float j = tone_param; + float a, b; + + if (s <= j) + return s; + + a = -j * j * (peak - 1.0f) / (j * j - 2.0f * j + peak); + b = (j * j - 2.0f * j * peak + peak) / max(peak - 1.0f, 1e-6f); + + return (b * b + 2.0f * b * j + j * j) / (b - a) * (s + a) / (s + b); +} + +// detect peak/average signal of a frame, the algorithm was ported from: +// libplacebo (https://github.com/haasn/libplacebo) +struct detection_result +detect_peak_avg(global uint *util_buf, __local uint *sum_wg, + float signal, float peak) { +// layout of the util buffer +// +// Name: : Size (units of 4-bytes) +// average buffer : detection_frames + 1 +// peak buffer : detection_frames + 1 +// workgroup counter : 1 +// total of peak : 1 +// total of average : 1 +// frame index : 1 +// frame number : 1 + global uint *avg_buf = util_buf; + global uint *peak_buf = avg_buf + DETECTION_FRAMES + 1; + global uint *counter_wg_p = peak_buf + DETECTION_FRAMES + 1; + global uint *max_total_p = counter_wg_p + 1; + global uint *avg_total_p = max_total_p + 1; + global uint *frame_idx_p = avg_total_p + 1; + global uint *scene_frame_num_p = frame_idx_p + 1; + + uint frame_idx = *frame_idx_p; + uint scene_frame_num = *scene_frame_num_p; + + size_t lidx = get_local_id(0); + size_t lidy = get_local_id(1); + size_t lsizex = get_local_size(0); + size_t lsizey = get_local_size(1); + uint num_wg = get_num_groups(0) * get_num_groups(1); + size_t group_idx = get_group_id(0); + size_t group_idy = get_group_id(1); + struct detection_result r = {peak, sdr_avg}; + if (lidx == 0 && lidy == 0) + *sum_wg = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + // update workgroup sum + atomic_add(sum_wg, (uint)(signal * REFERENCE_WHITE)); + barrier(CLK_LOCAL_MEM_FENCE); + + // update frame peak/avg using work-group-average. + if (lidx == 0 && lidy == 0) { + uint avg_wg = *sum_wg / (lsizex * lsizey); + atomic_max(&peak_buf[frame_idx], avg_wg); + atomic_add(&avg_buf[frame_idx], avg_wg); + } + + if (scene_frame_num > 0) { + float peak = (float)*max_total_p / (REFERENCE_WHITE * scene_frame_num); + float avg = (float)*avg_total_p / (REFERENCE_WHITE * scene_frame_num); + r.peak = max(1.0f, peak); + r.average = max(0.25f, avg); + } + + if (lidx == 0 && lidy == 0 && atomic_add(counter_wg_p, 1) == num_wg - 1) { + *counter_wg_p = 0; + avg_buf[frame_idx] /= num_wg; + + if (scene_threshold > 0.0f) { + uint cur_max = peak_buf[frame_idx]; + uint cur_avg = avg_buf[frame_idx]; + int diff = (int)(scene_frame_num * cur_avg) - (int)*avg_total_p; + + if (abs(diff) > scene_frame_num * scene_threshold * REFERENCE_WHITE) { + for (uint i = 0; i < DETECTION_FRAMES + 1; i++) + avg_buf[i] = 0; + for (uint i = 0; i < DETECTION_FRAMES + 1; i++) + peak_buf[i] = 0; + *avg_total_p = *max_total_p = 0; + *scene_frame_num_p = 0; + avg_buf[frame_idx] = cur_avg; + peak_buf[frame_idx] = cur_max; + } + } + uint next = (frame_idx + 1) % (DETECTION_FRAMES + 1); + // add current frame, subtract next frame + *max_total_p += peak_buf[frame_idx] - peak_buf[next]; + *avg_total_p += avg_buf[frame_idx] - avg_buf[next]; + // reset next frame + peak_buf[next] = avg_buf[next] = 0; + *frame_idx_p = next; + *scene_frame_num_p = min(*scene_frame_num_p + 1, + (uint)DETECTION_FRAMES); + } + return r; +} + +float3 map_one_pixel_rgb(float3 rgb, float peak, float average) { + float sig = max(max(rgb.x, max(rgb.y, rgb.z)), 1e-6f); + + // Rescale the variables in order to bring it into a representation where + // 1.0 represents the dst_peak. This is because all of the tone mapping + // algorithms are defined in such a way that they map to the range [0.0, 1.0]. + if (target_peak > 1.0f) { + sig *= 1.0f / target_peak; + peak *= 1.0f / target_peak; + } + + float sig_old = sig; + + // Scale the signal to compensate for differences in the average brightness + float slope = min(1.0f, sdr_avg / average); + sig *= slope; + peak *= slope; + + // Desaturate the color using a coefficient dependent on the signal level + if (desat_param > 0.0f) { + float luma = get_luma_dst(rgb); + float coeff = max(sig - 0.18f, 1e-6f) / max(sig, 1e-6f); + coeff = native_powr(coeff, 10.0f / desat_param); + rgb = mix(rgb, (float3)luma, (float3)coeff); + sig = mix(sig, luma * slope, coeff); + } + + sig = TONE_FUNC(sig, peak); + + sig = min(sig, 1.0f); + rgb *= (sig/sig_old); + return rgb; +} +// map from source space YUV to destination space RGB +float3 map_to_dst_space_from_yuv(float3 yuv, float peak) { + float3 c = yuv2lrgb(yuv); + c = ootf(c, peak); + c = lrgb2lrgb(c); + return c; +} + +__kernel void tonemap(__write_only image2d_t dst1, + __read_only image2d_t src1, + __write_only image2d_t dst2, + __read_only image2d_t src2, + global uint *util_buf, + float peak + ) +{ + __local uint sum_wg; + const sampler_t sampler = (CLK_NORMALIZED_COORDS_FALSE | + CLK_ADDRESS_CLAMP_TO_EDGE | + CLK_FILTER_NEAREST); + int xi = get_global_id(0); + int yi = get_global_id(1); + // each work item process four pixels + int x = 2 * xi; + int y = 2 * yi; + + float y0 = read_imagef(src1, sampler, (int2)(x, y)).x; + float y1 = read_imagef(src1, sampler, (int2)(x + 1, y)).x; + float y2 = read_imagef(src1, sampler, (int2)(x, y + 1)).x; + float y3 = read_imagef(src1, sampler, (int2)(x + 1, y + 1)).x; + float2 uv = read_imagef(src2, sampler, (int2)(xi, yi)).xy; + + float3 c0 = map_to_dst_space_from_yuv((float3)(y0, uv.x, uv.y), peak); + float3 c1 = map_to_dst_space_from_yuv((float3)(y1, uv.x, uv.y), peak); + float3 c2 = map_to_dst_space_from_yuv((float3)(y2, uv.x, uv.y), peak); + float3 c3 = map_to_dst_space_from_yuv((float3)(y3, uv.x, uv.y), peak); + + float sig0 = max(c0.x, max(c0.y, c0.z)); + float sig1 = max(c1.x, max(c1.y, c1.z)); + float sig2 = max(c2.x, max(c2.y, c2.z)); + float sig3 = max(c3.x, max(c3.y, c3.z)); + float sig = max(sig0, max(sig1, max(sig2, sig3))); + + struct detection_result r = detect_peak_avg(util_buf, &sum_wg, sig, peak); + + float3 c0_old = c0, c1_old = c1, c2_old = c2; + c0 = map_one_pixel_rgb(c0, r.peak, r.average); + c1 = map_one_pixel_rgb(c1, r.peak, r.average); + c2 = map_one_pixel_rgb(c2, r.peak, r.average); + c3 = map_one_pixel_rgb(c3, r.peak, r.average); + + c0 = inverse_ootf(c0, target_peak); + c1 = inverse_ootf(c1, target_peak); + c2 = inverse_ootf(c2, target_peak); + c3 = inverse_ootf(c3, target_peak); + + y0 = lrgb2y(c0); + y1 = lrgb2y(c1); + y2 = lrgb2y(c2); + y3 = lrgb2y(c3); + float3 chroma_c = get_chroma_sample(c0, c1, c2, c3); + float3 chroma = lrgb2yuv(chroma_c); + + if (xi < get_image_width(dst2) && yi < get_image_height(dst2)) { + write_imagef(dst1, (int2)(x, y), (float4)(y0, 0.0f, 0.0f, 1.0f)); + write_imagef(dst1, (int2)(x+1, y), (float4)(y1, 0.0f, 0.0f, 1.0f)); + write_imagef(dst1, (int2)(x, y+1), (float4)(y2, 0.0f, 0.0f, 1.0f)); + write_imagef(dst1, (int2)(x+1, y+1), (float4)(y3, 0.0f, 0.0f, 1.0f)); + write_imagef(dst2, (int2)(xi, yi), + (float4)(chroma.y, chroma.z, 0.0f, 1.0f)); + } +} |