summaryrefslogtreecommitdiffstats
path: root/libavfilter
diff options
context:
space:
mode:
authorClément Bœsch <u@pkh.me>2018-05-06 10:38:30 +0200
committerClément Bœsch <u@pkh.me>2018-05-08 10:28:06 +0200
commit26f02c51ce9797e392f956a268c373b353b21178 (patch)
treee9f3b55d59a24a45125a285ab52767f4426de50b /libavfilter
parentf1248b7795857e14ee31a15ad79583bdda7c4a90 (diff)
downloadffmpeg-streaming-26f02c51ce9797e392f956a268c373b353b21178.zip
ffmpeg-streaming-26f02c51ce9797e392f956a268c373b353b21178.tar.gz
lavfi/nlmeans: add SIMD-friendly assumptions for compute_safe_ssd_integral_image
SIMD code will not have to deal with padding itself. Overwriting in that function may have been possible but involve large overreading of the sources. Instead, we simply make sure the width to process is always a multiple of 16. Additionally, there must be some actual area to process so the SIMD code can have its boundary checks after processing the first pixels.
Diffstat (limited to 'libavfilter')
-rw-r--r--libavfilter/vf_nlmeans.c25
1 files changed, 18 insertions, 7 deletions
diff --git a/libavfilter/vf_nlmeans.c b/libavfilter/vf_nlmeans.c
index d222d39..3f0a43e 100644
--- a/libavfilter/vf_nlmeans.c
+++ b/libavfilter/vf_nlmeans.c
@@ -157,6 +157,9 @@ static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32
{
int x, y;
+ /* SIMD-friendly assumptions allowed here */
+ av_assert2(!(w & 0xf) && w >= 16 && h >= 1);
+
for (y = 0; y < h; y++) {
uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
@@ -257,9 +260,16 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32,
// to compare the 2 sources pixels
const int startx_safe = FFMAX(s1x, s2x);
const int starty_safe = FFMAX(s1y, s2y);
- const int endx_safe = FFMIN(s1x + w, s2x + w);
+ const int u_endx_safe = FFMIN(s1x + w, s2x + w); // unaligned
const int endy_safe = FFMIN(s1y + h, s2y + h);
+ // deduce the safe area width and height
+ const int safe_pw = (u_endx_safe - startx_safe) & ~0xf;
+ const int safe_ph = endy_safe - starty_safe;
+
+ // adjusted end x position of the safe area after width of the safe area gets aligned
+ const int endx_safe = startx_safe + safe_pw;
+
// top part where only one of s1 and s2 is still readable, or none at all
compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
0, 0,
@@ -273,24 +283,25 @@ static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32,
0, starty_safe,
src, linesize,
offx, offy, e, w, h,
- startx_safe, endy_safe - starty_safe);
+ startx_safe, safe_ph);
// main and safe part of the integral
av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w);
av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h);
av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w);
av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h);
- compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32,
- src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize,
- src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize,
- endx_safe - startx_safe, endy_safe - starty_safe);
+ if (safe_pw && safe_ph)
+ compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32,
+ src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize,
+ src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize,
+ safe_pw, safe_ph);
// right part of the integral
compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
endx_safe, starty_safe,
src, linesize,
offx, offy, e, w, h,
- ii_w - endx_safe, endy_safe - starty_safe);
+ ii_w - endx_safe, safe_ph);
// bottom part where only one of s1 and s2 is still readable, or none at all
compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
OpenPOWER on IntegriCloud