platform/external/ffmpeg.git - Unnamed repository; edit this file 'description' to name the repository.

1 /*
2  * Copyright (c) 2016 Clément Bœsch <u pkh me>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 /**
22  * @todo
23  * - SIMD for compute_safe_ssd_integral_image
24  * - SIMD for final weighted averaging
25  * - better automatic defaults? see "Parameters" @ http://www.ipol.im/pub/art/2011/bcm_nlm/
26  * - temporal support (probably doesn't need any displacement according to
27  *   "Denoising image sequences does not require motion estimation")
28  * - Bayer pixel format support for at least raw photos? (DNG support would be
29  *   handy here)
30  * - FATE test (probably needs visual threshold test mechanism due to the use
31  *   of floats)
32  */
33
34 #include "libavutil/avassert.h"
35 #include "libavutil/opt.h"
36 #include "libavutil/pixdesc.h"
37 #include "avfilter.h"
38 #include "formats.h"
39 #include "internal.h"
40 #include "video.h"
41
42 struct weighted_avg {
43     double total_weight;
44     double sum;
45 };
46
47 #define WEIGHT_LUT_NBITS 9
48 #define WEIGHT_LUT_SIZE  (1<<WEIGHT_LUT_NBITS)
49
50 typedef struct {
51     const AVClass *class;
52     int nb_planes;
53     int chroma_w, chroma_h;
54     double pdiff_scale;                         // invert of the filtering parameter (sigma*10) squared
55     double sigma;                               // denoising strength
56     int patch_size,    patch_hsize;             // patch size and half size
57     int patch_size_uv, patch_hsize_uv;          // patch size and half size for chroma planes
58     int research_size,    research_hsize;       // research size and half size
59     int research_size_uv, research_hsize_uv;    // research size and half size for chroma planes
60     uint32_t *ii_orig;                          // integral image
61     uint32_t *ii;                               // integral image starting after the 0-line and 0-column
62     int ii_w, ii_h;                             // width and height of the integral image
63     int ii_lz_32;                               // linesize in 32-bit units of the integral image
64     struct weighted_avg *wa;                    // weighted average of every pixel
65     int wa_linesize;                            // linesize for wa in struct size unit
66     double weight_lut[WEIGHT_LUT_SIZE];         // lookup table mapping (scaled) patch differences to their associated weights
67     double pdiff_lut_scale;                     // scale factor for patch differences before looking into the LUT
68     int max_meaningful_diff;                    // maximum difference considered (if the patch difference is too high we ignore the pixel)
69 } NLMeansContext;
70
71 #define OFFSET(x) offsetof(NLMeansContext, x)
72 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
73 static const AVOption nlmeans_options[] = {
74     { "s",  "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
75     { "p",  "patch size",                   OFFSET(patch_size),    AV_OPT_TYPE_INT, { .i64 = 3*2+1 }, 0, 99, FLAGS },
76     { "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
77     { "r",  "research window",                   OFFSET(research_size),    AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
78     { "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 },     0, 99, FLAGS },
79     { NULL }
80 };
81
82 AVFILTER_DEFINE_CLASS(nlmeans);
83
84 static int query_formats(AVFilterContext *ctx)
85 {
86     static const enum AVPixelFormat pix_fmts[] = {
87         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
88         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
89         AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
90         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
91         AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
92         AV_PIX_FMT_YUVJ411P,
93         AV_PIX_FMT_GRAY8, AV_PIX_FMT_GBRP,
94         AV_PIX_FMT_NONE
95     };
96
97     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
98     if (!fmts_list)
99         return AVERROR(ENOMEM);
100     return ff_set_common_formats(ctx, fmts_list);
101 }
102
103 /*
104  * M is a discrete map where every entry contains the sum of all the entries
105  * in the rectangle from the top-left origin of M to its coordinate. In the
106  * following schema, "i" contains the sum of the whole map:
107  *
108  * M = +----------+-----------------+----+
109  *     |          |                 |    |
110  *     |          |                 |    |
111  *     |         a|                b|   c|
112  *     +----------+-----------------+----+
113  *     |          |                 |    |
114  *     |          |                 |    |
115  *     |          |        X        |    |
116  *     |          |                 |    |
117  *     |         d|                e|   f|
118  *     +----------+-----------------+----+
119  *     |          |                 |    |
120  *     |         g|                h|   i|
121  *     +----------+-----------------+----+
122  *
123  * The sum of the X box can be calculated with:
124  *    X = e-d-b+a
125  *
126  * See https://en.wikipedia.org/wiki/Summed_area_table
127  *
128  * The compute*_ssd functions compute the integral image M where every entry
129  * contains the sum of the squared difference of every corresponding pixels of
130  * two input planes of the same size as M.
131  */
132 static inline int get_integral_patch_value(const uint32_t *ii, int ii_lz_32, int x, int y, int p)
133 {
134     const int e = ii[(y + p    ) * ii_lz_32 + (x + p    )];
135     const int d = ii[(y + p    ) * ii_lz_32 + (x - p - 1)];
136     const int b = ii[(y - p - 1) * ii_lz_32 + (x + p    )];
137     const int a = ii[(y - p - 1) * ii_lz_32 + (x - p - 1)];
138     return e - d - b + a;
139 }
140
141 /**
142  * Compute squared difference of the safe area (the zone where s1 and s2
143  * overlap). It is likely the largest integral zone, so it is interesting to do
144  * as little checks as possible; contrary to the unsafe version of this
145  * function, we do not need any clipping here.
146  *
147  * The line above dst and the column to its left are always readable.
148  *
149  * This C version computes the SSD integral image using a scalar accumulator,
150  * while for SIMD implementation it is likely more interesting to use the
151  * two-loops algorithm variant.
152  */
153 static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32,
154                                               const uint8_t *s1, int linesize1,
155                                               const uint8_t *s2, int linesize2,
156                                               int w, int h)
157 {
158     int x, y;
159
160     for (y = 0; y < h; y++) {
161         uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
162
163         for (x = 0; x < w; x++) {
164             const int d  = s1[x] - s2[x];
165             acc += d * d;
166             dst[x] = dst[-dst_linesize_32 + x] + acc;
167         }
168         s1  += linesize1;
169         s2  += linesize2;
170         dst += dst_linesize_32;
171     }
172 }
173
174 /**
175  * Compute squared difference of an unsafe area (the zone nor s1 nor s2 could
176  * be readable).
177  *
178  * On the other hand, the line above dst and the column to its left are always
179  * readable.
180  *
181  * There is little point in having this function SIMDified as it is likely too
182  * complex and only handle small portions of the image.
183  *
184  * @param dst               integral image
185  * @param dst_linesize_32   integral image linesize (in 32-bit integers unit)
186  * @param startx            integral starting x position
187  * @param starty            integral starting y position
188  * @param src               source plane buffer
189  * @param linesize          source plane linesize
190  * @param offx              source offsetting in x
191  * @param offy              source offsetting in y
192  * @paran r                 absolute maximum source offsetting
193  * @param sw                source width
194  * @param sh                source height
195  * @param w                 width to compute
196  * @param h                 height to compute
197  */
198 static inline void compute_unsafe_ssd_integral_image(uint32_t *dst, int dst_linesize_32,
199                                                      int startx, int starty,
200                                                      const uint8_t *src, int linesize,
201                                                      int offx, int offy, int r, int sw, int sh,
202                                                      int w, int h)
203 {
204     int x, y;
205
206     for (y = starty; y < starty + h; y++) {
207         uint32_t acc = dst[y*dst_linesize_32 + startx - 1] - dst[(y-1)*dst_linesize_32 + startx - 1];
208         const int s1y = av_clip(y -  r,         0, sh - 1);
209         const int s2y = av_clip(y - (r + offy), 0, sh - 1);
210
211         for (x = startx; x < startx + w; x++) {
212             const int s1x = av_clip(x -  r,         0, sw - 1);
213             const int s2x = av_clip(x - (r + offx), 0, sw - 1);
214             const uint8_t v1 = src[s1y*linesize + s1x];
215             const uint8_t v2 = src[s2y*linesize + s2x];
216             const int d = v1 - v2;
217             acc += d * d;
218             dst[y*dst_linesize_32 + x] = dst[(y-1)*dst_linesize_32 + x] + acc;
219         }
220     }
221 }
222
223 /*
224  * Compute the sum of squared difference integral image
225  * http://www.ipol.im/pub/art/2014/57/
226  * Integral Images for Block Matching - Gabriele Facciolo, Nicolas Limare, Enric Meinhardt-Llopis
227  *
228  * @param ii                integral image of dimension (w+e*2) x (h+e*2) with
229  *                          an additional zeroed top line and column already
230  *                          "applied" to the pointer value
231  * @param ii_linesize_32    integral image linesize (in 32-bit integers unit)
232  * @param src               source plane buffer
233  * @param linesize          source plane linesize
234  * @param offx              x-offsetting ranging in [-e;e]
235  * @param offy              y-offsetting ranging in [-e;e]
236  * @param w                 source width
237  * @param h                 source height
238  * @param e                 research padding edge
239  */
240 static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32,
241                                        const uint8_t *src, int linesize, int offx, int offy,
242                                        int e, int w, int h)
243 {
244     // ii has a surrounding padding of thickness "e"
245     const int ii_w = w + e*2;
246     const int ii_h = h + e*2;
247
248     // we center the first source
249     const int s1x = e;
250     const int s1y = e;
251
252     // 2nd source is the frame with offsetting
253     const int s2x = e + offx;
254     const int s2y = e + offy;
255
256     // get the dimension of the overlapping rectangle where it is always safe
257     // to compare the 2 sources pixels
258     const int startx_safe = FFMAX(s1x, s2x);
259     const int starty_safe = FFMAX(s1y, s2y);
260     const int endx_safe   = FFMIN(s1x + w, s2x + w);
261     const int endy_safe   = FFMIN(s1y + h, s2y + h);
262
263     // top part where only one of s1 and s2 is still readable, or none at all
264     compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
265                                       0, 0,
266                                       src, linesize,
267                                       offx, offy, e, w, h,
268                                       ii_w, starty_safe);
269
270     // fill the left column integral required to compute the central
271     // overlapping one
272     compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
273                                       0, starty_safe,
274                                       src, linesize,
275                                       offx, offy, e, w, h,
276                                       startx_safe, endy_safe - starty_safe);
277
278     // main and safe part of the integral
279     av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w);
280     av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h);
281     av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w);
282     av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h);
283     compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32,
284                                       src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize,
285                                       src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize,
286                                       endx_safe - startx_safe, endy_safe - starty_safe);
287
288     // right part of the integral
289     compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
290                                       endx_safe, starty_safe,
291                                       src, linesize,
292                                       offx, offy, e, w, h,
293                                       ii_w - endx_safe, endy_safe - starty_safe);
294
295     // bottom part where only one of s1 and s2 is still readable, or none at all
296     compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
297                                       0, endy_safe,
298                                       src, linesize,
299                                       offx, offy, e, w, h,
300                                       ii_w, ii_h - endy_safe);
301 }
302
303 static int config_input(AVFilterLink *inlink)
304 {
305     AVFilterContext *ctx = inlink->dst;
306     NLMeansContext *s = ctx->priv;
307     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
308     const int e = FFMAX(s->research_hsize, s->research_hsize_uv)
309                 + FFMAX(s->patch_hsize,    s->patch_hsize_uv);
310
311     s->chroma_w = FF_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
312     s->chroma_h = FF_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
313     s->nb_planes = av_pix_fmt_count_planes(inlink->format);
314
315     /* Allocate the integral image with extra edges of thickness "e"
316      *
317      *   +_+-------------------------------+
318      *   |0|0000000000000000000000000000000|
319      *   +-x-------------------------------+
320      *   |0|\    ^                         |
321      *   |0| ii  | e                       |
322      *   |0|     v                         |
323      *   |0|   +-----------------------+   |
324      *   |0|   |                       |   |
325      *   |0|<->|                       |   |
326      *   |0| e |                       |   |
327      *   |0|   |                       |   |
328      *   |0|   +-----------------------+   |
329      *   |0|                               |
330      *   |0|                               |
331      *   |0|                               |
332      *   +-+-------------------------------+
333      */
334     s->ii_w = inlink->w + e*2;
335     s->ii_h = inlink->h + e*2;
336
337     // align to 4 the linesize, "+1" is for the space of the left 0-column
338     s->ii_lz_32 = FFALIGN(s->ii_w + 1, 4);
339
340     // "+1" is for the space of the top 0-line
341     s->ii_orig = av_mallocz_array(s->ii_h + 1, s->ii_lz_32 * sizeof(*s->ii_orig));
342     if (!s->ii_orig)
343         return AVERROR(ENOMEM);
344
345     // skip top 0-line and left 0-column
346     s->ii = s->ii_orig + s->ii_lz_32 + 1;
347
348     // allocate weighted average for every pixel
349     s->wa_linesize = inlink->w;
350     s->wa = av_malloc_array(s->wa_linesize, inlink->h * sizeof(*s->wa));
351     if (!s->wa)
352         return AVERROR(ENOMEM);
353
354     return 0;
355 }
356
357 struct thread_data {
358     const uint8_t *src;
359     int src_linesize;
360     int startx, starty;
361     int endx, endy;
362     const uint32_t *ii_start;
363     int p;
364 };
365
366 static int nlmeans_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
367 {
368     int x, y;
369     NLMeansContext *s = ctx->priv;
370     const struct thread_data *td = arg;
371     const uint8_t *src = td->src;
372     const int src_linesize = td->src_linesize;
373     const int process_h = td->endy - td->starty;
374     const int slice_start = (process_h *  jobnr   ) / nb_jobs;
375     const int slice_end   = (process_h * (jobnr+1)) / nb_jobs;
376     const int starty = td->starty + slice_start;
377     const int endy   = td->starty + slice_end;
378
379     for (y = starty; y < endy; y++) {
380         for (x = td->startx; x < td->endx; x++) {
381             const int patch_diff_sq = get_integral_patch_value(td->ii_start, s->ii_lz_32, x, y, td->p);
382             if (patch_diff_sq < s->max_meaningful_diff) {
383                 struct weighted_avg *wa = &s->wa[y*s->wa_linesize + x];
384                 const int weight_lut_idx = patch_diff_sq * s->pdiff_lut_scale;
385                 const double weight = s->weight_lut[weight_lut_idx]; // exp(-patch_diff_sq * s->pdiff_scale)
386                 wa->total_weight += weight;
387                 wa->sum += weight * src[y*src_linesize + x];
388             }
389         }
390     }
391     return 0;
392 }
393
394 static int nlmeans_plane(AVFilterContext *ctx, int w, int h, int p, int r,
395                          uint8_t *dst, int dst_linesize,
396                          const uint8_t *src, int src_linesize)
397 {
398     int x, y;
399     int offx, offy;
400     NLMeansContext *s = ctx->priv;
401     /* patches center points cover the whole research window so the patches
402      * themselves overflow the research window */
403     const int e = r + p;
404     /* focus an integral pointer on the centered image (s1) */
405     const uint32_t *centered_ii = s->ii + e*s->ii_lz_32 + e;
406
407     memset(s->wa, 0, s->wa_linesize * h * sizeof(*s->wa));
408
409     for (offy = -r; offy <= r; offy++) {
410         for (offx = -r; offx <= r; offx++) {
411             if (offx || offy) {
412                 struct thread_data td = {
413                     .src          = src + offy*src_linesize + offx,
414                     .src_linesize = src_linesize,
415                     .startx       = FFMAX(0, -offx),
416                     .starty       = FFMAX(0, -offy),
417                     .endx         = FFMIN(w, w - offx),
418                     .endy         = FFMIN(h, h - offy),
419                     .ii_start     = centered_ii + offy*s->ii_lz_32 + offx,
420                     .p            = p,
421                 };
422
423                 compute_ssd_integral_image(s->ii, s->ii_lz_32,
424                                            src, src_linesize,
425                                            offx, offy, e, w, h);
426                 ctx->internal->execute(ctx, nlmeans_slice, &td, NULL,
427                                        FFMIN(td.endy - td.starty, ff_filter_get_nb_threads(ctx)));
428             }
429         }
430     }
431     for (y = 0; y < h; y++) {
432         for (x = 0; x < w; x++) {
433             struct weighted_avg *wa = &s->wa[y*s->wa_linesize + x];
434
435             // Also weight the centered pixel
436             wa->total_weight += 1.0;
437             wa->sum += 1.0 * src[y*src_linesize + x];
438
439             dst[y*dst_linesize + x] = av_clip_uint8(wa->sum / wa->total_weight);
440         }
441     }
442     return 0;
443 }
444
445 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
446 {
447     int i;
448     AVFilterContext *ctx = inlink->dst;
449     NLMeansContext *s = ctx->priv;
450     AVFilterLink *outlink = ctx->outputs[0];
451
452     AVFrame *out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
453     if (!out) {
454         av_frame_free(&in);
455         return AVERROR(ENOMEM);
456     }
457     av_frame_copy_props(out, in);
458
459     for (i = 0; i < s->nb_planes; i++) {
460         const int w = i ? s->chroma_w          : inlink->w;
461         const int h = i ? s->chroma_h          : inlink->h;
462         const int p = i ? s->patch_hsize_uv    : s->patch_hsize;
463         const int r = i ? s->research_hsize_uv : s->research_hsize;
464         nlmeans_plane(ctx, w, h, p, r,
465                       out->data[i], out->linesize[i],
466                       in->data[i],  in->linesize[i]);
467     }
468
469     av_frame_free(&in);
470     return ff_filter_frame(outlink, out);
471 }
472
473 #define CHECK_ODD_FIELD(field, name) do {                       \
474     if (!(s->field & 1)) {                                      \
475         s->field |= 1;                                          \
476         av_log(ctx, AV_LOG_WARNING, name " size must be odd, "  \
477                "setting it to %d\n", s->field);                 \
478     }                                                           \
479 } while (0)
480
481 static av_cold int init(AVFilterContext *ctx)
482 {
483     int i;
484     NLMeansContext *s = ctx->priv;
485     const double h = s->sigma * 10.;
486
487     s->pdiff_scale = 1. / (h * h);
488     s->max_meaningful_diff = -log(1/255.) / s->pdiff_scale;
489     s->pdiff_lut_scale = 1./s->max_meaningful_diff * WEIGHT_LUT_SIZE;
490     av_assert0((s->max_meaningful_diff - 1) * s->pdiff_lut_scale < FF_ARRAY_ELEMS(s->weight_lut));
491     for (i = 0; i < WEIGHT_LUT_SIZE; i++)
492         s->weight_lut[i] = exp(-i / s->pdiff_lut_scale * s->pdiff_scale);
493
494     CHECK_ODD_FIELD(research_size,   "Luma research window");
495     CHECK_ODD_FIELD(patch_size,      "Luma patch");
496
497     if (!s->research_size_uv) s->research_size_uv = s->research_size;
498     if (!s->patch_size_uv)    s->patch_size_uv    = s->patch_size;
499
500     CHECK_ODD_FIELD(research_size_uv, "Chroma research window");
501     CHECK_ODD_FIELD(patch_size_uv,    "Chroma patch");
502
503     s->research_hsize    = s->research_size    / 2;
504     s->research_hsize_uv = s->research_size_uv / 2;
505     s->patch_hsize       = s->patch_size       / 2;
506     s->patch_hsize_uv    = s->patch_size_uv    / 2;
507
508     av_log(ctx, AV_LOG_INFO, "Research window: %dx%d / %dx%d, patch size: %dx%d / %dx%d\n",
509            s->research_size, s->research_size, s->research_size_uv, s->research_size_uv,
510            s->patch_size,    s->patch_size,    s->patch_size_uv,    s->patch_size_uv);
511
512     return 0;
513 }
514
515 static av_cold void uninit(AVFilterContext *ctx)
516 {
517     NLMeansContext *s = ctx->priv;
518     av_freep(&s->ii_orig);
519     av_freep(&s->wa);
520 }
521
522 static const AVFilterPad nlmeans_inputs[] = {
523     {
524         .name         = "default",
525         .type         = AVMEDIA_TYPE_VIDEO,
526         .config_props = config_input,
527         .filter_frame = filter_frame,
528     },
529     { NULL }
530 };
531
532 static const AVFilterPad nlmeans_outputs[] = {
533     {
534         .name = "default",
535         .type = AVMEDIA_TYPE_VIDEO,
536     },
537     { NULL }
538 };
539
540 AVFilter ff_vf_nlmeans = {
541     .name          = "nlmeans",
542     .description   = NULL_IF_CONFIG_SMALL("Non-local means denoiser."),
543     .priv_size     = sizeof(NLMeansContext),
544     .init          = init,
545     .uninit        = uninit,
546     .query_formats = query_formats,
547     .inputs        = nlmeans_inputs,
548     .outputs       = nlmeans_outputs,
549     .priv_class    = &nlmeans_class,
550     .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | AVFILTER_FLAG_SLICE_THREADS,
551 };
552
1	/*
2	* Copyright (c) 2016 Clément Bœsch <u pkh me>
3	*
4	* This file is part of FFmpeg.
5	*
6	* FFmpeg is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2.1 of the License, or (at your option) any later version.
10	*
11	* FFmpeg is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with FFmpeg; if not, write to the Free Software
18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19	*/
20
21	/**
22	* @todo
23	* - SIMD for compute_safe_ssd_integral_image
24	* - SIMD for final weighted averaging
25	* - better automatic defaults? see "Parameters" @ http://www.ipol.im/pub/art/2011/bcm_nlm/
26	* - temporal support (probably doesn't need any displacement according to
27	* "Denoising image sequences does not require motion estimation")
28	* - Bayer pixel format support for at least raw photos? (DNG support would be
29	* handy here)
30	* - FATE test (probably needs visual threshold test mechanism due to the use
31	* of floats)
32	*/
33
34	#include "libavutil/avassert.h"
35	#include "libavutil/opt.h"
36	#include "libavutil/pixdesc.h"
37	#include "avfilter.h"
38	#include "formats.h"
39	#include "internal.h"
40	#include "video.h"
41
42	struct weighted_avg {
43	double total_weight;
44	double sum;
45	};
46
47	#define WEIGHT_LUT_NBITS 9
48	#define WEIGHT_LUT_SIZE (1<<WEIGHT_LUT_NBITS)
49
50	typedef struct {
51	const AVClass *class;
52	int nb_planes;
53	int chroma_w, chroma_h;
54	double pdiff_scale; // invert of the filtering parameter (sigma*10) squared
55	double sigma; // denoising strength
56	int patch_size, patch_hsize; // patch size and half size
57	int patch_size_uv, patch_hsize_uv; // patch size and half size for chroma planes
58	int research_size, research_hsize; // research size and half size
59	int research_size_uv, research_hsize_uv; // research size and half size for chroma planes
60	uint32_t *ii_orig; // integral image
61	uint32_t *ii; // integral image starting after the 0-line and 0-column
62	int ii_w, ii_h; // width and height of the integral image
63	int ii_lz_32; // linesize in 32-bit units of the integral image
64	struct weighted_avg *wa; // weighted average of every pixel
65	int wa_linesize; // linesize for wa in struct size unit
66	double weight_lut[WEIGHT_LUT_SIZE]; // lookup table mapping (scaled) patch differences to their associated weights
67	double pdiff_lut_scale; // scale factor for patch differences before looking into the LUT
68	int max_meaningful_diff; // maximum difference considered (if the patch difference is too high we ignore the pixel)
69	} NLMeansContext;
70
71	#define OFFSET(x) offsetof(NLMeansContext, x)
72	#define FLAGS AV_OPT_FLAG_FILTERING_PARAM\|AV_OPT_FLAG_VIDEO_PARAM
73	static const AVOption nlmeans_options[] = {
74	{ "s", "denoising strength", OFFSET(sigma), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 1.0, 30.0, FLAGS },
75	{ "p", "patch size", OFFSET(patch_size), AV_OPT_TYPE_INT, { .i64 = 3*2+1 }, 0, 99, FLAGS },
76	{ "pc", "patch size for chroma planes", OFFSET(patch_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
77	{ "r", "research window", OFFSET(research_size), AV_OPT_TYPE_INT, { .i64 = 7*2+1 }, 0, 99, FLAGS },
78	{ "rc", "research window for chroma planes", OFFSET(research_size_uv), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, 99, FLAGS },
79	{ NULL }
80	};
81
82	AVFILTER_DEFINE_CLASS(nlmeans);
83
84	static int query_formats(AVFilterContext *ctx)
85	{
86	static const enum AVPixelFormat pix_fmts[] = {
87	AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
88	AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
89	AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
90	AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
91	AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
92	AV_PIX_FMT_YUVJ411P,
93	AV_PIX_FMT_GRAY8, AV_PIX_FMT_GBRP,
94	AV_PIX_FMT_NONE
95	};
96
97	AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
98	if (!fmts_list)
99	return AVERROR(ENOMEM);
100	return ff_set_common_formats(ctx, fmts_list);
101	}
102
103	/*
104	* M is a discrete map where every entry contains the sum of all the entries
105	* in the rectangle from the top-left origin of M to its coordinate. In the
106	* following schema, "i" contains the sum of the whole map:
107	*
108	* M = +----------+-----------------+----+
109	* \| \| \| \|
110	* \| \| \| \|
111	* \| a\| b\| c\|
112	* +----------+-----------------+----+
113	* \| \| \| \|
114	* \| \| \| \|
115	* \| \| X \| \|
116	* \| \| \| \|
117	* \| d\| e\| f\|
118	* +----------+-----------------+----+
119	* \| \| \| \|
120	* \| g\| h\| i\|
121	* +----------+-----------------+----+
122	*
123	* The sum of the X box can be calculated with:
124	* X = e-d-b+a
125	*
126	* See https://en.wikipedia.org/wiki/Summed_area_table
127	*
128	* The compute*_ssd functions compute the integral image M where every entry
129	* contains the sum of the squared difference of every corresponding pixels of
130	* two input planes of the same size as M.
131	*/
132	static inline int get_integral_patch_value(const uint32_t *ii, int ii_lz_32, int x, int y, int p)
133	{
134	const int e = ii[(y + p ) * ii_lz_32 + (x + p )];
135	const int d = ii[(y + p ) * ii_lz_32 + (x - p - 1)];
136	const int b = ii[(y - p - 1) * ii_lz_32 + (x + p )];
137	const int a = ii[(y - p - 1) * ii_lz_32 + (x - p - 1)];
138	return e - d - b + a;
139	}
140
141	/**
142	* Compute squared difference of the safe area (the zone where s1 and s2
143	* overlap). It is likely the largest integral zone, so it is interesting to do
144	* as little checks as possible; contrary to the unsafe version of this
145	* function, we do not need any clipping here.
146	*
147	* The line above dst and the column to its left are always readable.
148	*
149	* This C version computes the SSD integral image using a scalar accumulator,
150	* while for SIMD implementation it is likely more interesting to use the
151	* two-loops algorithm variant.
152	*/
153	static void compute_safe_ssd_integral_image_c(uint32_t *dst, int dst_linesize_32,
154	const uint8_t *s1, int linesize1,
155	const uint8_t *s2, int linesize2,
156	int w, int h)
157	{
158	int x, y;
159
160	for (y = 0; y < h; y++) {
161	uint32_t acc = dst[-1] - dst[-dst_linesize_32 - 1];
162
163	for (x = 0; x < w; x++) {
164	const int d = s1[x] - s2[x];
165	acc += d * d;
166	dst[x] = dst[-dst_linesize_32 + x] + acc;
167	}
168	s1 += linesize1;
169	s2 += linesize2;
170	dst += dst_linesize_32;
171	}
172	}
173
174	/**
175	* Compute squared difference of an unsafe area (the zone nor s1 nor s2 could
176	* be readable).
177	*
178	* On the other hand, the line above dst and the column to its left are always
179	* readable.
180	*
181	* There is little point in having this function SIMDified as it is likely too
182	* complex and only handle small portions of the image.
183	*
184	* @param dst integral image
185	* @param dst_linesize_32 integral image linesize (in 32-bit integers unit)
186	* @param startx integral starting x position
187	* @param starty integral starting y position
188	* @param src source plane buffer
189	* @param linesize source plane linesize
190	* @param offx source offsetting in x
191	* @param offy source offsetting in y
192	* @paran r absolute maximum source offsetting
193	* @param sw source width
194	* @param sh source height
195	* @param w width to compute
196	* @param h height to compute
197	*/
198	static inline void compute_unsafe_ssd_integral_image(uint32_t *dst, int dst_linesize_32,
199	int startx, int starty,
200	const uint8_t *src, int linesize,
201	int offx, int offy, int r, int sw, int sh,
202	int w, int h)
203	{
204	int x, y;
205
206	for (y = starty; y < starty + h; y++) {
207	uint32_t acc = dst[ydst_linesize_32 + startx - 1] - dst[(y-1)dst_linesize_32 + startx - 1];
208	const int s1y = av_clip(y - r, 0, sh - 1);
209	const int s2y = av_clip(y - (r + offy), 0, sh - 1);
210
211	for (x = startx; x < startx + w; x++) {
212	const int s1x = av_clip(x - r, 0, sw - 1);
213	const int s2x = av_clip(x - (r + offx), 0, sw - 1);
214	const uint8_t v1 = src[s1y*linesize + s1x];
215	const uint8_t v2 = src[s2y*linesize + s2x];
216	const int d = v1 - v2;
217	acc += d * d;
218	dst[ydst_linesize_32 + x] = dst[(y-1)dst_linesize_32 + x] + acc;
219	}
220	}
221	}
222
223	/*
224	* Compute the sum of squared difference integral image
225	* http://www.ipol.im/pub/art/2014/57/
226	* Integral Images for Block Matching - Gabriele Facciolo, Nicolas Limare, Enric Meinhardt-Llopis
227	*
228	* @param ii integral image of dimension (w+e2) x (h+e2) with
229	* an additional zeroed top line and column already
230	* "applied" to the pointer value
231	* @param ii_linesize_32 integral image linesize (in 32-bit integers unit)
232	* @param src source plane buffer
233	* @param linesize source plane linesize
234	* @param offx x-offsetting ranging in [-e;e]
235	* @param offy y-offsetting ranging in [-e;e]
236	* @param w source width
237	* @param h source height
238	* @param e research padding edge
239	*/
240	static void compute_ssd_integral_image(uint32_t *ii, int ii_linesize_32,
241	const uint8_t *src, int linesize, int offx, int offy,
242	int e, int w, int h)
243	{
244	// ii has a surrounding padding of thickness "e"
245	const int ii_w = w + e*2;
246	const int ii_h = h + e*2;
247
248	// we center the first source
249	const int s1x = e;
250	const int s1y = e;
251
252	// 2nd source is the frame with offsetting
253	const int s2x = e + offx;
254	const int s2y = e + offy;
255
256	// get the dimension of the overlapping rectangle where it is always safe
257	// to compare the 2 sources pixels
258	const int startx_safe = FFMAX(s1x, s2x);
259	const int starty_safe = FFMAX(s1y, s2y);
260	const int endx_safe = FFMIN(s1x + w, s2x + w);
261	const int endy_safe = FFMIN(s1y + h, s2y + h);
262
263	// top part where only one of s1 and s2 is still readable, or none at all
264	compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
265	0, 0,
266	src, linesize,
267	offx, offy, e, w, h,
268	ii_w, starty_safe);
269
270	// fill the left column integral required to compute the central
271	// overlapping one
272	compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
273	0, starty_safe,
274	src, linesize,
275	offx, offy, e, w, h,
276	startx_safe, endy_safe - starty_safe);
277
278	// main and safe part of the integral
279	av_assert1(startx_safe - s1x >= 0); av_assert1(startx_safe - s1x < w);
280	av_assert1(starty_safe - s1y >= 0); av_assert1(starty_safe - s1y < h);
281	av_assert1(startx_safe - s2x >= 0); av_assert1(startx_safe - s2x < w);
282	av_assert1(starty_safe - s2y >= 0); av_assert1(starty_safe - s2y < h);
283	compute_safe_ssd_integral_image_c(ii + starty_safe*ii_linesize_32 + startx_safe, ii_linesize_32,
284	src + (starty_safe - s1y) * linesize + (startx_safe - s1x), linesize,
285	src + (starty_safe - s2y) * linesize + (startx_safe - s2x), linesize,
286	endx_safe - startx_safe, endy_safe - starty_safe);
287
288	// right part of the integral
289	compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
290	endx_safe, starty_safe,
291	src, linesize,
292	offx, offy, e, w, h,
293	ii_w - endx_safe, endy_safe - starty_safe);
294
295	// bottom part where only one of s1 and s2 is still readable, or none at all
296	compute_unsafe_ssd_integral_image(ii, ii_linesize_32,
297	0, endy_safe,
298	src, linesize,
299	offx, offy, e, w, h,
300	ii_w, ii_h - endy_safe);
301	}
302
303	static int config_input(AVFilterLink *inlink)
304	{
305	AVFilterContext *ctx = inlink->dst;
306	NLMeansContext *s = ctx->priv;
307	const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
308	const int e = FFMAX(s->research_hsize, s->research_hsize_uv)
309	+ FFMAX(s->patch_hsize, s->patch_hsize_uv);
310
311	s->chroma_w = FF_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
312	s->chroma_h = FF_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
313	s->nb_planes = av_pix_fmt_count_planes(inlink->format);
314
315	/* Allocate the integral image with extra edges of thickness "e"
316	*
317	* +_+-------------------------------+
318	* \|0\|0000000000000000000000000000000\|
319	* +-x-------------------------------+
320	* \|0\|\ ^ \|
321	* \|0\| ii \| e \|
322	* \|0\| v \|
323	* \|0\| +-----------------------+ \|
324	* \|0\| \| \| \|
325	* \|0\|<->\| \| \|
326	* \|0\| e \| \| \|
327	* \|0\| \| \| \|
328	* \|0\| +-----------------------+ \|
329	* \|0\| \|
330	* \|0\| \|
331	* \|0\| \|
332	* +-+-------------------------------+
333	*/
334	s->ii_w = inlink->w + e*2;
335	s->ii_h = inlink->h + e*2;
336
337	// align to 4 the linesize, "+1" is for the space of the left 0-column
338	s->ii_lz_32 = FFALIGN(s->ii_w + 1, 4);
339
340	// "+1" is for the space of the top 0-line
341	s->ii_orig = av_mallocz_array(s->ii_h + 1, s->ii_lz_32 * sizeof(*s->ii_orig));
342	if (!s->ii_orig)
343	return AVERROR(ENOMEM);
344
345	// skip top 0-line and left 0-column
346	s->ii = s->ii_orig + s->ii_lz_32 + 1;
347
348	// allocate weighted average for every pixel
349	s->wa_linesize = inlink->w;
350	s->wa = av_malloc_array(s->wa_linesize, inlink->h * sizeof(*s->wa));
351	if (!s->wa)
352	return AVERROR(ENOMEM);
353
354	return 0;
355	}
356
357	struct thread_data {
358	const uint8_t *src;
359	int src_linesize;
360	int startx, starty;
361	int endx, endy;
362	const uint32_t *ii_start;
363	int p;
364	};
365
366	static int nlmeans_slice(AVFilterContext ctx, void arg, int jobnr, int nb_jobs)
367	{
368	int x, y;
369	NLMeansContext *s = ctx->priv;
370	const struct thread_data *td = arg;
371	const uint8_t *src = td->src;
372	const int src_linesize = td->src_linesize;
373	const int process_h = td->endy - td->starty;
374	const int slice_start = (process_h * jobnr ) / nb_jobs;
375	const int slice_end = (process_h * (jobnr+1)) / nb_jobs;
376	const int starty = td->starty + slice_start;
377	const int endy = td->starty + slice_end;
378
379	for (y = starty; y < endy; y++) {
380	for (x = td->startx; x < td->endx; x++) {
381	const int patch_diff_sq = get_integral_patch_value(td->ii_start, s->ii_lz_32, x, y, td->p);
382	if (patch_diff_sq < s->max_meaningful_diff) {
383	struct weighted_avg wa = &s->wa[ys->wa_linesize + x];
384	const int weight_lut_idx = patch_diff_sq * s->pdiff_lut_scale;
385	const double weight = s->weight_lut[weight_lut_idx]; // exp(-patch_diff_sq * s->pdiff_scale)
386	wa->total_weight += weight;
387	wa->sum += weight * src[y*src_linesize + x];
388	}
389	}
390	}
391	return 0;
392	}
393
394	static int nlmeans_plane(AVFilterContext *ctx, int w, int h, int p, int r,
395	uint8_t *dst, int dst_linesize,
396	const uint8_t *src, int src_linesize)
397	{
398	int x, y;
399	int offx, offy;
400	NLMeansContext *s = ctx->priv;
401	/* patches center points cover the whole research window so the patches
402	* themselves overflow the research window */
403	const int e = r + p;
404	/* focus an integral pointer on the centered image (s1) */
405	const uint32_t centered_ii = s->ii + es->ii_lz_32 + e;
406
407	memset(s->wa, 0, s->wa_linesize * h * sizeof(*s->wa));
408
409	for (offy = -r; offy <= r; offy++) {
410	for (offx = -r; offx <= r; offx++) {
411	if (offx \|\| offy) {
412	struct thread_data td = {
413	.src = src + offy*src_linesize + offx,
414	.src_linesize = src_linesize,
415	.startx = FFMAX(0, -offx),
416	.starty = FFMAX(0, -offy),
417	.endx = FFMIN(w, w - offx),
418	.endy = FFMIN(h, h - offy),
419	.ii_start = centered_ii + offy*s->ii_lz_32 + offx,
420	.p = p,
421	};
422
423	compute_ssd_integral_image(s->ii, s->ii_lz_32,
424	src, src_linesize,
425	offx, offy, e, w, h);
426	ctx->internal->execute(ctx, nlmeans_slice, &td, NULL,
427	FFMIN(td.endy - td.starty, ff_filter_get_nb_threads(ctx)));
428	}
429	}
430	}
431	for (y = 0; y < h; y++) {
432	for (x = 0; x < w; x++) {
433	struct weighted_avg wa = &s->wa[ys->wa_linesize + x];
434
435	// Also weight the centered pixel
436	wa->total_weight += 1.0;
437	wa->sum += 1.0 * src[y*src_linesize + x];
438
439	dst[y*dst_linesize + x] = av_clip_uint8(wa->sum / wa->total_weight);
440	}
441	}
442	return 0;
443	}
444
445	static int filter_frame(AVFilterLink inlink, AVFrame in)
446	{
447	int i;
448	AVFilterContext *ctx = inlink->dst;
449	NLMeansContext *s = ctx->priv;
450	AVFilterLink *outlink = ctx->outputs[0];
451
452	AVFrame *out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
453	if (!out) {
454	av_frame_free(&in);
455	return AVERROR(ENOMEM);
456	}
457	av_frame_copy_props(out, in);
458
459	for (i = 0; i < s->nb_planes; i++) {
460	const int w = i ? s->chroma_w : inlink->w;
461	const int h = i ? s->chroma_h : inlink->h;
462	const int p = i ? s->patch_hsize_uv : s->patch_hsize;
463	const int r = i ? s->research_hsize_uv : s->research_hsize;
464	nlmeans_plane(ctx, w, h, p, r,
465	out->data[i], out->linesize[i],
466	in->data[i], in->linesize[i]);
467	}
468
469	av_frame_free(&in);
470	return ff_filter_frame(outlink, out);
471	}
472
473	#define CHECK_ODD_FIELD(field, name) do { \
474	if (!(s->field & 1)) { \
475	s->field \|= 1; \
476	av_log(ctx, AV_LOG_WARNING, name " size must be odd, " \
477	"setting it to %d\n", s->field); \
478	} \
479	} while (0)
480
481	static av_cold int init(AVFilterContext *ctx)
482	{
483	int i;
484	NLMeansContext *s = ctx->priv;
485	const double h = s->sigma * 10.;
486
487	s->pdiff_scale = 1. / (h * h);
488	s->max_meaningful_diff = -log(1/255.) / s->pdiff_scale;
489	s->pdiff_lut_scale = 1./s->max_meaningful_diff * WEIGHT_LUT_SIZE;
490	av_assert0((s->max_meaningful_diff - 1) * s->pdiff_lut_scale < FF_ARRAY_ELEMS(s->weight_lut));
491	for (i = 0; i < WEIGHT_LUT_SIZE; i++)
492	s->weight_lut[i] = exp(-i / s->pdiff_lut_scale * s->pdiff_scale);
493
494	CHECK_ODD_FIELD(research_size, "Luma research window");
495	CHECK_ODD_FIELD(patch_size, "Luma patch");
496
497	if (!s->research_size_uv) s->research_size_uv = s->research_size;
498	if (!s->patch_size_uv) s->patch_size_uv = s->patch_size;
499
500	CHECK_ODD_FIELD(research_size_uv, "Chroma research window");
501	CHECK_ODD_FIELD(patch_size_uv, "Chroma patch");
502
503	s->research_hsize = s->research_size / 2;
504	s->research_hsize_uv = s->research_size_uv / 2;
505	s->patch_hsize = s->patch_size / 2;
506	s->patch_hsize_uv = s->patch_size_uv / 2;
507
508	av_log(ctx, AV_LOG_INFO, "Research window: %dx%d / %dx%d, patch size: %dx%d / %dx%d\n",
509	s->research_size, s->research_size, s->research_size_uv, s->research_size_uv,
510	s->patch_size, s->patch_size, s->patch_size_uv, s->patch_size_uv);
511
512	return 0;
513	}
514
515	static av_cold void uninit(AVFilterContext *ctx)
516	{
517	NLMeansContext *s = ctx->priv;
518	av_freep(&s->ii_orig);
519	av_freep(&s->wa);
520	}
521
522	static const AVFilterPad nlmeans_inputs[] = {
523	{
524	.name = "default",
525	.type = AVMEDIA_TYPE_VIDEO,
526	.config_props = config_input,
527	.filter_frame = filter_frame,
528	},
529	{ NULL }
530	};
531
532	static const AVFilterPad nlmeans_outputs[] = {
533	{
534	.name = "default",
535	.type = AVMEDIA_TYPE_VIDEO,
536	},
537	{ NULL }
538	};
539
540	AVFilter ff_vf_nlmeans = {
541	.name = "nlmeans",
542	.description = NULL_IF_CONFIG_SMALL("Non-local means denoiser."),
543	.priv_size = sizeof(NLMeansContext),
544	.init = init,
545	.uninit = uninit,
546	.query_formats = query_formats,
547	.inputs = nlmeans_inputs,
548	.outputs = nlmeans_outputs,
549	.priv_class = &nlmeans_class,
550	.flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC \| AVFILTER_FLAG_SLICE_THREADS,
551	};
552