blob: 9d91c76047616a7545fba06aecf2ca6d052c9b73
1 | /* |
2 | * Copyright (c) 2016 Kyle Swanson <k@ylo.ph>. |
3 | * |
4 | * This file is part of FFmpeg. |
5 | * |
6 | * FFmpeg is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * FFmpeg is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with FFmpeg; if not, write to the Free Software |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | */ |
20 | |
21 | /* http://k.ylo.ph/2016/04/04/loudnorm.html */ |
22 | |
23 | #include "libavutil/opt.h" |
24 | #include "avfilter.h" |
25 | #include "internal.h" |
26 | #include "audio.h" |
27 | #include "ebur128.h" |
28 | |
29 | enum FrameType { |
30 | FIRST_FRAME, |
31 | INNER_FRAME, |
32 | FINAL_FRAME, |
33 | LINEAR_MODE, |
34 | FRAME_NB |
35 | }; |
36 | |
37 | enum LimiterState { |
38 | OUT, |
39 | ATTACK, |
40 | SUSTAIN, |
41 | RELEASE, |
42 | STATE_NB |
43 | }; |
44 | |
45 | enum PrintFormat { |
46 | NONE, |
47 | JSON, |
48 | SUMMARY, |
49 | PF_NB |
50 | }; |
51 | |
52 | typedef struct LoudNormContext { |
53 | const AVClass *class; |
54 | double target_i; |
55 | double target_lra; |
56 | double target_tp; |
57 | double measured_i; |
58 | double measured_lra; |
59 | double measured_tp; |
60 | double measured_thresh; |
61 | double offset; |
62 | int linear; |
63 | int dual_mono; |
64 | enum PrintFormat print_format; |
65 | |
66 | double *buf; |
67 | int buf_size; |
68 | int buf_index; |
69 | int prev_buf_index; |
70 | |
71 | double delta[30]; |
72 | double weights[21]; |
73 | double prev_delta; |
74 | int index; |
75 | |
76 | double gain_reduction[2]; |
77 | double *limiter_buf; |
78 | double *prev_smp; |
79 | int limiter_buf_index; |
80 | int limiter_buf_size; |
81 | enum LimiterState limiter_state; |
82 | int peak_index; |
83 | int env_index; |
84 | int env_cnt; |
85 | int attack_length; |
86 | int release_length; |
87 | |
88 | int64_t pts; |
89 | enum FrameType frame_type; |
90 | int above_threshold; |
91 | int prev_nb_samples; |
92 | int channels; |
93 | |
94 | FFEBUR128State *r128_in; |
95 | FFEBUR128State *r128_out; |
96 | } LoudNormContext; |
97 | |
98 | #define OFFSET(x) offsetof(LoudNormContext, x) |
99 | #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM |
100 | |
101 | static const AVOption loudnorm_options[] = { |
102 | { "I", "set integrated loudness target", OFFSET(target_i), AV_OPT_TYPE_DOUBLE, {.dbl = -24.}, -70., -5., FLAGS }, |
103 | { "i", "set integrated loudness target", OFFSET(target_i), AV_OPT_TYPE_DOUBLE, {.dbl = -24.}, -70., -5., FLAGS }, |
104 | { "LRA", "set loudness range target", OFFSET(target_lra), AV_OPT_TYPE_DOUBLE, {.dbl = 7.}, 1., 20., FLAGS }, |
105 | { "lra", "set loudness range target", OFFSET(target_lra), AV_OPT_TYPE_DOUBLE, {.dbl = 7.}, 1., 20., FLAGS }, |
106 | { "TP", "set maximum true peak", OFFSET(target_tp), AV_OPT_TYPE_DOUBLE, {.dbl = -2.}, -9., 0., FLAGS }, |
107 | { "tp", "set maximum true peak", OFFSET(target_tp), AV_OPT_TYPE_DOUBLE, {.dbl = -2.}, -9., 0., FLAGS }, |
108 | { "measured_I", "measured IL of input file", OFFSET(measured_i), AV_OPT_TYPE_DOUBLE, {.dbl = 0.}, -99., 0., FLAGS }, |
109 | { "measured_i", "measured IL of input file", OFFSET(measured_i), AV_OPT_TYPE_DOUBLE, {.dbl = 0.}, -99., 0., FLAGS }, |
110 | { "measured_LRA", "measured LRA of input file", OFFSET(measured_lra), AV_OPT_TYPE_DOUBLE, {.dbl = 0.}, 0., 99., FLAGS }, |
111 | { "measured_lra", "measured LRA of input file", OFFSET(measured_lra), AV_OPT_TYPE_DOUBLE, {.dbl = 0.}, 0., 99., FLAGS }, |
112 | { "measured_TP", "measured true peak of input file", OFFSET(measured_tp), AV_OPT_TYPE_DOUBLE, {.dbl = 99.}, -99., 99., FLAGS }, |
113 | { "measured_tp", "measured true peak of input file", OFFSET(measured_tp), AV_OPT_TYPE_DOUBLE, {.dbl = 99.}, -99., 99., FLAGS }, |
114 | { "measured_thresh", "measured threshold of input file", OFFSET(measured_thresh), AV_OPT_TYPE_DOUBLE, {.dbl = -70.}, -99., 0., FLAGS }, |
115 | { "offset", "set offset gain", OFFSET(offset), AV_OPT_TYPE_DOUBLE, {.dbl = 0.}, -99., 99., FLAGS }, |
116 | { "linear", "normalize linearly if possible", OFFSET(linear), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS }, |
117 | { "dual_mono", "treat mono input as dual-mono", OFFSET(dual_mono), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS }, |
118 | { "print_format", "set print format for stats", OFFSET(print_format), AV_OPT_TYPE_INT, {.i64 = NONE}, NONE, PF_NB -1, FLAGS, "print_format" }, |
119 | { "none", 0, 0, AV_OPT_TYPE_CONST, {.i64 = NONE}, 0, 0, FLAGS, "print_format" }, |
120 | { "json", 0, 0, AV_OPT_TYPE_CONST, {.i64 = JSON}, 0, 0, FLAGS, "print_format" }, |
121 | { "summary", 0, 0, AV_OPT_TYPE_CONST, {.i64 = SUMMARY}, 0, 0, FLAGS, "print_format" }, |
122 | { NULL } |
123 | }; |
124 | |
125 | AVFILTER_DEFINE_CLASS(loudnorm); |
126 | |
127 | static inline int frame_size(int sample_rate, int frame_len_msec) |
128 | { |
129 | const int frame_size = round((double)sample_rate * (frame_len_msec / 1000.0)); |
130 | return frame_size + (frame_size % 2); |
131 | } |
132 | |
133 | static void init_gaussian_filter(LoudNormContext *s) |
134 | { |
135 | double total_weight = 0.0; |
136 | const double sigma = 3.5; |
137 | double adjust; |
138 | int i; |
139 | |
140 | const int offset = 21 / 2; |
141 | const double c1 = 1.0 / (sigma * sqrt(2.0 * M_PI)); |
142 | const double c2 = 2.0 * pow(sigma, 2.0); |
143 | |
144 | for (i = 0; i < 21; i++) { |
145 | const int x = i - offset; |
146 | s->weights[i] = c1 * exp(-(pow(x, 2.0) / c2)); |
147 | total_weight += s->weights[i]; |
148 | } |
149 | |
150 | adjust = 1.0 / total_weight; |
151 | for (i = 0; i < 21; i++) |
152 | s->weights[i] *= adjust; |
153 | } |
154 | |
155 | static double gaussian_filter(LoudNormContext *s, int index) |
156 | { |
157 | double result = 0.; |
158 | int i; |
159 | |
160 | index = index - 10 > 0 ? index - 10 : index + 20; |
161 | for (i = 0; i < 21; i++) |
162 | result += s->delta[((index + i) < 30) ? (index + i) : (index + i - 30)] * s->weights[i]; |
163 | |
164 | return result; |
165 | } |
166 | |
167 | static void detect_peak(LoudNormContext *s, int offset, int nb_samples, int channels, int *peak_delta, double *peak_value) |
168 | { |
169 | int n, c, i, index; |
170 | double ceiling; |
171 | double *buf; |
172 | |
173 | *peak_delta = -1; |
174 | buf = s->limiter_buf; |
175 | ceiling = s->target_tp; |
176 | |
177 | index = s->limiter_buf_index + (offset * channels) + (1920 * channels); |
178 | if (index >= s->limiter_buf_size) |
179 | index -= s->limiter_buf_size; |
180 | |
181 | if (s->frame_type == FIRST_FRAME) { |
182 | for (c = 0; c < channels; c++) |
183 | s->prev_smp[c] = fabs(buf[index + c - channels]); |
184 | } |
185 | |
186 | for (n = 0; n < nb_samples; n++) { |
187 | for (c = 0; c < channels; c++) { |
188 | double this, next, max_peak; |
189 | |
190 | this = fabs(buf[(index + c) < s->limiter_buf_size ? (index + c) : (index + c - s->limiter_buf_size)]); |
191 | next = fabs(buf[(index + c + channels) < s->limiter_buf_size ? (index + c + channels) : (index + c + channels - s->limiter_buf_size)]); |
192 | |
193 | if ((s->prev_smp[c] <= this) && (next <= this) && (this > ceiling) && (n > 0)) { |
194 | int detected; |
195 | |
196 | detected = 1; |
197 | for (i = 2; i < 12; i++) { |
198 | next = fabs(buf[(index + c + (i * channels)) < s->limiter_buf_size ? (index + c + (i * channels)) : (index + c + (i * channels) - s->limiter_buf_size)]); |
199 | if (next > this) { |
200 | detected = 0; |
201 | break; |
202 | } |
203 | } |
204 | |
205 | if (!detected) |
206 | continue; |
207 | |
208 | for (c = 0; c < channels; c++) { |
209 | if (c == 0 || fabs(buf[index + c]) > max_peak) |
210 | max_peak = fabs(buf[index + c]); |
211 | |
212 | s->prev_smp[c] = fabs(buf[(index + c) < s->limiter_buf_size ? (index + c) : (index + c - s->limiter_buf_size)]); |
213 | } |
214 | |
215 | *peak_delta = n; |
216 | s->peak_index = index; |
217 | *peak_value = max_peak; |
218 | return; |
219 | } |
220 | |
221 | s->prev_smp[c] = this; |
222 | } |
223 | |
224 | index += channels; |
225 | if (index >= s->limiter_buf_size) |
226 | index -= s->limiter_buf_size; |
227 | } |
228 | } |
229 | |
230 | static void true_peak_limiter(LoudNormContext *s, double *out, int nb_samples, int channels) |
231 | { |
232 | int n, c, index, peak_delta, smp_cnt; |
233 | double ceiling, peak_value; |
234 | double *buf; |
235 | |
236 | buf = s->limiter_buf; |
237 | ceiling = s->target_tp; |
238 | index = s->limiter_buf_index; |
239 | smp_cnt = 0; |
240 | |
241 | if (s->frame_type == FIRST_FRAME) { |
242 | double max; |
243 | |
244 | max = 0.; |
245 | for (n = 0; n < 1920; n++) { |
246 | for (c = 0; c < channels; c++) { |
247 | max = fabs(buf[c]) > max ? fabs(buf[c]) : max; |
248 | } |
249 | buf += channels; |
250 | } |
251 | |
252 | if (max > ceiling) { |
253 | s->gain_reduction[1] = ceiling / max; |
254 | s->limiter_state = SUSTAIN; |
255 | buf = s->limiter_buf; |
256 | |
257 | for (n = 0; n < 1920; n++) { |
258 | for (c = 0; c < channels; c++) { |
259 | double env; |
260 | env = s->gain_reduction[1]; |
261 | buf[c] *= env; |
262 | } |
263 | buf += channels; |
264 | } |
265 | } |
266 | |
267 | buf = s->limiter_buf; |
268 | } |
269 | |
270 | do { |
271 | |
272 | switch(s->limiter_state) { |
273 | case OUT: |
274 | detect_peak(s, smp_cnt, nb_samples - smp_cnt, channels, &peak_delta, &peak_value); |
275 | if (peak_delta != -1) { |
276 | s->env_cnt = 0; |
277 | smp_cnt += (peak_delta - s->attack_length); |
278 | s->gain_reduction[0] = 1.; |
279 | s->gain_reduction[1] = ceiling / peak_value; |
280 | s->limiter_state = ATTACK; |
281 | |
282 | s->env_index = s->peak_index - (s->attack_length * channels); |
283 | if (s->env_index < 0) |
284 | s->env_index += s->limiter_buf_size; |
285 | |
286 | s->env_index += (s->env_cnt * channels); |
287 | if (s->env_index > s->limiter_buf_size) |
288 | s->env_index -= s->limiter_buf_size; |
289 | |
290 | } else { |
291 | smp_cnt = nb_samples; |
292 | } |
293 | break; |
294 | |
295 | case ATTACK: |
296 | for (; s->env_cnt < s->attack_length; s->env_cnt++) { |
297 | for (c = 0; c < channels; c++) { |
298 | double env; |
299 | env = s->gain_reduction[0] - ((double) s->env_cnt / (s->attack_length - 1) * (s->gain_reduction[0] - s->gain_reduction[1])); |
300 | buf[s->env_index + c] *= env; |
301 | } |
302 | |
303 | s->env_index += channels; |
304 | if (s->env_index >= s->limiter_buf_size) |
305 | s->env_index -= s->limiter_buf_size; |
306 | |
307 | smp_cnt++; |
308 | if (smp_cnt >= nb_samples) { |
309 | s->env_cnt++; |
310 | break; |
311 | } |
312 | } |
313 | |
314 | if (smp_cnt < nb_samples) { |
315 | s->env_cnt = 0; |
316 | s->attack_length = 1920; |
317 | s->limiter_state = SUSTAIN; |
318 | } |
319 | break; |
320 | |
321 | case SUSTAIN: |
322 | detect_peak(s, smp_cnt, nb_samples, channels, &peak_delta, &peak_value); |
323 | if (peak_delta == -1) { |
324 | s->limiter_state = RELEASE; |
325 | s->gain_reduction[0] = s->gain_reduction[1]; |
326 | s->gain_reduction[1] = 1.; |
327 | s->env_cnt = 0; |
328 | break; |
329 | } else { |
330 | double gain_reduction; |
331 | gain_reduction = ceiling / peak_value; |
332 | |
333 | if (gain_reduction < s->gain_reduction[1]) { |
334 | s->limiter_state = ATTACK; |
335 | |
336 | s->attack_length = peak_delta; |
337 | if (s->attack_length <= 1) |
338 | s->attack_length = 2; |
339 | |
340 | s->gain_reduction[0] = s->gain_reduction[1]; |
341 | s->gain_reduction[1] = gain_reduction; |
342 | s->env_cnt = 0; |
343 | break; |
344 | } |
345 | |
346 | for (s->env_cnt = 0; s->env_cnt < peak_delta; s->env_cnt++) { |
347 | for (c = 0; c < channels; c++) { |
348 | double env; |
349 | env = s->gain_reduction[1]; |
350 | buf[s->env_index + c] *= env; |
351 | } |
352 | |
353 | s->env_index += channels; |
354 | if (s->env_index >= s->limiter_buf_size) |
355 | s->env_index -= s->limiter_buf_size; |
356 | |
357 | smp_cnt++; |
358 | if (smp_cnt >= nb_samples) { |
359 | s->env_cnt++; |
360 | break; |
361 | } |
362 | } |
363 | } |
364 | break; |
365 | |
366 | case RELEASE: |
367 | for (; s->env_cnt < s->release_length; s->env_cnt++) { |
368 | for (c = 0; c < channels; c++) { |
369 | double env; |
370 | env = s->gain_reduction[0] + (((double) s->env_cnt / (s->release_length - 1)) * (s->gain_reduction[1] - s->gain_reduction[0])); |
371 | buf[s->env_index + c] *= env; |
372 | } |
373 | |
374 | s->env_index += channels; |
375 | if (s->env_index >= s->limiter_buf_size) |
376 | s->env_index -= s->limiter_buf_size; |
377 | |
378 | smp_cnt++; |
379 | if (smp_cnt >= nb_samples) { |
380 | s->env_cnt++; |
381 | break; |
382 | } |
383 | } |
384 | |
385 | if (smp_cnt < nb_samples) { |
386 | s->env_cnt = 0; |
387 | s->limiter_state = OUT; |
388 | } |
389 | |
390 | break; |
391 | } |
392 | |
393 | } while (smp_cnt < nb_samples); |
394 | |
395 | for (n = 0; n < nb_samples; n++) { |
396 | for (c = 0; c < channels; c++) { |
397 | out[c] = buf[index + c]; |
398 | if (fabs(out[c]) > ceiling) { |
399 | out[c] = ceiling * (out[c] < 0 ? -1 : 1); |
400 | } |
401 | } |
402 | out += channels; |
403 | index += channels; |
404 | if (index >= s->limiter_buf_size) |
405 | index -= s->limiter_buf_size; |
406 | } |
407 | } |
408 | |
409 | static int filter_frame(AVFilterLink *inlink, AVFrame *in) |
410 | { |
411 | AVFilterContext *ctx = inlink->dst; |
412 | LoudNormContext *s = ctx->priv; |
413 | AVFilterLink *outlink = ctx->outputs[0]; |
414 | AVFrame *out; |
415 | const double *src; |
416 | double *dst; |
417 | double *buf; |
418 | double *limiter_buf; |
419 | int i, n, c, subframe_length, src_index; |
420 | double gain, gain_next, env_global, env_shortterm, |
421 | global, shortterm, lra, relative_threshold; |
422 | |
423 | if (av_frame_is_writable(in)) { |
424 | out = in; |
425 | } else { |
426 | out = ff_get_audio_buffer(inlink, in->nb_samples); |
427 | if (!out) { |
428 | av_frame_free(&in); |
429 | return AVERROR(ENOMEM); |
430 | } |
431 | av_frame_copy_props(out, in); |
432 | } |
433 | |
434 | out->pts = s->pts; |
435 | src = (const double *)in->data[0]; |
436 | dst = (double *)out->data[0]; |
437 | buf = s->buf; |
438 | limiter_buf = s->limiter_buf; |
439 | |
440 | ff_ebur128_add_frames_double(s->r128_in, src, in->nb_samples); |
441 | |
442 | if (s->frame_type == FIRST_FRAME && in->nb_samples < frame_size(inlink->sample_rate, 3000)) { |
443 | double offset, offset_tp, true_peak; |
444 | |
445 | ff_ebur128_loudness_global(s->r128_in, &global); |
446 | for (c = 0; c < inlink->channels; c++) { |
447 | double tmp; |
448 | ff_ebur128_sample_peak(s->r128_in, c, &tmp); |
449 | if (c == 0 || tmp > true_peak) |
450 | true_peak = tmp; |
451 | } |
452 | |
453 | offset = s->target_i - global; |
454 | offset_tp = true_peak + offset; |
455 | s->offset = offset_tp < s->target_tp ? offset : s->target_tp - true_peak; |
456 | s->offset = pow(10., s->offset / 20.); |
457 | s->frame_type = LINEAR_MODE; |
458 | } |
459 | |
460 | switch (s->frame_type) { |
461 | case FIRST_FRAME: |
462 | for (n = 0; n < in->nb_samples; n++) { |
463 | for (c = 0; c < inlink->channels; c++) { |
464 | buf[s->buf_index + c] = src[c]; |
465 | } |
466 | src += inlink->channels; |
467 | s->buf_index += inlink->channels; |
468 | } |
469 | |
470 | ff_ebur128_loudness_shortterm(s->r128_in, &shortterm); |
471 | |
472 | if (shortterm < s->measured_thresh) { |
473 | s->above_threshold = 0; |
474 | env_shortterm = shortterm <= -70. ? 0. : s->target_i - s->measured_i; |
475 | } else { |
476 | s->above_threshold = 1; |
477 | env_shortterm = shortterm <= -70. ? 0. : s->target_i - shortterm; |
478 | } |
479 | |
480 | for (n = 0; n < 30; n++) |
481 | s->delta[n] = pow(10., env_shortterm / 20.); |
482 | s->prev_delta = s->delta[s->index]; |
483 | |
484 | s->buf_index = |
485 | s->limiter_buf_index = 0; |
486 | |
487 | for (n = 0; n < (s->limiter_buf_size / inlink->channels); n++) { |
488 | for (c = 0; c < inlink->channels; c++) { |
489 | limiter_buf[s->limiter_buf_index + c] = buf[s->buf_index + c] * s->delta[s->index] * s->offset; |
490 | } |
491 | s->limiter_buf_index += inlink->channels; |
492 | if (s->limiter_buf_index >= s->limiter_buf_size) |
493 | s->limiter_buf_index -= s->limiter_buf_size; |
494 | |
495 | s->buf_index += inlink->channels; |
496 | } |
497 | |
498 | subframe_length = frame_size(inlink->sample_rate, 100); |
499 | true_peak_limiter(s, dst, subframe_length, inlink->channels); |
500 | ff_ebur128_add_frames_double(s->r128_out, dst, subframe_length); |
501 | |
502 | s->pts += |
503 | out->nb_samples = |
504 | inlink->min_samples = |
505 | inlink->max_samples = |
506 | inlink->partial_buf_size = subframe_length; |
507 | |
508 | s->frame_type = INNER_FRAME; |
509 | break; |
510 | |
511 | case INNER_FRAME: |
512 | gain = gaussian_filter(s, s->index + 10 < 30 ? s->index + 10 : s->index + 10 - 30); |
513 | gain_next = gaussian_filter(s, s->index + 11 < 30 ? s->index + 11 : s->index + 11 - 30); |
514 | |
515 | for (n = 0; n < in->nb_samples; n++) { |
516 | for (c = 0; c < inlink->channels; c++) { |
517 | buf[s->prev_buf_index + c] = src[c]; |
518 | limiter_buf[s->limiter_buf_index + c] = buf[s->buf_index + c] * (gain + (((double) n / in->nb_samples) * (gain_next - gain))) * s->offset; |
519 | } |
520 | src += inlink->channels; |
521 | |
522 | s->limiter_buf_index += inlink->channels; |
523 | if (s->limiter_buf_index >= s->limiter_buf_size) |
524 | s->limiter_buf_index -= s->limiter_buf_size; |
525 | |
526 | s->prev_buf_index += inlink->channels; |
527 | if (s->prev_buf_index >= s->buf_size) |
528 | s->prev_buf_index -= s->buf_size; |
529 | |
530 | s->buf_index += inlink->channels; |
531 | if (s->buf_index >= s->buf_size) |
532 | s->buf_index -= s->buf_size; |
533 | } |
534 | |
535 | subframe_length = (frame_size(inlink->sample_rate, 100) - in->nb_samples) * inlink->channels; |
536 | s->limiter_buf_index = s->limiter_buf_index + subframe_length < s->limiter_buf_size ? s->limiter_buf_index + subframe_length : s->limiter_buf_index + subframe_length - s->limiter_buf_size; |
537 | |
538 | true_peak_limiter(s, dst, in->nb_samples, inlink->channels); |
539 | ff_ebur128_add_frames_double(s->r128_out, dst, in->nb_samples); |
540 | |
541 | ff_ebur128_loudness_range(s->r128_in, &lra); |
542 | ff_ebur128_loudness_global(s->r128_in, &global); |
543 | ff_ebur128_loudness_shortterm(s->r128_in, &shortterm); |
544 | ff_ebur128_relative_threshold(s->r128_in, &relative_threshold); |
545 | |
546 | if (s->above_threshold == 0) { |
547 | double shortterm_out; |
548 | |
549 | if (shortterm > s->measured_thresh) |
550 | s->prev_delta *= 1.0058; |
551 | |
552 | ff_ebur128_loudness_shortterm(s->r128_out, &shortterm_out); |
553 | if (shortterm_out >= s->target_i) |
554 | s->above_threshold = 1; |
555 | } |
556 | |
557 | if (shortterm < relative_threshold || shortterm <= -70. || s->above_threshold == 0) { |
558 | s->delta[s->index] = s->prev_delta; |
559 | } else { |
560 | env_global = fabs(shortterm - global) < (s->target_lra / 2.) ? shortterm - global : (s->target_lra / 2.) * ((shortterm - global) < 0 ? -1 : 1); |
561 | env_shortterm = s->target_i - shortterm; |
562 | s->delta[s->index] = pow(10., (env_global + env_shortterm) / 20.); |
563 | } |
564 | |
565 | s->prev_delta = s->delta[s->index]; |
566 | s->index++; |
567 | if (s->index >= 30) |
568 | s->index -= 30; |
569 | s->prev_nb_samples = in->nb_samples; |
570 | s->pts += in->nb_samples; |
571 | break; |
572 | |
573 | case FINAL_FRAME: |
574 | gain = gaussian_filter(s, s->index + 10 < 30 ? s->index + 10 : s->index + 10 - 30); |
575 | s->limiter_buf_index = 0; |
576 | src_index = 0; |
577 | |
578 | for (n = 0; n < s->limiter_buf_size / inlink->channels; n++) { |
579 | for (c = 0; c < inlink->channels; c++) { |
580 | s->limiter_buf[s->limiter_buf_index + c] = src[src_index + c] * gain * s->offset; |
581 | } |
582 | src_index += inlink->channels; |
583 | |
584 | s->limiter_buf_index += inlink->channels; |
585 | if (s->limiter_buf_index >= s->limiter_buf_size) |
586 | s->limiter_buf_index -= s->limiter_buf_size; |
587 | } |
588 | |
589 | subframe_length = frame_size(inlink->sample_rate, 100); |
590 | for (i = 0; i < in->nb_samples / subframe_length; i++) { |
591 | true_peak_limiter(s, dst, subframe_length, inlink->channels); |
592 | |
593 | for (n = 0; n < subframe_length; n++) { |
594 | for (c = 0; c < inlink->channels; c++) { |
595 | if (src_index < (in->nb_samples * inlink->channels)) { |
596 | limiter_buf[s->limiter_buf_index + c] = src[src_index + c] * gain * s->offset; |
597 | } else { |
598 | limiter_buf[s->limiter_buf_index + c] = 0.; |
599 | } |
600 | } |
601 | |
602 | if (src_index < (in->nb_samples * inlink->channels)) |
603 | src_index += inlink->channels; |
604 | |
605 | s->limiter_buf_index += inlink->channels; |
606 | if (s->limiter_buf_index >= s->limiter_buf_size) |
607 | s->limiter_buf_index -= s->limiter_buf_size; |
608 | } |
609 | |
610 | dst += (subframe_length * inlink->channels); |
611 | } |
612 | |
613 | dst = (double *)out->data[0]; |
614 | ff_ebur128_add_frames_double(s->r128_out, dst, in->nb_samples); |
615 | break; |
616 | |
617 | case LINEAR_MODE: |
618 | for (n = 0; n < in->nb_samples; n++) { |
619 | for (c = 0; c < inlink->channels; c++) { |
620 | dst[c] = src[c] * s->offset; |
621 | } |
622 | src += inlink->channels; |
623 | dst += inlink->channels; |
624 | } |
625 | |
626 | dst = (double *)out->data[0]; |
627 | ff_ebur128_add_frames_double(s->r128_out, dst, in->nb_samples); |
628 | s->pts += in->nb_samples; |
629 | break; |
630 | } |
631 | |
632 | if (in != out) |
633 | av_frame_free(&in); |
634 | |
635 | return ff_filter_frame(outlink, out); |
636 | } |
637 | |
638 | static int request_frame(AVFilterLink *outlink) |
639 | { |
640 | int ret; |
641 | AVFilterContext *ctx = outlink->src; |
642 | AVFilterLink *inlink = ctx->inputs[0]; |
643 | LoudNormContext *s = ctx->priv; |
644 | |
645 | ret = ff_request_frame(inlink); |
646 | if (ret == AVERROR_EOF && s->frame_type == INNER_FRAME) { |
647 | double *src; |
648 | double *buf; |
649 | int nb_samples, n, c, offset; |
650 | AVFrame *frame; |
651 | |
652 | nb_samples = (s->buf_size / inlink->channels) - s->prev_nb_samples; |
653 | nb_samples -= (frame_size(inlink->sample_rate, 100) - s->prev_nb_samples); |
654 | |
655 | frame = ff_get_audio_buffer(outlink, nb_samples); |
656 | if (!frame) |
657 | return AVERROR(ENOMEM); |
658 | frame->nb_samples = nb_samples; |
659 | |
660 | buf = s->buf; |
661 | src = (double *)frame->data[0]; |
662 | |
663 | offset = ((s->limiter_buf_size / inlink->channels) - s->prev_nb_samples) * inlink->channels; |
664 | offset -= (frame_size(inlink->sample_rate, 100) - s->prev_nb_samples) * inlink->channels; |
665 | s->buf_index = s->buf_index - offset < 0 ? s->buf_index - offset + s->buf_size : s->buf_index - offset; |
666 | |
667 | for (n = 0; n < nb_samples; n++) { |
668 | for (c = 0; c < inlink->channels; c++) { |
669 | src[c] = buf[s->buf_index + c]; |
670 | } |
671 | src += inlink->channels; |
672 | s->buf_index += inlink->channels; |
673 | if (s->buf_index >= s->buf_size) |
674 | s->buf_index -= s->buf_size; |
675 | } |
676 | |
677 | s->frame_type = FINAL_FRAME; |
678 | ret = filter_frame(inlink, frame); |
679 | } |
680 | return ret; |
681 | } |
682 | |
683 | static int query_formats(AVFilterContext *ctx) |
684 | { |
685 | AVFilterFormats *formats; |
686 | AVFilterChannelLayouts *layouts; |
687 | AVFilterLink *inlink = ctx->inputs[0]; |
688 | AVFilterLink *outlink = ctx->outputs[0]; |
689 | static const int input_srate[] = {192000, -1}; |
690 | static const enum AVSampleFormat sample_fmts[] = { |
691 | AV_SAMPLE_FMT_DBL, |
692 | AV_SAMPLE_FMT_NONE |
693 | }; |
694 | int ret; |
695 | |
696 | layouts = ff_all_channel_counts(); |
697 | if (!layouts) |
698 | return AVERROR(ENOMEM); |
699 | ret = ff_set_common_channel_layouts(ctx, layouts); |
700 | if (ret < 0) |
701 | return ret; |
702 | |
703 | formats = ff_make_format_list(sample_fmts); |
704 | if (!formats) |
705 | return AVERROR(ENOMEM); |
706 | ret = ff_set_common_formats(ctx, formats); |
707 | if (ret < 0) |
708 | return ret; |
709 | |
710 | formats = ff_make_format_list(input_srate); |
711 | if (!formats) |
712 | return AVERROR(ENOMEM); |
713 | ret = ff_formats_ref(formats, &inlink->out_samplerates); |
714 | if (ret < 0) |
715 | return ret; |
716 | ret = ff_formats_ref(formats, &outlink->in_samplerates); |
717 | if (ret < 0) |
718 | return ret; |
719 | |
720 | return 0; |
721 | } |
722 | |
723 | static int config_input(AVFilterLink *inlink) |
724 | { |
725 | AVFilterContext *ctx = inlink->dst; |
726 | LoudNormContext *s = ctx->priv; |
727 | |
728 | s->r128_in = ff_ebur128_init(inlink->channels, inlink->sample_rate, 0, FF_EBUR128_MODE_I | FF_EBUR128_MODE_S | FF_EBUR128_MODE_LRA | FF_EBUR128_MODE_SAMPLE_PEAK); |
729 | if (!s->r128_in) |
730 | return AVERROR(ENOMEM); |
731 | |
732 | s->r128_out = ff_ebur128_init(inlink->channels, inlink->sample_rate, 0, FF_EBUR128_MODE_I | FF_EBUR128_MODE_S | FF_EBUR128_MODE_LRA | FF_EBUR128_MODE_SAMPLE_PEAK); |
733 | if (!s->r128_out) |
734 | return AVERROR(ENOMEM); |
735 | |
736 | if (inlink->channels == 1 && s->dual_mono) { |
737 | ff_ebur128_set_channel(s->r128_in, 0, FF_EBUR128_DUAL_MONO); |
738 | ff_ebur128_set_channel(s->r128_out, 0, FF_EBUR128_DUAL_MONO); |
739 | } |
740 | |
741 | s->buf_size = frame_size(inlink->sample_rate, 3000) * inlink->channels; |
742 | s->buf = av_malloc_array(s->buf_size, sizeof(*s->buf)); |
743 | if (!s->buf) |
744 | return AVERROR(ENOMEM); |
745 | |
746 | s->limiter_buf_size = frame_size(inlink->sample_rate, 210) * inlink->channels; |
747 | s->limiter_buf = av_malloc_array(s->buf_size, sizeof(*s->limiter_buf)); |
748 | if (!s->limiter_buf) |
749 | return AVERROR(ENOMEM); |
750 | |
751 | s->prev_smp = av_malloc_array(inlink->channels, sizeof(*s->prev_smp)); |
752 | if (!s->prev_smp) |
753 | return AVERROR(ENOMEM); |
754 | |
755 | init_gaussian_filter(s); |
756 | |
757 | s->frame_type = FIRST_FRAME; |
758 | |
759 | if (s->linear) { |
760 | double offset, offset_tp; |
761 | offset = s->target_i - s->measured_i; |
762 | offset_tp = s->measured_tp + offset; |
763 | |
764 | if (s->measured_tp != 99 && s->measured_thresh != -70 && s->measured_lra != 0 && s->measured_i != 0) { |
765 | if ((offset_tp <= s->target_tp) && (s->measured_lra <= s->target_lra)) { |
766 | s->frame_type = LINEAR_MODE; |
767 | s->offset = offset; |
768 | } |
769 | } |
770 | } |
771 | |
772 | if (s->frame_type != LINEAR_MODE) { |
773 | inlink->min_samples = |
774 | inlink->max_samples = |
775 | inlink->partial_buf_size = frame_size(inlink->sample_rate, 3000); |
776 | } |
777 | |
778 | s->pts = |
779 | s->buf_index = |
780 | s->prev_buf_index = |
781 | s->limiter_buf_index = 0; |
782 | s->channels = inlink->channels; |
783 | s->index = 1; |
784 | s->limiter_state = OUT; |
785 | s->offset = pow(10., s->offset / 20.); |
786 | s->target_tp = pow(10., s->target_tp / 20.); |
787 | s->attack_length = frame_size(inlink->sample_rate, 10); |
788 | s->release_length = frame_size(inlink->sample_rate, 100); |
789 | |
790 | return 0; |
791 | } |
792 | |
793 | static av_cold void uninit(AVFilterContext *ctx) |
794 | { |
795 | LoudNormContext *s = ctx->priv; |
796 | double i_in, i_out, lra_in, lra_out, thresh_in, thresh_out, tp_in, tp_out; |
797 | int c; |
798 | |
799 | if (!s->r128_in || !s->r128_out) |
800 | goto end; |
801 | |
802 | ff_ebur128_loudness_range(s->r128_in, &lra_in); |
803 | ff_ebur128_loudness_global(s->r128_in, &i_in); |
804 | ff_ebur128_relative_threshold(s->r128_in, &thresh_in); |
805 | for (c = 0; c < s->channels; c++) { |
806 | double tmp; |
807 | ff_ebur128_sample_peak(s->r128_in, c, &tmp); |
808 | if ((c == 0) || (tmp > tp_in)) |
809 | tp_in = tmp; |
810 | } |
811 | |
812 | ff_ebur128_loudness_range(s->r128_out, &lra_out); |
813 | ff_ebur128_loudness_global(s->r128_out, &i_out); |
814 | ff_ebur128_relative_threshold(s->r128_out, &thresh_out); |
815 | for (c = 0; c < s->channels; c++) { |
816 | double tmp; |
817 | ff_ebur128_sample_peak(s->r128_out, c, &tmp); |
818 | if ((c == 0) || (tmp > tp_out)) |
819 | tp_out = tmp; |
820 | } |
821 | |
822 | switch(s->print_format) { |
823 | case NONE: |
824 | break; |
825 | |
826 | case JSON: |
827 | av_log(ctx, AV_LOG_INFO, |
828 | "\n{\n" |
829 | "\t\"input_i\" : \"%.2f\",\n" |
830 | "\t\"input_tp\" : \"%.2f\",\n" |
831 | "\t\"input_lra\" : \"%.2f\",\n" |
832 | "\t\"input_thresh\" : \"%.2f\",\n" |
833 | "\t\"output_i\" : \"%.2f\",\n" |
834 | "\t\"output_tp\" : \"%+.2f\",\n" |
835 | "\t\"output_lra\" : \"%.2f\",\n" |
836 | "\t\"output_thresh\" : \"%.2f\",\n" |
837 | "\t\"normalization_type\" : \"%s\",\n" |
838 | "\t\"target_offset\" : \"%.2f\"\n" |
839 | "}\n", |
840 | i_in, |
841 | 20. * log10(tp_in), |
842 | lra_in, |
843 | thresh_in, |
844 | i_out, |
845 | 20. * log10(tp_out), |
846 | lra_out, |
847 | thresh_out, |
848 | s->frame_type == LINEAR_MODE ? "linear" : "dynamic", |
849 | s->target_i - i_out |
850 | ); |
851 | break; |
852 | |
853 | case SUMMARY: |
854 | av_log(ctx, AV_LOG_INFO, |
855 | "\n" |
856 | "Input Integrated: %+6.1f LUFS\n" |
857 | "Input True Peak: %+6.1f dBTP\n" |
858 | "Input LRA: %6.1f LU\n" |
859 | "Input Threshold: %+6.1f LUFS\n" |
860 | "\n" |
861 | "Output Integrated: %+6.1f LUFS\n" |
862 | "Output True Peak: %+6.1f dBTP\n" |
863 | "Output LRA: %6.1f LU\n" |
864 | "Output Threshold: %+6.1f LUFS\n" |
865 | "\n" |
866 | "Normalization Type: %s\n" |
867 | "Target Offset: %+6.1f LU\n", |
868 | i_in, |
869 | 20. * log10(tp_in), |
870 | lra_in, |
871 | thresh_in, |
872 | i_out, |
873 | 20. * log10(tp_out), |
874 | lra_out, |
875 | thresh_out, |
876 | s->frame_type == LINEAR_MODE ? "Linear" : "Dynamic", |
877 | s->target_i - i_out |
878 | ); |
879 | break; |
880 | } |
881 | |
882 | end: |
883 | if (s->r128_in) |
884 | ff_ebur128_destroy(&s->r128_in); |
885 | if (s->r128_out) |
886 | ff_ebur128_destroy(&s->r128_out); |
887 | av_freep(&s->limiter_buf); |
888 | av_freep(&s->prev_smp); |
889 | av_freep(&s->buf); |
890 | } |
891 | |
892 | static const AVFilterPad avfilter_af_loudnorm_inputs[] = { |
893 | { |
894 | .name = "default", |
895 | .type = AVMEDIA_TYPE_AUDIO, |
896 | .config_props = config_input, |
897 | .filter_frame = filter_frame, |
898 | }, |
899 | { NULL } |
900 | }; |
901 | |
902 | static const AVFilterPad avfilter_af_loudnorm_outputs[] = { |
903 | { |
904 | .name = "default", |
905 | .request_frame = request_frame, |
906 | .type = AVMEDIA_TYPE_AUDIO, |
907 | }, |
908 | { NULL } |
909 | }; |
910 | |
911 | AVFilter ff_af_loudnorm = { |
912 | .name = "loudnorm", |
913 | .description = NULL_IF_CONFIG_SMALL("EBU R128 loudness normalization"), |
914 | .priv_size = sizeof(LoudNormContext), |
915 | .priv_class = &loudnorm_class, |
916 | .query_formats = query_formats, |
917 | .uninit = uninit, |
918 | .inputs = avfilter_af_loudnorm_inputs, |
919 | .outputs = avfilter_af_loudnorm_outputs, |
920 | }; |
921 |