blob: f156d1883d5d7a010ad0bb845e92a5062fb3f300
1 | /* |
2 | * Copyright (c) 2001 Heikki Leinonen |
3 | * Copyright (c) 2001 Chris Bagwell |
4 | * Copyright (c) 2003 Donnie Smith |
5 | * Copyright (c) 2014 Paul B Mahol |
6 | * |
7 | * This file is part of FFmpeg. |
8 | * |
9 | * FFmpeg is free software; you can redistribute it and/or |
10 | * modify it under the terms of the GNU Lesser General Public |
11 | * License as published by the Free Software Foundation; either |
12 | * version 2.1 of the License, or (at your option) any later version. |
13 | * |
14 | * FFmpeg is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | * Lesser General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU Lesser General Public |
20 | * License along with FFmpeg; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | */ |
23 | |
24 | #include <float.h> /* DBL_MAX */ |
25 | |
26 | #include "libavutil/opt.h" |
27 | #include "libavutil/timestamp.h" |
28 | #include "audio.h" |
29 | #include "formats.h" |
30 | #include "avfilter.h" |
31 | #include "internal.h" |
32 | |
33 | enum SilenceMode { |
34 | SILENCE_TRIM, |
35 | SILENCE_TRIM_FLUSH, |
36 | SILENCE_COPY, |
37 | SILENCE_COPY_FLUSH, |
38 | SILENCE_STOP |
39 | }; |
40 | |
41 | typedef struct SilenceRemoveContext { |
42 | const AVClass *class; |
43 | |
44 | enum SilenceMode mode; |
45 | |
46 | int start_periods; |
47 | int64_t start_duration; |
48 | double start_threshold; |
49 | |
50 | int stop_periods; |
51 | int64_t stop_duration; |
52 | double stop_threshold; |
53 | |
54 | double *start_holdoff; |
55 | size_t start_holdoff_offset; |
56 | size_t start_holdoff_end; |
57 | int start_found_periods; |
58 | |
59 | double *stop_holdoff; |
60 | size_t stop_holdoff_offset; |
61 | size_t stop_holdoff_end; |
62 | int stop_found_periods; |
63 | |
64 | double window_ratio; |
65 | double *window; |
66 | double *window_current; |
67 | double *window_end; |
68 | int window_size; |
69 | double sum; |
70 | |
71 | int leave_silence; |
72 | int restart; |
73 | int64_t next_pts; |
74 | |
75 | int detection; |
76 | void (*update)(struct SilenceRemoveContext *s, double sample); |
77 | double(*compute)(struct SilenceRemoveContext *s, double sample); |
78 | } SilenceRemoveContext; |
79 | |
80 | #define OFFSET(x) offsetof(SilenceRemoveContext, x) |
81 | #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM |
82 | static const AVOption silenceremove_options[] = { |
83 | { "start_periods", NULL, OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, FLAGS }, |
84 | { "start_duration", NULL, OFFSET(start_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS }, |
85 | { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS }, |
86 | { "stop_periods", NULL, OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, FLAGS }, |
87 | { "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS }, |
88 | { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS }, |
89 | { "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, |
90 | { "detection", NULL, OFFSET(detection), AV_OPT_TYPE_INT, {.i64=1}, 0, 1, FLAGS, "detection" }, |
91 | { "peak", 0, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "detection" }, |
92 | { "rms", 0, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "detection" }, |
93 | { "window", NULL, OFFSET(window_ratio), AV_OPT_TYPE_DOUBLE, {.dbl=0.02}, 0, 10, FLAGS }, |
94 | { NULL } |
95 | }; |
96 | |
97 | AVFILTER_DEFINE_CLASS(silenceremove); |
98 | |
99 | static double compute_peak(SilenceRemoveContext *s, double sample) |
100 | { |
101 | double new_sum; |
102 | |
103 | new_sum = s->sum; |
104 | new_sum -= *s->window_current; |
105 | new_sum += fabs(sample); |
106 | |
107 | return new_sum / s->window_size; |
108 | } |
109 | |
110 | static void update_peak(SilenceRemoveContext *s, double sample) |
111 | { |
112 | s->sum -= *s->window_current; |
113 | *s->window_current = fabs(sample); |
114 | s->sum += *s->window_current; |
115 | |
116 | s->window_current++; |
117 | if (s->window_current >= s->window_end) |
118 | s->window_current = s->window; |
119 | } |
120 | |
121 | static double compute_rms(SilenceRemoveContext *s, double sample) |
122 | { |
123 | double new_sum; |
124 | |
125 | new_sum = s->sum; |
126 | new_sum -= *s->window_current; |
127 | new_sum += sample * sample; |
128 | |
129 | return sqrt(new_sum / s->window_size); |
130 | } |
131 | |
132 | static void update_rms(SilenceRemoveContext *s, double sample) |
133 | { |
134 | s->sum -= *s->window_current; |
135 | *s->window_current = sample * sample; |
136 | s->sum += *s->window_current; |
137 | |
138 | s->window_current++; |
139 | if (s->window_current >= s->window_end) |
140 | s->window_current = s->window; |
141 | } |
142 | |
143 | static av_cold int init(AVFilterContext *ctx) |
144 | { |
145 | SilenceRemoveContext *s = ctx->priv; |
146 | |
147 | if (s->stop_periods < 0) { |
148 | s->stop_periods = -s->stop_periods; |
149 | s->restart = 1; |
150 | } |
151 | |
152 | switch (s->detection) { |
153 | case 0: |
154 | s->update = update_peak; |
155 | s->compute = compute_peak; |
156 | break; |
157 | case 1: |
158 | s->update = update_rms; |
159 | s->compute = compute_rms; |
160 | break; |
161 | }; |
162 | |
163 | return 0; |
164 | } |
165 | |
166 | static void clear_window(SilenceRemoveContext *s) |
167 | { |
168 | memset(s->window, 0, s->window_size * sizeof(*s->window)); |
169 | |
170 | s->window_current = s->window; |
171 | s->window_end = s->window + s->window_size; |
172 | s->sum = 0; |
173 | } |
174 | |
175 | static int config_input(AVFilterLink *inlink) |
176 | { |
177 | AVFilterContext *ctx = inlink->dst; |
178 | SilenceRemoveContext *s = ctx->priv; |
179 | |
180 | s->window_size = FFMAX((inlink->sample_rate * s->window_ratio), 1) * inlink->channels; |
181 | s->window = av_malloc_array(s->window_size, sizeof(*s->window)); |
182 | if (!s->window) |
183 | return AVERROR(ENOMEM); |
184 | |
185 | clear_window(s); |
186 | |
187 | s->start_duration = av_rescale(s->start_duration, inlink->sample_rate, |
188 | AV_TIME_BASE); |
189 | s->stop_duration = av_rescale(s->stop_duration, inlink->sample_rate, |
190 | AV_TIME_BASE); |
191 | |
192 | s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1), |
193 | sizeof(*s->start_holdoff) * |
194 | inlink->channels); |
195 | if (!s->start_holdoff) |
196 | return AVERROR(ENOMEM); |
197 | |
198 | s->start_holdoff_offset = 0; |
199 | s->start_holdoff_end = 0; |
200 | s->start_found_periods = 0; |
201 | |
202 | s->stop_holdoff = av_malloc_array(FFMAX(s->stop_duration, 1), |
203 | sizeof(*s->stop_holdoff) * |
204 | inlink->channels); |
205 | if (!s->stop_holdoff) |
206 | return AVERROR(ENOMEM); |
207 | |
208 | s->stop_holdoff_offset = 0; |
209 | s->stop_holdoff_end = 0; |
210 | s->stop_found_periods = 0; |
211 | |
212 | if (s->start_periods) |
213 | s->mode = SILENCE_TRIM; |
214 | else |
215 | s->mode = SILENCE_COPY; |
216 | |
217 | return 0; |
218 | } |
219 | |
220 | static void flush(AVFrame *out, AVFilterLink *outlink, |
221 | int *nb_samples_written, int *ret) |
222 | { |
223 | if (*nb_samples_written) { |
224 | out->nb_samples = *nb_samples_written / outlink->channels; |
225 | *ret = ff_filter_frame(outlink, out); |
226 | *nb_samples_written = 0; |
227 | } else { |
228 | av_frame_free(&out); |
229 | } |
230 | } |
231 | |
232 | static int filter_frame(AVFilterLink *inlink, AVFrame *in) |
233 | { |
234 | AVFilterContext *ctx = inlink->dst; |
235 | AVFilterLink *outlink = ctx->outputs[0]; |
236 | SilenceRemoveContext *s = ctx->priv; |
237 | int i, j, threshold, ret = 0; |
238 | int nbs, nb_samples_read, nb_samples_written; |
239 | double *obuf, *ibuf = (double *)in->data[0]; |
240 | AVFrame *out; |
241 | |
242 | nb_samples_read = nb_samples_written = 0; |
243 | |
244 | switch (s->mode) { |
245 | case SILENCE_TRIM: |
246 | silence_trim: |
247 | nbs = in->nb_samples - nb_samples_read / inlink->channels; |
248 | if (!nbs) |
249 | break; |
250 | |
251 | for (i = 0; i < nbs; i++) { |
252 | threshold = 0; |
253 | for (j = 0; j < inlink->channels; j++) { |
254 | threshold |= s->compute(s, ibuf[j]) > s->start_threshold; |
255 | } |
256 | |
257 | if (threshold) { |
258 | for (j = 0; j < inlink->channels; j++) { |
259 | s->update(s, *ibuf); |
260 | s->start_holdoff[s->start_holdoff_end++] = *ibuf++; |
261 | } |
262 | nb_samples_read += inlink->channels; |
263 | |
264 | if (s->start_holdoff_end >= s->start_duration * inlink->channels) { |
265 | if (++s->start_found_periods >= s->start_periods) { |
266 | s->mode = SILENCE_TRIM_FLUSH; |
267 | goto silence_trim_flush; |
268 | } |
269 | |
270 | s->start_holdoff_offset = 0; |
271 | s->start_holdoff_end = 0; |
272 | } |
273 | } else { |
274 | s->start_holdoff_end = 0; |
275 | |
276 | for (j = 0; j < inlink->channels; j++) |
277 | s->update(s, ibuf[j]); |
278 | |
279 | ibuf += inlink->channels; |
280 | nb_samples_read += inlink->channels; |
281 | } |
282 | } |
283 | break; |
284 | |
285 | case SILENCE_TRIM_FLUSH: |
286 | silence_trim_flush: |
287 | nbs = s->start_holdoff_end - s->start_holdoff_offset; |
288 | nbs -= nbs % inlink->channels; |
289 | if (!nbs) |
290 | break; |
291 | |
292 | out = ff_get_audio_buffer(inlink, nbs / inlink->channels); |
293 | if (!out) { |
294 | av_frame_free(&in); |
295 | return AVERROR(ENOMEM); |
296 | } |
297 | |
298 | memcpy(out->data[0], &s->start_holdoff[s->start_holdoff_offset], |
299 | nbs * sizeof(double)); |
300 | s->start_holdoff_offset += nbs; |
301 | |
302 | ret = ff_filter_frame(outlink, out); |
303 | |
304 | if (s->start_holdoff_offset == s->start_holdoff_end) { |
305 | s->start_holdoff_offset = 0; |
306 | s->start_holdoff_end = 0; |
307 | s->mode = SILENCE_COPY; |
308 | goto silence_copy; |
309 | } |
310 | break; |
311 | |
312 | case SILENCE_COPY: |
313 | silence_copy: |
314 | nbs = in->nb_samples - nb_samples_read / inlink->channels; |
315 | if (!nbs) |
316 | break; |
317 | |
318 | out = ff_get_audio_buffer(inlink, nbs); |
319 | if (!out) { |
320 | av_frame_free(&in); |
321 | return AVERROR(ENOMEM); |
322 | } |
323 | obuf = (double *)out->data[0]; |
324 | |
325 | if (s->stop_periods) { |
326 | for (i = 0; i < nbs; i++) { |
327 | threshold = 1; |
328 | for (j = 0; j < inlink->channels; j++) |
329 | threshold &= s->compute(s, ibuf[j]) > s->stop_threshold; |
330 | |
331 | if (threshold && s->stop_holdoff_end && !s->leave_silence) { |
332 | s->mode = SILENCE_COPY_FLUSH; |
333 | flush(out, outlink, &nb_samples_written, &ret); |
334 | goto silence_copy_flush; |
335 | } else if (threshold) { |
336 | for (j = 0; j < inlink->channels; j++) { |
337 | s->update(s, *ibuf); |
338 | *obuf++ = *ibuf++; |
339 | } |
340 | nb_samples_read += inlink->channels; |
341 | nb_samples_written += inlink->channels; |
342 | } else if (!threshold) { |
343 | for (j = 0; j < inlink->channels; j++) { |
344 | s->update(s, *ibuf); |
345 | if (s->leave_silence) { |
346 | *obuf++ = *ibuf; |
347 | nb_samples_written++; |
348 | } |
349 | |
350 | s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++; |
351 | } |
352 | nb_samples_read += inlink->channels; |
353 | |
354 | if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) { |
355 | if (++s->stop_found_periods >= s->stop_periods) { |
356 | s->stop_holdoff_offset = 0; |
357 | s->stop_holdoff_end = 0; |
358 | |
359 | if (!s->restart) { |
360 | s->mode = SILENCE_STOP; |
361 | flush(out, outlink, &nb_samples_written, &ret); |
362 | goto silence_stop; |
363 | } else { |
364 | s->stop_found_periods = 0; |
365 | s->start_found_periods = 0; |
366 | s->start_holdoff_offset = 0; |
367 | s->start_holdoff_end = 0; |
368 | clear_window(s); |
369 | s->mode = SILENCE_TRIM; |
370 | flush(out, outlink, &nb_samples_written, &ret); |
371 | goto silence_trim; |
372 | } |
373 | } |
374 | s->mode = SILENCE_COPY_FLUSH; |
375 | flush(out, outlink, &nb_samples_written, &ret); |
376 | goto silence_copy_flush; |
377 | } |
378 | } |
379 | } |
380 | flush(out, outlink, &nb_samples_written, &ret); |
381 | } else { |
382 | memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels); |
383 | ret = ff_filter_frame(outlink, out); |
384 | } |
385 | break; |
386 | |
387 | case SILENCE_COPY_FLUSH: |
388 | silence_copy_flush: |
389 | nbs = s->stop_holdoff_end - s->stop_holdoff_offset; |
390 | nbs -= nbs % inlink->channels; |
391 | if (!nbs) |
392 | break; |
393 | |
394 | out = ff_get_audio_buffer(inlink, nbs / inlink->channels); |
395 | if (!out) { |
396 | av_frame_free(&in); |
397 | return AVERROR(ENOMEM); |
398 | } |
399 | |
400 | memcpy(out->data[0], &s->stop_holdoff[s->stop_holdoff_offset], |
401 | nbs * sizeof(double)); |
402 | s->stop_holdoff_offset += nbs; |
403 | |
404 | ret = ff_filter_frame(outlink, out); |
405 | |
406 | if (s->stop_holdoff_offset == s->stop_holdoff_end) { |
407 | s->stop_holdoff_offset = 0; |
408 | s->stop_holdoff_end = 0; |
409 | s->mode = SILENCE_COPY; |
410 | goto silence_copy; |
411 | } |
412 | break; |
413 | case SILENCE_STOP: |
414 | silence_stop: |
415 | break; |
416 | } |
417 | |
418 | av_frame_free(&in); |
419 | |
420 | return ret; |
421 | } |
422 | |
423 | static int request_frame(AVFilterLink *outlink) |
424 | { |
425 | AVFilterContext *ctx = outlink->src; |
426 | SilenceRemoveContext *s = ctx->priv; |
427 | int ret; |
428 | |
429 | ret = ff_request_frame(ctx->inputs[0]); |
430 | if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH || |
431 | s->mode == SILENCE_COPY)) { |
432 | int nbs = s->stop_holdoff_end - s->stop_holdoff_offset; |
433 | if (nbs) { |
434 | AVFrame *frame; |
435 | |
436 | frame = ff_get_audio_buffer(outlink, nbs / outlink->channels); |
437 | if (!frame) |
438 | return AVERROR(ENOMEM); |
439 | |
440 | memcpy(frame->data[0], &s->stop_holdoff[s->stop_holdoff_offset], |
441 | nbs * sizeof(double)); |
442 | ret = ff_filter_frame(ctx->inputs[0], frame); |
443 | } |
444 | s->mode = SILENCE_STOP; |
445 | } |
446 | return ret; |
447 | } |
448 | |
449 | static int query_formats(AVFilterContext *ctx) |
450 | { |
451 | AVFilterFormats *formats = NULL; |
452 | AVFilterChannelLayouts *layouts = NULL; |
453 | static const enum AVSampleFormat sample_fmts[] = { |
454 | AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_NONE |
455 | }; |
456 | int ret; |
457 | |
458 | layouts = ff_all_channel_counts(); |
459 | if (!layouts) |
460 | return AVERROR(ENOMEM); |
461 | ret = ff_set_common_channel_layouts(ctx, layouts); |
462 | if (ret < 0) |
463 | return ret; |
464 | |
465 | formats = ff_make_format_list(sample_fmts); |
466 | if (!formats) |
467 | return AVERROR(ENOMEM); |
468 | ret = ff_set_common_formats(ctx, formats); |
469 | if (ret < 0) |
470 | return ret; |
471 | |
472 | formats = ff_all_samplerates(); |
473 | if (!formats) |
474 | return AVERROR(ENOMEM); |
475 | return ff_set_common_samplerates(ctx, formats); |
476 | } |
477 | |
478 | static av_cold void uninit(AVFilterContext *ctx) |
479 | { |
480 | SilenceRemoveContext *s = ctx->priv; |
481 | |
482 | av_freep(&s->start_holdoff); |
483 | av_freep(&s->stop_holdoff); |
484 | av_freep(&s->window); |
485 | } |
486 | |
487 | static const AVFilterPad silenceremove_inputs[] = { |
488 | { |
489 | .name = "default", |
490 | .type = AVMEDIA_TYPE_AUDIO, |
491 | .config_props = config_input, |
492 | .filter_frame = filter_frame, |
493 | }, |
494 | { NULL } |
495 | }; |
496 | |
497 | static const AVFilterPad silenceremove_outputs[] = { |
498 | { |
499 | .name = "default", |
500 | .type = AVMEDIA_TYPE_AUDIO, |
501 | .request_frame = request_frame, |
502 | }, |
503 | { NULL } |
504 | }; |
505 | |
506 | AVFilter ff_af_silenceremove = { |
507 | .name = "silenceremove", |
508 | .description = NULL_IF_CONFIG_SMALL("Remove silence."), |
509 | .priv_size = sizeof(SilenceRemoveContext), |
510 | .priv_class = &silenceremove_class, |
511 | .init = init, |
512 | .uninit = uninit, |
513 | .query_formats = query_formats, |
514 | .inputs = silenceremove_inputs, |
515 | .outputs = silenceremove_outputs, |
516 | }; |
517 |