blob: eb626564cd9c2e31f9fb6827f5bc06cc220b176c
1 | /* |
2 | * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com> |
3 | * |
4 | * This file is part of FFmpeg. |
5 | * |
6 | * FFmpeg is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * FFmpeg is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with FFmpeg; if not, write to the Free Software |
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | */ |
20 | |
21 | /** |
22 | * @file |
23 | * tempo scaling audio filter -- an implementation of WSOLA algorithm |
24 | * |
25 | * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h |
26 | * from Apprentice Video player by Pavel Koshevoy. |
27 | * https://sourceforge.net/projects/apprenticevideo/ |
28 | * |
29 | * An explanation of SOLA algorithm is available at |
30 | * http://www.surina.net/article/time-and-pitch-scaling.html |
31 | * |
32 | * WSOLA is very similar to SOLA, only one major difference exists between |
33 | * these algorithms. SOLA shifts audio fragments along the output stream, |
34 | * where as WSOLA shifts audio fragments along the input stream. |
35 | * |
36 | * The advantage of WSOLA algorithm is that the overlap region size is |
37 | * always the same, therefore the blending function is constant and |
38 | * can be precomputed. |
39 | */ |
40 | |
41 | #include <float.h> |
42 | #include "libavcodec/avfft.h" |
43 | #include "libavutil/avassert.h" |
44 | #include "libavutil/avstring.h" |
45 | #include "libavutil/channel_layout.h" |
46 | #include "libavutil/eval.h" |
47 | #include "libavutil/opt.h" |
48 | #include "libavutil/samplefmt.h" |
49 | #include "avfilter.h" |
50 | #include "audio.h" |
51 | #include "internal.h" |
52 | |
53 | /** |
54 | * A fragment of audio waveform |
55 | */ |
56 | typedef struct { |
57 | // index of the first sample of this fragment in the overall waveform; |
58 | // 0: input sample position |
59 | // 1: output sample position |
60 | int64_t position[2]; |
61 | |
62 | // original packed multi-channel samples: |
63 | uint8_t *data; |
64 | |
65 | // number of samples in this fragment: |
66 | int nsamples; |
67 | |
68 | // rDFT transform of the down-mixed mono fragment, used for |
69 | // fast waveform alignment via correlation in frequency domain: |
70 | FFTSample *xdat; |
71 | } AudioFragment; |
72 | |
73 | /** |
74 | * Filter state machine states |
75 | */ |
76 | typedef enum { |
77 | YAE_LOAD_FRAGMENT, |
78 | YAE_ADJUST_POSITION, |
79 | YAE_RELOAD_FRAGMENT, |
80 | YAE_OUTPUT_OVERLAP_ADD, |
81 | YAE_FLUSH_OUTPUT, |
82 | } FilterState; |
83 | |
84 | /** |
85 | * Filter state machine |
86 | */ |
87 | typedef struct { |
88 | const AVClass *class; |
89 | |
90 | // ring-buffer of input samples, necessary because some times |
91 | // input fragment position may be adjusted backwards: |
92 | uint8_t *buffer; |
93 | |
94 | // ring-buffer maximum capacity, expressed in sample rate time base: |
95 | int ring; |
96 | |
97 | // ring-buffer house keeping: |
98 | int size; |
99 | int head; |
100 | int tail; |
101 | |
102 | // 0: input sample position corresponding to the ring buffer tail |
103 | // 1: output sample position |
104 | int64_t position[2]; |
105 | |
106 | // sample format: |
107 | enum AVSampleFormat format; |
108 | |
109 | // number of channels: |
110 | int channels; |
111 | |
112 | // row of bytes to skip from one sample to next, across multple channels; |
113 | // stride = (number-of-channels * bits-per-sample-per-channel) / 8 |
114 | int stride; |
115 | |
116 | // fragment window size, power-of-two integer: |
117 | int window; |
118 | |
119 | // Hann window coefficients, for feathering |
120 | // (blending) the overlapping fragment region: |
121 | float *hann; |
122 | |
123 | // tempo scaling factor: |
124 | double tempo; |
125 | |
126 | // a snapshot of previous fragment input and output position values |
127 | // captured when the tempo scale factor was set most recently: |
128 | int64_t origin[2]; |
129 | |
130 | // current/previous fragment ring-buffer: |
131 | AudioFragment frag[2]; |
132 | |
133 | // current fragment index: |
134 | uint64_t nfrag; |
135 | |
136 | // current state: |
137 | FilterState state; |
138 | |
139 | // for fast correlation calculation in frequency domain: |
140 | RDFTContext *real_to_complex; |
141 | RDFTContext *complex_to_real; |
142 | FFTSample *correlation; |
143 | |
144 | // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame |
145 | AVFrame *dst_buffer; |
146 | uint8_t *dst; |
147 | uint8_t *dst_end; |
148 | uint64_t nsamples_in; |
149 | uint64_t nsamples_out; |
150 | } ATempoContext; |
151 | |
152 | #define OFFSET(x) offsetof(ATempoContext, x) |
153 | |
154 | static const AVOption atempo_options[] = { |
155 | { "tempo", "set tempo scale factor", |
156 | OFFSET(tempo), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 0.5, 2.0, |
157 | AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM }, |
158 | { NULL } |
159 | }; |
160 | |
161 | AVFILTER_DEFINE_CLASS(atempo); |
162 | |
163 | inline static AudioFragment *yae_curr_frag(ATempoContext *atempo) |
164 | { |
165 | return &atempo->frag[atempo->nfrag % 2]; |
166 | } |
167 | |
168 | inline static AudioFragment *yae_prev_frag(ATempoContext *atempo) |
169 | { |
170 | return &atempo->frag[(atempo->nfrag + 1) % 2]; |
171 | } |
172 | |
173 | /** |
174 | * Reset filter to initial state, do not deallocate existing local buffers. |
175 | */ |
176 | static void yae_clear(ATempoContext *atempo) |
177 | { |
178 | atempo->size = 0; |
179 | atempo->head = 0; |
180 | atempo->tail = 0; |
181 | |
182 | atempo->nfrag = 0; |
183 | atempo->state = YAE_LOAD_FRAGMENT; |
184 | |
185 | atempo->position[0] = 0; |
186 | atempo->position[1] = 0; |
187 | |
188 | atempo->origin[0] = 0; |
189 | atempo->origin[1] = 0; |
190 | |
191 | atempo->frag[0].position[0] = 0; |
192 | atempo->frag[0].position[1] = 0; |
193 | atempo->frag[0].nsamples = 0; |
194 | |
195 | atempo->frag[1].position[0] = 0; |
196 | atempo->frag[1].position[1] = 0; |
197 | atempo->frag[1].nsamples = 0; |
198 | |
199 | // shift left position of 1st fragment by half a window |
200 | // so that no re-normalization would be required for |
201 | // the left half of the 1st fragment: |
202 | atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2); |
203 | atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2); |
204 | |
205 | av_frame_free(&atempo->dst_buffer); |
206 | atempo->dst = NULL; |
207 | atempo->dst_end = NULL; |
208 | |
209 | atempo->nsamples_in = 0; |
210 | atempo->nsamples_out = 0; |
211 | } |
212 | |
213 | /** |
214 | * Reset filter to initial state and deallocate all buffers. |
215 | */ |
216 | static void yae_release_buffers(ATempoContext *atempo) |
217 | { |
218 | yae_clear(atempo); |
219 | |
220 | av_freep(&atempo->frag[0].data); |
221 | av_freep(&atempo->frag[1].data); |
222 | av_freep(&atempo->frag[0].xdat); |
223 | av_freep(&atempo->frag[1].xdat); |
224 | |
225 | av_freep(&atempo->buffer); |
226 | av_freep(&atempo->hann); |
227 | av_freep(&atempo->correlation); |
228 | |
229 | av_rdft_end(atempo->real_to_complex); |
230 | atempo->real_to_complex = NULL; |
231 | |
232 | av_rdft_end(atempo->complex_to_real); |
233 | atempo->complex_to_real = NULL; |
234 | } |
235 | |
236 | /* av_realloc is not aligned enough; fortunately, the data does not need to |
237 | * be preserved */ |
238 | #define RE_MALLOC_OR_FAIL(field, field_size) \ |
239 | do { \ |
240 | av_freep(&field); \ |
241 | field = av_malloc(field_size); \ |
242 | if (!field) { \ |
243 | yae_release_buffers(atempo); \ |
244 | return AVERROR(ENOMEM); \ |
245 | } \ |
246 | } while (0) |
247 | |
248 | /** |
249 | * Prepare filter for processing audio data of given format, |
250 | * sample rate and number of channels. |
251 | */ |
252 | static int yae_reset(ATempoContext *atempo, |
253 | enum AVSampleFormat format, |
254 | int sample_rate, |
255 | int channels) |
256 | { |
257 | const int sample_size = av_get_bytes_per_sample(format); |
258 | uint32_t nlevels = 0; |
259 | uint32_t pot; |
260 | int i; |
261 | |
262 | atempo->format = format; |
263 | atempo->channels = channels; |
264 | atempo->stride = sample_size * channels; |
265 | |
266 | // pick a segment window size: |
267 | atempo->window = sample_rate / 24; |
268 | |
269 | // adjust window size to be a power-of-two integer: |
270 | nlevels = av_log2(atempo->window); |
271 | pot = 1 << nlevels; |
272 | av_assert0(pot <= atempo->window); |
273 | |
274 | if (pot < atempo->window) { |
275 | atempo->window = pot * 2; |
276 | nlevels++; |
277 | } |
278 | |
279 | // initialize audio fragment buffers: |
280 | RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride); |
281 | RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride); |
282 | RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex)); |
283 | RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex)); |
284 | |
285 | // initialize rDFT contexts: |
286 | av_rdft_end(atempo->real_to_complex); |
287 | atempo->real_to_complex = NULL; |
288 | |
289 | av_rdft_end(atempo->complex_to_real); |
290 | atempo->complex_to_real = NULL; |
291 | |
292 | atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C); |
293 | if (!atempo->real_to_complex) { |
294 | yae_release_buffers(atempo); |
295 | return AVERROR(ENOMEM); |
296 | } |
297 | |
298 | atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R); |
299 | if (!atempo->complex_to_real) { |
300 | yae_release_buffers(atempo); |
301 | return AVERROR(ENOMEM); |
302 | } |
303 | |
304 | RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex)); |
305 | |
306 | atempo->ring = atempo->window * 3; |
307 | RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride); |
308 | |
309 | // initialize the Hann window function: |
310 | RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float)); |
311 | |
312 | for (i = 0; i < atempo->window; i++) { |
313 | double t = (double)i / (double)(atempo->window - 1); |
314 | double h = 0.5 * (1.0 - cos(2.0 * M_PI * t)); |
315 | atempo->hann[i] = (float)h; |
316 | } |
317 | |
318 | yae_clear(atempo); |
319 | return 0; |
320 | } |
321 | |
322 | static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo) |
323 | { |
324 | const AudioFragment *prev; |
325 | ATempoContext *atempo = ctx->priv; |
326 | char *tail = NULL; |
327 | double tempo = av_strtod(arg_tempo, &tail); |
328 | |
329 | if (tail && *tail) { |
330 | av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo); |
331 | return AVERROR(EINVAL); |
332 | } |
333 | |
334 | if (tempo < 0.5 || tempo > 2.0) { |
335 | av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n", |
336 | tempo); |
337 | return AVERROR(EINVAL); |
338 | } |
339 | |
340 | prev = yae_prev_frag(atempo); |
341 | atempo->origin[0] = prev->position[0] + atempo->window / 2; |
342 | atempo->origin[1] = prev->position[1] + atempo->window / 2; |
343 | atempo->tempo = tempo; |
344 | return 0; |
345 | } |
346 | |
347 | /** |
348 | * A helper macro for initializing complex data buffer with scalar data |
349 | * of a given type. |
350 | */ |
351 | #define yae_init_xdat(scalar_type, scalar_max) \ |
352 | do { \ |
353 | const uint8_t *src_end = src + \ |
354 | frag->nsamples * atempo->channels * sizeof(scalar_type); \ |
355 | \ |
356 | FFTSample *xdat = frag->xdat; \ |
357 | scalar_type tmp; \ |
358 | \ |
359 | if (atempo->channels == 1) { \ |
360 | for (; src < src_end; xdat++) { \ |
361 | tmp = *(const scalar_type *)src; \ |
362 | src += sizeof(scalar_type); \ |
363 | \ |
364 | *xdat = (FFTSample)tmp; \ |
365 | } \ |
366 | } else { \ |
367 | FFTSample s, max, ti, si; \ |
368 | int i; \ |
369 | \ |
370 | for (; src < src_end; xdat++) { \ |
371 | tmp = *(const scalar_type *)src; \ |
372 | src += sizeof(scalar_type); \ |
373 | \ |
374 | max = (FFTSample)tmp; \ |
375 | s = FFMIN((FFTSample)scalar_max, \ |
376 | (FFTSample)fabsf(max)); \ |
377 | \ |
378 | for (i = 1; i < atempo->channels; i++) { \ |
379 | tmp = *(const scalar_type *)src; \ |
380 | src += sizeof(scalar_type); \ |
381 | \ |
382 | ti = (FFTSample)tmp; \ |
383 | si = FFMIN((FFTSample)scalar_max, \ |
384 | (FFTSample)fabsf(ti)); \ |
385 | \ |
386 | if (s < si) { \ |
387 | s = si; \ |
388 | max = ti; \ |
389 | } \ |
390 | } \ |
391 | \ |
392 | *xdat = max; \ |
393 | } \ |
394 | } \ |
395 | } while (0) |
396 | |
397 | /** |
398 | * Initialize complex data buffer of a given audio fragment |
399 | * with down-mixed mono data of appropriate scalar type. |
400 | */ |
401 | static void yae_downmix(ATempoContext *atempo, AudioFragment *frag) |
402 | { |
403 | // shortcuts: |
404 | const uint8_t *src = frag->data; |
405 | |
406 | // init complex data buffer used for FFT and Correlation: |
407 | memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window); |
408 | |
409 | if (atempo->format == AV_SAMPLE_FMT_U8) { |
410 | yae_init_xdat(uint8_t, 127); |
411 | } else if (atempo->format == AV_SAMPLE_FMT_S16) { |
412 | yae_init_xdat(int16_t, 32767); |
413 | } else if (atempo->format == AV_SAMPLE_FMT_S32) { |
414 | yae_init_xdat(int, 2147483647); |
415 | } else if (atempo->format == AV_SAMPLE_FMT_FLT) { |
416 | yae_init_xdat(float, 1); |
417 | } else if (atempo->format == AV_SAMPLE_FMT_DBL) { |
418 | yae_init_xdat(double, 1); |
419 | } |
420 | } |
421 | |
422 | /** |
423 | * Populate the internal data buffer on as-needed basis. |
424 | * |
425 | * @return |
426 | * 0 if requested data was already available or was successfully loaded, |
427 | * AVERROR(EAGAIN) if more input data is required. |
428 | */ |
429 | static int yae_load_data(ATempoContext *atempo, |
430 | const uint8_t **src_ref, |
431 | const uint8_t *src_end, |
432 | int64_t stop_here) |
433 | { |
434 | // shortcut: |
435 | const uint8_t *src = *src_ref; |
436 | const int read_size = stop_here - atempo->position[0]; |
437 | |
438 | if (stop_here <= atempo->position[0]) { |
439 | return 0; |
440 | } |
441 | |
442 | // samples are not expected to be skipped: |
443 | av_assert0(read_size <= atempo->ring); |
444 | |
445 | while (atempo->position[0] < stop_here && src < src_end) { |
446 | int src_samples = (src_end - src) / atempo->stride; |
447 | |
448 | // load data piece-wise, in order to avoid complicating the logic: |
449 | int nsamples = FFMIN(read_size, src_samples); |
450 | int na; |
451 | int nb; |
452 | |
453 | nsamples = FFMIN(nsamples, atempo->ring); |
454 | na = FFMIN(nsamples, atempo->ring - atempo->tail); |
455 | nb = FFMIN(nsamples - na, atempo->ring); |
456 | |
457 | if (na) { |
458 | uint8_t *a = atempo->buffer + atempo->tail * atempo->stride; |
459 | memcpy(a, src, na * atempo->stride); |
460 | |
461 | src += na * atempo->stride; |
462 | atempo->position[0] += na; |
463 | |
464 | atempo->size = FFMIN(atempo->size + na, atempo->ring); |
465 | atempo->tail = (atempo->tail + na) % atempo->ring; |
466 | atempo->head = |
467 | atempo->size < atempo->ring ? |
468 | atempo->tail - atempo->size : |
469 | atempo->tail; |
470 | } |
471 | |
472 | if (nb) { |
473 | uint8_t *b = atempo->buffer; |
474 | memcpy(b, src, nb * atempo->stride); |
475 | |
476 | src += nb * atempo->stride; |
477 | atempo->position[0] += nb; |
478 | |
479 | atempo->size = FFMIN(atempo->size + nb, atempo->ring); |
480 | atempo->tail = (atempo->tail + nb) % atempo->ring; |
481 | atempo->head = |
482 | atempo->size < atempo->ring ? |
483 | atempo->tail - atempo->size : |
484 | atempo->tail; |
485 | } |
486 | } |
487 | |
488 | // pass back the updated source buffer pointer: |
489 | *src_ref = src; |
490 | |
491 | // sanity check: |
492 | av_assert0(atempo->position[0] <= stop_here); |
493 | |
494 | return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN); |
495 | } |
496 | |
497 | /** |
498 | * Populate current audio fragment data buffer. |
499 | * |
500 | * @return |
501 | * 0 when the fragment is ready, |
502 | * AVERROR(EAGAIN) if more input data is required. |
503 | */ |
504 | static int yae_load_frag(ATempoContext *atempo, |
505 | const uint8_t **src_ref, |
506 | const uint8_t *src_end) |
507 | { |
508 | // shortcuts: |
509 | AudioFragment *frag = yae_curr_frag(atempo); |
510 | uint8_t *dst; |
511 | int64_t missing, start, zeros; |
512 | uint32_t nsamples; |
513 | const uint8_t *a, *b; |
514 | int i0, i1, n0, n1, na, nb; |
515 | |
516 | int64_t stop_here = frag->position[0] + atempo->window; |
517 | if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) { |
518 | return AVERROR(EAGAIN); |
519 | } |
520 | |
521 | // calculate the number of samples we don't have: |
522 | missing = |
523 | stop_here > atempo->position[0] ? |
524 | stop_here - atempo->position[0] : 0; |
525 | |
526 | nsamples = |
527 | missing < (int64_t)atempo->window ? |
528 | (uint32_t)(atempo->window - missing) : 0; |
529 | |
530 | // setup the output buffer: |
531 | frag->nsamples = nsamples; |
532 | dst = frag->data; |
533 | |
534 | start = atempo->position[0] - atempo->size; |
535 | zeros = 0; |
536 | |
537 | if (frag->position[0] < start) { |
538 | // what we don't have we substitute with zeros: |
539 | zeros = FFMIN(start - frag->position[0], (int64_t)nsamples); |
540 | av_assert0(zeros != nsamples); |
541 | |
542 | memset(dst, 0, zeros * atempo->stride); |
543 | dst += zeros * atempo->stride; |
544 | } |
545 | |
546 | if (zeros == nsamples) { |
547 | return 0; |
548 | } |
549 | |
550 | // get the remaining data from the ring buffer: |
551 | na = (atempo->head < atempo->tail ? |
552 | atempo->tail - atempo->head : |
553 | atempo->ring - atempo->head); |
554 | |
555 | nb = atempo->head < atempo->tail ? 0 : atempo->tail; |
556 | |
557 | // sanity check: |
558 | av_assert0(nsamples <= zeros + na + nb); |
559 | |
560 | a = atempo->buffer + atempo->head * atempo->stride; |
561 | b = atempo->buffer; |
562 | |
563 | i0 = frag->position[0] + zeros - start; |
564 | i1 = i0 < na ? 0 : i0 - na; |
565 | |
566 | n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0; |
567 | n1 = nsamples - zeros - n0; |
568 | |
569 | if (n0) { |
570 | memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride); |
571 | dst += n0 * atempo->stride; |
572 | } |
573 | |
574 | if (n1) { |
575 | memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride); |
576 | } |
577 | |
578 | return 0; |
579 | } |
580 | |
581 | /** |
582 | * Prepare for loading next audio fragment. |
583 | */ |
584 | static void yae_advance_to_next_frag(ATempoContext *atempo) |
585 | { |
586 | const double fragment_step = atempo->tempo * (double)(atempo->window / 2); |
587 | |
588 | const AudioFragment *prev; |
589 | AudioFragment *frag; |
590 | |
591 | atempo->nfrag++; |
592 | prev = yae_prev_frag(atempo); |
593 | frag = yae_curr_frag(atempo); |
594 | |
595 | frag->position[0] = prev->position[0] + (int64_t)fragment_step; |
596 | frag->position[1] = prev->position[1] + atempo->window / 2; |
597 | frag->nsamples = 0; |
598 | } |
599 | |
600 | /** |
601 | * Calculate cross-correlation via rDFT. |
602 | * |
603 | * Multiply two vectors of complex numbers (result of real_to_complex rDFT) |
604 | * and transform back via complex_to_real rDFT. |
605 | */ |
606 | static void yae_xcorr_via_rdft(FFTSample *xcorr, |
607 | RDFTContext *complex_to_real, |
608 | const FFTComplex *xa, |
609 | const FFTComplex *xb, |
610 | const int window) |
611 | { |
612 | FFTComplex *xc = (FFTComplex *)xcorr; |
613 | int i; |
614 | |
615 | // NOTE: first element requires special care -- Given Y = rDFT(X), |
616 | // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc |
617 | // stores Re(Y[N/2]) in place of Im(Y[0]). |
618 | |
619 | xc->re = xa->re * xb->re; |
620 | xc->im = xa->im * xb->im; |
621 | xa++; |
622 | xb++; |
623 | xc++; |
624 | |
625 | for (i = 1; i < window; i++, xa++, xb++, xc++) { |
626 | xc->re = (xa->re * xb->re + xa->im * xb->im); |
627 | xc->im = (xa->im * xb->re - xa->re * xb->im); |
628 | } |
629 | |
630 | // apply inverse rDFT: |
631 | av_rdft_calc(complex_to_real, xcorr); |
632 | } |
633 | |
634 | /** |
635 | * Calculate alignment offset for given fragment |
636 | * relative to the previous fragment. |
637 | * |
638 | * @return alignment offset of current fragment relative to previous. |
639 | */ |
640 | static int yae_align(AudioFragment *frag, |
641 | const AudioFragment *prev, |
642 | const int window, |
643 | const int delta_max, |
644 | const int drift, |
645 | FFTSample *correlation, |
646 | RDFTContext *complex_to_real) |
647 | { |
648 | int best_offset = -drift; |
649 | FFTSample best_metric = -FLT_MAX; |
650 | FFTSample *xcorr; |
651 | |
652 | int i0; |
653 | int i1; |
654 | int i; |
655 | |
656 | yae_xcorr_via_rdft(correlation, |
657 | complex_to_real, |
658 | (const FFTComplex *)prev->xdat, |
659 | (const FFTComplex *)frag->xdat, |
660 | window); |
661 | |
662 | // identify search window boundaries: |
663 | i0 = FFMAX(window / 2 - delta_max - drift, 0); |
664 | i0 = FFMIN(i0, window); |
665 | |
666 | i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16); |
667 | i1 = FFMAX(i1, 0); |
668 | |
669 | // identify cross-correlation peaks within search window: |
670 | xcorr = correlation + i0; |
671 | |
672 | for (i = i0; i < i1; i++, xcorr++) { |
673 | FFTSample metric = *xcorr; |
674 | |
675 | // normalize: |
676 | FFTSample drifti = (FFTSample)(drift + i); |
677 | metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i); |
678 | |
679 | if (metric > best_metric) { |
680 | best_metric = metric; |
681 | best_offset = i - window / 2; |
682 | } |
683 | } |
684 | |
685 | return best_offset; |
686 | } |
687 | |
688 | /** |
689 | * Adjust current fragment position for better alignment |
690 | * with previous fragment. |
691 | * |
692 | * @return alignment correction. |
693 | */ |
694 | static int yae_adjust_position(ATempoContext *atempo) |
695 | { |
696 | const AudioFragment *prev = yae_prev_frag(atempo); |
697 | AudioFragment *frag = yae_curr_frag(atempo); |
698 | |
699 | const double prev_output_position = |
700 | (double)(prev->position[1] - atempo->origin[1] + atempo->window / 2) * |
701 | atempo->tempo; |
702 | |
703 | const double ideal_output_position = |
704 | (double)(prev->position[0] - atempo->origin[0] + atempo->window / 2); |
705 | |
706 | const int drift = (int)(prev_output_position - ideal_output_position); |
707 | |
708 | const int delta_max = atempo->window / 2; |
709 | const int correction = yae_align(frag, |
710 | prev, |
711 | atempo->window, |
712 | delta_max, |
713 | drift, |
714 | atempo->correlation, |
715 | atempo->complex_to_real); |
716 | |
717 | if (correction) { |
718 | // adjust fragment position: |
719 | frag->position[0] -= correction; |
720 | |
721 | // clear so that the fragment can be reloaded: |
722 | frag->nsamples = 0; |
723 | } |
724 | |
725 | return correction; |
726 | } |
727 | |
728 | /** |
729 | * A helper macro for blending the overlap region of previous |
730 | * and current audio fragment. |
731 | */ |
732 | #define yae_blend(scalar_type) \ |
733 | do { \ |
734 | const scalar_type *aaa = (const scalar_type *)a; \ |
735 | const scalar_type *bbb = (const scalar_type *)b; \ |
736 | \ |
737 | scalar_type *out = (scalar_type *)dst; \ |
738 | scalar_type *out_end = (scalar_type *)dst_end; \ |
739 | int64_t i; \ |
740 | \ |
741 | for (i = 0; i < overlap && out < out_end; \ |
742 | i++, atempo->position[1]++, wa++, wb++) { \ |
743 | float w0 = *wa; \ |
744 | float w1 = *wb; \ |
745 | int j; \ |
746 | \ |
747 | for (j = 0; j < atempo->channels; \ |
748 | j++, aaa++, bbb++, out++) { \ |
749 | float t0 = (float)*aaa; \ |
750 | float t1 = (float)*bbb; \ |
751 | \ |
752 | *out = \ |
753 | frag->position[0] + i < 0 ? \ |
754 | *aaa : \ |
755 | (scalar_type)(t0 * w0 + t1 * w1); \ |
756 | } \ |
757 | } \ |
758 | dst = (uint8_t *)out; \ |
759 | } while (0) |
760 | |
761 | /** |
762 | * Blend the overlap region of previous and current audio fragment |
763 | * and output the results to the given destination buffer. |
764 | * |
765 | * @return |
766 | * 0 if the overlap region was completely stored in the dst buffer, |
767 | * AVERROR(EAGAIN) if more destination buffer space is required. |
768 | */ |
769 | static int yae_overlap_add(ATempoContext *atempo, |
770 | uint8_t **dst_ref, |
771 | uint8_t *dst_end) |
772 | { |
773 | // shortcuts: |
774 | const AudioFragment *prev = yae_prev_frag(atempo); |
775 | const AudioFragment *frag = yae_curr_frag(atempo); |
776 | |
777 | const int64_t start_here = FFMAX(atempo->position[1], |
778 | frag->position[1]); |
779 | |
780 | const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples, |
781 | frag->position[1] + frag->nsamples); |
782 | |
783 | const int64_t overlap = stop_here - start_here; |
784 | |
785 | const int64_t ia = start_here - prev->position[1]; |
786 | const int64_t ib = start_here - frag->position[1]; |
787 | |
788 | const float *wa = atempo->hann + ia; |
789 | const float *wb = atempo->hann + ib; |
790 | |
791 | const uint8_t *a = prev->data + ia * atempo->stride; |
792 | const uint8_t *b = frag->data + ib * atempo->stride; |
793 | |
794 | uint8_t *dst = *dst_ref; |
795 | |
796 | av_assert0(start_here <= stop_here && |
797 | frag->position[1] <= start_here && |
798 | overlap <= frag->nsamples); |
799 | |
800 | if (atempo->format == AV_SAMPLE_FMT_U8) { |
801 | yae_blend(uint8_t); |
802 | } else if (atempo->format == AV_SAMPLE_FMT_S16) { |
803 | yae_blend(int16_t); |
804 | } else if (atempo->format == AV_SAMPLE_FMT_S32) { |
805 | yae_blend(int); |
806 | } else if (atempo->format == AV_SAMPLE_FMT_FLT) { |
807 | yae_blend(float); |
808 | } else if (atempo->format == AV_SAMPLE_FMT_DBL) { |
809 | yae_blend(double); |
810 | } |
811 | |
812 | // pass-back the updated destination buffer pointer: |
813 | *dst_ref = dst; |
814 | |
815 | return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN); |
816 | } |
817 | |
818 | /** |
819 | * Feed as much data to the filter as it is able to consume |
820 | * and receive as much processed data in the destination buffer |
821 | * as it is able to produce or store. |
822 | */ |
823 | static void |
824 | yae_apply(ATempoContext *atempo, |
825 | const uint8_t **src_ref, |
826 | const uint8_t *src_end, |
827 | uint8_t **dst_ref, |
828 | uint8_t *dst_end) |
829 | { |
830 | while (1) { |
831 | if (atempo->state == YAE_LOAD_FRAGMENT) { |
832 | // load additional data for the current fragment: |
833 | if (yae_load_frag(atempo, src_ref, src_end) != 0) { |
834 | break; |
835 | } |
836 | |
837 | // down-mix to mono: |
838 | yae_downmix(atempo, yae_curr_frag(atempo)); |
839 | |
840 | // apply rDFT: |
841 | av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat); |
842 | |
843 | // must load the second fragment before alignment can start: |
844 | if (!atempo->nfrag) { |
845 | yae_advance_to_next_frag(atempo); |
846 | continue; |
847 | } |
848 | |
849 | atempo->state = YAE_ADJUST_POSITION; |
850 | } |
851 | |
852 | if (atempo->state == YAE_ADJUST_POSITION) { |
853 | // adjust position for better alignment: |
854 | if (yae_adjust_position(atempo)) { |
855 | // reload the fragment at the corrected position, so that the |
856 | // Hann window blending would not require normalization: |
857 | atempo->state = YAE_RELOAD_FRAGMENT; |
858 | } else { |
859 | atempo->state = YAE_OUTPUT_OVERLAP_ADD; |
860 | } |
861 | } |
862 | |
863 | if (atempo->state == YAE_RELOAD_FRAGMENT) { |
864 | // load additional data if necessary due to position adjustment: |
865 | if (yae_load_frag(atempo, src_ref, src_end) != 0) { |
866 | break; |
867 | } |
868 | |
869 | // down-mix to mono: |
870 | yae_downmix(atempo, yae_curr_frag(atempo)); |
871 | |
872 | // apply rDFT: |
873 | av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat); |
874 | |
875 | atempo->state = YAE_OUTPUT_OVERLAP_ADD; |
876 | } |
877 | |
878 | if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) { |
879 | // overlap-add and output the result: |
880 | if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) { |
881 | break; |
882 | } |
883 | |
884 | // advance to the next fragment, repeat: |
885 | yae_advance_to_next_frag(atempo); |
886 | atempo->state = YAE_LOAD_FRAGMENT; |
887 | } |
888 | } |
889 | } |
890 | |
891 | /** |
892 | * Flush any buffered data from the filter. |
893 | * |
894 | * @return |
895 | * 0 if all data was completely stored in the dst buffer, |
896 | * AVERROR(EAGAIN) if more destination buffer space is required. |
897 | */ |
898 | static int yae_flush(ATempoContext *atempo, |
899 | uint8_t **dst_ref, |
900 | uint8_t *dst_end) |
901 | { |
902 | AudioFragment *frag = yae_curr_frag(atempo); |
903 | int64_t overlap_end; |
904 | int64_t start_here; |
905 | int64_t stop_here; |
906 | int64_t offset; |
907 | |
908 | const uint8_t *src; |
909 | uint8_t *dst; |
910 | |
911 | int src_size; |
912 | int dst_size; |
913 | int nbytes; |
914 | |
915 | atempo->state = YAE_FLUSH_OUTPUT; |
916 | |
917 | if (atempo->position[0] >= frag->position[0] + frag->nsamples && |
918 | atempo->position[1] >= frag->position[1] + frag->nsamples) { |
919 | // the current fragment is already flushed: |
920 | return 0; |
921 | } |
922 | |
923 | if (frag->position[0] + frag->nsamples < atempo->position[0]) { |
924 | // finish loading the current (possibly partial) fragment: |
925 | yae_load_frag(atempo, NULL, NULL); |
926 | |
927 | if (atempo->nfrag) { |
928 | // down-mix to mono: |
929 | yae_downmix(atempo, frag); |
930 | |
931 | // apply rDFT: |
932 | av_rdft_calc(atempo->real_to_complex, frag->xdat); |
933 | |
934 | // align current fragment to previous fragment: |
935 | if (yae_adjust_position(atempo)) { |
936 | // reload the current fragment due to adjusted position: |
937 | yae_load_frag(atempo, NULL, NULL); |
938 | } |
939 | } |
940 | } |
941 | |
942 | // flush the overlap region: |
943 | overlap_end = frag->position[1] + FFMIN(atempo->window / 2, |
944 | frag->nsamples); |
945 | |
946 | while (atempo->position[1] < overlap_end) { |
947 | if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) { |
948 | return AVERROR(EAGAIN); |
949 | } |
950 | } |
951 | |
952 | // check whether all of the input samples have been consumed: |
953 | if (frag->position[0] + frag->nsamples < atempo->position[0]) { |
954 | yae_advance_to_next_frag(atempo); |
955 | return AVERROR(EAGAIN); |
956 | } |
957 | |
958 | // flush the remainder of the current fragment: |
959 | start_here = FFMAX(atempo->position[1], overlap_end); |
960 | stop_here = frag->position[1] + frag->nsamples; |
961 | offset = start_here - frag->position[1]; |
962 | av_assert0(start_here <= stop_here && frag->position[1] <= start_here); |
963 | |
964 | src = frag->data + offset * atempo->stride; |
965 | dst = (uint8_t *)*dst_ref; |
966 | |
967 | src_size = (int)(stop_here - start_here) * atempo->stride; |
968 | dst_size = dst_end - dst; |
969 | nbytes = FFMIN(src_size, dst_size); |
970 | |
971 | memcpy(dst, src, nbytes); |
972 | dst += nbytes; |
973 | |
974 | atempo->position[1] += (nbytes / atempo->stride); |
975 | |
976 | // pass-back the updated destination buffer pointer: |
977 | *dst_ref = (uint8_t *)dst; |
978 | |
979 | return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN); |
980 | } |
981 | |
982 | static av_cold int init(AVFilterContext *ctx) |
983 | { |
984 | ATempoContext *atempo = ctx->priv; |
985 | atempo->format = AV_SAMPLE_FMT_NONE; |
986 | atempo->state = YAE_LOAD_FRAGMENT; |
987 | return 0; |
988 | } |
989 | |
990 | static av_cold void uninit(AVFilterContext *ctx) |
991 | { |
992 | ATempoContext *atempo = ctx->priv; |
993 | yae_release_buffers(atempo); |
994 | } |
995 | |
996 | static int query_formats(AVFilterContext *ctx) |
997 | { |
998 | AVFilterChannelLayouts *layouts = NULL; |
999 | AVFilterFormats *formats = NULL; |
1000 | |
1001 | // WSOLA necessitates an internal sliding window ring buffer |
1002 | // for incoming audio stream. |
1003 | // |
1004 | // Planar sample formats are too cumbersome to store in a ring buffer, |
1005 | // therefore planar sample formats are not supported. |
1006 | // |
1007 | static const enum AVSampleFormat sample_fmts[] = { |
1008 | AV_SAMPLE_FMT_U8, |
1009 | AV_SAMPLE_FMT_S16, |
1010 | AV_SAMPLE_FMT_S32, |
1011 | AV_SAMPLE_FMT_FLT, |
1012 | AV_SAMPLE_FMT_DBL, |
1013 | AV_SAMPLE_FMT_NONE |
1014 | }; |
1015 | int ret; |
1016 | |
1017 | layouts = ff_all_channel_counts(); |
1018 | if (!layouts) { |
1019 | return AVERROR(ENOMEM); |
1020 | } |
1021 | ret = ff_set_common_channel_layouts(ctx, layouts); |
1022 | if (ret < 0) |
1023 | return ret; |
1024 | |
1025 | formats = ff_make_format_list(sample_fmts); |
1026 | if (!formats) { |
1027 | return AVERROR(ENOMEM); |
1028 | } |
1029 | ret = ff_set_common_formats(ctx, formats); |
1030 | if (ret < 0) |
1031 | return ret; |
1032 | |
1033 | formats = ff_all_samplerates(); |
1034 | if (!formats) { |
1035 | return AVERROR(ENOMEM); |
1036 | } |
1037 | return ff_set_common_samplerates(ctx, formats); |
1038 | } |
1039 | |
1040 | static int config_props(AVFilterLink *inlink) |
1041 | { |
1042 | AVFilterContext *ctx = inlink->dst; |
1043 | ATempoContext *atempo = ctx->priv; |
1044 | |
1045 | enum AVSampleFormat format = inlink->format; |
1046 | int sample_rate = (int)inlink->sample_rate; |
1047 | |
1048 | return yae_reset(atempo, format, sample_rate, inlink->channels); |
1049 | } |
1050 | |
1051 | static int push_samples(ATempoContext *atempo, |
1052 | AVFilterLink *outlink, |
1053 | int n_out) |
1054 | { |
1055 | int ret; |
1056 | |
1057 | atempo->dst_buffer->sample_rate = outlink->sample_rate; |
1058 | atempo->dst_buffer->nb_samples = n_out; |
1059 | |
1060 | // adjust the PTS: |
1061 | atempo->dst_buffer->pts = |
1062 | av_rescale_q(atempo->nsamples_out, |
1063 | (AVRational){ 1, outlink->sample_rate }, |
1064 | outlink->time_base); |
1065 | |
1066 | ret = ff_filter_frame(outlink, atempo->dst_buffer); |
1067 | atempo->dst_buffer = NULL; |
1068 | atempo->dst = NULL; |
1069 | atempo->dst_end = NULL; |
1070 | if (ret < 0) |
1071 | return ret; |
1072 | |
1073 | atempo->nsamples_out += n_out; |
1074 | return 0; |
1075 | } |
1076 | |
1077 | static int filter_frame(AVFilterLink *inlink, AVFrame *src_buffer) |
1078 | { |
1079 | AVFilterContext *ctx = inlink->dst; |
1080 | ATempoContext *atempo = ctx->priv; |
1081 | AVFilterLink *outlink = ctx->outputs[0]; |
1082 | |
1083 | int ret = 0; |
1084 | int n_in = src_buffer->nb_samples; |
1085 | int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo); |
1086 | |
1087 | const uint8_t *src = src_buffer->data[0]; |
1088 | const uint8_t *src_end = src + n_in * atempo->stride; |
1089 | |
1090 | while (src < src_end) { |
1091 | if (!atempo->dst_buffer) { |
1092 | atempo->dst_buffer = ff_get_audio_buffer(outlink, n_out); |
1093 | if (!atempo->dst_buffer) |
1094 | return AVERROR(ENOMEM); |
1095 | av_frame_copy_props(atempo->dst_buffer, src_buffer); |
1096 | |
1097 | atempo->dst = atempo->dst_buffer->data[0]; |
1098 | atempo->dst_end = atempo->dst + n_out * atempo->stride; |
1099 | } |
1100 | |
1101 | yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end); |
1102 | |
1103 | if (atempo->dst == atempo->dst_end) { |
1104 | int n_samples = ((atempo->dst - atempo->dst_buffer->data[0]) / |
1105 | atempo->stride); |
1106 | ret = push_samples(atempo, outlink, n_samples); |
1107 | if (ret < 0) |
1108 | goto end; |
1109 | } |
1110 | } |
1111 | |
1112 | atempo->nsamples_in += n_in; |
1113 | end: |
1114 | av_frame_free(&src_buffer); |
1115 | return ret; |
1116 | } |
1117 | |
1118 | static int request_frame(AVFilterLink *outlink) |
1119 | { |
1120 | AVFilterContext *ctx = outlink->src; |
1121 | ATempoContext *atempo = ctx->priv; |
1122 | int ret; |
1123 | |
1124 | ret = ff_request_frame(ctx->inputs[0]); |
1125 | |
1126 | if (ret == AVERROR_EOF) { |
1127 | // flush the filter: |
1128 | int n_max = atempo->ring; |
1129 | int n_out; |
1130 | int err = AVERROR(EAGAIN); |
1131 | |
1132 | while (err == AVERROR(EAGAIN)) { |
1133 | if (!atempo->dst_buffer) { |
1134 | atempo->dst_buffer = ff_get_audio_buffer(outlink, n_max); |
1135 | if (!atempo->dst_buffer) |
1136 | return AVERROR(ENOMEM); |
1137 | |
1138 | atempo->dst = atempo->dst_buffer->data[0]; |
1139 | atempo->dst_end = atempo->dst + n_max * atempo->stride; |
1140 | } |
1141 | |
1142 | err = yae_flush(atempo, &atempo->dst, atempo->dst_end); |
1143 | |
1144 | n_out = ((atempo->dst - atempo->dst_buffer->data[0]) / |
1145 | atempo->stride); |
1146 | |
1147 | if (n_out) { |
1148 | ret = push_samples(atempo, outlink, n_out); |
1149 | } |
1150 | } |
1151 | |
1152 | av_frame_free(&atempo->dst_buffer); |
1153 | atempo->dst = NULL; |
1154 | atempo->dst_end = NULL; |
1155 | |
1156 | return AVERROR_EOF; |
1157 | } |
1158 | |
1159 | return ret; |
1160 | } |
1161 | |
1162 | static int process_command(AVFilterContext *ctx, |
1163 | const char *cmd, |
1164 | const char *arg, |
1165 | char *res, |
1166 | int res_len, |
1167 | int flags) |
1168 | { |
1169 | return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS); |
1170 | } |
1171 | |
1172 | static const AVFilterPad atempo_inputs[] = { |
1173 | { |
1174 | .name = "default", |
1175 | .type = AVMEDIA_TYPE_AUDIO, |
1176 | .filter_frame = filter_frame, |
1177 | .config_props = config_props, |
1178 | }, |
1179 | { NULL } |
1180 | }; |
1181 | |
1182 | static const AVFilterPad atempo_outputs[] = { |
1183 | { |
1184 | .name = "default", |
1185 | .request_frame = request_frame, |
1186 | .type = AVMEDIA_TYPE_AUDIO, |
1187 | }, |
1188 | { NULL } |
1189 | }; |
1190 | |
1191 | AVFilter ff_af_atempo = { |
1192 | .name = "atempo", |
1193 | .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."), |
1194 | .init = init, |
1195 | .uninit = uninit, |
1196 | .query_formats = query_formats, |
1197 | .process_command = process_command, |
1198 | .priv_size = sizeof(ATempoContext), |
1199 | .priv_class = &atempo_class, |
1200 | .inputs = atempo_inputs, |
1201 | .outputs = atempo_outputs, |
1202 | }; |
1203 |