blob: 9e8e790c84e80a1607b90f1a19e7cde9f8a86b4f
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: sbrqmf.c,v 1.2 2005/05/19 20:45:20 jrecker Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com) |
41 | * February 2005 |
42 | * |
43 | * sbrqmf.c - analysis and synthesis QMF filters for SBR |
44 | **************************************************************************************/ |
45 | #include "sbr.h" |
46 | #include "assembly.h" |
47 | #include <stdio.h> |
48 | |
49 | |
50 | /* PreMultiply64() table |
51 | * format = Q30 |
52 | * reordered for sequential access |
53 | * |
54 | * for (i = 0; i < 64/4; i++) { |
55 | * angle = (i + 0.25) * M_PI / nmdct; |
56 | * x = (cos(angle) + sin(angle)); |
57 | * x = sin(angle); |
58 | * |
59 | * angle = (nmdct/2 - 1 - i + 0.25) * M_PI / nmdct; |
60 | * x = (cos(angle) + sin(angle)); |
61 | * x = sin(angle); |
62 | * } |
63 | */ |
64 | static const int cos4sin4tab64[64] = { |
65 | 0x40c7d2bd, 0x00c90e90, 0x424ff28f, 0x3ff4e5e0, 0x43cdd89a, 0x03ecadcf, 0x454149fc, 0x3fc395f9, |
66 | 0x46aa0d6d, 0x070de172, 0x4807eb4b, 0x3f6af2e3, 0x495aada2, 0x0a2abb59, 0x4aa22036, 0x3eeb3347, |
67 | 0x4bde1089, 0x0d415013, 0x4d0e4de2, 0x3e44a5ef, 0x4e32a956, 0x104fb80e, 0x4f4af5d1, 0x3d77b192, |
68 | 0x50570819, 0x135410c3, 0x5156b6d9, 0x3c84d496, 0x5249daa2, 0x164c7ddd, 0x53304df6, 0x3b6ca4c4, |
69 | 0x5409ed4b, 0x19372a64, 0x54d69714, 0x3a2fcee8, 0x55962bc0, 0x1c1249d8, 0x56488dc5, 0x38cf1669, |
70 | 0x56eda1a0, 0x1edc1953, 0x57854ddd, 0x374b54ce, 0x580f7b19, 0x2192e09b, 0x588c1404, 0x35a5793c, |
71 | 0x58fb0568, 0x2434f332, 0x595c3e2a, 0x33de87de, 0x59afaf4c, 0x26c0b162, 0x59f54bee, 0x31f79948, |
72 | 0x5a2d0957, 0x29348937, 0x5a56deec, 0x2ff1d9c7, 0x5a72c63b, 0x2b8ef77d, 0x5a80baf6, 0x2dce88aa, |
73 | }; |
74 | |
75 | /* PostMultiply64() table |
76 | * format = Q30 |
77 | * reordered for sequential access |
78 | * |
79 | * for (i = 0; i <= (32/2); i++) { |
80 | * angle = i * M_PI / 64; |
81 | * x = (cos(angle) + sin(angle)); |
82 | * x = sin(angle); |
83 | * } |
84 | */ |
85 | static const int cos1sin1tab64[34] = { |
86 | 0x40000000, 0x00000000, 0x43103085, 0x0323ecbe, 0x45f704f7, 0x0645e9af, 0x48b2b335, 0x09640837, |
87 | 0x4b418bbe, 0x0c7c5c1e, 0x4da1fab5, 0x0f8cfcbe, 0x4fd288dc, 0x1294062f, 0x51d1dc80, 0x158f9a76, |
88 | 0x539eba45, 0x187de2a7, 0x553805f2, 0x1b5d100a, 0x569cc31b, 0x1e2b5d38, 0x57cc15bc, 0x20e70f32, |
89 | 0x58c542c5, 0x238e7673, 0x5987b08a, 0x261feffa, 0x5a12e720, 0x2899e64a, 0x5a6690ae, 0x2afad269, |
90 | 0x5a82799a, 0x2d413ccd, |
91 | }; |
92 | |
93 | /************************************************************************************** |
94 | * Function: PreMultiply64 |
95 | * |
96 | * Description: pre-twiddle stage of 64-point DCT-IV |
97 | * |
98 | * Inputs: buffer of 64 samples |
99 | * |
100 | * Outputs: processed samples in same buffer |
101 | * |
102 | * Return: none |
103 | * |
104 | * Notes: minimum 1 GB in, 2 GB out, gains 2 int bits |
105 | * gbOut = gbIn + 1 |
106 | * output is limited to sqrt(2)/2 plus GB in full GB |
107 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
108 | **************************************************************************************/ |
109 | static void PreMultiply64(int *zbuf1) |
110 | { |
111 | int i, ar1, ai1, ar2, ai2, z1, z2; |
112 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
113 | int *zbuf2; |
114 | const int *csptr; |
115 | |
116 | zbuf2 = zbuf1 + 64 - 1; |
117 | csptr = cos4sin4tab64; |
118 | |
119 | /* whole thing should fit in registers - verify that compiler does this */ |
120 | for (i = 64 >> 2; i != 0; i--) { |
121 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
122 | cps2a = *csptr++; |
123 | sin2a = *csptr++; |
124 | cps2b = *csptr++; |
125 | sin2b = *csptr++; |
126 | |
127 | ar1 = *(zbuf1 + 0); |
128 | ai2 = *(zbuf1 + 1); |
129 | ai1 = *(zbuf2 + 0); |
130 | ar2 = *(zbuf2 - 1); |
131 | |
132 | /* gain 2 ints bit from MULSHIFT32 by Q30 |
133 | * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414 |
134 | * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from |
135 | * extra sign bits, and eat one in adding |
136 | */ |
137 | t = MULSHIFT32(sin2a, ar1 + ai1); |
138 | z2 = MULSHIFT32(cps2a, ai1) - t; |
139 | cms2 = cps2a - 2 * sin2a; |
140 | z1 = MULSHIFT32(cms2, ar1) + t; |
141 | *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */ |
142 | *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */ |
143 | |
144 | t = MULSHIFT32(sin2b, ar2 + ai2); |
145 | z2 = MULSHIFT32(cps2b, ai2) - t; |
146 | cms2 = cps2b - 2 * sin2b; |
147 | z1 = MULSHIFT32(cms2, ar2) + t; |
148 | *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */ |
149 | *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */ |
150 | } |
151 | } |
152 | |
153 | /************************************************************************************** |
154 | * Function: PostMultiply64 |
155 | * |
156 | * Description: post-twiddle stage of 64-point type-IV DCT |
157 | * |
158 | * Inputs: buffer of 64 samples |
159 | * number of output samples to calculate |
160 | * |
161 | * Outputs: processed samples in same buffer |
162 | * |
163 | * Return: none |
164 | * |
165 | * Notes: minimum 1 GB in, 2 GB out, gains 2 int bits |
166 | * gbOut = gbIn + 1 |
167 | * output is limited to sqrt(2)/2 plus GB in full GB |
168 | * nSampsOut is rounded up to next multiple of 4, since we calculate |
169 | * 4 samples per loop |
170 | **************************************************************************************/ |
171 | static void PostMultiply64(int *fft1, int nSampsOut) |
172 | { |
173 | int i, ar1, ai1, ar2, ai2; |
174 | int t, cms2, cps2, sin2; |
175 | int *fft2; |
176 | const int *csptr; |
177 | |
178 | csptr = cos1sin1tab64; |
179 | fft2 = fft1 + 64 - 1; |
180 | |
181 | /* load coeffs for first pass |
182 | * cps2 = (cos+sin)/2, sin2 = sin/2, cms2 = (cos-sin)/2 |
183 | */ |
184 | cps2 = *csptr++; |
185 | sin2 = *csptr++; |
186 | cms2 = cps2 - 2 * sin2; |
187 | |
188 | for (i = (nSampsOut + 3) >> 2; i != 0; i--) { |
189 | ar1 = *(fft1 + 0); |
190 | ai1 = *(fft1 + 1); |
191 | ar2 = *(fft2 - 1); |
192 | ai2 = *(fft2 + 0); |
193 | |
194 | /* gain 2 int bits (multiplying by Q30), max gain = sqrt(2) */ |
195 | t = MULSHIFT32(sin2, ar1 + ai1); |
196 | *fft2-- = t - MULSHIFT32(cps2, ai1); |
197 | *fft1++ = t + MULSHIFT32(cms2, ar1); |
198 | |
199 | cps2 = *csptr++; |
200 | sin2 = *csptr++; |
201 | |
202 | ai2 = -ai2; |
203 | t = MULSHIFT32(sin2, ar2 + ai2); |
204 | *fft2-- = t - MULSHIFT32(cps2, ai2); |
205 | cms2 = cps2 - 2 * sin2; |
206 | *fft1++ = t + MULSHIFT32(cms2, ar2); |
207 | } |
208 | } |
209 | |
210 | /************************************************************************************** |
211 | * Function: QMFAnalysisConv |
212 | * |
213 | * Description: convolution kernel for analysis QMF |
214 | * |
215 | * Inputs: pointer to coefficient table, reordered for sequential access |
216 | * delay buffer of size 32*10 = 320 real-valued PCM samples |
217 | * index for delay ring buffer (range = [0, 9]) |
218 | * |
219 | * Outputs: 64 consecutive 32-bit samples |
220 | * |
221 | * Return: none |
222 | * |
223 | * Notes: this is carefully written to be efficient on ARM |
224 | * use the assembly code version in sbrqmfak.s when building for ARM! |
225 | **************************************************************************************/ |
226 | |
227 | #ifdef __cplusplus |
228 | extern "C" |
229 | #endif |
230 | void QMFAnalysisConv(int *cTab, int *delay, int dIdx, int *uBuf) |
231 | { |
232 | int k, dOff; |
233 | int *cPtr0, *cPtr1; |
234 | U64 u64lo, u64hi; |
235 | |
236 | dOff = dIdx * 32 + 31; |
237 | cPtr0 = cTab; |
238 | cPtr1 = cTab + 33 * 5 - 1; |
239 | |
240 | /* special first pass since we need to flip sign to create cTab[384], cTab[512] */ |
241 | u64lo.w64 = 0; |
242 | u64hi.w64 = 0; |
243 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
244 | dOff -= 32; |
245 | if (dOff < 0) { |
246 | dOff += 320; |
247 | } |
248 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
249 | dOff -= 32; |
250 | if (dOff < 0) { |
251 | dOff += 320; |
252 | } |
253 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
254 | dOff -= 32; |
255 | if (dOff < 0) { |
256 | dOff += 320; |
257 | } |
258 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
259 | dOff -= 32; |
260 | if (dOff < 0) { |
261 | dOff += 320; |
262 | } |
263 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
264 | dOff -= 32; |
265 | if (dOff < 0) { |
266 | dOff += 320; |
267 | } |
268 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
269 | dOff -= 32; |
270 | if (dOff < 0) { |
271 | dOff += 320; |
272 | } |
273 | u64lo.w64 = MADD64(u64lo.w64, -(*cPtr1--), delay[dOff]); |
274 | dOff -= 32; |
275 | if (dOff < 0) { |
276 | dOff += 320; |
277 | } |
278 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
279 | dOff -= 32; |
280 | if (dOff < 0) { |
281 | dOff += 320; |
282 | } |
283 | u64lo.w64 = MADD64(u64lo.w64, -(*cPtr1--), delay[dOff]); |
284 | dOff -= 32; |
285 | if (dOff < 0) { |
286 | dOff += 320; |
287 | } |
288 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
289 | dOff -= 32; |
290 | if (dOff < 0) { |
291 | dOff += 320; |
292 | } |
293 | |
294 | uBuf[0] = u64lo.r.hi32; |
295 | uBuf[32] = u64hi.r.hi32; |
296 | uBuf++; |
297 | dOff--; |
298 | |
299 | /* max gain for any sample in uBuf, after scaling by cTab, ~= 0.99 |
300 | * so we can just sum the uBuf values with no overflow problems |
301 | */ |
302 | for (k = 1; k <= 31; k++) { |
303 | u64lo.w64 = 0; |
304 | u64hi.w64 = 0; |
305 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
306 | dOff -= 32; |
307 | if (dOff < 0) { |
308 | dOff += 320; |
309 | } |
310 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
311 | dOff -= 32; |
312 | if (dOff < 0) { |
313 | dOff += 320; |
314 | } |
315 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
316 | dOff -= 32; |
317 | if (dOff < 0) { |
318 | dOff += 320; |
319 | } |
320 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
321 | dOff -= 32; |
322 | if (dOff < 0) { |
323 | dOff += 320; |
324 | } |
325 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
326 | dOff -= 32; |
327 | if (dOff < 0) { |
328 | dOff += 320; |
329 | } |
330 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
331 | dOff -= 32; |
332 | if (dOff < 0) { |
333 | dOff += 320; |
334 | } |
335 | u64lo.w64 = MADD64(u64lo.w64, *cPtr1--, delay[dOff]); |
336 | dOff -= 32; |
337 | if (dOff < 0) { |
338 | dOff += 320; |
339 | } |
340 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
341 | dOff -= 32; |
342 | if (dOff < 0) { |
343 | dOff += 320; |
344 | } |
345 | u64lo.w64 = MADD64(u64lo.w64, *cPtr1--, delay[dOff]); |
346 | dOff -= 32; |
347 | if (dOff < 0) { |
348 | dOff += 320; |
349 | } |
350 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
351 | dOff -= 32; |
352 | if (dOff < 0) { |
353 | dOff += 320; |
354 | } |
355 | |
356 | uBuf[0] = u64lo.r.hi32; |
357 | uBuf[32] = u64hi.r.hi32; |
358 | uBuf++; |
359 | dOff--; |
360 | } |
361 | } |
362 | |
363 | /************************************************************************************** |
364 | * Function: QMFAnalysis |
365 | * |
366 | * Description: 32-subband analysis QMF (4.6.18.4.1) |
367 | * |
368 | * Inputs: 32 consecutive samples of decoded 32-bit PCM, format = Q(fBitsIn) |
369 | * delay buffer of size 32*10 = 320 PCM samples |
370 | * number of fraction bits in input PCM |
371 | * index for delay ring buffer (range = [0, 9]) |
372 | * number of subbands to calculate (range = [0, 32]) |
373 | * |
374 | * Outputs: qmfaBands complex subband samples, format = Q(FBITS_OUT_QMFA) |
375 | * updated delay buffer |
376 | * updated delay index |
377 | * |
378 | * Return: guard bit mask |
379 | * |
380 | * Notes: output stored as RE{X0}, IM{X0}, RE{X1}, IM{X1}, ... RE{X31}, IM{X31} |
381 | * output stored in int buffer of size 64*2 = 128 |
382 | * (zero-filled from XBuf[2*qmfaBands] to XBuf[127]) |
383 | **************************************************************************************/ |
384 | int QMFAnalysis(int *inbuf, int *delay, int *XBuf, int fBitsIn, int *delayIdx, int qmfaBands) |
385 | { |
386 | int n, y, shift, gbMask; |
387 | int *delayPtr, *uBuf, *tBuf; |
388 | |
389 | /* use XBuf[128] as temp buffer for reordering */ |
390 | uBuf = XBuf; /* first 64 samples */ |
391 | tBuf = XBuf + 64; /* second 64 samples */ |
392 | |
393 | /* overwrite oldest PCM with new PCM |
394 | * delay[n] has 1 GB after shifting (either << or >>) |
395 | */ |
396 | delayPtr = delay + (*delayIdx * 32); |
397 | if (fBitsIn > FBITS_IN_QMFA) { |
398 | shift = MIN(fBitsIn - FBITS_IN_QMFA, 31); |
399 | for (n = 32; n != 0; n--) { |
400 | y = (*inbuf) >> shift; |
401 | inbuf++; |
402 | *delayPtr++ = y; |
403 | } |
404 | } else { |
405 | shift = MIN(FBITS_IN_QMFA - fBitsIn, 30); |
406 | for (n = 32; n != 0; n--) { |
407 | y = *inbuf++; |
408 | CLIP_2N_SHIFT30(y, shift); |
409 | *delayPtr++ = y; |
410 | } |
411 | } |
412 | |
413 | QMFAnalysisConv((int *)cTabA, delay, *delayIdx, uBuf); |
414 | |
415 | /* uBuf has at least 2 GB right now (1 from clipping to Q(FBITS_IN_QMFA), one from |
416 | * the scaling by cTab (MULSHIFT32(*delayPtr--, *cPtr++), with net gain of < 1.0) |
417 | * TODO - fuse with QMFAnalysisConv to avoid separate reordering |
418 | */ |
419 | tBuf[2 * 0 + 0] = uBuf[0]; |
420 | tBuf[2 * 0 + 1] = uBuf[1]; |
421 | for (n = 1; n < 31; n++) { |
422 | tBuf[2 * n + 0] = -uBuf[64 - n]; |
423 | tBuf[2 * n + 1] = uBuf[n + 1]; |
424 | } |
425 | tBuf[2 * 31 + 1] = uBuf[32]; |
426 | tBuf[2 * 31 + 0] = -uBuf[33]; |
427 | |
428 | /* fast in-place DCT-IV - only need 2*qmfaBands output samples */ |
429 | PreMultiply64(tBuf); /* 2 GB in, 3 GB out */ |
430 | FFT32C(tBuf); /* 3 GB in, 1 GB out */ |
431 | PostMultiply64(tBuf, qmfaBands * 2); /* 1 GB in, 2 GB out */ |
432 | |
433 | /* TODO - roll into PostMultiply (if enough registers) */ |
434 | gbMask = 0; |
435 | for (n = 0; n < qmfaBands; n++) { |
436 | XBuf[2 * n + 0] = tBuf[ n + 0]; /* implicit scaling of 2 in our output Q format */ |
437 | gbMask |= FASTABS(XBuf[2 * n + 0]); |
438 | XBuf[2 * n + 1] = -tBuf[63 - n]; |
439 | gbMask |= FASTABS(XBuf[2 * n + 1]); |
440 | } |
441 | |
442 | /* fill top section with zeros for HF generation */ |
443 | for (; n < 64; n++) { |
444 | XBuf[2 * n + 0] = 0; |
445 | XBuf[2 * n + 1] = 0; |
446 | } |
447 | |
448 | *delayIdx = (*delayIdx == NUM_QMF_DELAY_BUFS - 1 ? 0 : *delayIdx + 1); |
449 | |
450 | /* minimum of 2 GB in output */ |
451 | return gbMask; |
452 | } |
453 | |
454 | /* lose FBITS_LOST_DCT4_64 in DCT4, gain 6 for implicit scaling by 1/64, lose 1 for cTab multiply (Q31) */ |
455 | #define FBITS_OUT_QMFS (FBITS_IN_QMFS - FBITS_LOST_DCT4_64 + 6 - 1) |
456 | #define RND_VAL (1 << (FBITS_OUT_QMFS-1)) |
457 | |
458 | /************************************************************************************** |
459 | * Function: QMFSynthesisConv |
460 | * |
461 | * Description: final convolution kernel for synthesis QMF |
462 | * |
463 | * Inputs: pointer to coefficient table, reordered for sequential access |
464 | * delay buffer of size 64*10 = 640 complex samples (1280 ints) |
465 | * index for delay ring buffer (range = [0, 9]) |
466 | * number of QMF subbands to process (range = [0, 64]) |
467 | * number of channels |
468 | * |
469 | * Outputs: 64 consecutive 16-bit PCM samples, interleaved by factor of nChans |
470 | * |
471 | * Return: none |
472 | * |
473 | * Notes: this is carefully written to be efficient on ARM |
474 | * use the assembly code version in sbrqmfsk.s when building for ARM! |
475 | **************************************************************************************/ |
476 | |
477 | #ifdef __cplusplus |
478 | extern "C" |
479 | #endif |
480 | void QMFSynthesisConv(int *cPtr, int *delay, int dIdx, short *outbuf, int nChans) |
481 | { |
482 | int k, dOff0, dOff1; |
483 | U64 sum64; |
484 | |
485 | dOff0 = (dIdx) * 128; |
486 | dOff1 = dOff0 - 1; |
487 | if (dOff1 < 0) { |
488 | dOff1 += 1280; |
489 | } |
490 | |
491 | /* scaling note: total gain of coefs (cPtr[0]-cPtr[9] for any k) is < 2.0, so 1 GB in delay values is adequate */ |
492 | for (k = 0; k <= 63; k++) { |
493 | sum64.w64 = 0; |
494 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
495 | dOff0 -= 256; |
496 | if (dOff0 < 0) { |
497 | dOff0 += 1280; |
498 | } |
499 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
500 | dOff1 -= 256; |
501 | if (dOff1 < 0) { |
502 | dOff1 += 1280; |
503 | } |
504 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
505 | dOff0 -= 256; |
506 | if (dOff0 < 0) { |
507 | dOff0 += 1280; |
508 | } |
509 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
510 | dOff1 -= 256; |
511 | if (dOff1 < 0) { |
512 | dOff1 += 1280; |
513 | } |
514 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
515 | dOff0 -= 256; |
516 | if (dOff0 < 0) { |
517 | dOff0 += 1280; |
518 | } |
519 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
520 | dOff1 -= 256; |
521 | if (dOff1 < 0) { |
522 | dOff1 += 1280; |
523 | } |
524 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
525 | dOff0 -= 256; |
526 | if (dOff0 < 0) { |
527 | dOff0 += 1280; |
528 | } |
529 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
530 | dOff1 -= 256; |
531 | if (dOff1 < 0) { |
532 | dOff1 += 1280; |
533 | } |
534 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
535 | dOff0 -= 256; |
536 | if (dOff0 < 0) { |
537 | dOff0 += 1280; |
538 | } |
539 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
540 | dOff1 -= 256; |
541 | if (dOff1 < 0) { |
542 | dOff1 += 1280; |
543 | } |
544 | |
545 | dOff0++; |
546 | dOff1--; |
547 | *outbuf = CLIPTOSHORT((sum64.r.hi32 + RND_VAL) >> FBITS_OUT_QMFS); |
548 | outbuf += nChans; |
549 | } |
550 | } |
551 | |
552 | /************************************************************************************** |
553 | * Function: QMFSynthesis |
554 | * |
555 | * Description: 64-subband synthesis QMF (4.6.18.4.2) |
556 | * |
557 | * Inputs: 64 consecutive complex subband QMF samples, format = Q(FBITS_IN_QMFS) |
558 | * delay buffer of size 64*10 = 640 complex samples (1280 ints) |
559 | * index for delay ring buffer (range = [0, 9]) |
560 | * number of QMF subbands to process (range = [0, 64]) |
561 | * number of channels |
562 | * |
563 | * Outputs: 64 consecutive 16-bit PCM samples, interleaved by factor of nChans |
564 | * updated delay buffer |
565 | * updated delay index |
566 | * |
567 | * Return: none |
568 | * |
569 | * Notes: assumes MIN_GBITS_IN_QMFS guard bits in input, either from |
570 | * QMFAnalysis (if upsampling only) or from MapHF (if SBR on) |
571 | **************************************************************************************/ |
572 | void QMFSynthesis(int *inbuf, int *delay, int *delayIdx, int qmfsBands, short *outbuf, int nChans) |
573 | { |
574 | int n, a0, a1, b0, b1, dOff0, dOff1, dIdx; |
575 | int *tBufLo, *tBufHi; |
576 | |
577 | dIdx = *delayIdx; |
578 | tBufLo = delay + dIdx * 128 + 0; |
579 | tBufHi = delay + dIdx * 128 + 127; |
580 | |
581 | /* reorder inputs to DCT-IV, only use first qmfsBands (complex) samples |
582 | * TODO - fuse with PreMultiply64 to avoid separate reordering steps |
583 | */ |
584 | for (n = 0; n < qmfsBands >> 1; n++) { |
585 | a0 = *inbuf++; |
586 | b0 = *inbuf++; |
587 | a1 = *inbuf++; |
588 | b1 = *inbuf++; |
589 | *tBufLo++ = a0; |
590 | *tBufLo++ = a1; |
591 | *tBufHi-- = b0; |
592 | *tBufHi-- = b1; |
593 | } |
594 | if (qmfsBands & 0x01) { |
595 | a0 = *inbuf++; |
596 | b0 = *inbuf++; |
597 | *tBufLo++ = a0; |
598 | *tBufHi-- = b0; |
599 | *tBufLo++ = 0; |
600 | *tBufHi-- = 0; |
601 | n++; |
602 | } |
603 | for (; n < 32; n++) { |
604 | *tBufLo++ = 0; |
605 | *tBufHi-- = 0; |
606 | *tBufLo++ = 0; |
607 | *tBufHi-- = 0; |
608 | } |
609 | |
610 | tBufLo = delay + dIdx * 128 + 0; |
611 | tBufHi = delay + dIdx * 128 + 64; |
612 | |
613 | /* 2 GB in, 3 GB out */ |
614 | PreMultiply64(tBufLo); |
615 | PreMultiply64(tBufHi); |
616 | |
617 | /* 3 GB in, 1 GB out */ |
618 | FFT32C(tBufLo); |
619 | FFT32C(tBufHi); |
620 | |
621 | /* 1 GB in, 2 GB out */ |
622 | PostMultiply64(tBufLo, 64); |
623 | PostMultiply64(tBufHi, 64); |
624 | |
625 | /* could fuse with PostMultiply64 to avoid separate pass */ |
626 | dOff0 = dIdx * 128; |
627 | dOff1 = dIdx * 128 + 64; |
628 | for (n = 32; n != 0; n--) { |
629 | a0 = (*tBufLo++); |
630 | a1 = (*tBufLo++); |
631 | b0 = (*tBufHi++); |
632 | b1 = -(*tBufHi++); |
633 | |
634 | delay[dOff0++] = (b0 - a0); |
635 | delay[dOff0++] = (b1 - a1); |
636 | delay[dOff1++] = (b0 + a0); |
637 | delay[dOff1++] = (b1 + a1); |
638 | } |
639 | |
640 | QMFSynthesisConv((int *)cTabS, delay, dIdx, outbuf, nChans); |
641 | |
642 | *delayIdx = (*delayIdx == NUM_QMF_DELAY_BUFS - 1 ? 0 : *delayIdx + 1); |
643 | } |
644 |