blob: 6a9464a418c9e250849f168dcebe036d38e32d38
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: sbrqmf.c,v 1.1.2.2 2005/05/19 21:00:01 jrecker Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com) |
41 | * February 2005 |
42 | * |
43 | * sbrqmf.c - analysis and synthesis QMF filters for SBR |
44 | **************************************************************************************/ |
45 | |
46 | #include "sbr.h" |
47 | #include "assembly.h" |
48 | |
49 | /* PreMultiply64() table |
50 | * format = Q30 |
51 | * reordered for sequential access |
52 | * |
53 | * for (i = 0; i < 64/4; i++) { |
54 | * angle = (i + 0.25) * M_PI / nmdct; |
55 | * x = (cos(angle) + sin(angle)); |
56 | * x = sin(angle); |
57 | * |
58 | * angle = (nmdct/2 - 1 - i + 0.25) * M_PI / nmdct; |
59 | * x = (cos(angle) + sin(angle)); |
60 | * x = sin(angle); |
61 | * } |
62 | */ |
63 | static const int cos4sin4tab64[64] = { |
64 | 0x40c7d2bd, 0x00c90e90, 0x424ff28f, 0x3ff4e5e0, 0x43cdd89a, 0x03ecadcf, 0x454149fc, 0x3fc395f9, |
65 | 0x46aa0d6d, 0x070de172, 0x4807eb4b, 0x3f6af2e3, 0x495aada2, 0x0a2abb59, 0x4aa22036, 0x3eeb3347, |
66 | 0x4bde1089, 0x0d415013, 0x4d0e4de2, 0x3e44a5ef, 0x4e32a956, 0x104fb80e, 0x4f4af5d1, 0x3d77b192, |
67 | 0x50570819, 0x135410c3, 0x5156b6d9, 0x3c84d496, 0x5249daa2, 0x164c7ddd, 0x53304df6, 0x3b6ca4c4, |
68 | 0x5409ed4b, 0x19372a64, 0x54d69714, 0x3a2fcee8, 0x55962bc0, 0x1c1249d8, 0x56488dc5, 0x38cf1669, |
69 | 0x56eda1a0, 0x1edc1953, 0x57854ddd, 0x374b54ce, 0x580f7b19, 0x2192e09b, 0x588c1404, 0x35a5793c, |
70 | 0x58fb0568, 0x2434f332, 0x595c3e2a, 0x33de87de, 0x59afaf4c, 0x26c0b162, 0x59f54bee, 0x31f79948, |
71 | 0x5a2d0957, 0x29348937, 0x5a56deec, 0x2ff1d9c7, 0x5a72c63b, 0x2b8ef77d, 0x5a80baf6, 0x2dce88aa, |
72 | }; |
73 | |
74 | /* PostMultiply64() table |
75 | * format = Q30 |
76 | * reordered for sequential access |
77 | * |
78 | * for (i = 0; i <= (32/2); i++) { |
79 | * angle = i * M_PI / 64; |
80 | * x = (cos(angle) + sin(angle)); |
81 | * x = sin(angle); |
82 | * } |
83 | */ |
84 | static const int cos1sin1tab64[34] = { |
85 | 0x40000000, 0x00000000, 0x43103085, 0x0323ecbe, 0x45f704f7, 0x0645e9af, 0x48b2b335, 0x09640837, |
86 | 0x4b418bbe, 0x0c7c5c1e, 0x4da1fab5, 0x0f8cfcbe, 0x4fd288dc, 0x1294062f, 0x51d1dc80, 0x158f9a76, |
87 | 0x539eba45, 0x187de2a7, 0x553805f2, 0x1b5d100a, 0x569cc31b, 0x1e2b5d38, 0x57cc15bc, 0x20e70f32, |
88 | 0x58c542c5, 0x238e7673, 0x5987b08a, 0x261feffa, 0x5a12e720, 0x2899e64a, 0x5a6690ae, 0x2afad269, |
89 | 0x5a82799a, 0x2d413ccd, |
90 | }; |
91 | |
92 | /************************************************************************************** |
93 | * Function: PreMultiply64 |
94 | * |
95 | * Description: pre-twiddle stage of 64-point DCT-IV |
96 | * |
97 | * Inputs: buffer of 64 samples |
98 | * |
99 | * Outputs: processed samples in same buffer |
100 | * |
101 | * Return: none |
102 | * |
103 | * Notes: minimum 1 GB in, 2 GB out, gains 2 int bits |
104 | * gbOut = gbIn + 1 |
105 | * output is limited to sqrt(2)/2 plus GB in full GB |
106 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
107 | **************************************************************************************/ |
108 | static void PreMultiply64(int *zbuf1) |
109 | { |
110 | int i, ar1, ai1, ar2, ai2, z1, z2; |
111 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
112 | int *zbuf2; |
113 | const int *csptr; |
114 | |
115 | zbuf2 = zbuf1 + 64 - 1; |
116 | csptr = cos4sin4tab64; |
117 | |
118 | /* whole thing should fit in registers - verify that compiler does this */ |
119 | for (i = 64 >> 2; i != 0; i--) { |
120 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
121 | cps2a = *csptr++; |
122 | sin2a = *csptr++; |
123 | cps2b = *csptr++; |
124 | sin2b = *csptr++; |
125 | |
126 | ar1 = *(zbuf1 + 0); |
127 | ai2 = *(zbuf1 + 1); |
128 | ai1 = *(zbuf2 + 0); |
129 | ar2 = *(zbuf2 - 1); |
130 | |
131 | /* gain 2 ints bit from MULSHIFT32 by Q30 |
132 | * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414 |
133 | * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from |
134 | * extra sign bits, and eat one in adding |
135 | */ |
136 | t = MULSHIFT32(sin2a, ar1 + ai1); |
137 | z2 = MULSHIFT32(cps2a, ai1) - t; |
138 | cms2 = cps2a - 2 * sin2a; |
139 | z1 = MULSHIFT32(cms2, ar1) + t; |
140 | *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */ |
141 | *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */ |
142 | |
143 | t = MULSHIFT32(sin2b, ar2 + ai2); |
144 | z2 = MULSHIFT32(cps2b, ai2) - t; |
145 | cms2 = cps2b - 2 * sin2b; |
146 | z1 = MULSHIFT32(cms2, ar2) + t; |
147 | *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */ |
148 | *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */ |
149 | } |
150 | } |
151 | |
152 | /************************************************************************************** |
153 | * Function: PostMultiply64 |
154 | * |
155 | * Description: post-twiddle stage of 64-point type-IV DCT |
156 | * |
157 | * Inputs: buffer of 64 samples |
158 | * number of output samples to calculate |
159 | * |
160 | * Outputs: processed samples in same buffer |
161 | * |
162 | * Return: none |
163 | * |
164 | * Notes: minimum 1 GB in, 2 GB out, gains 2 int bits |
165 | * gbOut = gbIn + 1 |
166 | * output is limited to sqrt(2)/2 plus GB in full GB |
167 | * nSampsOut is rounded up to next multiple of 4, since we calculate |
168 | * 4 samples per loop |
169 | **************************************************************************************/ |
170 | static void PostMultiply64(int *fft1, int nSampsOut) |
171 | { |
172 | int i, ar1, ai1, ar2, ai2; |
173 | int t, cms2, cps2, sin2; |
174 | int *fft2; |
175 | const int *csptr; |
176 | |
177 | csptr = cos1sin1tab64; |
178 | fft2 = fft1 + 64 - 1; |
179 | |
180 | /* load coeffs for first pass |
181 | * cps2 = (cos+sin)/2, sin2 = sin/2, cms2 = (cos-sin)/2 |
182 | */ |
183 | cps2 = *csptr++; |
184 | sin2 = *csptr++; |
185 | cms2 = cps2 - 2 * sin2; |
186 | |
187 | for (i = (nSampsOut + 3) >> 2; i != 0; i--) { |
188 | ar1 = *(fft1 + 0); |
189 | ai1 = *(fft1 + 1); |
190 | ar2 = *(fft2 - 1); |
191 | ai2 = *(fft2 + 0); |
192 | |
193 | /* gain 2 int bits (multiplying by Q30), max gain = sqrt(2) */ |
194 | t = MULSHIFT32(sin2, ar1 + ai1); |
195 | *fft2-- = t - MULSHIFT32(cps2, ai1); |
196 | *fft1++ = t + MULSHIFT32(cms2, ar1); |
197 | |
198 | cps2 = *csptr++; |
199 | sin2 = *csptr++; |
200 | |
201 | ai2 = -ai2; |
202 | t = MULSHIFT32(sin2, ar2 + ai2); |
203 | *fft2-- = t - MULSHIFT32(cps2, ai2); |
204 | cms2 = cps2 - 2 * sin2; |
205 | *fft1++ = t + MULSHIFT32(cms2, ar2); |
206 | } |
207 | } |
208 | |
209 | /************************************************************************************** |
210 | * Function: QMFAnalysisConv |
211 | * |
212 | * Description: convolution kernel for analysis QMF |
213 | * |
214 | * Inputs: pointer to coefficient table, reordered for sequential access |
215 | * delay buffer of size 32*10 = 320 real-valued PCM samples |
216 | * index for delay ring buffer (range = [0, 9]) |
217 | * |
218 | * Outputs: 64 consecutive 32-bit samples |
219 | * |
220 | * Return: none |
221 | * |
222 | * Notes: this is carefully written to be efficient on ARM |
223 | * use the assembly code version in sbrqmfak.s when building for ARM! |
224 | **************************************************************************************/ |
225 | #if 0// (defined (__arm) && defined (__ARMCC_VERSION)) || (defined (_WIN32) && defined (_WIN32_WCE) && defined (ARM)) || (defined(__GNUC__) && defined(__arm__)) |
226 | #ifdef __cplusplus |
227 | extern "C" |
228 | #endif |
229 | void QMFAnalysisConv(int *cTab, int *delay, int dIdx, int *uBuf); |
230 | #else |
231 | void QMFAnalysisConv(int *cTab, int *delay, int dIdx, int *uBuf) |
232 | { |
233 | int k, dOff; |
234 | int *cPtr0, *cPtr1; |
235 | U64 u64lo, u64hi; |
236 | |
237 | dOff = dIdx * 32 + 31; |
238 | cPtr0 = cTab; |
239 | cPtr1 = cTab + 33 * 5 - 1; |
240 | |
241 | /* special first pass since we need to flip sign to create cTab[384], cTab[512] */ |
242 | u64lo.w64 = 0; |
243 | u64hi.w64 = 0; |
244 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
245 | dOff -= 32; |
246 | if (dOff < 0) { |
247 | dOff += 320; |
248 | } |
249 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
250 | dOff -= 32; |
251 | if (dOff < 0) { |
252 | dOff += 320; |
253 | } |
254 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
255 | dOff -= 32; |
256 | if (dOff < 0) { |
257 | dOff += 320; |
258 | } |
259 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
260 | dOff -= 32; |
261 | if (dOff < 0) { |
262 | dOff += 320; |
263 | } |
264 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
265 | dOff -= 32; |
266 | if (dOff < 0) { |
267 | dOff += 320; |
268 | } |
269 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
270 | dOff -= 32; |
271 | if (dOff < 0) { |
272 | dOff += 320; |
273 | } |
274 | u64lo.w64 = MADD64(u64lo.w64, -(*cPtr1--), delay[dOff]); |
275 | dOff -= 32; |
276 | if (dOff < 0) { |
277 | dOff += 320; |
278 | } |
279 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
280 | dOff -= 32; |
281 | if (dOff < 0) { |
282 | dOff += 320; |
283 | } |
284 | u64lo.w64 = MADD64(u64lo.w64, -(*cPtr1--), delay[dOff]); |
285 | dOff -= 32; |
286 | if (dOff < 0) { |
287 | dOff += 320; |
288 | } |
289 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
290 | dOff -= 32; |
291 | if (dOff < 0) { |
292 | dOff += 320; |
293 | } |
294 | |
295 | uBuf[0] = u64lo.r.hi32; |
296 | uBuf[32] = u64hi.r.hi32; |
297 | uBuf++; |
298 | dOff--; |
299 | |
300 | /* max gain for any sample in uBuf, after scaling by cTab, ~= 0.99 |
301 | * so we can just sum the uBuf values with no overflow problems |
302 | */ |
303 | for (k = 1; k <= 31; k++) { |
304 | u64lo.w64 = 0; |
305 | u64hi.w64 = 0; |
306 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
307 | dOff -= 32; |
308 | if (dOff < 0) { |
309 | dOff += 320; |
310 | } |
311 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
312 | dOff -= 32; |
313 | if (dOff < 0) { |
314 | dOff += 320; |
315 | } |
316 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
317 | dOff -= 32; |
318 | if (dOff < 0) { |
319 | dOff += 320; |
320 | } |
321 | u64hi.w64 = MADD64(u64hi.w64, *cPtr0++, delay[dOff]); |
322 | dOff -= 32; |
323 | if (dOff < 0) { |
324 | dOff += 320; |
325 | } |
326 | u64lo.w64 = MADD64(u64lo.w64, *cPtr0++, delay[dOff]); |
327 | dOff -= 32; |
328 | if (dOff < 0) { |
329 | dOff += 320; |
330 | } |
331 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
332 | dOff -= 32; |
333 | if (dOff < 0) { |
334 | dOff += 320; |
335 | } |
336 | u64lo.w64 = MADD64(u64lo.w64, *cPtr1--, delay[dOff]); |
337 | dOff -= 32; |
338 | if (dOff < 0) { |
339 | dOff += 320; |
340 | } |
341 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
342 | dOff -= 32; |
343 | if (dOff < 0) { |
344 | dOff += 320; |
345 | } |
346 | u64lo.w64 = MADD64(u64lo.w64, *cPtr1--, delay[dOff]); |
347 | dOff -= 32; |
348 | if (dOff < 0) { |
349 | dOff += 320; |
350 | } |
351 | u64hi.w64 = MADD64(u64hi.w64, *cPtr1--, delay[dOff]); |
352 | dOff -= 32; |
353 | if (dOff < 0) { |
354 | dOff += 320; |
355 | } |
356 | |
357 | uBuf[0] = u64lo.r.hi32; |
358 | uBuf[32] = u64hi.r.hi32; |
359 | uBuf++; |
360 | dOff--; |
361 | } |
362 | } |
363 | #endif |
364 | |
365 | /************************************************************************************** |
366 | * Function: QMFAnalysis |
367 | * |
368 | * Description: 32-subband analysis QMF (4.6.18.4.1) |
369 | * |
370 | * Inputs: 32 consecutive samples of decoded 32-bit PCM, format = Q(fBitsIn) |
371 | * delay buffer of size 32*10 = 320 PCM samples |
372 | * number of fraction bits in input PCM |
373 | * index for delay ring buffer (range = [0, 9]) |
374 | * number of subbands to calculate (range = [0, 32]) |
375 | * |
376 | * Outputs: qmfaBands complex subband samples, format = Q(FBITS_OUT_QMFA) |
377 | * updated delay buffer |
378 | * updated delay index |
379 | * |
380 | * Return: guard bit mask |
381 | * |
382 | * Notes: output stored as RE{X0}, IM{X0}, RE{X1}, IM{X1}, ... RE{X31}, IM{X31} |
383 | * output stored in int buffer of size 64*2 = 128 |
384 | * (zero-filled from XBuf[2*qmfaBands] to XBuf[127]) |
385 | **************************************************************************************/ |
386 | int QMFAnalysis(int *inbuf, int *delay, int *XBuf, int fBitsIn, int *delayIdx, int qmfaBands) |
387 | { |
388 | int n, y, shift, gbMask; |
389 | int *delayPtr, *uBuf, *tBuf; |
390 | |
391 | /* use XBuf[128] as temp buffer for reordering */ |
392 | uBuf = XBuf; /* first 64 samples */ |
393 | tBuf = XBuf + 64; /* second 64 samples */ |
394 | |
395 | /* overwrite oldest PCM with new PCM |
396 | * delay[n] has 1 GB after shifting (either << or >>) |
397 | */ |
398 | delayPtr = delay + (*delayIdx * 32); |
399 | if (fBitsIn > FBITS_IN_QMFA) { |
400 | shift = MIN(fBitsIn - FBITS_IN_QMFA, 31); |
401 | for (n = 32; n != 0; n--) { |
402 | y = (*inbuf) >> shift; |
403 | inbuf++; |
404 | *delayPtr++ = y; |
405 | } |
406 | } else { |
407 | shift = MIN(FBITS_IN_QMFA - fBitsIn, 30); |
408 | for (n = 32; n != 0; n--) { |
409 | y = *inbuf++; |
410 | CLIP_2N_SHIFT30(y, shift); |
411 | *delayPtr++ = y; |
412 | } |
413 | } |
414 | |
415 | QMFAnalysisConv((int *)cTabA, delay, *delayIdx, uBuf); |
416 | |
417 | /* uBuf has at least 2 GB right now (1 from clipping to Q(FBITS_IN_QMFA), one from |
418 | * the scaling by cTab (MULSHIFT32(*delayPtr--, *cPtr++), with net gain of < 1.0) |
419 | * TODO - fuse with QMFAnalysisConv to avoid separate reordering |
420 | */ |
421 | tBuf[2 * 0 + 0] = uBuf[0]; |
422 | tBuf[2 * 0 + 1] = uBuf[1]; |
423 | for (n = 1; n < 31; n++) { |
424 | tBuf[2 * n + 0] = -uBuf[64 - n]; |
425 | tBuf[2 * n + 1] = uBuf[n + 1]; |
426 | } |
427 | tBuf[2 * 31 + 1] = uBuf[32]; |
428 | tBuf[2 * 31 + 0] = -uBuf[33]; |
429 | |
430 | /* fast in-place DCT-IV - only need 2*qmfaBands output samples */ |
431 | PreMultiply64(tBuf); /* 2 GB in, 3 GB out */ |
432 | FFT32C(tBuf); /* 3 GB in, 1 GB out */ |
433 | PostMultiply64(tBuf, qmfaBands * 2); /* 1 GB in, 2 GB out */ |
434 | |
435 | /* TODO - roll into PostMultiply (if enough registers) */ |
436 | gbMask = 0; |
437 | for (n = 0; n < qmfaBands; n++) { |
438 | XBuf[2 * n + 0] = tBuf[ n + 0]; /* implicit scaling of 2 in our output Q format */ |
439 | gbMask |= FASTABS(XBuf[2 * n + 0]); |
440 | XBuf[2 * n + 1] = -tBuf[63 - n]; |
441 | gbMask |= FASTABS(XBuf[2 * n + 1]); |
442 | } |
443 | |
444 | /* fill top section with zeros for HF generation */ |
445 | for (; n < 64; n++) { |
446 | XBuf[2 * n + 0] = 0; |
447 | XBuf[2 * n + 1] = 0; |
448 | } |
449 | |
450 | *delayIdx = (*delayIdx == NUM_QMF_DELAY_BUFS - 1 ? 0 : *delayIdx + 1); |
451 | |
452 | /* minimum of 2 GB in output */ |
453 | return gbMask; |
454 | } |
455 | |
456 | /* lose FBITS_LOST_DCT4_64 in DCT4, gain 6 for implicit scaling by 1/64, lose 1 for cTab multiply (Q31) */ |
457 | #define FBITS_OUT_QMFS (FBITS_IN_QMFS - FBITS_LOST_DCT4_64 + 6 - 1) |
458 | #define RND_VAL (1 << (FBITS_OUT_QMFS-1)) |
459 | |
460 | /************************************************************************************** |
461 | * Function: QMFSynthesisConv |
462 | * |
463 | * Description: final convolution kernel for synthesis QMF |
464 | * |
465 | * Inputs: pointer to coefficient table, reordered for sequential access |
466 | * delay buffer of size 64*10 = 640 complex samples (1280 ints) |
467 | * index for delay ring buffer (range = [0, 9]) |
468 | * number of QMF subbands to process (range = [0, 64]) |
469 | * number of channels |
470 | * |
471 | * Outputs: 64 consecutive 16-bit PCM samples, interleaved by factor of nChans |
472 | * |
473 | * Return: none |
474 | * |
475 | * Notes: this is carefully written to be efficient on ARM |
476 | * use the assembly code version in sbrqmfsk.s when building for ARM! |
477 | **************************************************************************************/ |
478 | #if 0// (defined (__arm) && defined (__ARMCC_VERSION)) || (defined (_WIN32) && defined (_WIN32_WCE) && defined (ARM)) || (defined(__GNUC__) && defined(__arm__)) |
479 | #ifdef __cplusplus |
480 | extern "C" |
481 | #endif |
482 | void QMFSynthesisConv(int *cPtr, int *delay, int dIdx, short *outbuf, int nChans); |
483 | #else |
484 | void QMFSynthesisConv(int *cPtr, int *delay, int dIdx, short *outbuf, int nChans) |
485 | { |
486 | int k, dOff0, dOff1; |
487 | U64 sum64; |
488 | |
489 | dOff0 = (dIdx) * 128; |
490 | dOff1 = dOff0 - 1; |
491 | if (dOff1 < 0) { |
492 | dOff1 += 1280; |
493 | } |
494 | |
495 | /* scaling note: total gain of coefs (cPtr[0]-cPtr[9] for any k) is < 2.0, so 1 GB in delay values is adequate */ |
496 | for (k = 0; k <= 63; k++) { |
497 | sum64.w64 = 0; |
498 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
499 | dOff0 -= 256; |
500 | if (dOff0 < 0) { |
501 | dOff0 += 1280; |
502 | } |
503 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
504 | dOff1 -= 256; |
505 | if (dOff1 < 0) { |
506 | dOff1 += 1280; |
507 | } |
508 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
509 | dOff0 -= 256; |
510 | if (dOff0 < 0) { |
511 | dOff0 += 1280; |
512 | } |
513 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
514 | dOff1 -= 256; |
515 | if (dOff1 < 0) { |
516 | dOff1 += 1280; |
517 | } |
518 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
519 | dOff0 -= 256; |
520 | if (dOff0 < 0) { |
521 | dOff0 += 1280; |
522 | } |
523 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
524 | dOff1 -= 256; |
525 | if (dOff1 < 0) { |
526 | dOff1 += 1280; |
527 | } |
528 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
529 | dOff0 -= 256; |
530 | if (dOff0 < 0) { |
531 | dOff0 += 1280; |
532 | } |
533 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
534 | dOff1 -= 256; |
535 | if (dOff1 < 0) { |
536 | dOff1 += 1280; |
537 | } |
538 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff0]); |
539 | dOff0 -= 256; |
540 | if (dOff0 < 0) { |
541 | dOff0 += 1280; |
542 | } |
543 | sum64.w64 = MADD64(sum64.w64, *cPtr++, delay[dOff1]); |
544 | dOff1 -= 256; |
545 | if (dOff1 < 0) { |
546 | dOff1 += 1280; |
547 | } |
548 | |
549 | dOff0++; |
550 | dOff1--; |
551 | *outbuf = CLIPTOSHORT((sum64.r.hi32 + RND_VAL) >> FBITS_OUT_QMFS); |
552 | outbuf += nChans; |
553 | } |
554 | } |
555 | #endif |
556 | |
557 | /************************************************************************************** |
558 | * Function: QMFSynthesis |
559 | * |
560 | * Description: 64-subband synthesis QMF (4.6.18.4.2) |
561 | * |
562 | * Inputs: 64 consecutive complex subband QMF samples, format = Q(FBITS_IN_QMFS) |
563 | * delay buffer of size 64*10 = 640 complex samples (1280 ints) |
564 | * index for delay ring buffer (range = [0, 9]) |
565 | * number of QMF subbands to process (range = [0, 64]) |
566 | * number of channels |
567 | * |
568 | * Outputs: 64 consecutive 16-bit PCM samples, interleaved by factor of nChans |
569 | * updated delay buffer |
570 | * updated delay index |
571 | * |
572 | * Return: none |
573 | * |
574 | * Notes: assumes MIN_GBITS_IN_QMFS guard bits in input, either from |
575 | * QMFAnalysis (if upsampling only) or from MapHF (if SBR on) |
576 | **************************************************************************************/ |
577 | void QMFSynthesis(int *inbuf, int *delay, int *delayIdx, int qmfsBands, short *outbuf, int nChans) |
578 | { |
579 | int n, a0, a1, b0, b1, dOff0, dOff1, dIdx; |
580 | int *tBufLo, *tBufHi; |
581 | |
582 | dIdx = *delayIdx; |
583 | tBufLo = delay + dIdx * 128 + 0; |
584 | tBufHi = delay + dIdx * 128 + 127; |
585 | |
586 | /* reorder inputs to DCT-IV, only use first qmfsBands (complex) samples |
587 | * TODO - fuse with PreMultiply64 to avoid separate reordering steps |
588 | */ |
589 | for (n = 0; n < qmfsBands >> 1; n++) { |
590 | a0 = *inbuf++; |
591 | b0 = *inbuf++; |
592 | a1 = *inbuf++; |
593 | b1 = *inbuf++; |
594 | *tBufLo++ = a0; |
595 | *tBufLo++ = a1; |
596 | *tBufHi-- = b0; |
597 | *tBufHi-- = b1; |
598 | } |
599 | if (qmfsBands & 0x01) { |
600 | a0 = *inbuf++; |
601 | b0 = *inbuf++; |
602 | *tBufLo++ = a0; |
603 | *tBufHi-- = b0; |
604 | *tBufLo++ = 0; |
605 | *tBufHi-- = 0; |
606 | n++; |
607 | } |
608 | for (; n < 32; n++) { |
609 | *tBufLo++ = 0; |
610 | *tBufHi-- = 0; |
611 | *tBufLo++ = 0; |
612 | *tBufHi-- = 0; |
613 | } |
614 | |
615 | tBufLo = delay + dIdx * 128 + 0; |
616 | tBufHi = delay + dIdx * 128 + 64; |
617 | |
618 | /* 2 GB in, 3 GB out */ |
619 | PreMultiply64(tBufLo); |
620 | PreMultiply64(tBufHi); |
621 | |
622 | /* 3 GB in, 1 GB out */ |
623 | FFT32C(tBufLo); |
624 | FFT32C(tBufHi); |
625 | |
626 | /* 1 GB in, 2 GB out */ |
627 | PostMultiply64(tBufLo, 64); |
628 | PostMultiply64(tBufHi, 64); |
629 | |
630 | /* could fuse with PostMultiply64 to avoid separate pass */ |
631 | dOff0 = dIdx * 128; |
632 | dOff1 = dIdx * 128 + 64; |
633 | for (n = 32; n != 0; n--) { |
634 | a0 = (*tBufLo++); |
635 | a1 = (*tBufLo++); |
636 | b0 = (*tBufHi++); |
637 | b1 = -(*tBufHi++); |
638 | |
639 | delay[dOff0++] = (b0 - a0); |
640 | delay[dOff0++] = (b1 - a1); |
641 | delay[dOff1++] = (b0 + a0); |
642 | delay[dOff1++] = (b1 + a1); |
643 | } |
644 | |
645 | QMFSynthesisConv((int *)cTabS, delay, dIdx, outbuf, nChans); |
646 | |
647 | *delayIdx = (*delayIdx == NUM_QMF_DELAY_BUFS - 1 ? 0 : *delayIdx + 1); |
648 | } |
649 |