blob: 2f74fc5c35b75c00c8ec6bf5b299c4012324c1cf
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: sbrfft.c,v 1.1 2005/02/26 01:47:35 jrecker Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) |
41 | * February 2005 |
42 | * |
43 | * sbrfft.c - optimized FFT for SBR QMF filters |
44 | **************************************************************************************/ |
45 | |
46 | #include "sbr.h" |
47 | #include "assembly.h" |
48 | |
49 | #define SQRT1_2 0x5a82799a |
50 | |
51 | /* swap RE{p0} with RE{p1} and IM{P0} with IM{P1} */ |
52 | #define swapcplx(p0,p1) \ |
53 | t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1 |
54 | |
55 | /* nfft = 32, hard coded since small, fixed size FFT |
56 | static const unsigned char bitrevtab32[9] = { |
57 | 0x01, 0x04, 0x03, 0x06, 0x00, 0x02, 0x05, 0x07, 0x00, |
58 | }; |
59 | */ |
60 | |
61 | /* twiddle table for radix 4 pass, format = Q31 */ |
62 | static const int twidTabOdd32[8 * 6] = { |
63 | 0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x539eba45, 0xe7821d59, |
64 | 0x4b418bbe, 0xf383a3e2, 0x58c542c5, 0xdc71898d, 0x5a82799a, 0xd2bec333, 0x539eba45, 0xe7821d59, |
65 | 0x539eba45, 0xc4df2862, 0x539eba45, 0xc4df2862, 0x58c542c5, 0xdc71898d, 0x3248d382, 0xc13ad060, |
66 | 0x40000000, 0xc0000000, 0x5a82799a, 0xd2bec333, 0x00000000, 0xd2bec333, 0x22a2f4f8, 0xc4df2862, |
67 | 0x58c542c5, 0xcac933ae, 0xcdb72c7e, 0xf383a3e2, 0x00000000, 0xd2bec333, 0x539eba45, 0xc4df2862, |
68 | 0xac6145bb, 0x187de2a7, 0xdd5d0b08, 0xe7821d59, 0x4b418bbe, 0xc13ad060, 0xa73abd3b, 0x3536cc52, |
69 | }; |
70 | |
71 | /************************************************************************************** |
72 | * Function: BitReverse32 |
73 | * |
74 | * Description: Ken's fast in-place bit reverse |
75 | * |
76 | * Inputs: buffer of 32 complex samples |
77 | * |
78 | * Outputs: bit-reversed samples in same buffer |
79 | * |
80 | * Return: none |
81 | **************************************************************************************/ |
82 | static void BitReverse32(int *inout) |
83 | { |
84 | int t, t1; |
85 | |
86 | swapcplx(inout[2], inout[32]); |
87 | swapcplx(inout[4], inout[16]); |
88 | swapcplx(inout[6], inout[48]); |
89 | swapcplx(inout[10], inout[40]); |
90 | swapcplx(inout[12], inout[24]); |
91 | swapcplx(inout[14], inout[56]); |
92 | swapcplx(inout[18], inout[36]); |
93 | swapcplx(inout[22], inout[52]); |
94 | swapcplx(inout[26], inout[44]); |
95 | swapcplx(inout[30], inout[60]); |
96 | swapcplx(inout[38], inout[50]); |
97 | swapcplx(inout[46], inout[58]); |
98 | } |
99 | |
100 | /************************************************************************************** |
101 | * Function: R8FirstPass32 |
102 | * |
103 | * Description: radix-8 trivial pass for decimation-in-time FFT (log2(N) = 5) |
104 | * |
105 | * Inputs: buffer of (bit-reversed) samples |
106 | * |
107 | * Outputs: processed samples in same buffer |
108 | * |
109 | * Return: none |
110 | * |
111 | * Notes: assumes 3 guard bits, gains 1 integer bit |
112 | * guard bits out = guard bits in - 3 (if inputs are full scale) |
113 | * or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2) |
114 | * see scaling comments in fft.c for base AAC |
115 | * should compile with no stack spills on ARM (verify compiled output) |
116 | * current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU |
117 | **************************************************************************************/ |
118 | static void R8FirstPass32(int *r0) |
119 | { |
120 | int r1, r2, r3, r4, r5, r6, r7; |
121 | int r8, r9, r10, r11, r12, r14; |
122 | |
123 | /* number of passes = fft size / 8 = 32 / 8 = 4 */ |
124 | r1 = (32 >> 3); |
125 | do { |
126 | |
127 | r2 = r0[8]; |
128 | r3 = r0[9]; |
129 | r4 = r0[10]; |
130 | r5 = r0[11]; |
131 | r6 = r0[12]; |
132 | r7 = r0[13]; |
133 | r8 = r0[14]; |
134 | r9 = r0[15]; |
135 | |
136 | r10 = r2 + r4; |
137 | r11 = r3 + r5; |
138 | r12 = r6 + r8; |
139 | r14 = r7 + r9; |
140 | |
141 | r2 -= r4; |
142 | r3 -= r5; |
143 | r6 -= r8; |
144 | r7 -= r9; |
145 | |
146 | r4 = r2 - r7; |
147 | r5 = r2 + r7; |
148 | r8 = r3 - r6; |
149 | r9 = r3 + r6; |
150 | |
151 | r2 = r4 - r9; |
152 | r3 = r4 + r9; |
153 | r6 = r5 - r8; |
154 | r7 = r5 + r8; |
155 | |
156 | r2 = MULSHIFT32(SQRT1_2, r2); /* can use r4, r5, r8, or r9 for constant and lo32 scratch reg */ |
157 | r3 = MULSHIFT32(SQRT1_2, r3); |
158 | r6 = MULSHIFT32(SQRT1_2, r6); |
159 | r7 = MULSHIFT32(SQRT1_2, r7); |
160 | |
161 | r4 = r10 + r12; |
162 | r5 = r10 - r12; |
163 | r8 = r11 + r14; |
164 | r9 = r11 - r14; |
165 | |
166 | r10 = r0[0]; |
167 | r11 = r0[2]; |
168 | r12 = r0[4]; |
169 | r14 = r0[6]; |
170 | |
171 | r10 += r11; |
172 | r12 += r14; |
173 | |
174 | r4 >>= 1; |
175 | r10 += r12; |
176 | r4 += (r10 >> 1); |
177 | r0[ 0] = r4; |
178 | r4 -= (r10 >> 1); |
179 | r4 = (r10 >> 1) - r4; |
180 | r0[ 8] = r4; |
181 | |
182 | r9 >>= 1; |
183 | r10 -= 2 * r12; |
184 | r4 = (r10 >> 1) + r9; |
185 | r0[ 4] = r4; |
186 | r4 = (r10 >> 1) - r9; |
187 | r0[12] = r4; |
188 | r10 += r12; |
189 | |
190 | r10 -= 2 * r11; |
191 | r12 -= 2 * r14; |
192 | |
193 | r4 = r0[1]; |
194 | r9 = r0[3]; |
195 | r11 = r0[5]; |
196 | r14 = r0[7]; |
197 | |
198 | r4 += r9; |
199 | r11 += r14; |
200 | |
201 | r8 >>= 1; |
202 | r4 += r11; |
203 | r8 += (r4 >> 1); |
204 | r0[ 1] = r8; |
205 | r8 -= (r4 >> 1); |
206 | r8 = (r4 >> 1) - r8; |
207 | r0[ 9] = r8; |
208 | |
209 | r5 >>= 1; |
210 | r4 -= 2 * r11; |
211 | r8 = (r4 >> 1) - r5; |
212 | r0[ 5] = r8; |
213 | r8 = (r4 >> 1) + r5; |
214 | r0[13] = r8; |
215 | r4 += r11; |
216 | |
217 | r4 -= 2 * r9; |
218 | r11 -= 2 * r14; |
219 | |
220 | r9 = r10 - r11; |
221 | r10 += r11; |
222 | r14 = r4 + r12; |
223 | r4 -= r12; |
224 | |
225 | r5 = (r10 >> 1) + r7; |
226 | r8 = (r4 >> 1) - r6; |
227 | r0[ 2] = r5; |
228 | r0[ 3] = r8; |
229 | |
230 | r5 = (r9 >> 1) - r2; |
231 | r8 = (r14 >> 1) - r3; |
232 | r0[ 6] = r5; |
233 | r0[ 7] = r8; |
234 | |
235 | r5 = (r10 >> 1) - r7; |
236 | r8 = (r4 >> 1) + r6; |
237 | r0[10] = r5; |
238 | r0[11] = r8; |
239 | |
240 | r5 = (r9 >> 1) + r2; |
241 | r8 = (r14 >> 1) + r3; |
242 | r0[14] = r5; |
243 | r0[15] = r8; |
244 | |
245 | r0 += 16; |
246 | r1--; |
247 | } while (r1 != 0); |
248 | } |
249 | |
250 | /************************************************************************************** |
251 | * Function: R4Core32 |
252 | * |
253 | * Description: radix-4 pass for 32-point decimation-in-time FFT |
254 | * |
255 | * Inputs: buffer of samples |
256 | * |
257 | * Outputs: processed samples in same buffer |
258 | * |
259 | * Return: none |
260 | * |
261 | * Notes: gain 2 integer bits |
262 | * guard bits out = guard bits in - 1 (if inputs are full scale) |
263 | * see scaling comments in fft.c for base AAC |
264 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
265 | * should compile with no stack spills on ARM (verify compiled output) |
266 | * current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU |
267 | **************************************************************************************/ |
268 | static void R4Core32(int *r0) |
269 | { |
270 | int r2, r3, r4, r5, r6, r7; |
271 | int r8, r9, r10, r12, r14; |
272 | int *r1; |
273 | |
274 | r1 = (int *)twidTabOdd32; |
275 | r10 = 8; |
276 | do { |
277 | /* can use r14 for lo32 scratch register in all MULSHIFT32 */ |
278 | r2 = r1[0]; |
279 | r3 = r1[1]; |
280 | r4 = r0[16]; |
281 | r5 = r0[17]; |
282 | r12 = r4 + r5; |
283 | r12 = MULSHIFT32(r3, r12); |
284 | r5 = MULSHIFT32(r2, r5) + r12; |
285 | r2 += 2 * r3; |
286 | r4 = MULSHIFT32(r2, r4) - r12; |
287 | |
288 | r2 = r1[2]; |
289 | r3 = r1[3]; |
290 | r6 = r0[32]; |
291 | r7 = r0[33]; |
292 | r12 = r6 + r7; |
293 | r12 = MULSHIFT32(r3, r12); |
294 | r7 = MULSHIFT32(r2, r7) + r12; |
295 | r2 += 2 * r3; |
296 | r6 = MULSHIFT32(r2, r6) - r12; |
297 | |
298 | r2 = r1[4]; |
299 | r3 = r1[5]; |
300 | r8 = r0[48]; |
301 | r9 = r0[49]; |
302 | r12 = r8 + r9; |
303 | r12 = MULSHIFT32(r3, r12); |
304 | r9 = MULSHIFT32(r2, r9) + r12; |
305 | r2 += 2 * r3; |
306 | r8 = MULSHIFT32(r2, r8) - r12; |
307 | |
308 | r2 = r0[0]; |
309 | r3 = r0[1]; |
310 | |
311 | r12 = r6 + r8; |
312 | r8 = r6 - r8; |
313 | r14 = r9 - r7; |
314 | r9 = r9 + r7; |
315 | |
316 | r6 = (r2 >> 2) - r4; |
317 | r7 = (r3 >> 2) - r5; |
318 | r4 += (r2 >> 2); |
319 | r5 += (r3 >> 2); |
320 | |
321 | r2 = r4 + r12; |
322 | r3 = r5 + r9; |
323 | r0[0] = r2; |
324 | r0[1] = r3; |
325 | r2 = r6 - r14; |
326 | r3 = r7 - r8; |
327 | r0[16] = r2; |
328 | r0[17] = r3; |
329 | r2 = r4 - r12; |
330 | r3 = r5 - r9; |
331 | r0[32] = r2; |
332 | r0[33] = r3; |
333 | r2 = r6 + r14; |
334 | r3 = r7 + r8; |
335 | r0[48] = r2; |
336 | r0[49] = r3; |
337 | |
338 | r0 += 2; |
339 | r1 += 6; |
340 | r10--; |
341 | } while (r10 != 0); |
342 | } |
343 | |
344 | /************************************************************************************** |
345 | * Function: FFT32C |
346 | * |
347 | * Description: Ken's very fast in-place radix-4 decimation-in-time FFT |
348 | * |
349 | * Inputs: buffer of 32 complex samples (before bit-reversal) |
350 | * |
351 | * Outputs: processed samples in same buffer |
352 | * |
353 | * Return: none |
354 | * |
355 | * Notes: assumes 3 guard bits in, gains 3 integer bits |
356 | * guard bits out = guard bits in - 2 |
357 | * (guard bit analysis includes assumptions about steps immediately |
358 | * before and after, i.e. PreMul and PostMul for DCT) |
359 | **************************************************************************************/ |
360 | void FFT32C(int *x) |
361 | { |
362 | /* decimation in time */ |
363 | BitReverse32(x); |
364 | |
365 | /* 32-point complex FFT */ |
366 | R8FirstPass32(x); /* gain 1 int bit, lose 2 GB (making assumptions about input) */ |
367 | R4Core32(x); /* gain 2 int bits, lose 0 GB (making assumptions about input) */ |
368 | } |
369 |