summaryrefslogtreecommitdiff
path: root/audio_codec/libraac/sbrfft.c (plain)
blob: 2f74fc5c35b75c00c8ec6bf5b299c4012324c1cf
1/* ***** BEGIN LICENSE BLOCK *****
2 * Source last modified: $Id: sbrfft.c,v 1.1 2005/02/26 01:47:35 jrecker Exp $
3 *
4 * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5 *
6 * The contents of this file, and the files included with this file,
7 * are subject to the current version of the RealNetworks Public
8 * Source License (the "RPSL") available at
9 * http://www.helixcommunity.org/content/rpsl unless you have licensed
10 * the file under the current version of the RealNetworks Community
11 * Source License (the "RCSL") available at
12 * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13 * will apply. You may also obtain the license terms directly from
14 * RealNetworks. You may not use this file except in compliance with
15 * the RPSL or, if you have a valid RCSL with RealNetworks applicable
16 * to this file, the RCSL. Please see the applicable RPSL or RCSL for
17 * the rights, obligations and limitations governing use of the
18 * contents of the file.
19 *
20 * This file is part of the Helix DNA Technology. RealNetworks is the
21 * developer of the Original Code and owns the copyrights in the
22 * portions it created.
23 *
24 * This file, and the files included with this file, is distributed
25 * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26 * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27 * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29 * ENJOYMENT OR NON-INFRINGEMENT.
30 *
31 * Technology Compatibility Kit Test Suite(s) Location:
32 * http://www.helixcommunity.org/content/tck
33 *
34 * Contributor(s):
35 *
36 * ***** END LICENSE BLOCK ***** */
37
38/**************************************************************************************
39 * Fixed-point HE-AAC decoder
40 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41 * February 2005
42 *
43 * sbrfft.c - optimized FFT for SBR QMF filters
44 **************************************************************************************/
45
46#include "sbr.h"
47#include "assembly.h"
48
49#define SQRT1_2 0x5a82799a
50
51/* swap RE{p0} with RE{p1} and IM{P0} with IM{P1} */
52#define swapcplx(p0,p1) \
53 t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1
54
55/* nfft = 32, hard coded since small, fixed size FFT
56static const unsigned char bitrevtab32[9] = {
57 0x01, 0x04, 0x03, 0x06, 0x00, 0x02, 0x05, 0x07, 0x00,
58};
59*/
60
61/* twiddle table for radix 4 pass, format = Q31 */
62static const int twidTabOdd32[8 * 6] = {
63 0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x539eba45, 0xe7821d59,
64 0x4b418bbe, 0xf383a3e2, 0x58c542c5, 0xdc71898d, 0x5a82799a, 0xd2bec333, 0x539eba45, 0xe7821d59,
65 0x539eba45, 0xc4df2862, 0x539eba45, 0xc4df2862, 0x58c542c5, 0xdc71898d, 0x3248d382, 0xc13ad060,
66 0x40000000, 0xc0000000, 0x5a82799a, 0xd2bec333, 0x00000000, 0xd2bec333, 0x22a2f4f8, 0xc4df2862,
67 0x58c542c5, 0xcac933ae, 0xcdb72c7e, 0xf383a3e2, 0x00000000, 0xd2bec333, 0x539eba45, 0xc4df2862,
68 0xac6145bb, 0x187de2a7, 0xdd5d0b08, 0xe7821d59, 0x4b418bbe, 0xc13ad060, 0xa73abd3b, 0x3536cc52,
69};
70
71/**************************************************************************************
72 * Function: BitReverse32
73 *
74 * Description: Ken's fast in-place bit reverse
75 *
76 * Inputs: buffer of 32 complex samples
77 *
78 * Outputs: bit-reversed samples in same buffer
79 *
80 * Return: none
81**************************************************************************************/
82static void BitReverse32(int *inout)
83{
84 int t, t1;
85
86 swapcplx(inout[2], inout[32]);
87 swapcplx(inout[4], inout[16]);
88 swapcplx(inout[6], inout[48]);
89 swapcplx(inout[10], inout[40]);
90 swapcplx(inout[12], inout[24]);
91 swapcplx(inout[14], inout[56]);
92 swapcplx(inout[18], inout[36]);
93 swapcplx(inout[22], inout[52]);
94 swapcplx(inout[26], inout[44]);
95 swapcplx(inout[30], inout[60]);
96 swapcplx(inout[38], inout[50]);
97 swapcplx(inout[46], inout[58]);
98}
99
100/**************************************************************************************
101 * Function: R8FirstPass32
102 *
103 * Description: radix-8 trivial pass for decimation-in-time FFT (log2(N) = 5)
104 *
105 * Inputs: buffer of (bit-reversed) samples
106 *
107 * Outputs: processed samples in same buffer
108 *
109 * Return: none
110 *
111 * Notes: assumes 3 guard bits, gains 1 integer bit
112 * guard bits out = guard bits in - 3 (if inputs are full scale)
113 * or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2)
114 * see scaling comments in fft.c for base AAC
115 * should compile with no stack spills on ARM (verify compiled output)
116 * current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU
117 **************************************************************************************/
118static void R8FirstPass32(int *r0)
119{
120 int r1, r2, r3, r4, r5, r6, r7;
121 int r8, r9, r10, r11, r12, r14;
122
123 /* number of passes = fft size / 8 = 32 / 8 = 4 */
124 r1 = (32 >> 3);
125 do {
126
127 r2 = r0[8];
128 r3 = r0[9];
129 r4 = r0[10];
130 r5 = r0[11];
131 r6 = r0[12];
132 r7 = r0[13];
133 r8 = r0[14];
134 r9 = r0[15];
135
136 r10 = r2 + r4;
137 r11 = r3 + r5;
138 r12 = r6 + r8;
139 r14 = r7 + r9;
140
141 r2 -= r4;
142 r3 -= r5;
143 r6 -= r8;
144 r7 -= r9;
145
146 r4 = r2 - r7;
147 r5 = r2 + r7;
148 r8 = r3 - r6;
149 r9 = r3 + r6;
150
151 r2 = r4 - r9;
152 r3 = r4 + r9;
153 r6 = r5 - r8;
154 r7 = r5 + r8;
155
156 r2 = MULSHIFT32(SQRT1_2, r2); /* can use r4, r5, r8, or r9 for constant and lo32 scratch reg */
157 r3 = MULSHIFT32(SQRT1_2, r3);
158 r6 = MULSHIFT32(SQRT1_2, r6);
159 r7 = MULSHIFT32(SQRT1_2, r7);
160
161 r4 = r10 + r12;
162 r5 = r10 - r12;
163 r8 = r11 + r14;
164 r9 = r11 - r14;
165
166 r10 = r0[0];
167 r11 = r0[2];
168 r12 = r0[4];
169 r14 = r0[6];
170
171 r10 += r11;
172 r12 += r14;
173
174 r4 >>= 1;
175 r10 += r12;
176 r4 += (r10 >> 1);
177 r0[ 0] = r4;
178 r4 -= (r10 >> 1);
179 r4 = (r10 >> 1) - r4;
180 r0[ 8] = r4;
181
182 r9 >>= 1;
183 r10 -= 2 * r12;
184 r4 = (r10 >> 1) + r9;
185 r0[ 4] = r4;
186 r4 = (r10 >> 1) - r9;
187 r0[12] = r4;
188 r10 += r12;
189
190 r10 -= 2 * r11;
191 r12 -= 2 * r14;
192
193 r4 = r0[1];
194 r9 = r0[3];
195 r11 = r0[5];
196 r14 = r0[7];
197
198 r4 += r9;
199 r11 += r14;
200
201 r8 >>= 1;
202 r4 += r11;
203 r8 += (r4 >> 1);
204 r0[ 1] = r8;
205 r8 -= (r4 >> 1);
206 r8 = (r4 >> 1) - r8;
207 r0[ 9] = r8;
208
209 r5 >>= 1;
210 r4 -= 2 * r11;
211 r8 = (r4 >> 1) - r5;
212 r0[ 5] = r8;
213 r8 = (r4 >> 1) + r5;
214 r0[13] = r8;
215 r4 += r11;
216
217 r4 -= 2 * r9;
218 r11 -= 2 * r14;
219
220 r9 = r10 - r11;
221 r10 += r11;
222 r14 = r4 + r12;
223 r4 -= r12;
224
225 r5 = (r10 >> 1) + r7;
226 r8 = (r4 >> 1) - r6;
227 r0[ 2] = r5;
228 r0[ 3] = r8;
229
230 r5 = (r9 >> 1) - r2;
231 r8 = (r14 >> 1) - r3;
232 r0[ 6] = r5;
233 r0[ 7] = r8;
234
235 r5 = (r10 >> 1) - r7;
236 r8 = (r4 >> 1) + r6;
237 r0[10] = r5;
238 r0[11] = r8;
239
240 r5 = (r9 >> 1) + r2;
241 r8 = (r14 >> 1) + r3;
242 r0[14] = r5;
243 r0[15] = r8;
244
245 r0 += 16;
246 r1--;
247 } while (r1 != 0);
248}
249
250/**************************************************************************************
251 * Function: R4Core32
252 *
253 * Description: radix-4 pass for 32-point decimation-in-time FFT
254 *
255 * Inputs: buffer of samples
256 *
257 * Outputs: processed samples in same buffer
258 *
259 * Return: none
260 *
261 * Notes: gain 2 integer bits
262 * guard bits out = guard bits in - 1 (if inputs are full scale)
263 * see scaling comments in fft.c for base AAC
264 * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
265 * should compile with no stack spills on ARM (verify compiled output)
266 * current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU
267 **************************************************************************************/
268static void R4Core32(int *r0)
269{
270 int r2, r3, r4, r5, r6, r7;
271 int r8, r9, r10, r12, r14;
272 int *r1;
273
274 r1 = (int *)twidTabOdd32;
275 r10 = 8;
276 do {
277 /* can use r14 for lo32 scratch register in all MULSHIFT32 */
278 r2 = r1[0];
279 r3 = r1[1];
280 r4 = r0[16];
281 r5 = r0[17];
282 r12 = r4 + r5;
283 r12 = MULSHIFT32(r3, r12);
284 r5 = MULSHIFT32(r2, r5) + r12;
285 r2 += 2 * r3;
286 r4 = MULSHIFT32(r2, r4) - r12;
287
288 r2 = r1[2];
289 r3 = r1[3];
290 r6 = r0[32];
291 r7 = r0[33];
292 r12 = r6 + r7;
293 r12 = MULSHIFT32(r3, r12);
294 r7 = MULSHIFT32(r2, r7) + r12;
295 r2 += 2 * r3;
296 r6 = MULSHIFT32(r2, r6) - r12;
297
298 r2 = r1[4];
299 r3 = r1[5];
300 r8 = r0[48];
301 r9 = r0[49];
302 r12 = r8 + r9;
303 r12 = MULSHIFT32(r3, r12);
304 r9 = MULSHIFT32(r2, r9) + r12;
305 r2 += 2 * r3;
306 r8 = MULSHIFT32(r2, r8) - r12;
307
308 r2 = r0[0];
309 r3 = r0[1];
310
311 r12 = r6 + r8;
312 r8 = r6 - r8;
313 r14 = r9 - r7;
314 r9 = r9 + r7;
315
316 r6 = (r2 >> 2) - r4;
317 r7 = (r3 >> 2) - r5;
318 r4 += (r2 >> 2);
319 r5 += (r3 >> 2);
320
321 r2 = r4 + r12;
322 r3 = r5 + r9;
323 r0[0] = r2;
324 r0[1] = r3;
325 r2 = r6 - r14;
326 r3 = r7 - r8;
327 r0[16] = r2;
328 r0[17] = r3;
329 r2 = r4 - r12;
330 r3 = r5 - r9;
331 r0[32] = r2;
332 r0[33] = r3;
333 r2 = r6 + r14;
334 r3 = r7 + r8;
335 r0[48] = r2;
336 r0[49] = r3;
337
338 r0 += 2;
339 r1 += 6;
340 r10--;
341 } while (r10 != 0);
342}
343
344/**************************************************************************************
345 * Function: FFT32C
346 *
347 * Description: Ken's very fast in-place radix-4 decimation-in-time FFT
348 *
349 * Inputs: buffer of 32 complex samples (before bit-reversal)
350 *
351 * Outputs: processed samples in same buffer
352 *
353 * Return: none
354 *
355 * Notes: assumes 3 guard bits in, gains 3 integer bits
356 * guard bits out = guard bits in - 2
357 * (guard bit analysis includes assumptions about steps immediately
358 * before and after, i.e. PreMul and PostMul for DCT)
359 **************************************************************************************/
360void FFT32C(int *x)
361{
362 /* decimation in time */
363 BitReverse32(x);
364
365 /* 32-point complex FFT */
366 R8FirstPass32(x); /* gain 1 int bit, lose 2 GB (making assumptions about input) */
367 R4Core32(x); /* gain 2 int bits, lose 0 GB (making assumptions about input) */
368}
369