blob: e17a2bd929cef02bede49fa1d613a995cf95467d
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: fft.c,v 1.1 2005/02/26 01:47:34 jrecker Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) |
41 | * February 2005 |
42 | * |
43 | * fft.c - Ken's optimized radix-4 DIT FFT, optional radix-8 first pass for odd log2(N) |
44 | **************************************************************************************/ |
45 | |
46 | #include "coder.h" |
47 | |
48 | #include "assembly.h" |
49 | |
50 | |
51 | #define NUM_FFT_SIZES 2 |
52 | static const int nfftTab[NUM_FFT_SIZES] = {64, 512}; |
53 | static const int nfftlog2Tab[NUM_FFT_SIZES] = {6, 9}; |
54 | |
55 | #define SQRT1_2 0x5a82799a /* sqrt(1/2) in Q31 */ |
56 | |
57 | #define swapcplx(p0,p1) \ |
58 | t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1 |
59 | |
60 | /************************************************************************************** |
61 | * Function: BitReverse |
62 | * |
63 | * Description: Ken's fast in-place bit reverse, using super-small table |
64 | * |
65 | * Inputs: buffer of samples |
66 | * table index (for transform size) |
67 | * |
68 | * Outputs: bit-reversed samples in same buffer |
69 | * |
70 | * Return: none |
71 | **************************************************************************************/ |
72 | static void BitReverse(int *inout, int tabidx) |
73 | { |
74 | int *part0, *part1; |
75 | int a, b, t, t1; |
76 | const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx]; |
77 | int nbits = nfftlog2Tab[tabidx]; |
78 | |
79 | part0 = inout; |
80 | part1 = inout + (1 << nbits); |
81 | |
82 | while ((a = *tab++) != 0) { |
83 | b = *tab++; |
84 | |
85 | swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */ |
86 | swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */ |
87 | swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */ |
88 | swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */ |
89 | } |
90 | |
91 | do { |
92 | swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */ |
93 | } while ((a = *tab++) != 0); |
94 | } |
95 | |
96 | /************************************************************************************** |
97 | * Function: R4FirstPass |
98 | * |
99 | * Description: radix-4 trivial pass for decimation-in-time FFT |
100 | * |
101 | * Inputs: buffer of (bit-reversed) samples |
102 | * number of R4 butterflies per group (i.e. nfft / 4) |
103 | * |
104 | * Outputs: processed samples in same buffer |
105 | * |
106 | * Return: none |
107 | * |
108 | * Notes: assumes 2 guard bits, gains no integer bits, |
109 | * guard bits out = guard bits in - 2 |
110 | **************************************************************************************/ |
111 | static void R4FirstPass(int *x, int bg) |
112 | { |
113 | int ar, ai, br, bi, cr, ci, dr, di; |
114 | |
115 | for (; bg != 0; bg--) { |
116 | |
117 | ar = x[0] + x[2]; |
118 | br = x[0] - x[2]; |
119 | ai = x[1] + x[3]; |
120 | bi = x[1] - x[3]; |
121 | cr = x[4] + x[6]; |
122 | dr = x[4] - x[6]; |
123 | ci = x[5] + x[7]; |
124 | di = x[5] - x[7]; |
125 | |
126 | /* max per-sample gain = 4.0 (adding 4 inputs together) */ |
127 | x[0] = ar + cr; |
128 | x[4] = ar - cr; |
129 | x[1] = ai + ci; |
130 | x[5] = ai - ci; |
131 | x[2] = br + di; |
132 | x[6] = br - di; |
133 | x[3] = bi - dr; |
134 | x[7] = bi + dr; |
135 | |
136 | x += 8; |
137 | } |
138 | } |
139 | |
140 | /************************************************************************************** |
141 | * Function: R8FirstPass |
142 | * |
143 | * Description: radix-8 trivial pass for decimation-in-time FFT |
144 | * |
145 | * Inputs: buffer of (bit-reversed) samples |
146 | * number of R8 butterflies per group (i.e. nfft / 8) |
147 | * |
148 | * Outputs: processed samples in same buffer |
149 | * |
150 | * Return: none |
151 | * |
152 | * Notes: assumes 3 guard bits, gains 1 integer bit |
153 | * guard bits out = guard bits in - 3 (if inputs are full scale) |
154 | * or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2) |
155 | * see scaling comments in code |
156 | **************************************************************************************/ |
157 | static void R8FirstPass(int *x, int bg) |
158 | { |
159 | int ar, ai, br, bi, cr, ci, dr, di; |
160 | int sr, si, tr, ti, ur, ui, vr, vi; |
161 | int wr, wi, xr, xi, yr, yi, zr, zi; |
162 | |
163 | for (; bg != 0; bg--) { |
164 | |
165 | ar = x[0] + x[2]; |
166 | br = x[0] - x[2]; |
167 | ai = x[1] + x[3]; |
168 | bi = x[1] - x[3]; |
169 | cr = x[4] + x[6]; |
170 | dr = x[4] - x[6]; |
171 | ci = x[5] + x[7]; |
172 | di = x[5] - x[7]; |
173 | |
174 | sr = ar + cr; |
175 | ur = ar - cr; |
176 | si = ai + ci; |
177 | ui = ai - ci; |
178 | tr = br - di; |
179 | vr = br + di; |
180 | ti = bi + dr; |
181 | vi = bi - dr; |
182 | |
183 | ar = x[ 8] + x[10]; |
184 | br = x[ 8] - x[10]; |
185 | ai = x[ 9] + x[11]; |
186 | bi = x[ 9] - x[11]; |
187 | cr = x[12] + x[14]; |
188 | dr = x[12] - x[14]; |
189 | ci = x[13] + x[15]; |
190 | di = x[13] - x[15]; |
191 | |
192 | /* max gain of wr/wi/yr/yi vs input = 2 |
193 | * (sum of 4 samples >> 1) |
194 | */ |
195 | wr = (ar + cr) >> 1; |
196 | yr = (ar - cr) >> 1; |
197 | wi = (ai + ci) >> 1; |
198 | yi = (ai - ci) >> 1; |
199 | |
200 | /* max gain of output vs input = 4 |
201 | * (sum of 4 samples >> 1 + sum of 4 samples >> 1) |
202 | */ |
203 | x[ 0] = (sr >> 1) + wr; |
204 | x[ 8] = (sr >> 1) - wr; |
205 | x[ 1] = (si >> 1) + wi; |
206 | x[ 9] = (si >> 1) - wi; |
207 | x[ 4] = (ur >> 1) + yi; |
208 | x[12] = (ur >> 1) - yi; |
209 | x[ 5] = (ui >> 1) - yr; |
210 | x[13] = (ui >> 1) + yr; |
211 | |
212 | ar = br - di; |
213 | cr = br + di; |
214 | ai = bi + dr; |
215 | ci = bi - dr; |
216 | |
217 | /* max gain of xr/xi/zr/zi vs input = 4*sqrt(2)/2 = 2*sqrt(2) |
218 | * (sum of 8 samples, multiply by sqrt(2)/2, implicit >> 1 from Q31) |
219 | */ |
220 | xr = MULSHIFT32(SQRT1_2, ar - ai); |
221 | xi = MULSHIFT32(SQRT1_2, ar + ai); |
222 | zr = MULSHIFT32(SQRT1_2, cr - ci); |
223 | zi = MULSHIFT32(SQRT1_2, cr + ci); |
224 | |
225 | /* max gain of output vs input = (2 + 2*sqrt(2) ~= 4.83) |
226 | * (sum of 4 samples >> 1, plus xr/xi/zr/zi with gain of 2*sqrt(2)) |
227 | * in absolute terms, we have max gain of appx 9.656 (4 + 0.707*8) |
228 | * but we also gain 1 int bit (from MULSHIFT32 or from explicit >> 1) |
229 | */ |
230 | x[ 6] = (tr >> 1) - xr; |
231 | x[14] = (tr >> 1) + xr; |
232 | x[ 7] = (ti >> 1) - xi; |
233 | x[15] = (ti >> 1) + xi; |
234 | x[ 2] = (vr >> 1) + zi; |
235 | x[10] = (vr >> 1) - zi; |
236 | x[ 3] = (vi >> 1) - zr; |
237 | x[11] = (vi >> 1) + zr; |
238 | |
239 | x += 16; |
240 | } |
241 | } |
242 | |
243 | /************************************************************************************** |
244 | * Function: R4Core |
245 | * |
246 | * Description: radix-4 pass for decimation-in-time FFT |
247 | * |
248 | * Inputs: buffer of samples |
249 | * number of R4 butterflies per group |
250 | * number of R4 groups per pass |
251 | * pointer to twiddle factors tables |
252 | * |
253 | * Outputs: processed samples in same buffer |
254 | * |
255 | * Return: none |
256 | * |
257 | * Notes: gain 2 integer bits per pass (see scaling comments in code) |
258 | * min 1 GB in |
259 | * gbOut = gbIn - 1 (short block) or gbIn - 2 (long block) |
260 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
261 | **************************************************************************************/ |
262 | static void R4Core(int *x, int bg, int gp, int *wtab) |
263 | { |
264 | int ar, ai, br, bi, cr, ci, dr, di, tr, ti; |
265 | int wd, ws, wi; |
266 | int i, j, step; |
267 | int *xptr, *wptr; |
268 | |
269 | for (; bg != 0; gp <<= 2, bg >>= 2) { |
270 | |
271 | step = 2 * gp; |
272 | xptr = x; |
273 | |
274 | /* max per-sample gain, per group < 1 + 3*sqrt(2) ~= 5.25 if inputs x are full-scale |
275 | * do 3 groups for long block, 2 groups for short block (gain 2 int bits per group) |
276 | * |
277 | * very conservative scaling: |
278 | * group 1: max gain = 5.25, int bits gained = 2, gb used = 1 (2^3 = 8) |
279 | * group 2: max gain = 5.25^2 = 27.6, int bits gained = 4, gb used = 1 (2^5 = 32) |
280 | * group 3: max gain = 5.25^3 = 144.7, int bits gained = 6, gb used = 2 (2^8 = 256) |
281 | */ |
282 | for (i = bg; i != 0; i--) { |
283 | |
284 | wptr = wtab; |
285 | |
286 | for (j = gp; j != 0; j--) { |
287 | |
288 | ar = xptr[0]; |
289 | ai = xptr[1]; |
290 | xptr += step; |
291 | |
292 | /* gain 2 int bits for br/bi, cr/ci, dr/di (MULSHIFT32 by Q30) |
293 | * gain 1 net GB |
294 | */ |
295 | ws = wptr[0]; |
296 | wi = wptr[1]; |
297 | br = xptr[0]; |
298 | bi = xptr[1]; |
299 | wd = ws + 2 * wi; |
300 | tr = MULSHIFT32(wi, br + bi); |
301 | br = MULSHIFT32(wd, br) - tr; /* cos*br + sin*bi */ |
302 | bi = MULSHIFT32(ws, bi) + tr; /* cos*bi - sin*br */ |
303 | xptr += step; |
304 | |
305 | ws = wptr[2]; |
306 | wi = wptr[3]; |
307 | cr = xptr[0]; |
308 | ci = xptr[1]; |
309 | wd = ws + 2 * wi; |
310 | tr = MULSHIFT32(wi, cr + ci); |
311 | cr = MULSHIFT32(wd, cr) - tr; |
312 | ci = MULSHIFT32(ws, ci) + tr; |
313 | xptr += step; |
314 | |
315 | ws = wptr[4]; |
316 | wi = wptr[5]; |
317 | dr = xptr[0]; |
318 | di = xptr[1]; |
319 | wd = ws + 2 * wi; |
320 | tr = MULSHIFT32(wi, dr + di); |
321 | dr = MULSHIFT32(wd, dr) - tr; |
322 | di = MULSHIFT32(ws, di) + tr; |
323 | wptr += 6; |
324 | |
325 | tr = ar; |
326 | ti = ai; |
327 | ar = (tr >> 2) - br; |
328 | ai = (ti >> 2) - bi; |
329 | br = (tr >> 2) + br; |
330 | bi = (ti >> 2) + bi; |
331 | |
332 | tr = cr; |
333 | ti = ci; |
334 | cr = tr + dr; |
335 | ci = di - ti; |
336 | dr = tr - dr; |
337 | di = di + ti; |
338 | |
339 | xptr[0] = ar + ci; |
340 | xptr[1] = ai + dr; |
341 | xptr -= step; |
342 | xptr[0] = br - cr; |
343 | xptr[1] = bi - di; |
344 | xptr -= step; |
345 | xptr[0] = ar - ci; |
346 | xptr[1] = ai - dr; |
347 | xptr -= step; |
348 | xptr[0] = br + cr; |
349 | xptr[1] = bi + di; |
350 | xptr += 2; |
351 | } |
352 | xptr += 3 * step; |
353 | } |
354 | wtab += 3 * step; |
355 | } |
356 | } |
357 | |
358 | |
359 | /************************************************************************************** |
360 | * Function: R4FFT |
361 | * |
362 | * Description: Ken's very fast in-place radix-4 decimation-in-time FFT |
363 | * |
364 | * Inputs: table index (for transform size) |
365 | * buffer of samples (non bit-reversed) |
366 | * |
367 | * Outputs: processed samples in same buffer |
368 | * |
369 | * Return: none |
370 | * |
371 | * Notes: assumes 5 guard bits in for nfft <= 512 |
372 | * gbOut = gbIn - 4 (assuming input is from PreMultiply) |
373 | * gains log2(nfft) - 2 int bits total |
374 | * so gain 7 int bits (LONG), 4 int bits (SHORT) |
375 | **************************************************************************************/ |
376 | void R4FFT(int tabidx, int *x) |
377 | { |
378 | int order = nfftlog2Tab[tabidx]; |
379 | int nfft = nfftTab[tabidx]; |
380 | |
381 | /* decimation in time */ |
382 | BitReverse(x, tabidx); |
383 | |
384 | if (order & 0x1) { |
385 | /* long block: order = 9, nfft = 512 */ |
386 | R8FirstPass(x, nfft >> 3); /* gain 1 int bit, lose 2 GB */ |
387 | R4Core(x, nfft >> 5, 8, (int *)twidTabOdd); /* gain 6 int bits, lose 2 GB */ |
388 | } else { |
389 | /* short block: order = 6, nfft = 64 */ |
390 | R4FirstPass(x, nfft >> 2); /* gain 0 int bits, lose 2 GB */ |
391 | R4Core(x, nfft >> 4, 4, (int *)twidTabEven); /* gain 4 int bits, lose 1 GB */ |
392 | } |
393 | } |
394 |