blob: db17d27e4aa1b52924194753dcdc8ce8d05b3c89
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: fft.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) |
41 | * February 2005 |
42 | * |
43 | * fft.c - Ken's optimized radix-4 DIT FFT, optional radix-8 first pass for odd log2(N) |
44 | **************************************************************************************/ |
45 | |
46 | #include "coder.h" |
47 | #include "assembly.h" |
48 | |
49 | #define NUM_FFT_SIZES 2 |
50 | static const int nfftTab[NUM_FFT_SIZES] = {64, 512}; |
51 | static const int nfftlog2Tab[NUM_FFT_SIZES] = {6, 9}; |
52 | |
53 | #define SQRT1_2 0x5a82799a /* sqrt(1/2) in Q31 */ |
54 | |
55 | #define swapcplx(p0,p1) \ |
56 | t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1 |
57 | |
58 | /************************************************************************************** |
59 | * Function: BitReverse |
60 | * |
61 | * Description: Ken's fast in-place bit reverse, using super-small table |
62 | * |
63 | * Inputs: buffer of samples |
64 | * table index (for transform size) |
65 | * |
66 | * Outputs: bit-reversed samples in same buffer |
67 | * |
68 | * Return: none |
69 | **************************************************************************************/ |
70 | static void BitReverse(int *inout, int tabidx) |
71 | { |
72 | int *part0, *part1; |
73 | int a, b, t, t1; |
74 | const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx]; |
75 | int nbits = nfftlog2Tab[tabidx]; |
76 | |
77 | part0 = inout; |
78 | part1 = inout + (1 << nbits); |
79 | |
80 | while ((a = *tab++) != 0) { |
81 | b = *tab++; |
82 | |
83 | swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */ |
84 | swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */ |
85 | swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */ |
86 | swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */ |
87 | } |
88 | |
89 | do { |
90 | swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */ |
91 | } while ((a = *tab++) != 0); |
92 | } |
93 | |
94 | /************************************************************************************** |
95 | * Function: R4FirstPass |
96 | * |
97 | * Description: radix-4 trivial pass for decimation-in-time FFT |
98 | * |
99 | * Inputs: buffer of (bit-reversed) samples |
100 | * number of R4 butterflies per group (i.e. nfft / 4) |
101 | * |
102 | * Outputs: processed samples in same buffer |
103 | * |
104 | * Return: none |
105 | * |
106 | * Notes: assumes 2 guard bits, gains no integer bits, |
107 | * guard bits out = guard bits in - 2 |
108 | **************************************************************************************/ |
109 | static void R4FirstPass(int *x, int bg) |
110 | { |
111 | int ar, ai, br, bi, cr, ci, dr, di; |
112 | |
113 | for (; bg != 0; bg--) { |
114 | |
115 | ar = x[0] + x[2]; |
116 | br = x[0] - x[2]; |
117 | ai = x[1] + x[3]; |
118 | bi = x[1] - x[3]; |
119 | cr = x[4] + x[6]; |
120 | dr = x[4] - x[6]; |
121 | ci = x[5] + x[7]; |
122 | di = x[5] - x[7]; |
123 | |
124 | /* max per-sample gain = 4.0 (adding 4 inputs together) */ |
125 | x[0] = ar + cr; |
126 | x[4] = ar - cr; |
127 | x[1] = ai + ci; |
128 | x[5] = ai - ci; |
129 | x[2] = br + di; |
130 | x[6] = br - di; |
131 | x[3] = bi - dr; |
132 | x[7] = bi + dr; |
133 | |
134 | x += 8; |
135 | } |
136 | } |
137 | |
138 | /************************************************************************************** |
139 | * Function: R8FirstPass |
140 | * |
141 | * Description: radix-8 trivial pass for decimation-in-time FFT |
142 | * |
143 | * Inputs: buffer of (bit-reversed) samples |
144 | * number of R8 butterflies per group (i.e. nfft / 8) |
145 | * |
146 | * Outputs: processed samples in same buffer |
147 | * |
148 | * Return: none |
149 | * |
150 | * Notes: assumes 3 guard bits, gains 1 integer bit |
151 | * guard bits out = guard bits in - 3 (if inputs are full scale) |
152 | * or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2) |
153 | * see scaling comments in code |
154 | **************************************************************************************/ |
155 | static void R8FirstPass(int *x, int bg) |
156 | { |
157 | int ar, ai, br, bi, cr, ci, dr, di; |
158 | int sr, si, tr, ti, ur, ui, vr, vi; |
159 | int wr, wi, xr, xi, yr, yi, zr, zi; |
160 | |
161 | for (; bg != 0; bg--) { |
162 | |
163 | ar = x[0] + x[2]; |
164 | br = x[0] - x[2]; |
165 | ai = x[1] + x[3]; |
166 | bi = x[1] - x[3]; |
167 | cr = x[4] + x[6]; |
168 | dr = x[4] - x[6]; |
169 | ci = x[5] + x[7]; |
170 | di = x[5] - x[7]; |
171 | |
172 | sr = ar + cr; |
173 | ur = ar - cr; |
174 | si = ai + ci; |
175 | ui = ai - ci; |
176 | tr = br - di; |
177 | vr = br + di; |
178 | ti = bi + dr; |
179 | vi = bi - dr; |
180 | |
181 | ar = x[ 8] + x[10]; |
182 | br = x[ 8] - x[10]; |
183 | ai = x[ 9] + x[11]; |
184 | bi = x[ 9] - x[11]; |
185 | cr = x[12] + x[14]; |
186 | dr = x[12] - x[14]; |
187 | ci = x[13] + x[15]; |
188 | di = x[13] - x[15]; |
189 | |
190 | /* max gain of wr/wi/yr/yi vs input = 2 |
191 | * (sum of 4 samples >> 1) |
192 | */ |
193 | wr = (ar + cr) >> 1; |
194 | yr = (ar - cr) >> 1; |
195 | wi = (ai + ci) >> 1; |
196 | yi = (ai - ci) >> 1; |
197 | |
198 | /* max gain of output vs input = 4 |
199 | * (sum of 4 samples >> 1 + sum of 4 samples >> 1) |
200 | */ |
201 | x[ 0] = (sr >> 1) + wr; |
202 | x[ 8] = (sr >> 1) - wr; |
203 | x[ 1] = (si >> 1) + wi; |
204 | x[ 9] = (si >> 1) - wi; |
205 | x[ 4] = (ur >> 1) + yi; |
206 | x[12] = (ur >> 1) - yi; |
207 | x[ 5] = (ui >> 1) - yr; |
208 | x[13] = (ui >> 1) + yr; |
209 | |
210 | ar = br - di; |
211 | cr = br + di; |
212 | ai = bi + dr; |
213 | ci = bi - dr; |
214 | |
215 | /* max gain of xr/xi/zr/zi vs input = 4*sqrt(2)/2 = 2*sqrt(2) |
216 | * (sum of 8 samples, multiply by sqrt(2)/2, implicit >> 1 from Q31) |
217 | */ |
218 | xr = MULSHIFT32(SQRT1_2, ar - ai); |
219 | xi = MULSHIFT32(SQRT1_2, ar + ai); |
220 | zr = MULSHIFT32(SQRT1_2, cr - ci); |
221 | zi = MULSHIFT32(SQRT1_2, cr + ci); |
222 | |
223 | /* max gain of output vs input = (2 + 2*sqrt(2) ~= 4.83) |
224 | * (sum of 4 samples >> 1, plus xr/xi/zr/zi with gain of 2*sqrt(2)) |
225 | * in absolute terms, we have max gain of appx 9.656 (4 + 0.707*8) |
226 | * but we also gain 1 int bit (from MULSHIFT32 or from explicit >> 1) |
227 | */ |
228 | x[ 6] = (tr >> 1) - xr; |
229 | x[14] = (tr >> 1) + xr; |
230 | x[ 7] = (ti >> 1) - xi; |
231 | x[15] = (ti >> 1) + xi; |
232 | x[ 2] = (vr >> 1) + zi; |
233 | x[10] = (vr >> 1) - zi; |
234 | x[ 3] = (vi >> 1) - zr; |
235 | x[11] = (vi >> 1) + zr; |
236 | |
237 | x += 16; |
238 | } |
239 | } |
240 | |
241 | /************************************************************************************** |
242 | * Function: R4Core |
243 | * |
244 | * Description: radix-4 pass for decimation-in-time FFT |
245 | * |
246 | * Inputs: buffer of samples |
247 | * number of R4 butterflies per group |
248 | * number of R4 groups per pass |
249 | * pointer to twiddle factors tables |
250 | * |
251 | * Outputs: processed samples in same buffer |
252 | * |
253 | * Return: none |
254 | * |
255 | * Notes: gain 2 integer bits per pass (see scaling comments in code) |
256 | * min 1 GB in |
257 | * gbOut = gbIn - 1 (short block) or gbIn - 2 (long block) |
258 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
259 | **************************************************************************************/ |
260 | static void R4Core(int *x, int bg, int gp, int *wtab) |
261 | { |
262 | int ar, ai, br, bi, cr, ci, dr, di, tr, ti; |
263 | int wd, ws, wi; |
264 | int i, j, step; |
265 | int *xptr, *wptr; |
266 | |
267 | for (; bg != 0; gp <<= 2, bg >>= 2) { |
268 | |
269 | step = 2 * gp; |
270 | xptr = x; |
271 | |
272 | /* max per-sample gain, per group < 1 + 3*sqrt(2) ~= 5.25 if inputs x are full-scale |
273 | * do 3 groups for long block, 2 groups for short block (gain 2 int bits per group) |
274 | * |
275 | * very conservative scaling: |
276 | * group 1: max gain = 5.25, int bits gained = 2, gb used = 1 (2^3 = 8) |
277 | * group 2: max gain = 5.25^2 = 27.6, int bits gained = 4, gb used = 1 (2^5 = 32) |
278 | * group 3: max gain = 5.25^3 = 144.7, int bits gained = 6, gb used = 2 (2^8 = 256) |
279 | */ |
280 | for (i = bg; i != 0; i--) { |
281 | |
282 | wptr = wtab; |
283 | |
284 | for (j = gp; j != 0; j--) { |
285 | |
286 | ar = xptr[0]; |
287 | ai = xptr[1]; |
288 | xptr += step; |
289 | |
290 | /* gain 2 int bits for br/bi, cr/ci, dr/di (MULSHIFT32 by Q30) |
291 | * gain 1 net GB |
292 | */ |
293 | ws = wptr[0]; |
294 | wi = wptr[1]; |
295 | br = xptr[0]; |
296 | bi = xptr[1]; |
297 | wd = ws + 2 * wi; |
298 | tr = MULSHIFT32(wi, br + bi); |
299 | br = MULSHIFT32(wd, br) - tr; /* cos*br + sin*bi */ |
300 | bi = MULSHIFT32(ws, bi) + tr; /* cos*bi - sin*br */ |
301 | xptr += step; |
302 | |
303 | ws = wptr[2]; |
304 | wi = wptr[3]; |
305 | cr = xptr[0]; |
306 | ci = xptr[1]; |
307 | wd = ws + 2 * wi; |
308 | tr = MULSHIFT32(wi, cr + ci); |
309 | cr = MULSHIFT32(wd, cr) - tr; |
310 | ci = MULSHIFT32(ws, ci) + tr; |
311 | xptr += step; |
312 | |
313 | ws = wptr[4]; |
314 | wi = wptr[5]; |
315 | dr = xptr[0]; |
316 | di = xptr[1]; |
317 | wd = ws + 2 * wi; |
318 | tr = MULSHIFT32(wi, dr + di); |
319 | dr = MULSHIFT32(wd, dr) - tr; |
320 | di = MULSHIFT32(ws, di) + tr; |
321 | wptr += 6; |
322 | |
323 | tr = ar; |
324 | ti = ai; |
325 | ar = (tr >> 2) - br; |
326 | ai = (ti >> 2) - bi; |
327 | br = (tr >> 2) + br; |
328 | bi = (ti >> 2) + bi; |
329 | |
330 | tr = cr; |
331 | ti = ci; |
332 | cr = tr + dr; |
333 | ci = di - ti; |
334 | dr = tr - dr; |
335 | di = di + ti; |
336 | |
337 | xptr[0] = ar + ci; |
338 | xptr[1] = ai + dr; |
339 | xptr -= step; |
340 | xptr[0] = br - cr; |
341 | xptr[1] = bi - di; |
342 | xptr -= step; |
343 | xptr[0] = ar - ci; |
344 | xptr[1] = ai - dr; |
345 | xptr -= step; |
346 | xptr[0] = br + cr; |
347 | xptr[1] = bi + di; |
348 | xptr += 2; |
349 | } |
350 | xptr += 3 * step; |
351 | } |
352 | wtab += 3 * step; |
353 | } |
354 | } |
355 | |
356 | |
357 | /************************************************************************************** |
358 | * Function: R4FFT |
359 | * |
360 | * Description: Ken's very fast in-place radix-4 decimation-in-time FFT |
361 | * |
362 | * Inputs: table index (for transform size) |
363 | * buffer of samples (non bit-reversed) |
364 | * |
365 | * Outputs: processed samples in same buffer |
366 | * |
367 | * Return: none |
368 | * |
369 | * Notes: assumes 5 guard bits in for nfft <= 512 |
370 | * gbOut = gbIn - 4 (assuming input is from PreMultiply) |
371 | * gains log2(nfft) - 2 int bits total |
372 | * so gain 7 int bits (LONG), 4 int bits (SHORT) |
373 | **************************************************************************************/ |
374 | void R4FFT(int tabidx, int *x) |
375 | { |
376 | int order = nfftlog2Tab[tabidx]; |
377 | int nfft = nfftTab[tabidx]; |
378 | |
379 | /* decimation in time */ |
380 | BitReverse(x, tabidx); |
381 | |
382 | if (order & 0x1) { |
383 | /* long block: order = 9, nfft = 512 */ |
384 | R8FirstPass(x, nfft >> 3); /* gain 1 int bit, lose 2 GB */ |
385 | R4Core(x, nfft >> 5, 8, (int *)twidTabOdd); /* gain 6 int bits, lose 2 GB */ |
386 | } else { |
387 | /* short block: order = 6, nfft = 64 */ |
388 | R4FirstPass(x, nfft >> 2); /* gain 0 int bits, lose 2 GB */ |
389 | R4Core(x, nfft >> 4, 4, (int *)twidTabEven); /* gain 4 int bits, lose 1 GB */ |
390 | } |
391 | } |
392 |