summaryrefslogtreecommitdiff
path: root/audio_codec/wfd_aac_decoder/fft.c (plain)
blob: db17d27e4aa1b52924194753dcdc8ce8d05b3c89
1/* ***** BEGIN LICENSE BLOCK *****
2 * Source last modified: $Id: fft.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $
3 *
4 * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5 *
6 * The contents of this file, and the files included with this file,
7 * are subject to the current version of the RealNetworks Public
8 * Source License (the "RPSL") available at
9 * http://www.helixcommunity.org/content/rpsl unless you have licensed
10 * the file under the current version of the RealNetworks Community
11 * Source License (the "RCSL") available at
12 * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13 * will apply. You may also obtain the license terms directly from
14 * RealNetworks. You may not use this file except in compliance with
15 * the RPSL or, if you have a valid RCSL with RealNetworks applicable
16 * to this file, the RCSL. Please see the applicable RPSL or RCSL for
17 * the rights, obligations and limitations governing use of the
18 * contents of the file.
19 *
20 * This file is part of the Helix DNA Technology. RealNetworks is the
21 * developer of the Original Code and owns the copyrights in the
22 * portions it created.
23 *
24 * This file, and the files included with this file, is distributed
25 * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26 * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27 * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29 * ENJOYMENT OR NON-INFRINGEMENT.
30 *
31 * Technology Compatibility Kit Test Suite(s) Location:
32 * http://www.helixcommunity.org/content/tck
33 *
34 * Contributor(s):
35 *
36 * ***** END LICENSE BLOCK ***** */
37
38/**************************************************************************************
39 * Fixed-point HE-AAC decoder
40 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41 * February 2005
42 *
43 * fft.c - Ken's optimized radix-4 DIT FFT, optional radix-8 first pass for odd log2(N)
44 **************************************************************************************/
45
46#include "coder.h"
47#include "assembly.h"
48
49#define NUM_FFT_SIZES 2
50static const int nfftTab[NUM_FFT_SIZES] = {64, 512};
51static const int nfftlog2Tab[NUM_FFT_SIZES] = {6, 9};
52
53#define SQRT1_2 0x5a82799a /* sqrt(1/2) in Q31 */
54
55#define swapcplx(p0,p1) \
56 t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1
57
58/**************************************************************************************
59 * Function: BitReverse
60 *
61 * Description: Ken's fast in-place bit reverse, using super-small table
62 *
63 * Inputs: buffer of samples
64 * table index (for transform size)
65 *
66 * Outputs: bit-reversed samples in same buffer
67 *
68 * Return: none
69 **************************************************************************************/
70static void BitReverse(int *inout, int tabidx)
71{
72 int *part0, *part1;
73 int a, b, t, t1;
74 const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx];
75 int nbits = nfftlog2Tab[tabidx];
76
77 part0 = inout;
78 part1 = inout + (1 << nbits);
79
80 while ((a = *tab++) != 0) {
81 b = *tab++;
82
83 swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */
84 swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */
85 swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */
86 swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */
87 }
88
89 do {
90 swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */
91 } while ((a = *tab++) != 0);
92}
93
94/**************************************************************************************
95 * Function: R4FirstPass
96 *
97 * Description: radix-4 trivial pass for decimation-in-time FFT
98 *
99 * Inputs: buffer of (bit-reversed) samples
100 * number of R4 butterflies per group (i.e. nfft / 4)
101 *
102 * Outputs: processed samples in same buffer
103 *
104 * Return: none
105 *
106 * Notes: assumes 2 guard bits, gains no integer bits,
107 * guard bits out = guard bits in - 2
108 **************************************************************************************/
109static void R4FirstPass(int *x, int bg)
110{
111 int ar, ai, br, bi, cr, ci, dr, di;
112
113 for (; bg != 0; bg--) {
114
115 ar = x[0] + x[2];
116 br = x[0] - x[2];
117 ai = x[1] + x[3];
118 bi = x[1] - x[3];
119 cr = x[4] + x[6];
120 dr = x[4] - x[6];
121 ci = x[5] + x[7];
122 di = x[5] - x[7];
123
124 /* max per-sample gain = 4.0 (adding 4 inputs together) */
125 x[0] = ar + cr;
126 x[4] = ar - cr;
127 x[1] = ai + ci;
128 x[5] = ai - ci;
129 x[2] = br + di;
130 x[6] = br - di;
131 x[3] = bi - dr;
132 x[7] = bi + dr;
133
134 x += 8;
135 }
136}
137
138/**************************************************************************************
139 * Function: R8FirstPass
140 *
141 * Description: radix-8 trivial pass for decimation-in-time FFT
142 *
143 * Inputs: buffer of (bit-reversed) samples
144 * number of R8 butterflies per group (i.e. nfft / 8)
145 *
146 * Outputs: processed samples in same buffer
147 *
148 * Return: none
149 *
150 * Notes: assumes 3 guard bits, gains 1 integer bit
151 * guard bits out = guard bits in - 3 (if inputs are full scale)
152 * or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2)
153 * see scaling comments in code
154 **************************************************************************************/
155static void R8FirstPass(int *x, int bg)
156{
157 int ar, ai, br, bi, cr, ci, dr, di;
158 int sr, si, tr, ti, ur, ui, vr, vi;
159 int wr, wi, xr, xi, yr, yi, zr, zi;
160
161 for (; bg != 0; bg--) {
162
163 ar = x[0] + x[2];
164 br = x[0] - x[2];
165 ai = x[1] + x[3];
166 bi = x[1] - x[3];
167 cr = x[4] + x[6];
168 dr = x[4] - x[6];
169 ci = x[5] + x[7];
170 di = x[5] - x[7];
171
172 sr = ar + cr;
173 ur = ar - cr;
174 si = ai + ci;
175 ui = ai - ci;
176 tr = br - di;
177 vr = br + di;
178 ti = bi + dr;
179 vi = bi - dr;
180
181 ar = x[ 8] + x[10];
182 br = x[ 8] - x[10];
183 ai = x[ 9] + x[11];
184 bi = x[ 9] - x[11];
185 cr = x[12] + x[14];
186 dr = x[12] - x[14];
187 ci = x[13] + x[15];
188 di = x[13] - x[15];
189
190 /* max gain of wr/wi/yr/yi vs input = 2
191 * (sum of 4 samples >> 1)
192 */
193 wr = (ar + cr) >> 1;
194 yr = (ar - cr) >> 1;
195 wi = (ai + ci) >> 1;
196 yi = (ai - ci) >> 1;
197
198 /* max gain of output vs input = 4
199 * (sum of 4 samples >> 1 + sum of 4 samples >> 1)
200 */
201 x[ 0] = (sr >> 1) + wr;
202 x[ 8] = (sr >> 1) - wr;
203 x[ 1] = (si >> 1) + wi;
204 x[ 9] = (si >> 1) - wi;
205 x[ 4] = (ur >> 1) + yi;
206 x[12] = (ur >> 1) - yi;
207 x[ 5] = (ui >> 1) - yr;
208 x[13] = (ui >> 1) + yr;
209
210 ar = br - di;
211 cr = br + di;
212 ai = bi + dr;
213 ci = bi - dr;
214
215 /* max gain of xr/xi/zr/zi vs input = 4*sqrt(2)/2 = 2*sqrt(2)
216 * (sum of 8 samples, multiply by sqrt(2)/2, implicit >> 1 from Q31)
217 */
218 xr = MULSHIFT32(SQRT1_2, ar - ai);
219 xi = MULSHIFT32(SQRT1_2, ar + ai);
220 zr = MULSHIFT32(SQRT1_2, cr - ci);
221 zi = MULSHIFT32(SQRT1_2, cr + ci);
222
223 /* max gain of output vs input = (2 + 2*sqrt(2) ~= 4.83)
224 * (sum of 4 samples >> 1, plus xr/xi/zr/zi with gain of 2*sqrt(2))
225 * in absolute terms, we have max gain of appx 9.656 (4 + 0.707*8)
226 * but we also gain 1 int bit (from MULSHIFT32 or from explicit >> 1)
227 */
228 x[ 6] = (tr >> 1) - xr;
229 x[14] = (tr >> 1) + xr;
230 x[ 7] = (ti >> 1) - xi;
231 x[15] = (ti >> 1) + xi;
232 x[ 2] = (vr >> 1) + zi;
233 x[10] = (vr >> 1) - zi;
234 x[ 3] = (vi >> 1) - zr;
235 x[11] = (vi >> 1) + zr;
236
237 x += 16;
238 }
239}
240
241/**************************************************************************************
242 * Function: R4Core
243 *
244 * Description: radix-4 pass for decimation-in-time FFT
245 *
246 * Inputs: buffer of samples
247 * number of R4 butterflies per group
248 * number of R4 groups per pass
249 * pointer to twiddle factors tables
250 *
251 * Outputs: processed samples in same buffer
252 *
253 * Return: none
254 *
255 * Notes: gain 2 integer bits per pass (see scaling comments in code)
256 * min 1 GB in
257 * gbOut = gbIn - 1 (short block) or gbIn - 2 (long block)
258 * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
259 **************************************************************************************/
260static void R4Core(int *x, int bg, int gp, int *wtab)
261{
262 int ar, ai, br, bi, cr, ci, dr, di, tr, ti;
263 int wd, ws, wi;
264 int i, j, step;
265 int *xptr, *wptr;
266
267 for (; bg != 0; gp <<= 2, bg >>= 2) {
268
269 step = 2 * gp;
270 xptr = x;
271
272 /* max per-sample gain, per group < 1 + 3*sqrt(2) ~= 5.25 if inputs x are full-scale
273 * do 3 groups for long block, 2 groups for short block (gain 2 int bits per group)
274 *
275 * very conservative scaling:
276 * group 1: max gain = 5.25, int bits gained = 2, gb used = 1 (2^3 = 8)
277 * group 2: max gain = 5.25^2 = 27.6, int bits gained = 4, gb used = 1 (2^5 = 32)
278 * group 3: max gain = 5.25^3 = 144.7, int bits gained = 6, gb used = 2 (2^8 = 256)
279 */
280 for (i = bg; i != 0; i--) {
281
282 wptr = wtab;
283
284 for (j = gp; j != 0; j--) {
285
286 ar = xptr[0];
287 ai = xptr[1];
288 xptr += step;
289
290 /* gain 2 int bits for br/bi, cr/ci, dr/di (MULSHIFT32 by Q30)
291 * gain 1 net GB
292 */
293 ws = wptr[0];
294 wi = wptr[1];
295 br = xptr[0];
296 bi = xptr[1];
297 wd = ws + 2 * wi;
298 tr = MULSHIFT32(wi, br + bi);
299 br = MULSHIFT32(wd, br) - tr; /* cos*br + sin*bi */
300 bi = MULSHIFT32(ws, bi) + tr; /* cos*bi - sin*br */
301 xptr += step;
302
303 ws = wptr[2];
304 wi = wptr[3];
305 cr = xptr[0];
306 ci = xptr[1];
307 wd = ws + 2 * wi;
308 tr = MULSHIFT32(wi, cr + ci);
309 cr = MULSHIFT32(wd, cr) - tr;
310 ci = MULSHIFT32(ws, ci) + tr;
311 xptr += step;
312
313 ws = wptr[4];
314 wi = wptr[5];
315 dr = xptr[0];
316 di = xptr[1];
317 wd = ws + 2 * wi;
318 tr = MULSHIFT32(wi, dr + di);
319 dr = MULSHIFT32(wd, dr) - tr;
320 di = MULSHIFT32(ws, di) + tr;
321 wptr += 6;
322
323 tr = ar;
324 ti = ai;
325 ar = (tr >> 2) - br;
326 ai = (ti >> 2) - bi;
327 br = (tr >> 2) + br;
328 bi = (ti >> 2) + bi;
329
330 tr = cr;
331 ti = ci;
332 cr = tr + dr;
333 ci = di - ti;
334 dr = tr - dr;
335 di = di + ti;
336
337 xptr[0] = ar + ci;
338 xptr[1] = ai + dr;
339 xptr -= step;
340 xptr[0] = br - cr;
341 xptr[1] = bi - di;
342 xptr -= step;
343 xptr[0] = ar - ci;
344 xptr[1] = ai - dr;
345 xptr -= step;
346 xptr[0] = br + cr;
347 xptr[1] = bi + di;
348 xptr += 2;
349 }
350 xptr += 3 * step;
351 }
352 wtab += 3 * step;
353 }
354}
355
356
357/**************************************************************************************
358 * Function: R4FFT
359 *
360 * Description: Ken's very fast in-place radix-4 decimation-in-time FFT
361 *
362 * Inputs: table index (for transform size)
363 * buffer of samples (non bit-reversed)
364 *
365 * Outputs: processed samples in same buffer
366 *
367 * Return: none
368 *
369 * Notes: assumes 5 guard bits in for nfft <= 512
370 * gbOut = gbIn - 4 (assuming input is from PreMultiply)
371 * gains log2(nfft) - 2 int bits total
372 * so gain 7 int bits (LONG), 4 int bits (SHORT)
373 **************************************************************************************/
374void R4FFT(int tabidx, int *x)
375{
376 int order = nfftlog2Tab[tabidx];
377 int nfft = nfftTab[tabidx];
378
379 /* decimation in time */
380 BitReverse(x, tabidx);
381
382 if (order & 0x1) {
383 /* long block: order = 9, nfft = 512 */
384 R8FirstPass(x, nfft >> 3); /* gain 1 int bit, lose 2 GB */
385 R4Core(x, nfft >> 5, 8, (int *)twidTabOdd); /* gain 6 int bits, lose 2 GB */
386 } else {
387 /* short block: order = 6, nfft = 64 */
388 R4FirstPass(x, nfft >> 2); /* gain 0 int bits, lose 2 GB */
389 R4Core(x, nfft >> 4, 4, (int *)twidTabEven); /* gain 4 int bits, lose 1 GB */
390 }
391}
392