summaryrefslogtreecommitdiff
path: root/audio_codec/libcook/ra_fft.c (plain)
blob: 6091de9c746d04b214f61d47ab356cc89f359ec8
1/* ***** BEGIN LICENSE BLOCK *****
2 * Source last modified: $Id: fft.c,v 1.4 2005/04/27 19:20:50 hubbe Exp $
3 *
4 * REALNETWORKS CONFIDENTIAL--NOT FOR DISTRIBUTION IN SOURCE CODE FORM
5 * Portions Copyright (c) 1995-2002 RealNetworks, Inc.
6 * All Rights Reserved.
7 *
8 * The contents of this file, and the files included with this file,
9 * are subject to the current version of the Real Format Source Code
10 * Porting and Optimization License, available at
11 * https://helixcommunity.org/2005/license/realformatsource (unless
12 * RealNetworks otherwise expressly agrees in writing that you are
13 * subject to a different license). You may also obtain the license
14 * terms directly from RealNetworks. You may not use this file except
15 * in compliance with the Real Format Source Code Porting and
16 * Optimization License. There are no redistribution rights for the
17 * source code of this file. Please see the Real Format Source Code
18 * Porting and Optimization License for the rights, obligations and
19 * limitations governing use of the contents of the file.
20 *
21 * RealNetworks is the developer of the Original Code and owns the
22 * copyrights in the portions it created.
23 *
24 * This file, and the files included with this file, is distributed and
25 * made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND,
26 * EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL
27 * SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF
28 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT
29 * OR NON-INFRINGEMENT.
30 *
31 * Technology Compatibility Kit Test Suite(s) Location:
32 * https://rarvcode-tck.helixcommunity.org
33 *
34 * Contributor(s):
35 *
36 * ***** END LICENSE BLOCK ***** */
37
38/**************************************************************************************
39 * Fixed-point RealAudio 8 decoder
40 * Ken Cooke (kenc@real.com), Jon Recker(jrecker@real.com)
41 * October 2003
42 *
43 * fft.c - Ken's highly optimized radix-4 DIT FFT, with optional radix-8 first pass
44 * for odd powers of 2
45 **************************************************************************************/
46
47#include "coder.h"
48#include "assembly.h"
49
50#define NUM_FFT_SIZES 3
51static const int nfftTab[NUM_FFT_SIZES] = {128, 256, 512};
52static const int nfftlog2Tab[NUM_FFT_SIZES] = {7, 8, 9};
53
54#define SQRT1_2 1518500250 /* sqrt(1/2) in Q31 */
55
56#define swapcplx(p0,p1) \
57 t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1
58
59/**************************************************************************************
60 * Function: BitReverse
61 *
62 * Description: Ken's fast in-place bit reverse, using super-small table
63 *
64 * Inputs: buffer of samples
65 * table index (for transform size)
66 *
67 * Outputs: bit-reversed samples in same buffer
68 *
69 * Return: none
70 **************************************************************************************/
71static void BitReverse(int *inout, int tabidx)
72{
73 /* 1st part: non-id */
74 int *part0, *part1;
75 int a, b, t, t1;
76 const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx];
77 int nbits = nfftlog2Tab[tabidx];
78
79 part0 = inout;
80 part1 = inout + (1 << nbits);
81
82 while ((a = *tab++) != 0) {
83 b = *tab++;
84
85 swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */
86 swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */
87 swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */
88 swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */
89 }
90
91 do {
92 swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */
93 } while ((a = *tab++) != 0);
94}
95
96/**************************************************************************************
97 * Function: R4FirstPass
98 *
99 * Description: radix-4 trivial pass for decimation-in-time FFT
100 *
101 * Inputs: buffer of (bit-reversed) samples
102 * number of R4 butterflies per group (i.e. nfft / 4)
103 *
104 * Outputs: processed samples in same buffer
105 *
106 * Return: none
107 *
108 * Notes: assumes 3 guard bits, gains no integer bits,
109 * guard bits out = guard bits in - 3
110 **************************************************************************************/
111static void R4FirstPass(int *x, int bg)
112{
113 int ar, ai, br, bi, cr, ci, dr, di;
114
115 for (; bg != 0; bg--) {
116
117 ar = x[0] + x[2];
118 br = x[0] - x[2];
119 ai = x[1] + x[3];
120 bi = x[1] - x[3];
121 cr = x[4] + x[6];
122 dr = x[4] - x[6];
123 ci = x[5] + x[7];
124 di = x[5] - x[7];
125
126 x[0] = ar + cr;
127 x[4] = ar - cr;
128 x[1] = ai + ci;
129 x[5] = ai - ci;
130 x[2] = br + di;
131 x[6] = br - di;
132 x[3] = bi - dr;
133 x[7] = bi + dr;
134
135 x += 8;
136 }
137}
138
139/**************************************************************************************
140 * Function: R8FirstPass
141 *
142 * Description: radix-8 trivial pass for decimation-in-time FFT
143 *
144 * Inputs: buffer of (bit-reversed) samples
145 * number of R8 butterflies per group (i.e. nfft / 8)
146 *
147 * Outputs: processed samples in same buffer
148 *
149 * Return: none
150 *
151 * Notes: assumes 3 guard bits, gains 1 integer bit,
152 * guard bits out = guard bits in - 3
153 **************************************************************************************/
154static void R8FirstPass(int *x, int bg)
155{
156 int ar, ai, br, bi, cr, ci, dr, di;
157 int sr, si, tr, ti, ur, ui, vr, vi;
158 int wr, wi, xr, xi, yr, yi, zr, zi;
159
160 for (; bg != 0; bg--) {
161
162 ar = x[0] + x[2];
163 br = x[0] - x[2];
164 ai = x[1] + x[3];
165 bi = x[1] - x[3];
166 cr = x[4] + x[6];
167 dr = x[4] - x[6];
168 ci = x[5] + x[7];
169 di = x[5] - x[7];
170
171 sr = ar + cr;
172 ur = ar - cr;
173 si = ai + ci;
174 ui = ai - ci;
175 tr = br - di;
176 vr = br + di;
177 ti = bi + dr;
178 vi = bi - dr;
179
180 ar = x[ 8] + x[10];
181 br = x[ 8] - x[10];
182 ai = x[ 9] + x[11];
183 bi = x[ 9] - x[11];
184 cr = x[12] + x[14];
185 dr = x[12] - x[14];
186 ci = x[13] + x[15];
187 di = x[13] - x[15];
188
189 wr = (ar + cr) >> 1;
190 yr = (ar - cr) >> 1;
191 wi = (ai + ci) >> 1;
192 yi = (ai - ci) >> 1;
193
194 /* gain 1 bit per R8 pass */
195 x[ 0] = (sr >> 1) + wr;
196 x[ 8] = (sr >> 1) - wr;
197 x[ 1] = (si >> 1) + wi;
198 x[ 9] = (si >> 1) - wi;
199 x[ 4] = (ur >> 1) + yi;
200 x[12] = (ur >> 1) - yi;
201 x[ 5] = (ui >> 1) - yr;
202 x[13] = (ui >> 1) + yr;
203
204 ar = br - di;
205 cr = br + di;
206 ai = bi + dr;
207 ci = bi - dr;
208
209 /* gain 1 bit from mul by Q31 */
210 xr = MULSHIFT32(SQRT1_2, ar - ai);
211 xi = MULSHIFT32(SQRT1_2, ar + ai);
212 zr = MULSHIFT32(SQRT1_2, cr - ci);
213 zi = MULSHIFT32(SQRT1_2, cr + ci);
214
215 x[ 6] = (tr >> 1) - xr;
216 x[14] = (tr >> 1) + xr;
217 x[ 7] = (ti >> 1) - xi;
218 x[15] = (ti >> 1) + xi;
219 x[ 2] = (vr >> 1) + zi;
220 x[10] = (vr >> 1) - zi;
221 x[ 3] = (vi >> 1) - zr;
222 x[11] = (vi >> 1) + zr;
223
224 x += 16;
225 }
226}
227
228/**************************************************************************************
229 * Function: R4Core
230 *
231 * Description: radix-4 pass for decimation-in-time FFT
232 * number of R4 butterflies per group
233 * number of R4 groups per pass
234 * pointer to twiddle factors tables
235 *
236 * Inputs: buffer of samples
237 *
238 * Outputs: processed samples in same buffer
239 *
240 * Return: none
241 *
242 * Notes: gain 2 integer bits per pass (see notes on R4FFT for scaling info)
243 * for final multiply, can use SMLAL (if it pays off - needs extra
244 * mov rLO, #0 before SMLAL if using inline asm from C)
245 * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
246 **************************************************************************************/
247static void R4Core(int *x, int bg, int gp, int *wtab)
248{
249 int ar, ai, br, bi, cr, ci, dr, di, tr, ti;
250 int wd, ws, wi;
251 int i, j, step;
252 int *xptr, *wptr;
253
254 for (; bg != 0; gp <<= 2, bg >>= 2) {
255
256 step = 2 * gp;
257 xptr = x;
258
259 for (i = bg; i != 0; i--) {
260
261 wptr = wtab;
262
263 for (j = gp; j != 0; j--) {
264
265 ar = xptr[0];
266 ai = xptr[1];
267 xptr += step;
268
269 ws = wptr[0];
270 wi = wptr[1];
271 br = xptr[0];
272 bi = xptr[1];
273 wd = ws + 2 * wi;
274 tr = MULSHIFT32(wi, br + bi);
275 br = MULSHIFT32(wd, br) - tr;
276 bi = MULSHIFT32(ws, bi) + tr;
277 xptr += step;
278
279 ws = wptr[2];
280 wi = wptr[3];
281 cr = xptr[0];
282 ci = xptr[1];
283 wd = ws + 2 * wi;
284 tr = MULSHIFT32(wi, cr + ci);
285 cr = MULSHIFT32(wd, cr) - tr;
286 ci = MULSHIFT32(ws, ci) + tr;
287 xptr += step;
288
289 ws = wptr[4];
290 wi = wptr[5];
291 dr = xptr[0];
292 di = xptr[1];
293 wd = ws + 2 * wi;
294 tr = MULSHIFT32(wi, dr + di);
295 dr = MULSHIFT32(wd, dr) - tr;
296 di = MULSHIFT32(ws, di) + tr;
297 wptr += 6;
298
299 /* have now gained 1 bit for br,bi,cr,ci,dr,di
300 * gain 1 more bit for by skipping 2*br,bi,cr,ci,dr,di in additions
301 * manually >> 2 on ar,ai to normalize (trivial case, no mul)
302 * net gain = 2 int bits per x[i], per R4 pass
303 */
304 tr = ar;
305 ti = ai;
306 ar = (tr >> 2) - br;
307 ai = (ti >> 2) - bi;
308 br = (tr >> 2) + br;
309 bi = (ti >> 2) + bi;
310
311 tr = cr;
312 ti = ci;
313 cr = tr + dr;
314 ci = di - ti;
315 dr = tr - dr;
316 di = di + ti;
317
318 xptr[0] = ar + ci;
319 xptr[1] = ai + dr;
320 xptr -= step;
321 xptr[0] = br - cr;
322 xptr[1] = bi - di;
323 xptr -= step;
324 xptr[0] = ar - ci;
325 xptr[1] = ai - dr;
326 xptr -= step;
327 xptr[0] = br + cr;
328 xptr[1] = bi + di;
329 xptr += 2;
330 }
331 xptr += 3 * step;
332 }
333 wtab += 3 * step;
334 }
335}
336
337
338/**************************************************************************************
339 * Function: R4FFT
340 *
341 * Description: Ken's very fast in-place radix-4 decimation-in-time FFT
342 *
343 * Inputs: table index (for transform size)
344 * buffer of samples (non bit-reversed)
345 *
346 * Outputs: processed samples in same buffer
347 *
348 * Return: none
349 *
350 * Notes: assumes 4 guard bits in for nfft = [128, 256, 512]
351 * gains log2(nfft) - 2 int bits total
352 **************************************************************************************/
353void R4FFT(int tabidx, int *x)
354{
355 int order = nfftlog2Tab[tabidx];
356 int nfft = nfftTab[tabidx];
357
358 /* decimation in time */
359 BitReverse(x, tabidx);
360
361 if (order & 0x1) {
362 R8FirstPass(x, nfft >> 3);
363 R4Core(x, nfft >> 5, 8, (int *)twidTabOdd);
364 } else {
365 R4FirstPass(x, nfft >> 2);
366 R4Core(x, nfft >> 4, 4, (int *)twidTabEven);
367 }
368}
369