blob: 6091de9c746d04b214f61d47ab356cc89f359ec8
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: fft.c,v 1.4 2005/04/27 19:20:50 hubbe Exp $ |
3 | * |
4 | * REALNETWORKS CONFIDENTIAL--NOT FOR DISTRIBUTION IN SOURCE CODE FORM |
5 | * Portions Copyright (c) 1995-2002 RealNetworks, Inc. |
6 | * All Rights Reserved. |
7 | * |
8 | * The contents of this file, and the files included with this file, |
9 | * are subject to the current version of the Real Format Source Code |
10 | * Porting and Optimization License, available at |
11 | * https://helixcommunity.org/2005/license/realformatsource (unless |
12 | * RealNetworks otherwise expressly agrees in writing that you are |
13 | * subject to a different license). You may also obtain the license |
14 | * terms directly from RealNetworks. You may not use this file except |
15 | * in compliance with the Real Format Source Code Porting and |
16 | * Optimization License. There are no redistribution rights for the |
17 | * source code of this file. Please see the Real Format Source Code |
18 | * Porting and Optimization License for the rights, obligations and |
19 | * limitations governing use of the contents of the file. |
20 | * |
21 | * RealNetworks is the developer of the Original Code and owns the |
22 | * copyrights in the portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed and |
25 | * made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, |
26 | * EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL |
27 | * SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF |
28 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT |
29 | * OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * https://rarvcode-tck.helixcommunity.org |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point RealAudio 8 decoder |
40 | * Ken Cooke (kenc@real.com), Jon Recker(jrecker@real.com) |
41 | * October 2003 |
42 | * |
43 | * fft.c - Ken's highly optimized radix-4 DIT FFT, with optional radix-8 first pass |
44 | * for odd powers of 2 |
45 | **************************************************************************************/ |
46 | |
47 | #include "coder.h" |
48 | #include "assembly.h" |
49 | |
50 | #define NUM_FFT_SIZES 3 |
51 | static const int nfftTab[NUM_FFT_SIZES] = {128, 256, 512}; |
52 | static const int nfftlog2Tab[NUM_FFT_SIZES] = {7, 8, 9}; |
53 | |
54 | #define SQRT1_2 1518500250 /* sqrt(1/2) in Q31 */ |
55 | |
56 | #define swapcplx(p0,p1) \ |
57 | t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1 |
58 | |
59 | /************************************************************************************** |
60 | * Function: BitReverse |
61 | * |
62 | * Description: Ken's fast in-place bit reverse, using super-small table |
63 | * |
64 | * Inputs: buffer of samples |
65 | * table index (for transform size) |
66 | * |
67 | * Outputs: bit-reversed samples in same buffer |
68 | * |
69 | * Return: none |
70 | **************************************************************************************/ |
71 | static void BitReverse(int *inout, int tabidx) |
72 | { |
73 | /* 1st part: non-id */ |
74 | int *part0, *part1; |
75 | int a, b, t, t1; |
76 | const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx]; |
77 | int nbits = nfftlog2Tab[tabidx]; |
78 | |
79 | part0 = inout; |
80 | part1 = inout + (1 << nbits); |
81 | |
82 | while ((a = *tab++) != 0) { |
83 | b = *tab++; |
84 | |
85 | swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */ |
86 | swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */ |
87 | swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */ |
88 | swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */ |
89 | } |
90 | |
91 | do { |
92 | swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */ |
93 | } while ((a = *tab++) != 0); |
94 | } |
95 | |
96 | /************************************************************************************** |
97 | * Function: R4FirstPass |
98 | * |
99 | * Description: radix-4 trivial pass for decimation-in-time FFT |
100 | * |
101 | * Inputs: buffer of (bit-reversed) samples |
102 | * number of R4 butterflies per group (i.e. nfft / 4) |
103 | * |
104 | * Outputs: processed samples in same buffer |
105 | * |
106 | * Return: none |
107 | * |
108 | * Notes: assumes 3 guard bits, gains no integer bits, |
109 | * guard bits out = guard bits in - 3 |
110 | **************************************************************************************/ |
111 | static void R4FirstPass(int *x, int bg) |
112 | { |
113 | int ar, ai, br, bi, cr, ci, dr, di; |
114 | |
115 | for (; bg != 0; bg--) { |
116 | |
117 | ar = x[0] + x[2]; |
118 | br = x[0] - x[2]; |
119 | ai = x[1] + x[3]; |
120 | bi = x[1] - x[3]; |
121 | cr = x[4] + x[6]; |
122 | dr = x[4] - x[6]; |
123 | ci = x[5] + x[7]; |
124 | di = x[5] - x[7]; |
125 | |
126 | x[0] = ar + cr; |
127 | x[4] = ar - cr; |
128 | x[1] = ai + ci; |
129 | x[5] = ai - ci; |
130 | x[2] = br + di; |
131 | x[6] = br - di; |
132 | x[3] = bi - dr; |
133 | x[7] = bi + dr; |
134 | |
135 | x += 8; |
136 | } |
137 | } |
138 | |
139 | /************************************************************************************** |
140 | * Function: R8FirstPass |
141 | * |
142 | * Description: radix-8 trivial pass for decimation-in-time FFT |
143 | * |
144 | * Inputs: buffer of (bit-reversed) samples |
145 | * number of R8 butterflies per group (i.e. nfft / 8) |
146 | * |
147 | * Outputs: processed samples in same buffer |
148 | * |
149 | * Return: none |
150 | * |
151 | * Notes: assumes 3 guard bits, gains 1 integer bit, |
152 | * guard bits out = guard bits in - 3 |
153 | **************************************************************************************/ |
154 | static void R8FirstPass(int *x, int bg) |
155 | { |
156 | int ar, ai, br, bi, cr, ci, dr, di; |
157 | int sr, si, tr, ti, ur, ui, vr, vi; |
158 | int wr, wi, xr, xi, yr, yi, zr, zi; |
159 | |
160 | for (; bg != 0; bg--) { |
161 | |
162 | ar = x[0] + x[2]; |
163 | br = x[0] - x[2]; |
164 | ai = x[1] + x[3]; |
165 | bi = x[1] - x[3]; |
166 | cr = x[4] + x[6]; |
167 | dr = x[4] - x[6]; |
168 | ci = x[5] + x[7]; |
169 | di = x[5] - x[7]; |
170 | |
171 | sr = ar + cr; |
172 | ur = ar - cr; |
173 | si = ai + ci; |
174 | ui = ai - ci; |
175 | tr = br - di; |
176 | vr = br + di; |
177 | ti = bi + dr; |
178 | vi = bi - dr; |
179 | |
180 | ar = x[ 8] + x[10]; |
181 | br = x[ 8] - x[10]; |
182 | ai = x[ 9] + x[11]; |
183 | bi = x[ 9] - x[11]; |
184 | cr = x[12] + x[14]; |
185 | dr = x[12] - x[14]; |
186 | ci = x[13] + x[15]; |
187 | di = x[13] - x[15]; |
188 | |
189 | wr = (ar + cr) >> 1; |
190 | yr = (ar - cr) >> 1; |
191 | wi = (ai + ci) >> 1; |
192 | yi = (ai - ci) >> 1; |
193 | |
194 | /* gain 1 bit per R8 pass */ |
195 | x[ 0] = (sr >> 1) + wr; |
196 | x[ 8] = (sr >> 1) - wr; |
197 | x[ 1] = (si >> 1) + wi; |
198 | x[ 9] = (si >> 1) - wi; |
199 | x[ 4] = (ur >> 1) + yi; |
200 | x[12] = (ur >> 1) - yi; |
201 | x[ 5] = (ui >> 1) - yr; |
202 | x[13] = (ui >> 1) + yr; |
203 | |
204 | ar = br - di; |
205 | cr = br + di; |
206 | ai = bi + dr; |
207 | ci = bi - dr; |
208 | |
209 | /* gain 1 bit from mul by Q31 */ |
210 | xr = MULSHIFT32(SQRT1_2, ar - ai); |
211 | xi = MULSHIFT32(SQRT1_2, ar + ai); |
212 | zr = MULSHIFT32(SQRT1_2, cr - ci); |
213 | zi = MULSHIFT32(SQRT1_2, cr + ci); |
214 | |
215 | x[ 6] = (tr >> 1) - xr; |
216 | x[14] = (tr >> 1) + xr; |
217 | x[ 7] = (ti >> 1) - xi; |
218 | x[15] = (ti >> 1) + xi; |
219 | x[ 2] = (vr >> 1) + zi; |
220 | x[10] = (vr >> 1) - zi; |
221 | x[ 3] = (vi >> 1) - zr; |
222 | x[11] = (vi >> 1) + zr; |
223 | |
224 | x += 16; |
225 | } |
226 | } |
227 | |
228 | /************************************************************************************** |
229 | * Function: R4Core |
230 | * |
231 | * Description: radix-4 pass for decimation-in-time FFT |
232 | * number of R4 butterflies per group |
233 | * number of R4 groups per pass |
234 | * pointer to twiddle factors tables |
235 | * |
236 | * Inputs: buffer of samples |
237 | * |
238 | * Outputs: processed samples in same buffer |
239 | * |
240 | * Return: none |
241 | * |
242 | * Notes: gain 2 integer bits per pass (see notes on R4FFT for scaling info) |
243 | * for final multiply, can use SMLAL (if it pays off - needs extra |
244 | * mov rLO, #0 before SMLAL if using inline asm from C) |
245 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
246 | **************************************************************************************/ |
247 | static void R4Core(int *x, int bg, int gp, int *wtab) |
248 | { |
249 | int ar, ai, br, bi, cr, ci, dr, di, tr, ti; |
250 | int wd, ws, wi; |
251 | int i, j, step; |
252 | int *xptr, *wptr; |
253 | |
254 | for (; bg != 0; gp <<= 2, bg >>= 2) { |
255 | |
256 | step = 2 * gp; |
257 | xptr = x; |
258 | |
259 | for (i = bg; i != 0; i--) { |
260 | |
261 | wptr = wtab; |
262 | |
263 | for (j = gp; j != 0; j--) { |
264 | |
265 | ar = xptr[0]; |
266 | ai = xptr[1]; |
267 | xptr += step; |
268 | |
269 | ws = wptr[0]; |
270 | wi = wptr[1]; |
271 | br = xptr[0]; |
272 | bi = xptr[1]; |
273 | wd = ws + 2 * wi; |
274 | tr = MULSHIFT32(wi, br + bi); |
275 | br = MULSHIFT32(wd, br) - tr; |
276 | bi = MULSHIFT32(ws, bi) + tr; |
277 | xptr += step; |
278 | |
279 | ws = wptr[2]; |
280 | wi = wptr[3]; |
281 | cr = xptr[0]; |
282 | ci = xptr[1]; |
283 | wd = ws + 2 * wi; |
284 | tr = MULSHIFT32(wi, cr + ci); |
285 | cr = MULSHIFT32(wd, cr) - tr; |
286 | ci = MULSHIFT32(ws, ci) + tr; |
287 | xptr += step; |
288 | |
289 | ws = wptr[4]; |
290 | wi = wptr[5]; |
291 | dr = xptr[0]; |
292 | di = xptr[1]; |
293 | wd = ws + 2 * wi; |
294 | tr = MULSHIFT32(wi, dr + di); |
295 | dr = MULSHIFT32(wd, dr) - tr; |
296 | di = MULSHIFT32(ws, di) + tr; |
297 | wptr += 6; |
298 | |
299 | /* have now gained 1 bit for br,bi,cr,ci,dr,di |
300 | * gain 1 more bit for by skipping 2*br,bi,cr,ci,dr,di in additions |
301 | * manually >> 2 on ar,ai to normalize (trivial case, no mul) |
302 | * net gain = 2 int bits per x[i], per R4 pass |
303 | */ |
304 | tr = ar; |
305 | ti = ai; |
306 | ar = (tr >> 2) - br; |
307 | ai = (ti >> 2) - bi; |
308 | br = (tr >> 2) + br; |
309 | bi = (ti >> 2) + bi; |
310 | |
311 | tr = cr; |
312 | ti = ci; |
313 | cr = tr + dr; |
314 | ci = di - ti; |
315 | dr = tr - dr; |
316 | di = di + ti; |
317 | |
318 | xptr[0] = ar + ci; |
319 | xptr[1] = ai + dr; |
320 | xptr -= step; |
321 | xptr[0] = br - cr; |
322 | xptr[1] = bi - di; |
323 | xptr -= step; |
324 | xptr[0] = ar - ci; |
325 | xptr[1] = ai - dr; |
326 | xptr -= step; |
327 | xptr[0] = br + cr; |
328 | xptr[1] = bi + di; |
329 | xptr += 2; |
330 | } |
331 | xptr += 3 * step; |
332 | } |
333 | wtab += 3 * step; |
334 | } |
335 | } |
336 | |
337 | |
338 | /************************************************************************************** |
339 | * Function: R4FFT |
340 | * |
341 | * Description: Ken's very fast in-place radix-4 decimation-in-time FFT |
342 | * |
343 | * Inputs: table index (for transform size) |
344 | * buffer of samples (non bit-reversed) |
345 | * |
346 | * Outputs: processed samples in same buffer |
347 | * |
348 | * Return: none |
349 | * |
350 | * Notes: assumes 4 guard bits in for nfft = [128, 256, 512] |
351 | * gains log2(nfft) - 2 int bits total |
352 | **************************************************************************************/ |
353 | void R4FFT(int tabidx, int *x) |
354 | { |
355 | int order = nfftlog2Tab[tabidx]; |
356 | int nfft = nfftTab[tabidx]; |
357 | |
358 | /* decimation in time */ |
359 | BitReverse(x, tabidx); |
360 | |
361 | if (order & 0x1) { |
362 | R8FirstPass(x, nfft >> 3); |
363 | R4Core(x, nfft >> 5, 8, (int *)twidTabOdd); |
364 | } else { |
365 | R4FirstPass(x, nfft >> 2); |
366 | R4Core(x, nfft >> 4, 4, (int *)twidTabEven); |
367 | } |
368 | } |
369 |