platform/hardware/amlogic/LibAudio.git - Unnamed repository; edit this file 'description' to name the repository.

1 /* ***** BEGIN LICENSE BLOCK *****
2  * Source last modified: $Id: fft.c,v 1.4 2005/04/27 19:20:50 hubbe Exp $
3  *
4  * REALNETWORKS CONFIDENTIAL--NOT FOR DISTRIBUTION IN SOURCE CODE FORM
5  * Portions Copyright (c) 1995-2002 RealNetworks, Inc.
6  * All Rights Reserved.
7  *
8  * The contents of this file, and the files included with this file,
9  * are subject to the current version of the Real Format Source Code
10  * Porting and Optimization License, available at
11  * https://helixcommunity.org/2005/license/realformatsource (unless
12  * RealNetworks otherwise expressly agrees in writing that you are
13  * subject to a different license).  You may also obtain the license
14  * terms directly from RealNetworks.  You may not use this file except
15  * in compliance with the Real Format Source Code Porting and
16  * Optimization License. There are no redistribution rights for the
17  * source code of this file. Please see the Real Format Source Code
18  * Porting and Optimization License for the rights, obligations and
19  * limitations governing use of the contents of the file.
20  *
21  * RealNetworks is the developer of the Original Code and owns the
22  * copyrights in the portions it created.
23  *
24  * This file, and the files included with this file, is distributed and
25  * made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND,
26  * EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL
27  * SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF
28  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT
29  * OR NON-INFRINGEMENT.
30  *
31  * Technology Compatibility Kit Test Suite(s) Location:
32  * https://rarvcode-tck.helixcommunity.org
33  *
34  * Contributor(s):
35  *
36  * ***** END LICENSE BLOCK ***** */
37
38 /**************************************************************************************
39  * Fixed-point RealAudio 8 decoder
40  * Ken Cooke (kenc@real.com), Jon Recker(jrecker@real.com)
41  * October 2003
42  *
43  * fft.c - Ken's highly optimized radix-4 DIT FFT, with optional radix-8 first pass
44  *           for odd powers of 2
45  **************************************************************************************/
46
47 #include "coder.h"
48 #include "assembly.h"
49
50 #define NUM_FFT_SIZES   3
51 static const int nfftTab[NUM_FFT_SIZES] =       {128, 256, 512};
52 static const int nfftlog2Tab[NUM_FFT_SIZES] =   {7, 8, 9};
53
54 #define SQRT1_2 1518500250  /* sqrt(1/2) in Q31 */
55
56 #define swapcplx(p0,p1) \
57     t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1
58
59 /**************************************************************************************
60  * Function:    BitReverse
61  *
62  * Description: Ken's fast in-place bit reverse, using super-small table
63  *
64  * Inputs:      buffer of samples
65  *              table index (for transform size)
66  *
67  * Outputs:     bit-reversed samples in same buffer
68  *
69  * Return:      none
70  **************************************************************************************/
71 static void BitReverse(int *inout, int tabidx)
72 {
73     /* 1st part: non-id */
74     int *part0, *part1;
75     int a, b, t, t1;
76     const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx];
77     int nbits = nfftlog2Tab[tabidx];
78
79     part0 = inout;
80     part1 = inout + (1 << nbits);
81
82     while ((a = *tab++) != 0) {
83         b = *tab++;
84
85         swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */
86         swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */
87         swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */
88         swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */
89     }
90
91     do {
92         swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */
93     } while ((a = *tab++) != 0);
94 }
95
96 /**************************************************************************************
97  * Function:    R4FirstPass
98  *
99  * Description: radix-4 trivial pass for decimation-in-time FFT
100  *
101  * Inputs:      buffer of (bit-reversed) samples
102  *              number of R4 butterflies per group (i.e. nfft / 4)
103  *
104  * Outputs:     processed samples in same buffer
105  *
106  * Return:      none
107  *
108  * Notes:       assumes 3 guard bits, gains no integer bits,
109  *                guard bits out = guard bits in - 3
110  **************************************************************************************/
111 static void R4FirstPass(int *x, int bg)
112 {
113     int ar, ai, br, bi, cr, ci, dr, di;
114
115     for (; bg != 0; bg--) {
116
117         ar = x[0] + x[2];
118         br = x[0] - x[2];
119         ai = x[1] + x[3];
120         bi = x[1] - x[3];
121         cr = x[4] + x[6];
122         dr = x[4] - x[6];
123         ci = x[5] + x[7];
124         di = x[5] - x[7];
125
126         x[0] = ar + cr;
127         x[4] = ar - cr;
128         x[1] = ai + ci;
129         x[5] = ai - ci;
130         x[2] = br + di;
131         x[6] = br - di;
132         x[3] = bi - dr;
133         x[7] = bi + dr;
134
135         x += 8;
136     }
137 }
138
139 /**************************************************************************************
140  * Function:    R8FirstPass
141  *
142  * Description: radix-8 trivial pass for decimation-in-time FFT
143  *
144  * Inputs:      buffer of (bit-reversed) samples
145  *              number of R8 butterflies per group (i.e. nfft / 8)
146  *
147  * Outputs:     processed samples in same buffer
148  *
149  * Return:      none
150  *
151  * Notes:       assumes 3 guard bits, gains 1 integer bit,
152  *                guard bits out = guard bits in - 3
153  **************************************************************************************/
154 static void R8FirstPass(int *x, int bg)
155 {
156     int ar, ai, br, bi, cr, ci, dr, di;
157     int sr, si, tr, ti, ur, ui, vr, vi;
158     int wr, wi, xr, xi, yr, yi, zr, zi;
159
160     for (; bg != 0; bg--) {
161
162         ar = x[0] + x[2];
163         br = x[0] - x[2];
164         ai = x[1] + x[3];
165         bi = x[1] - x[3];
166         cr = x[4] + x[6];
167         dr = x[4] - x[6];
168         ci = x[5] + x[7];
169         di = x[5] - x[7];
170
171         sr = ar + cr;
172         ur = ar - cr;
173         si = ai + ci;
174         ui = ai - ci;
175         tr = br - di;
176         vr = br + di;
177         ti = bi + dr;
178         vi = bi - dr;
179
180         ar = x[ 8] + x[10];
181         br = x[ 8] - x[10];
182         ai = x[ 9] + x[11];
183         bi = x[ 9] - x[11];
184         cr = x[12] + x[14];
185         dr = x[12] - x[14];
186         ci = x[13] + x[15];
187         di = x[13] - x[15];
188
189         wr = (ar + cr) >> 1;
190         yr = (ar - cr) >> 1;
191         wi = (ai + ci) >> 1;
192         yi = (ai - ci) >> 1;
193
194         /* gain 1 bit per R8 pass */
195         x[ 0] = (sr >> 1) + wr;
196         x[ 8] = (sr >> 1) - wr;
197         x[ 1] = (si >> 1) + wi;
198         x[ 9] = (si >> 1) - wi;
199         x[ 4] = (ur >> 1) + yi;
200         x[12] = (ur >> 1) - yi;
201         x[ 5] = (ui >> 1) - yr;
202         x[13] = (ui >> 1) + yr;
203
204         ar = br - di;
205         cr = br + di;
206         ai = bi + dr;
207         ci = bi - dr;
208
209         /* gain 1 bit from mul by Q31 */
210         xr = MULSHIFT32(SQRT1_2, ar - ai);
211         xi = MULSHIFT32(SQRT1_2, ar + ai);
212         zr = MULSHIFT32(SQRT1_2, cr - ci);
213         zi = MULSHIFT32(SQRT1_2, cr + ci);
214
215         x[ 6] = (tr >> 1) - xr;
216         x[14] = (tr >> 1) + xr;
217         x[ 7] = (ti >> 1) - xi;
218         x[15] = (ti >> 1) + xi;
219         x[ 2] = (vr >> 1) + zi;
220         x[10] = (vr >> 1) - zi;
221         x[ 3] = (vi >> 1) - zr;
222         x[11] = (vi >> 1) + zr;
223
224         x += 16;
225     }
226 }
227
228 /**************************************************************************************
229  * Function:    R4Core
230  *
231  * Description: radix-4 pass for decimation-in-time FFT
232  *              number of R4 butterflies per group
233  *              number of R4 groups per pass
234  *              pointer to twiddle factors tables
235  *
236  * Inputs:      buffer of samples
237  *
238  * Outputs:     processed samples in same buffer
239  *
240  * Return:      none
241  *
242  * Notes:       gain 2 integer bits per pass (see notes on R4FFT for scaling info)
243  *              for final multiply, can use SMLAL (if it pays off - needs extra
244  *                mov rLO, #0 before SMLAL if using inline asm from C)
245  *              uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
246  **************************************************************************************/
247 static void R4Core(int *x, int bg, int gp, int *wtab)
248 {
249     int ar, ai, br, bi, cr, ci, dr, di, tr, ti;
250     int wd, ws, wi;
251     int i, j, step;
252     int *xptr, *wptr;
253
254     for (; bg != 0; gp <<= 2, bg >>= 2) {
255
256         step = 2 * gp;
257         xptr = x;
258
259         for (i = bg; i != 0; i--) {
260
261             wptr = wtab;
262
263             for (j = gp; j != 0; j--) {
264
265                 ar = xptr[0];
266                 ai = xptr[1];
267                 xptr += step;
268
269                 ws = wptr[0];
270                 wi = wptr[1];
271                 br = xptr[0];
272                 bi = xptr[1];
273                 wd = ws + 2 * wi;
274                 tr = MULSHIFT32(wi, br + bi);
275                 br = MULSHIFT32(wd, br) - tr;
276                 bi = MULSHIFT32(ws, bi) + tr;
277                 xptr += step;
278
279                 ws = wptr[2];
280                 wi = wptr[3];
281                 cr = xptr[0];
282                 ci = xptr[1];
283                 wd = ws + 2 * wi;
284                 tr = MULSHIFT32(wi, cr + ci);
285                 cr = MULSHIFT32(wd, cr) - tr;
286                 ci = MULSHIFT32(ws, ci) + tr;
287                 xptr += step;
288
289                 ws = wptr[4];
290                 wi = wptr[5];
291                 dr = xptr[0];
292                 di = xptr[1];
293                 wd = ws + 2 * wi;
294                 tr = MULSHIFT32(wi, dr + di);
295                 dr = MULSHIFT32(wd, dr) - tr;
296                 di = MULSHIFT32(ws, di) + tr;
297                 wptr += 6;
298
299                 /* have now gained 1 bit for br,bi,cr,ci,dr,di
300                  *   gain 1 more bit for by skipping 2*br,bi,cr,ci,dr,di in additions
301                  * manually >> 2 on ar,ai to normalize (trivial case, no mul)
302                  * net gain = 2 int bits per x[i], per R4 pass
303                  */
304                 tr = ar;
305                 ti = ai;
306                 ar = (tr >> 2) - br;
307                 ai = (ti >> 2) - bi;
308                 br = (tr >> 2) + br;
309                 bi = (ti >> 2) + bi;
310
311                 tr = cr;
312                 ti = ci;
313                 cr = tr + dr;
314                 ci = di - ti;
315                 dr = tr - dr;
316                 di = di + ti;
317
318                 xptr[0] = ar + ci;
319                 xptr[1] = ai + dr;
320                 xptr -= step;
321                 xptr[0] = br - cr;
322                 xptr[1] = bi - di;
323                 xptr -= step;
324                 xptr[0] = ar - ci;
325                 xptr[1] = ai - dr;
326                 xptr -= step;
327                 xptr[0] = br + cr;
328                 xptr[1] = bi + di;
329                 xptr += 2;
330             }
331             xptr += 3 * step;
332         }
333         wtab += 3 * step;
334     }
335 }
336
337
338 /**************************************************************************************
339  * Function:    R4FFT
340  *
341  * Description: Ken's very fast in-place radix-4 decimation-in-time FFT
342  *
343  * Inputs:      table index (for transform size)
344  *              buffer of samples (non bit-reversed)
345  *
346  * Outputs:     processed samples in same buffer
347  *
348  * Return:      none
349  *
350  * Notes:       assumes 4 guard bits in for nfft = [128, 256, 512]
351  *              gains log2(nfft) - 2 int bits total
352  **************************************************************************************/
353 void R4FFT(int tabidx, int *x)
354 {
355     int order = nfftlog2Tab[tabidx];
356     int nfft = nfftTab[tabidx];
357
358     /* decimation in time */
359     BitReverse(x, tabidx);
360
361     if (order & 0x1) {
362         R8FirstPass(x, nfft >> 3);
363         R4Core(x, nfft >> 5, 8, (int *)twidTabOdd);
364     } else {
365         R4FirstPass(x, nfft >> 2);
366         R4Core(x, nfft >> 4, 4, (int *)twidTabEven);
367     }
368 }
369
1	/* *** BEGIN LICENSE BLOCK ***
2	* Source last modified: $Id: fft.c,v 1.4 2005/04/27 19:20:50 hubbe Exp $
3	*
4	* REALNETWORKS CONFIDENTIAL--NOT FOR DISTRIBUTION IN SOURCE CODE FORM
5	* Portions Copyright (c) 1995-2002 RealNetworks, Inc.
6	* All Rights Reserved.
7	*
8	* The contents of this file, and the files included with this file,
9	* are subject to the current version of the Real Format Source Code
10	* Porting and Optimization License, available at
11	* https://helixcommunity.org/2005/license/realformatsource (unless
12	* RealNetworks otherwise expressly agrees in writing that you are
13	* subject to a different license). You may also obtain the license
14	* terms directly from RealNetworks. You may not use this file except
15	* in compliance with the Real Format Source Code Porting and
16	* Optimization License. There are no redistribution rights for the
17	* source code of this file. Please see the Real Format Source Code
18	* Porting and Optimization License for the rights, obligations and
19	* limitations governing use of the contents of the file.
20	*
21	* RealNetworks is the developer of the Original Code and owns the
22	* copyrights in the portions it created.
23	*
24	* This file, and the files included with this file, is distributed and
25	* made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND,
26	* EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL
27	* SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF
28	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT
29	* OR NON-INFRINGEMENT.
30	*
31	* Technology Compatibility Kit Test Suite(s) Location:
32	* https://rarvcode-tck.helixcommunity.org
33	*
34	* Contributor(s):
35	*
36	* *** END LICENSE BLOCK *** */
37
38	/**************************************************************************************
39	* Fixed-point RealAudio 8 decoder
40	* Ken Cooke (kenc@real.com), Jon Recker(jrecker@real.com)
41	* October 2003
42	*
43	* fft.c - Ken's highly optimized radix-4 DIT FFT, with optional radix-8 first pass
44	* for odd powers of 2
45	**************************************************************************************/
46
47	#include "coder.h"
48	#include "assembly.h"
49
50	#define NUM_FFT_SIZES 3
51	static const int nfftTab[NUM_FFT_SIZES] = {128, 256, 512};
52	static const int nfftlog2Tab[NUM_FFT_SIZES] = {7, 8, 9};
53
54	#define SQRT1_2 1518500250 /* sqrt(1/2) in Q31 */
55
56	#define swapcplx(p0,p1) \
57	t = p0; t1 = (&(p0)+1); p0 = p1; (&(p0)+1) = (&(p1)+1); p1 = t; (&(p1)+1) = t1
58
59	/**************************************************************************************
60	* Function: BitReverse
61	*
62	* Description: Ken's fast in-place bit reverse, using super-small table
63	*
64	* Inputs: buffer of samples
65	* table index (for transform size)
66	*
67	* Outputs: bit-reversed samples in same buffer
68	*
69	* Return: none
70	**************************************************************************************/
71	static void BitReverse(int *inout, int tabidx)
72	{
73	/* 1st part: non-id */
74	int part0, part1;
75	int a, b, t, t1;
76	const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx];
77	int nbits = nfftlog2Tab[tabidx];
78
79	part0 = inout;
80	part1 = inout + (1 << nbits);
81
82	while ((a = *tab++) != 0) {
83	b = *tab++;
84
85	swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */
86	swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */
87	swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */
88	swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */
89	}
90
91	do {
92	swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */
93	} while ((a = *tab++) != 0);
94	}
95
96	/**************************************************************************************
97	* Function: R4FirstPass
98	*
99	* Description: radix-4 trivial pass for decimation-in-time FFT
100	*
101	* Inputs: buffer of (bit-reversed) samples
102	* number of R4 butterflies per group (i.e. nfft / 4)
103	*
104	* Outputs: processed samples in same buffer
105	*
106	* Return: none
107	*
108	* Notes: assumes 3 guard bits, gains no integer bits,
109	* guard bits out = guard bits in - 3
110	**************************************************************************************/
111	static void R4FirstPass(int *x, int bg)
112	{
113	int ar, ai, br, bi, cr, ci, dr, di;
114
115	for (; bg != 0; bg--) {
116
117	ar = x[0] + x[2];
118	br = x[0] - x[2];
119	ai = x[1] + x[3];
120	bi = x[1] - x[3];
121	cr = x[4] + x[6];
122	dr = x[4] - x[6];
123	ci = x[5] + x[7];
124	di = x[5] - x[7];
125
126	x[0] = ar + cr;
127	x[4] = ar - cr;
128	x[1] = ai + ci;
129	x[5] = ai - ci;
130	x[2] = br + di;
131	x[6] = br - di;
132	x[3] = bi - dr;
133	x[7] = bi + dr;
134
135	x += 8;
136	}
137	}
138
139	/**************************************************************************************
140	* Function: R8FirstPass
141	*
142	* Description: radix-8 trivial pass for decimation-in-time FFT
143	*
144	* Inputs: buffer of (bit-reversed) samples
145	* number of R8 butterflies per group (i.e. nfft / 8)
146	*
147	* Outputs: processed samples in same buffer
148	*
149	* Return: none
150	*
151	* Notes: assumes 3 guard bits, gains 1 integer bit,
152	* guard bits out = guard bits in - 3
153	**************************************************************************************/
154	static void R8FirstPass(int *x, int bg)
155	{
156	int ar, ai, br, bi, cr, ci, dr, di;
157	int sr, si, tr, ti, ur, ui, vr, vi;
158	int wr, wi, xr, xi, yr, yi, zr, zi;
159
160	for (; bg != 0; bg--) {
161
162	ar = x[0] + x[2];
163	br = x[0] - x[2];
164	ai = x[1] + x[3];
165	bi = x[1] - x[3];
166	cr = x[4] + x[6];
167	dr = x[4] - x[6];
168	ci = x[5] + x[7];
169	di = x[5] - x[7];
170
171	sr = ar + cr;
172	ur = ar - cr;
173	si = ai + ci;
174	ui = ai - ci;
175	tr = br - di;
176	vr = br + di;
177	ti = bi + dr;
178	vi = bi - dr;
179
180	ar = x[ 8] + x[10];
181	br = x[ 8] - x[10];
182	ai = x[ 9] + x[11];
183	bi = x[ 9] - x[11];
184	cr = x[12] + x[14];
185	dr = x[12] - x[14];
186	ci = x[13] + x[15];
187	di = x[13] - x[15];
188
189	wr = (ar + cr) >> 1;
190	yr = (ar - cr) >> 1;
191	wi = (ai + ci) >> 1;
192	yi = (ai - ci) >> 1;
193
194	/* gain 1 bit per R8 pass */
195	x[ 0] = (sr >> 1) + wr;
196	x[ 8] = (sr >> 1) - wr;
197	x[ 1] = (si >> 1) + wi;
198	x[ 9] = (si >> 1) - wi;
199	x[ 4] = (ur >> 1) + yi;
200	x[12] = (ur >> 1) - yi;
201	x[ 5] = (ui >> 1) - yr;
202	x[13] = (ui >> 1) + yr;
203
204	ar = br - di;
205	cr = br + di;
206	ai = bi + dr;
207	ci = bi - dr;
208
209	/* gain 1 bit from mul by Q31 */
210	xr = MULSHIFT32(SQRT1_2, ar - ai);
211	xi = MULSHIFT32(SQRT1_2, ar + ai);
212	zr = MULSHIFT32(SQRT1_2, cr - ci);
213	zi = MULSHIFT32(SQRT1_2, cr + ci);
214
215	x[ 6] = (tr >> 1) - xr;
216	x[14] = (tr >> 1) + xr;
217	x[ 7] = (ti >> 1) - xi;
218	x[15] = (ti >> 1) + xi;
219	x[ 2] = (vr >> 1) + zi;
220	x[10] = (vr >> 1) - zi;
221	x[ 3] = (vi >> 1) - zr;
222	x[11] = (vi >> 1) + zr;
223
224	x += 16;
225	}
226	}
227
228	/**************************************************************************************
229	* Function: R4Core
230	*
231	* Description: radix-4 pass for decimation-in-time FFT
232	* number of R4 butterflies per group
233	* number of R4 groups per pass
234	* pointer to twiddle factors tables
235	*
236	* Inputs: buffer of samples
237	*
238	* Outputs: processed samples in same buffer
239	*
240	* Return: none
241	*
242	* Notes: gain 2 integer bits per pass (see notes on R4FFT for scaling info)
243	* for final multiply, can use SMLAL (if it pays off - needs extra
244	* mov rLO, #0 before SMLAL if using inline asm from C)
245	* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
246	**************************************************************************************/
247	static void R4Core(int x, int bg, int gp, int wtab)
248	{
249	int ar, ai, br, bi, cr, ci, dr, di, tr, ti;
250	int wd, ws, wi;
251	int i, j, step;
252	int xptr, wptr;
253
254	for (; bg != 0; gp <<= 2, bg >>= 2) {
255
256	step = 2 * gp;
257	xptr = x;
258
259	for (i = bg; i != 0; i--) {
260
261	wptr = wtab;
262
263	for (j = gp; j != 0; j--) {
264
265	ar = xptr[0];
266	ai = xptr[1];
267	xptr += step;
268
269	ws = wptr[0];
270	wi = wptr[1];
271	br = xptr[0];
272	bi = xptr[1];
273	wd = ws + 2 * wi;
274	tr = MULSHIFT32(wi, br + bi);
275	br = MULSHIFT32(wd, br) - tr;
276	bi = MULSHIFT32(ws, bi) + tr;
277	xptr += step;
278
279	ws = wptr[2];
280	wi = wptr[3];
281	cr = xptr[0];
282	ci = xptr[1];
283	wd = ws + 2 * wi;
284	tr = MULSHIFT32(wi, cr + ci);
285	cr = MULSHIFT32(wd, cr) - tr;
286	ci = MULSHIFT32(ws, ci) + tr;
287	xptr += step;
288
289	ws = wptr[4];
290	wi = wptr[5];
291	dr = xptr[0];
292	di = xptr[1];
293	wd = ws + 2 * wi;
294	tr = MULSHIFT32(wi, dr + di);
295	dr = MULSHIFT32(wd, dr) - tr;
296	di = MULSHIFT32(ws, di) + tr;
297	wptr += 6;
298
299	/* have now gained 1 bit for br,bi,cr,ci,dr,di
300	* gain 1 more bit for by skipping 2*br,bi,cr,ci,dr,di in additions
301	* manually >> 2 on ar,ai to normalize (trivial case, no mul)
302	* net gain = 2 int bits per x[i], per R4 pass
303	*/
304	tr = ar;
305	ti = ai;
306	ar = (tr >> 2) - br;
307	ai = (ti >> 2) - bi;
308	br = (tr >> 2) + br;
309	bi = (ti >> 2) + bi;
310
311	tr = cr;
312	ti = ci;
313	cr = tr + dr;
314	ci = di - ti;
315	dr = tr - dr;
316	di = di + ti;
317
318	xptr[0] = ar + ci;
319	xptr[1] = ai + dr;
320	xptr -= step;
321	xptr[0] = br - cr;
322	xptr[1] = bi - di;
323	xptr -= step;
324	xptr[0] = ar - ci;
325	xptr[1] = ai - dr;
326	xptr -= step;
327	xptr[0] = br + cr;
328	xptr[1] = bi + di;
329	xptr += 2;
330	}
331	xptr += 3 * step;
332	}
333	wtab += 3 * step;
334	}
335	}
336
337
338	/**************************************************************************************
339	* Function: R4FFT
340	*
341	* Description: Ken's very fast in-place radix-4 decimation-in-time FFT
342	*
343	* Inputs: table index (for transform size)
344	* buffer of samples (non bit-reversed)
345	*
346	* Outputs: processed samples in same buffer
347	*
348	* Return: none
349	*
350	* Notes: assumes 4 guard bits in for nfft = [128, 256, 512]
351	* gains log2(nfft) - 2 int bits total
352	**************************************************************************************/
353	void R4FFT(int tabidx, int *x)
354	{
355	int order = nfftlog2Tab[tabidx];
356	int nfft = nfftTab[tabidx];
357
358	/* decimation in time */
359	BitReverse(x, tabidx);
360
361	if (order & 0x1) {
362	R8FirstPass(x, nfft >> 3);
363	R4Core(x, nfft >> 5, 8, (int *)twidTabOdd);
364	} else {
365	R4FirstPass(x, nfft >> 2);
366	R4Core(x, nfft >> 4, 4, (int *)twidTabEven);
367	}
368	}
369