platform/hardware/amlogic/LibAudio.git - Unnamed repository; edit this file 'description' to name the repository.

1 /* ***** BEGIN LICENSE BLOCK *****
2  * Source last modified: $Id: fft.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $
3  *
4  * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5  *
6  * The contents of this file, and the files included with this file,
7  * are subject to the current version of the RealNetworks Public
8  * Source License (the "RPSL") available at
9  * http://www.helixcommunity.org/content/rpsl unless you have licensed
10  * the file under the current version of the RealNetworks Community
11  * Source License (the "RCSL") available at
12  * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13  * will apply. You may also obtain the license terms directly from
14  * RealNetworks.  You may not use this file except in compliance with
15  * the RPSL or, if you have a valid RCSL with RealNetworks applicable
16  * to this file, the RCSL.  Please see the applicable RPSL or RCSL for
17  * the rights, obligations and limitations governing use of the
18  * contents of the file.
19  *
20  * This file is part of the Helix DNA Technology. RealNetworks is the
21  * developer of the Original Code and owns the copyrights in the
22  * portions it created.
23  *
24  * This file, and the files included with this file, is distributed
25  * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26  * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27  * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29  * ENJOYMENT OR NON-INFRINGEMENT.
30  *
31  * Technology Compatibility Kit Test Suite(s) Location:
32  *    http://www.helixcommunity.org/content/tck
33  *
34  * Contributor(s):
35  *
36  * ***** END LICENSE BLOCK ***** */
37
38 /**************************************************************************************
39  * Fixed-point HE-AAC decoder
40  * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41  * February 2005
42  *
43  * fft.c - Ken's optimized radix-4 DIT FFT, optional radix-8 first pass for odd log2(N)
44  **************************************************************************************/
45
46 #include "coder.h"
47 #include "assembly.h"
48
49 #define NUM_FFT_SIZES   2
50 static const int nfftTab[NUM_FFT_SIZES] =       {64, 512};
51 static const int nfftlog2Tab[NUM_FFT_SIZES] =   {6, 9};
52
53 #define SQRT1_2 0x5a82799a  /* sqrt(1/2) in Q31 */
54
55 #define swapcplx(p0,p1) \
56     t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1
57
58 /**************************************************************************************
59  * Function:    BitReverse
60  *
61  * Description: Ken's fast in-place bit reverse, using super-small table
62  *
63  * Inputs:      buffer of samples
64  *              table index (for transform size)
65  *
66  * Outputs:     bit-reversed samples in same buffer
67  *
68  * Return:      none
69  **************************************************************************************/
70 static void BitReverse(int *inout, int tabidx)
71 {
72     int *part0, *part1;
73     int a, b, t, t1;
74     const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx];
75     int nbits = nfftlog2Tab[tabidx];
76
77     part0 = inout;
78     part1 = inout + (1 << nbits);
79
80     while ((a = *tab++) != 0) {
81         b = *tab++;
82
83         swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */
84         swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */
85         swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */
86         swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */
87     }
88
89     do {
90         swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */
91     } while ((a = *tab++) != 0);
92 }
93
94 /**************************************************************************************
95  * Function:    R4FirstPass
96  *
97  * Description: radix-4 trivial pass for decimation-in-time FFT
98  *
99  * Inputs:      buffer of (bit-reversed) samples
100  *              number of R4 butterflies per group (i.e. nfft / 4)
101  *
102  * Outputs:     processed samples in same buffer
103  *
104  * Return:      none
105  *
106  * Notes:       assumes 2 guard bits, gains no integer bits,
107  *                guard bits out = guard bits in - 2
108  **************************************************************************************/
109 static void R4FirstPass(int *x, int bg)
110 {
111     int ar, ai, br, bi, cr, ci, dr, di;
112
113     for (; bg != 0; bg--) {
114
115         ar = x[0] + x[2];
116         br = x[0] - x[2];
117         ai = x[1] + x[3];
118         bi = x[1] - x[3];
119         cr = x[4] + x[6];
120         dr = x[4] - x[6];
121         ci = x[5] + x[7];
122         di = x[5] - x[7];
123
124         /* max per-sample gain = 4.0 (adding 4 inputs together) */
125         x[0] = ar + cr;
126         x[4] = ar - cr;
127         x[1] = ai + ci;
128         x[5] = ai - ci;
129         x[2] = br + di;
130         x[6] = br - di;
131         x[3] = bi - dr;
132         x[7] = bi + dr;
133
134         x += 8;
135     }
136 }
137
138 /**************************************************************************************
139  * Function:    R8FirstPass
140  *
141  * Description: radix-8 trivial pass for decimation-in-time FFT
142  *
143  * Inputs:      buffer of (bit-reversed) samples
144  *              number of R8 butterflies per group (i.e. nfft / 8)
145  *
146  * Outputs:     processed samples in same buffer
147  *
148  * Return:      none
149  *
150  * Notes:       assumes 3 guard bits, gains 1 integer bit
151  *              guard bits out = guard bits in - 3 (if inputs are full scale)
152  *                or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2)
153  *              see scaling comments in code
154  **************************************************************************************/
155 static void R8FirstPass(int *x, int bg)
156 {
157     int ar, ai, br, bi, cr, ci, dr, di;
158     int sr, si, tr, ti, ur, ui, vr, vi;
159     int wr, wi, xr, xi, yr, yi, zr, zi;
160
161     for (; bg != 0; bg--) {
162
163         ar = x[0] + x[2];
164         br = x[0] - x[2];
165         ai = x[1] + x[3];
166         bi = x[1] - x[3];
167         cr = x[4] + x[6];
168         dr = x[4] - x[6];
169         ci = x[5] + x[7];
170         di = x[5] - x[7];
171
172         sr = ar + cr;
173         ur = ar - cr;
174         si = ai + ci;
175         ui = ai - ci;
176         tr = br - di;
177         vr = br + di;
178         ti = bi + dr;
179         vi = bi - dr;
180
181         ar = x[ 8] + x[10];
182         br = x[ 8] - x[10];
183         ai = x[ 9] + x[11];
184         bi = x[ 9] - x[11];
185         cr = x[12] + x[14];
186         dr = x[12] - x[14];
187         ci = x[13] + x[15];
188         di = x[13] - x[15];
189
190         /* max gain of wr/wi/yr/yi vs input = 2
191          *  (sum of 4 samples >> 1)
192          */
193         wr = (ar + cr) >> 1;
194         yr = (ar - cr) >> 1;
195         wi = (ai + ci) >> 1;
196         yi = (ai - ci) >> 1;
197
198         /* max gain of output vs input = 4
199          *  (sum of 4 samples >> 1 + sum of 4 samples >> 1)
200          */
201         x[ 0] = (sr >> 1) + wr;
202         x[ 8] = (sr >> 1) - wr;
203         x[ 1] = (si >> 1) + wi;
204         x[ 9] = (si >> 1) - wi;
205         x[ 4] = (ur >> 1) + yi;
206         x[12] = (ur >> 1) - yi;
207         x[ 5] = (ui >> 1) - yr;
208         x[13] = (ui >> 1) + yr;
209
210         ar = br - di;
211         cr = br + di;
212         ai = bi + dr;
213         ci = bi - dr;
214
215         /* max gain of xr/xi/zr/zi vs input = 4*sqrt(2)/2 = 2*sqrt(2)
216          *  (sum of 8 samples, multiply by sqrt(2)/2, implicit >> 1 from Q31)
217          */
218         xr = MULSHIFT32(SQRT1_2, ar - ai);
219         xi = MULSHIFT32(SQRT1_2, ar + ai);
220         zr = MULSHIFT32(SQRT1_2, cr - ci);
221         zi = MULSHIFT32(SQRT1_2, cr + ci);
222
223         /* max gain of output vs input = (2 + 2*sqrt(2) ~= 4.83)
224          *  (sum of 4 samples >> 1, plus xr/xi/zr/zi with gain of 2*sqrt(2))
225          * in absolute terms, we have max gain of appx 9.656 (4 + 0.707*8)
226          *  but we also gain 1 int bit (from MULSHIFT32 or from explicit >> 1)
227          */
228         x[ 6] = (tr >> 1) - xr;
229         x[14] = (tr >> 1) + xr;
230         x[ 7] = (ti >> 1) - xi;
231         x[15] = (ti >> 1) + xi;
232         x[ 2] = (vr >> 1) + zi;
233         x[10] = (vr >> 1) - zi;
234         x[ 3] = (vi >> 1) - zr;
235         x[11] = (vi >> 1) + zr;
236
237         x += 16;
238     }
239 }
240
241 /**************************************************************************************
242  * Function:    R4Core
243  *
244  * Description: radix-4 pass for decimation-in-time FFT
245  *
246  * Inputs:      buffer of samples
247  *              number of R4 butterflies per group
248  *              number of R4 groups per pass
249  *              pointer to twiddle factors tables
250  *
251  * Outputs:     processed samples in same buffer
252  *
253  * Return:      none
254  *
255  * Notes:       gain 2 integer bits per pass (see scaling comments in code)
256  *              min 1 GB in
257  *              gbOut = gbIn - 1 (short block) or gbIn - 2 (long block)
258  *              uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
259  **************************************************************************************/
260 static void R4Core(int *x, int bg, int gp, int *wtab)
261 {
262     int ar, ai, br, bi, cr, ci, dr, di, tr, ti;
263     int wd, ws, wi;
264     int i, j, step;
265     int *xptr, *wptr;
266
267     for (; bg != 0; gp <<= 2, bg >>= 2) {
268
269         step = 2 * gp;
270         xptr = x;
271
272         /* max per-sample gain, per group < 1 + 3*sqrt(2) ~= 5.25 if inputs x are full-scale
273          * do 3 groups for long block, 2 groups for short block (gain 2 int bits per group)
274          *
275          * very conservative scaling:
276          *   group 1: max gain = 5.25,           int bits gained = 2, gb used = 1 (2^3 = 8)
277          *   group 2: max gain = 5.25^2 = 27.6,  int bits gained = 4, gb used = 1 (2^5 = 32)
278          *   group 3: max gain = 5.25^3 = 144.7, int bits gained = 6, gb used = 2 (2^8 = 256)
279          */
280         for (i = bg; i != 0; i--) {
281
282             wptr = wtab;
283
284             for (j = gp; j != 0; j--) {
285
286                 ar = xptr[0];
287                 ai = xptr[1];
288                 xptr += step;
289
290                 /* gain 2 int bits for br/bi, cr/ci, dr/di (MULSHIFT32 by Q30)
291                  * gain 1 net GB
292                  */
293                 ws = wptr[0];
294                 wi = wptr[1];
295                 br = xptr[0];
296                 bi = xptr[1];
297                 wd = ws + 2 * wi;
298                 tr = MULSHIFT32(wi, br + bi);
299                 br = MULSHIFT32(wd, br) - tr;   /* cos*br + sin*bi */
300                 bi = MULSHIFT32(ws, bi) + tr;   /* cos*bi - sin*br */
301                 xptr += step;
302
303                 ws = wptr[2];
304                 wi = wptr[3];
305                 cr = xptr[0];
306                 ci = xptr[1];
307                 wd = ws + 2 * wi;
308                 tr = MULSHIFT32(wi, cr + ci);
309                 cr = MULSHIFT32(wd, cr) - tr;
310                 ci = MULSHIFT32(ws, ci) + tr;
311                 xptr += step;
312
313                 ws = wptr[4];
314                 wi = wptr[5];
315                 dr = xptr[0];
316                 di = xptr[1];
317                 wd = ws + 2 * wi;
318                 tr = MULSHIFT32(wi, dr + di);
319                 dr = MULSHIFT32(wd, dr) - tr;
320                 di = MULSHIFT32(ws, di) + tr;
321                 wptr += 6;
322
323                 tr = ar;
324                 ti = ai;
325                 ar = (tr >> 2) - br;
326                 ai = (ti >> 2) - bi;
327                 br = (tr >> 2) + br;
328                 bi = (ti >> 2) + bi;
329
330                 tr = cr;
331                 ti = ci;
332                 cr = tr + dr;
333                 ci = di - ti;
334                 dr = tr - dr;
335                 di = di + ti;
336
337                 xptr[0] = ar + ci;
338                 xptr[1] = ai + dr;
339                 xptr -= step;
340                 xptr[0] = br - cr;
341                 xptr[1] = bi - di;
342                 xptr -= step;
343                 xptr[0] = ar - ci;
344                 xptr[1] = ai - dr;
345                 xptr -= step;
346                 xptr[0] = br + cr;
347                 xptr[1] = bi + di;
348                 xptr += 2;
349             }
350             xptr += 3 * step;
351         }
352         wtab += 3 * step;
353     }
354 }
355
356
357 /**************************************************************************************
358  * Function:    R4FFT
359  *
360  * Description: Ken's very fast in-place radix-4 decimation-in-time FFT
361  *
362  * Inputs:      table index (for transform size)
363  *              buffer of samples (non bit-reversed)
364  *
365  * Outputs:     processed samples in same buffer
366  *
367  * Return:      none
368  *
369  * Notes:       assumes 5 guard bits in for nfft <= 512
370  *              gbOut = gbIn - 4 (assuming input is from PreMultiply)
371  *              gains log2(nfft) - 2 int bits total
372  *                so gain 7 int bits (LONG), 4 int bits (SHORT)
373  **************************************************************************************/
374 void R4FFT(int tabidx, int *x)
375 {
376     int order = nfftlog2Tab[tabidx];
377     int nfft = nfftTab[tabidx];
378
379     /* decimation in time */
380     BitReverse(x, tabidx);
381
382     if (order & 0x1) {
383         /* long block: order = 9, nfft = 512 */
384         R8FirstPass(x, nfft >> 3);                      /* gain 1 int bit,  lose 2 GB */
385         R4Core(x, nfft >> 5, 8, (int *)twidTabOdd);     /* gain 6 int bits, lose 2 GB */
386     } else {
387         /* short block: order = 6, nfft = 64 */
388         R4FirstPass(x, nfft >> 2);                      /* gain 0 int bits, lose 2 GB */
389         R4Core(x, nfft >> 4, 4, (int *)twidTabEven);    /* gain 4 int bits, lose 1 GB */
390     }
391 }
392
1	/* *** BEGIN LICENSE BLOCK ***
2	* Source last modified: $Id: fft.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $
3	*
4	* Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5	*
6	* The contents of this file, and the files included with this file,
7	* are subject to the current version of the RealNetworks Public
8	* Source License (the "RPSL") available at
9	* http://www.helixcommunity.org/content/rpsl unless you have licensed
10	* the file under the current version of the RealNetworks Community
11	* Source License (the "RCSL") available at
12	* http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13	* will apply. You may also obtain the license terms directly from
14	* RealNetworks. You may not use this file except in compliance with
15	* the RPSL or, if you have a valid RCSL with RealNetworks applicable
16	* to this file, the RCSL. Please see the applicable RPSL or RCSL for
17	* the rights, obligations and limitations governing use of the
18	* contents of the file.
19	*
20	* This file is part of the Helix DNA Technology. RealNetworks is the
21	* developer of the Original Code and owns the copyrights in the
22	* portions it created.
23	*
24	* This file, and the files included with this file, is distributed
25	* and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26	* KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27	* ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29	* ENJOYMENT OR NON-INFRINGEMENT.
30	*
31	* Technology Compatibility Kit Test Suite(s) Location:
32	* http://www.helixcommunity.org/content/tck
33	*
34	* Contributor(s):
35	*
36	* *** END LICENSE BLOCK *** */
37
38	/**************************************************************************************
39	* Fixed-point HE-AAC decoder
40	* Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41	* February 2005
42	*
43	* fft.c - Ken's optimized radix-4 DIT FFT, optional radix-8 first pass for odd log2(N)
44	**************************************************************************************/
45
46	#include "coder.h"
47	#include "assembly.h"
48
49	#define NUM_FFT_SIZES 2
50	static const int nfftTab[NUM_FFT_SIZES] = {64, 512};
51	static const int nfftlog2Tab[NUM_FFT_SIZES] = {6, 9};
52
53	#define SQRT1_2 0x5a82799a /* sqrt(1/2) in Q31 */
54
55	#define swapcplx(p0,p1) \
56	t = p0; t1 = (&(p0)+1); p0 = p1; (&(p0)+1) = (&(p1)+1); p1 = t; (&(p1)+1) = t1
57
58	/**************************************************************************************
59	* Function: BitReverse
60	*
61	* Description: Ken's fast in-place bit reverse, using super-small table
62	*
63	* Inputs: buffer of samples
64	* table index (for transform size)
65	*
66	* Outputs: bit-reversed samples in same buffer
67	*
68	* Return: none
69	**************************************************************************************/
70	static void BitReverse(int *inout, int tabidx)
71	{
72	int part0, part1;
73	int a, b, t, t1;
74	const unsigned char* tab = bitrevtab + bitrevtabOffset[tabidx];
75	int nbits = nfftlog2Tab[tabidx];
76
77	part0 = inout;
78	part1 = inout + (1 << nbits);
79
80	while ((a = *tab++) != 0) {
81	b = *tab++;
82
83	swapcplx(part0[4 * a + 0], part0[4 * b + 0]); /* 0xxx0 <-> 0yyy0 */
84	swapcplx(part0[4 * a + 2], part1[4 * b + 0]); /* 0xxx1 <-> 1yyy0 */
85	swapcplx(part1[4 * a + 0], part0[4 * b + 2]); /* 1xxx0 <-> 0yyy1 */
86	swapcplx(part1[4 * a + 2], part1[4 * b + 2]); /* 1xxx1 <-> 1yyy1 */
87	}
88
89	do {
90	swapcplx(part0[4 * a + 2], part1[4 * a + 0]); /* 0xxx1 <-> 1xxx0 */
91	} while ((a = *tab++) != 0);
92	}
93
94	/**************************************************************************************
95	* Function: R4FirstPass
96	*
97	* Description: radix-4 trivial pass for decimation-in-time FFT
98	*
99	* Inputs: buffer of (bit-reversed) samples
100	* number of R4 butterflies per group (i.e. nfft / 4)
101	*
102	* Outputs: processed samples in same buffer
103	*
104	* Return: none
105	*
106	* Notes: assumes 2 guard bits, gains no integer bits,
107	* guard bits out = guard bits in - 2
108	**************************************************************************************/
109	static void R4FirstPass(int *x, int bg)
110	{
111	int ar, ai, br, bi, cr, ci, dr, di;
112
113	for (; bg != 0; bg--) {
114
115	ar = x[0] + x[2];
116	br = x[0] - x[2];
117	ai = x[1] + x[3];
118	bi = x[1] - x[3];
119	cr = x[4] + x[6];
120	dr = x[4] - x[6];
121	ci = x[5] + x[7];
122	di = x[5] - x[7];
123
124	/* max per-sample gain = 4.0 (adding 4 inputs together) */
125	x[0] = ar + cr;
126	x[4] = ar - cr;
127	x[1] = ai + ci;
128	x[5] = ai - ci;
129	x[2] = br + di;
130	x[6] = br - di;
131	x[3] = bi - dr;
132	x[7] = bi + dr;
133
134	x += 8;
135	}
136	}
137
138	/**************************************************************************************
139	* Function: R8FirstPass
140	*
141	* Description: radix-8 trivial pass for decimation-in-time FFT
142	*
143	* Inputs: buffer of (bit-reversed) samples
144	* number of R8 butterflies per group (i.e. nfft / 8)
145	*
146	* Outputs: processed samples in same buffer
147	*
148	* Return: none
149	*
150	* Notes: assumes 3 guard bits, gains 1 integer bit
151	* guard bits out = guard bits in - 3 (if inputs are full scale)
152	* or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2)
153	* see scaling comments in code
154	**************************************************************************************/
155	static void R8FirstPass(int *x, int bg)
156	{
157	int ar, ai, br, bi, cr, ci, dr, di;
158	int sr, si, tr, ti, ur, ui, vr, vi;
159	int wr, wi, xr, xi, yr, yi, zr, zi;
160
161	for (; bg != 0; bg--) {
162
163	ar = x[0] + x[2];
164	br = x[0] - x[2];
165	ai = x[1] + x[3];
166	bi = x[1] - x[3];
167	cr = x[4] + x[6];
168	dr = x[4] - x[6];
169	ci = x[5] + x[7];
170	di = x[5] - x[7];
171
172	sr = ar + cr;
173	ur = ar - cr;
174	si = ai + ci;
175	ui = ai - ci;
176	tr = br - di;
177	vr = br + di;
178	ti = bi + dr;
179	vi = bi - dr;
180
181	ar = x[ 8] + x[10];
182	br = x[ 8] - x[10];
183	ai = x[ 9] + x[11];
184	bi = x[ 9] - x[11];
185	cr = x[12] + x[14];
186	dr = x[12] - x[14];
187	ci = x[13] + x[15];
188	di = x[13] - x[15];
189
190	/* max gain of wr/wi/yr/yi vs input = 2
191	* (sum of 4 samples >> 1)
192	*/
193	wr = (ar + cr) >> 1;
194	yr = (ar - cr) >> 1;
195	wi = (ai + ci) >> 1;
196	yi = (ai - ci) >> 1;
197
198	/* max gain of output vs input = 4
199	* (sum of 4 samples >> 1 + sum of 4 samples >> 1)
200	*/
201	x[ 0] = (sr >> 1) + wr;
202	x[ 8] = (sr >> 1) - wr;
203	x[ 1] = (si >> 1) + wi;
204	x[ 9] = (si >> 1) - wi;
205	x[ 4] = (ur >> 1) + yi;
206	x[12] = (ur >> 1) - yi;
207	x[ 5] = (ui >> 1) - yr;
208	x[13] = (ui >> 1) + yr;
209
210	ar = br - di;
211	cr = br + di;
212	ai = bi + dr;
213	ci = bi - dr;
214
215	/* max gain of xr/xi/zr/zi vs input = 4sqrt(2)/2 = 2sqrt(2)
216	* (sum of 8 samples, multiply by sqrt(2)/2, implicit >> 1 from Q31)
217	*/
218	xr = MULSHIFT32(SQRT1_2, ar - ai);
219	xi = MULSHIFT32(SQRT1_2, ar + ai);
220	zr = MULSHIFT32(SQRT1_2, cr - ci);
221	zi = MULSHIFT32(SQRT1_2, cr + ci);
222
223	/* max gain of output vs input = (2 + 2*sqrt(2) ~= 4.83)
224	* (sum of 4 samples >> 1, plus xr/xi/zr/zi with gain of 2*sqrt(2))
225	* in absolute terms, we have max gain of appx 9.656 (4 + 0.707*8)
226	* but we also gain 1 int bit (from MULSHIFT32 or from explicit >> 1)
227	*/
228	x[ 6] = (tr >> 1) - xr;
229	x[14] = (tr >> 1) + xr;
230	x[ 7] = (ti >> 1) - xi;
231	x[15] = (ti >> 1) + xi;
232	x[ 2] = (vr >> 1) + zi;
233	x[10] = (vr >> 1) - zi;
234	x[ 3] = (vi >> 1) - zr;
235	x[11] = (vi >> 1) + zr;
236
237	x += 16;
238	}
239	}
240
241	/**************************************************************************************
242	* Function: R4Core
243	*
244	* Description: radix-4 pass for decimation-in-time FFT
245	*
246	* Inputs: buffer of samples
247	* number of R4 butterflies per group
248	* number of R4 groups per pass
249	* pointer to twiddle factors tables
250	*
251	* Outputs: processed samples in same buffer
252	*
253	* Return: none
254	*
255	* Notes: gain 2 integer bits per pass (see scaling comments in code)
256	* min 1 GB in
257	* gbOut = gbIn - 1 (short block) or gbIn - 2 (long block)
258	* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
259	**************************************************************************************/
260	static void R4Core(int x, int bg, int gp, int wtab)
261	{
262	int ar, ai, br, bi, cr, ci, dr, di, tr, ti;
263	int wd, ws, wi;
264	int i, j, step;
265	int xptr, wptr;
266
267	for (; bg != 0; gp <<= 2, bg >>= 2) {
268
269	step = 2 * gp;
270	xptr = x;
271
272	/* max per-sample gain, per group < 1 + 3*sqrt(2) ~= 5.25 if inputs x are full-scale
273	* do 3 groups for long block, 2 groups for short block (gain 2 int bits per group)
274	*
275	* very conservative scaling:
276	* group 1: max gain = 5.25, int bits gained = 2, gb used = 1 (2^3 = 8)
277	* group 2: max gain = 5.25^2 = 27.6, int bits gained = 4, gb used = 1 (2^5 = 32)
278	* group 3: max gain = 5.25^3 = 144.7, int bits gained = 6, gb used = 2 (2^8 = 256)
279	*/
280	for (i = bg; i != 0; i--) {
281
282	wptr = wtab;
283
284	for (j = gp; j != 0; j--) {
285
286	ar = xptr[0];
287	ai = xptr[1];
288	xptr += step;
289
290	/* gain 2 int bits for br/bi, cr/ci, dr/di (MULSHIFT32 by Q30)
291	* gain 1 net GB
292	*/
293	ws = wptr[0];
294	wi = wptr[1];
295	br = xptr[0];
296	bi = xptr[1];
297	wd = ws + 2 * wi;
298	tr = MULSHIFT32(wi, br + bi);
299	br = MULSHIFT32(wd, br) - tr; /* cosbr + sinbi */
300	bi = MULSHIFT32(ws, bi) + tr; /* cosbi - sinbr */
301	xptr += step;
302
303	ws = wptr[2];
304	wi = wptr[3];
305	cr = xptr[0];
306	ci = xptr[1];
307	wd = ws + 2 * wi;
308	tr = MULSHIFT32(wi, cr + ci);
309	cr = MULSHIFT32(wd, cr) - tr;
310	ci = MULSHIFT32(ws, ci) + tr;
311	xptr += step;
312
313	ws = wptr[4];
314	wi = wptr[5];
315	dr = xptr[0];
316	di = xptr[1];
317	wd = ws + 2 * wi;
318	tr = MULSHIFT32(wi, dr + di);
319	dr = MULSHIFT32(wd, dr) - tr;
320	di = MULSHIFT32(ws, di) + tr;
321	wptr += 6;
322
323	tr = ar;
324	ti = ai;
325	ar = (tr >> 2) - br;
326	ai = (ti >> 2) - bi;
327	br = (tr >> 2) + br;
328	bi = (ti >> 2) + bi;
329
330	tr = cr;
331	ti = ci;
332	cr = tr + dr;
333	ci = di - ti;
334	dr = tr - dr;
335	di = di + ti;
336
337	xptr[0] = ar + ci;
338	xptr[1] = ai + dr;
339	xptr -= step;
340	xptr[0] = br - cr;
341	xptr[1] = bi - di;
342	xptr -= step;
343	xptr[0] = ar - ci;
344	xptr[1] = ai - dr;
345	xptr -= step;
346	xptr[0] = br + cr;
347	xptr[1] = bi + di;
348	xptr += 2;
349	}
350	xptr += 3 * step;
351	}
352	wtab += 3 * step;
353	}
354	}
355
356
357	/**************************************************************************************
358	* Function: R4FFT
359	*
360	* Description: Ken's very fast in-place radix-4 decimation-in-time FFT
361	*
362	* Inputs: table index (for transform size)
363	* buffer of samples (non bit-reversed)
364	*
365	* Outputs: processed samples in same buffer
366	*
367	* Return: none
368	*
369	* Notes: assumes 5 guard bits in for nfft <= 512
370	* gbOut = gbIn - 4 (assuming input is from PreMultiply)
371	* gains log2(nfft) - 2 int bits total
372	* so gain 7 int bits (LONG), 4 int bits (SHORT)
373	**************************************************************************************/
374	void R4FFT(int tabidx, int *x)
375	{
376	int order = nfftlog2Tab[tabidx];
377	int nfft = nfftTab[tabidx];
378
379	/* decimation in time */
380	BitReverse(x, tabidx);
381
382	if (order & 0x1) {
383	/* long block: order = 9, nfft = 512 */
384	R8FirstPass(x, nfft >> 3); /* gain 1 int bit, lose 2 GB */
385	R4Core(x, nfft >> 5, 8, (int )twidTabOdd); / gain 6 int bits, lose 2 GB */
386	} else {
387	/* short block: order = 6, nfft = 64 */
388	R4FirstPass(x, nfft >> 2); /* gain 0 int bits, lose 2 GB */
389	R4Core(x, nfft >> 4, 4, (int )twidTabEven); / gain 4 int bits, lose 1 GB */
390	}
391	}
392