platform/hardware/amlogic/LibAudio.git - Unnamed repository; edit this file 'description' to name the repository.

1 /* ***** BEGIN LICENSE BLOCK *****
2  * Source last modified: $Id: sbrfft.c,v 1.1 2005/02/26 01:47:35 jrecker Exp $
3  *
4  * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5  *
6  * The contents of this file, and the files included with this file,
7  * are subject to the current version of the RealNetworks Public
8  * Source License (the "RPSL") available at
9  * http://www.helixcommunity.org/content/rpsl unless you have licensed
10  * the file under the current version of the RealNetworks Community
11  * Source License (the "RCSL") available at
12  * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13  * will apply. You may also obtain the license terms directly from
14  * RealNetworks.  You may not use this file except in compliance with
15  * the RPSL or, if you have a valid RCSL with RealNetworks applicable
16  * to this file, the RCSL.  Please see the applicable RPSL or RCSL for
17  * the rights, obligations and limitations governing use of the
18  * contents of the file.
19  *
20  * This file is part of the Helix DNA Technology. RealNetworks is the
21  * developer of the Original Code and owns the copyrights in the
22  * portions it created.
23  *
24  * This file, and the files included with this file, is distributed
25  * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26  * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27  * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29  * ENJOYMENT OR NON-INFRINGEMENT.
30  *
31  * Technology Compatibility Kit Test Suite(s) Location:
32  *    http://www.helixcommunity.org/content/tck
33  *
34  * Contributor(s):
35  *
36  * ***** END LICENSE BLOCK ***** */
37
38 /**************************************************************************************
39  * Fixed-point HE-AAC decoder
40  * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41  * February 2005
42  *
43  * sbrfft.c - optimized FFT for SBR QMF filters
44  **************************************************************************************/
45
46 #include "sbr.h"
47 #include "assembly.h"
48
49 #define SQRT1_2 0x5a82799a
50
51 /* swap RE{p0} with RE{p1} and IM{P0} with IM{P1} */
52 #define swapcplx(p0,p1) \
53     t = p0; t1 = *(&(p0)+1); p0 = p1; *(&(p0)+1) = *(&(p1)+1); p1 = t; *(&(p1)+1) = t1
54
55 /* nfft = 32, hard coded since small, fixed size FFT
56 static const unsigned char bitrevtab32[9] = {
57     0x01, 0x04, 0x03, 0x06, 0x00, 0x02, 0x05, 0x07, 0x00,
58 };
59 */
60
61 /* twiddle table for radix 4 pass, format = Q31 */
62 static const int twidTabOdd32[8 * 6] = {
63     0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x539eba45, 0xe7821d59,
64     0x4b418bbe, 0xf383a3e2, 0x58c542c5, 0xdc71898d, 0x5a82799a, 0xd2bec333, 0x539eba45, 0xe7821d59,
65     0x539eba45, 0xc4df2862, 0x539eba45, 0xc4df2862, 0x58c542c5, 0xdc71898d, 0x3248d382, 0xc13ad060,
66     0x40000000, 0xc0000000, 0x5a82799a, 0xd2bec333, 0x00000000, 0xd2bec333, 0x22a2f4f8, 0xc4df2862,
67     0x58c542c5, 0xcac933ae, 0xcdb72c7e, 0xf383a3e2, 0x00000000, 0xd2bec333, 0x539eba45, 0xc4df2862,
68     0xac6145bb, 0x187de2a7, 0xdd5d0b08, 0xe7821d59, 0x4b418bbe, 0xc13ad060, 0xa73abd3b, 0x3536cc52,
69 };
70
71 /**************************************************************************************
72  * Function:    BitReverse32
73  *
74  * Description: Ken's fast in-place bit reverse
75  *
76  * Inputs:      buffer of 32 complex samples
77  *
78  * Outputs:     bit-reversed samples in same buffer
79  *
80  * Return:      none
81 **************************************************************************************/
82 static void BitReverse32(int *inout)
83 {
84     int t, t1;
85
86     swapcplx(inout[2],  inout[32]);
87     swapcplx(inout[4],  inout[16]);
88     swapcplx(inout[6],  inout[48]);
89     swapcplx(inout[10], inout[40]);
90     swapcplx(inout[12], inout[24]);
91     swapcplx(inout[14], inout[56]);
92     swapcplx(inout[18], inout[36]);
93     swapcplx(inout[22], inout[52]);
94     swapcplx(inout[26], inout[44]);
95     swapcplx(inout[30], inout[60]);
96     swapcplx(inout[38], inout[50]);
97     swapcplx(inout[46], inout[58]);
98 }
99
100 /**************************************************************************************
101  * Function:    R8FirstPass32
102  *
103  * Description: radix-8 trivial pass for decimation-in-time FFT (log2(N) = 5)
104  *
105  * Inputs:      buffer of (bit-reversed) samples
106  *
107  * Outputs:     processed samples in same buffer
108  *
109  * Return:      none
110  *
111  * Notes:       assumes 3 guard bits, gains 1 integer bit
112  *              guard bits out = guard bits in - 3 (if inputs are full scale)
113  *                or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2)
114  *              see scaling comments in fft.c for base AAC
115  *              should compile with no stack spills on ARM (verify compiled output)
116  *              current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU
117  **************************************************************************************/
118 static void R8FirstPass32(int *r0)
119 {
120     int r1, r2, r3, r4, r5, r6, r7;
121     int r8, r9, r10, r11, r12, r14;
122
123     /* number of passes = fft size / 8 = 32 / 8 = 4 */
124     r1 = (32 >> 3);
125     do {
126
127         r2 = r0[8];
128         r3 = r0[9];
129         r4 = r0[10];
130         r5 = r0[11];
131         r6 = r0[12];
132         r7 = r0[13];
133         r8 = r0[14];
134         r9 = r0[15];
135
136         r10 = r2 + r4;
137         r11 = r3 + r5;
138         r12 = r6 + r8;
139         r14 = r7 + r9;
140
141         r2 -= r4;
142         r3 -= r5;
143         r6 -= r8;
144         r7 -= r9;
145
146         r4 = r2 - r7;
147         r5 = r2 + r7;
148         r8 = r3 - r6;
149         r9 = r3 + r6;
150
151         r2 = r4 - r9;
152         r3 = r4 + r9;
153         r6 = r5 - r8;
154         r7 = r5 + r8;
155
156         r2 = MULSHIFT32(SQRT1_2, r2);   /* can use r4, r5, r8, or r9 for constant and lo32 scratch reg */
157         r3 = MULSHIFT32(SQRT1_2, r3);
158         r6 = MULSHIFT32(SQRT1_2, r6);
159         r7 = MULSHIFT32(SQRT1_2, r7);
160
161         r4 = r10 + r12;
162         r5 = r10 - r12;
163         r8 = r11 + r14;
164         r9 = r11 - r14;
165
166         r10 = r0[0];
167         r11 = r0[2];
168         r12 = r0[4];
169         r14 = r0[6];
170
171         r10 += r11;
172         r12 += r14;
173
174         r4 >>= 1;
175         r10 += r12;
176         r4 += (r10 >> 1);
177         r0[ 0] = r4;
178         r4 -= (r10 >> 1);
179         r4 = (r10 >> 1) - r4;
180         r0[ 8] = r4;
181
182         r9 >>= 1;
183         r10 -= 2 * r12;
184         r4 = (r10 >> 1) + r9;
185         r0[ 4] = r4;
186         r4 = (r10 >> 1) - r9;
187         r0[12] = r4;
188         r10 += r12;
189
190         r10 -= 2 * r11;
191         r12 -= 2 * r14;
192
193         r4 =  r0[1];
194         r9 =  r0[3];
195         r11 = r0[5];
196         r14 = r0[7];
197
198         r4 += r9;
199         r11 += r14;
200
201         r8 >>= 1;
202         r4 += r11;
203         r8 += (r4 >> 1);
204         r0[ 1] = r8;
205         r8 -= (r4 >> 1);
206         r8 = (r4 >> 1) - r8;
207         r0[ 9] = r8;
208
209         r5 >>= 1;
210         r4 -= 2 * r11;
211         r8 = (r4 >> 1) - r5;
212         r0[ 5] = r8;
213         r8 = (r4 >> 1) + r5;
214         r0[13] = r8;
215         r4 += r11;
216
217         r4 -= 2 * r9;
218         r11 -= 2 * r14;
219
220         r9 = r10 - r11;
221         r10 += r11;
222         r14 = r4 + r12;
223         r4 -= r12;
224
225         r5 = (r10 >> 1) + r7;
226         r8 = (r4 >> 1) - r6;
227         r0[ 2] = r5;
228         r0[ 3] = r8;
229
230         r5 = (r9 >> 1) - r2;
231         r8 = (r14 >> 1) - r3;
232         r0[ 6] = r5;
233         r0[ 7] = r8;
234
235         r5 = (r10 >> 1) - r7;
236         r8 = (r4 >> 1) + r6;
237         r0[10] = r5;
238         r0[11] = r8;
239
240         r5 = (r9 >> 1) + r2;
241         r8 = (r14 >> 1) + r3;
242         r0[14] = r5;
243         r0[15] = r8;
244
245         r0 += 16;
246         r1--;
247     } while (r1 != 0);
248 }
249
250 /**************************************************************************************
251  * Function:    R4Core32
252  *
253  * Description: radix-4 pass for 32-point decimation-in-time FFT
254  *
255  * Inputs:      buffer of samples
256  *
257  * Outputs:     processed samples in same buffer
258  *
259  * Return:      none
260  *
261  * Notes:       gain 2 integer bits
262  *              guard bits out = guard bits in - 1 (if inputs are full scale)
263  *              see scaling comments in fft.c for base AAC
264  *              uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
265  *              should compile with no stack spills on ARM (verify compiled output)
266  *              current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU
267  **************************************************************************************/
268 static void R4Core32(int *r0)
269 {
270     int r2, r3, r4, r5, r6, r7;
271     int r8, r9, r10, r12, r14;
272     int *r1;
273
274     r1 = (int *)twidTabOdd32;
275     r10 = 8;
276     do {
277         /* can use r14 for lo32 scratch register in all MULSHIFT32 */
278         r2 = r1[0];
279         r3 = r1[1];
280         r4 = r0[16];
281         r5 = r0[17];
282         r12 = r4 + r5;
283         r12 = MULSHIFT32(r3, r12);
284         r5  = MULSHIFT32(r2, r5) + r12;
285         r2 += 2 * r3;
286         r4  = MULSHIFT32(r2, r4) - r12;
287
288         r2 = r1[2];
289         r3 = r1[3];
290         r6 = r0[32];
291         r7 = r0[33];
292         r12 = r6 + r7;
293         r12 = MULSHIFT32(r3, r12);
294         r7  = MULSHIFT32(r2, r7) + r12;
295         r2 += 2 * r3;
296         r6  = MULSHIFT32(r2, r6) - r12;
297
298         r2 = r1[4];
299         r3 = r1[5];
300         r8 = r0[48];
301         r9 = r0[49];
302         r12 = r8 + r9;
303         r12 = MULSHIFT32(r3, r12);
304         r9  = MULSHIFT32(r2, r9) + r12;
305         r2 += 2 * r3;
306         r8  = MULSHIFT32(r2, r8) - r12;
307
308         r2 = r0[0];
309         r3 = r0[1];
310
311         r12 = r6 + r8;
312         r8  = r6 - r8;
313         r14 = r9 - r7;
314         r9  = r9 + r7;
315
316         r6 = (r2 >> 2) - r4;
317         r7 = (r3 >> 2) - r5;
318         r4 += (r2 >> 2);
319         r5 += (r3 >> 2);
320
321         r2 = r4 + r12;
322         r3 = r5 + r9;
323         r0[0] = r2;
324         r0[1] = r3;
325         r2 = r6 - r14;
326         r3 = r7 - r8;
327         r0[16] = r2;
328         r0[17] = r3;
329         r2 = r4 - r12;
330         r3 = r5 - r9;
331         r0[32] = r2;
332         r0[33] = r3;
333         r2 = r6 + r14;
334         r3 = r7 + r8;
335         r0[48] = r2;
336         r0[49] = r3;
337
338         r0 += 2;
339         r1 += 6;
340         r10--;
341     } while (r10 != 0);
342 }
343
344 /**************************************************************************************
345  * Function:    FFT32C
346  *
347  * Description: Ken's very fast in-place radix-4 decimation-in-time FFT
348  *
349  * Inputs:      buffer of 32 complex samples (before bit-reversal)
350  *
351  * Outputs:     processed samples in same buffer
352  *
353  * Return:      none
354  *
355  * Notes:       assumes 3 guard bits in, gains 3 integer bits
356  *              guard bits out = guard bits in - 2
357  *              (guard bit analysis includes assumptions about steps immediately
358  *               before and after, i.e. PreMul and PostMul for DCT)
359  **************************************************************************************/
360 void FFT32C(int *x)
361 {
362     /* decimation in time */
363     BitReverse32(x);
364
365     /* 32-point complex FFT */
366     R8FirstPass32(x);   /* gain 1 int bit,  lose 2 GB (making assumptions about input) */
367     R4Core32(x);        /* gain 2 int bits, lose 0 GB (making assumptions about input) */
368 }
369
1	/* *** BEGIN LICENSE BLOCK ***
2	* Source last modified: $Id: sbrfft.c,v 1.1 2005/02/26 01:47:35 jrecker Exp $
3	*
4	* Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5	*
6	* The contents of this file, and the files included with this file,
7	* are subject to the current version of the RealNetworks Public
8	* Source License (the "RPSL") available at
9	* http://www.helixcommunity.org/content/rpsl unless you have licensed
10	* the file under the current version of the RealNetworks Community
11	* Source License (the "RCSL") available at
12	* http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13	* will apply. You may also obtain the license terms directly from
14	* RealNetworks. You may not use this file except in compliance with
15	* the RPSL or, if you have a valid RCSL with RealNetworks applicable
16	* to this file, the RCSL. Please see the applicable RPSL or RCSL for
17	* the rights, obligations and limitations governing use of the
18	* contents of the file.
19	*
20	* This file is part of the Helix DNA Technology. RealNetworks is the
21	* developer of the Original Code and owns the copyrights in the
22	* portions it created.
23	*
24	* This file, and the files included with this file, is distributed
25	* and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26	* KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27	* ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29	* ENJOYMENT OR NON-INFRINGEMENT.
30	*
31	* Technology Compatibility Kit Test Suite(s) Location:
32	* http://www.helixcommunity.org/content/tck
33	*
34	* Contributor(s):
35	*
36	* *** END LICENSE BLOCK *** */
37
38	/**************************************************************************************
39	* Fixed-point HE-AAC decoder
40	* Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41	* February 2005
42	*
43	* sbrfft.c - optimized FFT for SBR QMF filters
44	**************************************************************************************/
45
46	#include "sbr.h"
47	#include "assembly.h"
48
49	#define SQRT1_2 0x5a82799a
50
51	/* swap RE{p0} with RE{p1} and IM{P0} with IM{P1} */
52	#define swapcplx(p0,p1) \
53	t = p0; t1 = (&(p0)+1); p0 = p1; (&(p0)+1) = (&(p1)+1); p1 = t; (&(p1)+1) = t1
54
55	/* nfft = 32, hard coded since small, fixed size FFT
56	static const unsigned char bitrevtab32[9] = {
57	0x01, 0x04, 0x03, 0x06, 0x00, 0x02, 0x05, 0x07, 0x00,
58	};
59	*/
60
61	/* twiddle table for radix 4 pass, format = Q31 */
62	static const int twidTabOdd32[8 * 6] = {
63	0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x40000000, 0x00000000, 0x539eba45, 0xe7821d59,
64	0x4b418bbe, 0xf383a3e2, 0x58c542c5, 0xdc71898d, 0x5a82799a, 0xd2bec333, 0x539eba45, 0xe7821d59,
65	0x539eba45, 0xc4df2862, 0x539eba45, 0xc4df2862, 0x58c542c5, 0xdc71898d, 0x3248d382, 0xc13ad060,
66	0x40000000, 0xc0000000, 0x5a82799a, 0xd2bec333, 0x00000000, 0xd2bec333, 0x22a2f4f8, 0xc4df2862,
67	0x58c542c5, 0xcac933ae, 0xcdb72c7e, 0xf383a3e2, 0x00000000, 0xd2bec333, 0x539eba45, 0xc4df2862,
68	0xac6145bb, 0x187de2a7, 0xdd5d0b08, 0xe7821d59, 0x4b418bbe, 0xc13ad060, 0xa73abd3b, 0x3536cc52,
69	};
70
71	/**************************************************************************************
72	* Function: BitReverse32
73	*
74	* Description: Ken's fast in-place bit reverse
75	*
76	* Inputs: buffer of 32 complex samples
77	*
78	* Outputs: bit-reversed samples in same buffer
79	*
80	* Return: none
81	**************************************************************************************/
82	static void BitReverse32(int *inout)
83	{
84	int t, t1;
85
86	swapcplx(inout[2], inout[32]);
87	swapcplx(inout[4], inout[16]);
88	swapcplx(inout[6], inout[48]);
89	swapcplx(inout[10], inout[40]);
90	swapcplx(inout[12], inout[24]);
91	swapcplx(inout[14], inout[56]);
92	swapcplx(inout[18], inout[36]);
93	swapcplx(inout[22], inout[52]);
94	swapcplx(inout[26], inout[44]);
95	swapcplx(inout[30], inout[60]);
96	swapcplx(inout[38], inout[50]);
97	swapcplx(inout[46], inout[58]);
98	}
99
100	/**************************************************************************************
101	* Function: R8FirstPass32
102	*
103	* Description: radix-8 trivial pass for decimation-in-time FFT (log2(N) = 5)
104	*
105	* Inputs: buffer of (bit-reversed) samples
106	*
107	* Outputs: processed samples in same buffer
108	*
109	* Return: none
110	*
111	* Notes: assumes 3 guard bits, gains 1 integer bit
112	* guard bits out = guard bits in - 3 (if inputs are full scale)
113	* or guard bits in - 2 (if inputs bounded to +/- sqrt(2)/2)
114	* see scaling comments in fft.c for base AAC
115	* should compile with no stack spills on ARM (verify compiled output)
116	* current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU
117	**************************************************************************************/
118	static void R8FirstPass32(int *r0)
119	{
120	int r1, r2, r3, r4, r5, r6, r7;
121	int r8, r9, r10, r11, r12, r14;
122
123	/* number of passes = fft size / 8 = 32 / 8 = 4 */
124	r1 = (32 >> 3);
125	do {
126
127	r2 = r0[8];
128	r3 = r0[9];
129	r4 = r0[10];
130	r5 = r0[11];
131	r6 = r0[12];
132	r7 = r0[13];
133	r8 = r0[14];
134	r9 = r0[15];
135
136	r10 = r2 + r4;
137	r11 = r3 + r5;
138	r12 = r6 + r8;
139	r14 = r7 + r9;
140
141	r2 -= r4;
142	r3 -= r5;
143	r6 -= r8;
144	r7 -= r9;
145
146	r4 = r2 - r7;
147	r5 = r2 + r7;
148	r8 = r3 - r6;
149	r9 = r3 + r6;
150
151	r2 = r4 - r9;
152	r3 = r4 + r9;
153	r6 = r5 - r8;
154	r7 = r5 + r8;
155
156	r2 = MULSHIFT32(SQRT1_2, r2); /* can use r4, r5, r8, or r9 for constant and lo32 scratch reg */
157	r3 = MULSHIFT32(SQRT1_2, r3);
158	r6 = MULSHIFT32(SQRT1_2, r6);
159	r7 = MULSHIFT32(SQRT1_2, r7);
160
161	r4 = r10 + r12;
162	r5 = r10 - r12;
163	r8 = r11 + r14;
164	r9 = r11 - r14;
165
166	r10 = r0[0];
167	r11 = r0[2];
168	r12 = r0[4];
169	r14 = r0[6];
170
171	r10 += r11;
172	r12 += r14;
173
174	r4 >>= 1;
175	r10 += r12;
176	r4 += (r10 >> 1);
177	r0[ 0] = r4;
178	r4 -= (r10 >> 1);
179	r4 = (r10 >> 1) - r4;
180	r0[ 8] = r4;
181
182	r9 >>= 1;
183	r10 -= 2 * r12;
184	r4 = (r10 >> 1) + r9;
185	r0[ 4] = r4;
186	r4 = (r10 >> 1) - r9;
187	r0[12] = r4;
188	r10 += r12;
189
190	r10 -= 2 * r11;
191	r12 -= 2 * r14;
192
193	r4 = r0[1];
194	r9 = r0[3];
195	r11 = r0[5];
196	r14 = r0[7];
197
198	r4 += r9;
199	r11 += r14;
200
201	r8 >>= 1;
202	r4 += r11;
203	r8 += (r4 >> 1);
204	r0[ 1] = r8;
205	r8 -= (r4 >> 1);
206	r8 = (r4 >> 1) - r8;
207	r0[ 9] = r8;
208
209	r5 >>= 1;
210	r4 -= 2 * r11;
211	r8 = (r4 >> 1) - r5;
212	r0[ 5] = r8;
213	r8 = (r4 >> 1) + r5;
214	r0[13] = r8;
215	r4 += r11;
216
217	r4 -= 2 * r9;
218	r11 -= 2 * r14;
219
220	r9 = r10 - r11;
221	r10 += r11;
222	r14 = r4 + r12;
223	r4 -= r12;
224
225	r5 = (r10 >> 1) + r7;
226	r8 = (r4 >> 1) - r6;
227	r0[ 2] = r5;
228	r0[ 3] = r8;
229
230	r5 = (r9 >> 1) - r2;
231	r8 = (r14 >> 1) - r3;
232	r0[ 6] = r5;
233	r0[ 7] = r8;
234
235	r5 = (r10 >> 1) - r7;
236	r8 = (r4 >> 1) + r6;
237	r0[10] = r5;
238	r0[11] = r8;
239
240	r5 = (r9 >> 1) + r2;
241	r8 = (r14 >> 1) + r3;
242	r0[14] = r5;
243	r0[15] = r8;
244
245	r0 += 16;
246	r1--;
247	} while (r1 != 0);
248	}
249
250	/**************************************************************************************
251	* Function: R4Core32
252	*
253	* Description: radix-4 pass for 32-point decimation-in-time FFT
254	*
255	* Inputs: buffer of samples
256	*
257	* Outputs: processed samples in same buffer
258	*
259	* Return: none
260	*
261	* Notes: gain 2 integer bits
262	* guard bits out = guard bits in - 1 (if inputs are full scale)
263	* see scaling comments in fft.c for base AAC
264	* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
265	* should compile with no stack spills on ARM (verify compiled output)
266	* current instruction count (per pass): 16 LDR, 16 STR, 4 SMULL, 61 ALU
267	**************************************************************************************/
268	static void R4Core32(int *r0)
269	{
270	int r2, r3, r4, r5, r6, r7;
271	int r8, r9, r10, r12, r14;
272	int *r1;
273
274	r1 = (int *)twidTabOdd32;
275	r10 = 8;
276	do {
277	/* can use r14 for lo32 scratch register in all MULSHIFT32 */
278	r2 = r1[0];
279	r3 = r1[1];
280	r4 = r0[16];
281	r5 = r0[17];
282	r12 = r4 + r5;
283	r12 = MULSHIFT32(r3, r12);
284	r5 = MULSHIFT32(r2, r5) + r12;
285	r2 += 2 * r3;
286	r4 = MULSHIFT32(r2, r4) - r12;
287
288	r2 = r1[2];
289	r3 = r1[3];
290	r6 = r0[32];
291	r7 = r0[33];
292	r12 = r6 + r7;
293	r12 = MULSHIFT32(r3, r12);
294	r7 = MULSHIFT32(r2, r7) + r12;
295	r2 += 2 * r3;
296	r6 = MULSHIFT32(r2, r6) - r12;
297
298	r2 = r1[4];
299	r3 = r1[5];
300	r8 = r0[48];
301	r9 = r0[49];
302	r12 = r8 + r9;
303	r12 = MULSHIFT32(r3, r12);
304	r9 = MULSHIFT32(r2, r9) + r12;
305	r2 += 2 * r3;
306	r8 = MULSHIFT32(r2, r8) - r12;
307
308	r2 = r0[0];
309	r3 = r0[1];
310
311	r12 = r6 + r8;
312	r8 = r6 - r8;
313	r14 = r9 - r7;
314	r9 = r9 + r7;
315
316	r6 = (r2 >> 2) - r4;
317	r7 = (r3 >> 2) - r5;
318	r4 += (r2 >> 2);
319	r5 += (r3 >> 2);
320
321	r2 = r4 + r12;
322	r3 = r5 + r9;
323	r0[0] = r2;
324	r0[1] = r3;
325	r2 = r6 - r14;
326	r3 = r7 - r8;
327	r0[16] = r2;
328	r0[17] = r3;
329	r2 = r4 - r12;
330	r3 = r5 - r9;
331	r0[32] = r2;
332	r0[33] = r3;
333	r2 = r6 + r14;
334	r3 = r7 + r8;
335	r0[48] = r2;
336	r0[49] = r3;
337
338	r0 += 2;
339	r1 += 6;
340	r10--;
341	} while (r10 != 0);
342	}
343
344	/**************************************************************************************
345	* Function: FFT32C
346	*
347	* Description: Ken's very fast in-place radix-4 decimation-in-time FFT
348	*
349	* Inputs: buffer of 32 complex samples (before bit-reversal)
350	*
351	* Outputs: processed samples in same buffer
352	*
353	* Return: none
354	*
355	* Notes: assumes 3 guard bits in, gains 3 integer bits
356	* guard bits out = guard bits in - 2
357	* (guard bit analysis includes assumptions about steps immediately
358	* before and after, i.e. PreMul and PostMul for DCT)
359	**************************************************************************************/
360	void FFT32C(int *x)
361	{
362	/* decimation in time */
363	BitReverse32(x);
364
365	/* 32-point complex FFT */
366	R8FirstPass32(x); /* gain 1 int bit, lose 2 GB (making assumptions about input) */
367	R4Core32(x); /* gain 2 int bits, lose 0 GB (making assumptions about input) */
368	}
369