platform/hardware/amlogic/LibAudio.git - Unnamed repository; edit this file 'description' to name the repository.

1 /* ***** BEGIN LICENSE BLOCK *****
2  * Source last modified: $Id: dct4.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $
3  *
4  * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5  *
6  * The contents of this file, and the files included with this file,
7  * are subject to the current version of the RealNetworks Public
8  * Source License (the "RPSL") available at
9  * http://www.helixcommunity.org/content/rpsl unless you have licensed
10  * the file under the current version of the RealNetworks Community
11  * Source License (the "RCSL") available at
12  * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13  * will apply. You may also obtain the license terms directly from
14  * RealNetworks.  You may not use this file except in compliance with
15  * the RPSL or, if you have a valid RCSL with RealNetworks applicable
16  * to this file, the RCSL.  Please see the applicable RPSL or RCSL for
17  * the rights, obligations and limitations governing use of the
18  * contents of the file.
19  *
20  * This file is part of the Helix DNA Technology. RealNetworks is the
21  * developer of the Original Code and owns the copyrights in the
22  * portions it created.
23  *
24  * This file, and the files included with this file, is distributed
25  * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26  * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27  * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29  * ENJOYMENT OR NON-INFRINGEMENT.
30  *
31  * Technology Compatibility Kit Test Suite(s) Location:
32  *    http://www.helixcommunity.org/content/tck
33  *
34  * Contributor(s):
35  *
36  * ***** END LICENSE BLOCK ***** */
37
38 /**************************************************************************************
39  * Fixed-point HE-AAC decoder
40  * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41  * February 2005
42  *
43  * dct4.c - optimized DCT-IV
44  **************************************************************************************/
45
46 #include "coder.h"
47 #include "assembly.h"
48
49 static const int nmdctTab[NUM_IMDCT_SIZES] = {128, 1024};
50 static const int postSkip[NUM_IMDCT_SIZES] = {15, 1};
51
52 /**************************************************************************************
53  * Function:    PreMultiply
54  *
55  * Description: pre-twiddle stage of DCT4
56  *
57  * Inputs:      table index (for transform size)
58  *              buffer of nmdct samples
59  *
60  * Outputs:     processed samples in same buffer
61  *
62  * Return:      none
63  *
64  * Notes:       minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits
65  *              i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long)
66  *              normalization by -1/N is rolled into tables here (see trigtabs.c)
67  *              uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
68  **************************************************************************************/
69 static void PreMultiply(int tabidx, int *zbuf1)
70 {
71     int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
72     int t, cms2, cps2a, sin2a, cps2b, sin2b;
73     int *zbuf2;
74     const int *csptr;
75
76     nmdct = nmdctTab[tabidx];
77     zbuf2 = zbuf1 + nmdct - 1;
78     csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
79
80     /* whole thing should fit in registers - verify that compiler does this */
81     for (i = nmdct >> 2; i != 0; i--) {
82         /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
83         cps2a = *csptr++;
84         sin2a = *csptr++;
85         cps2b = *csptr++;
86         sin2b = *csptr++;
87
88         ar1 = *(zbuf1 + 0);
89         ai2 = *(zbuf1 + 1);
90         ai1 = *(zbuf2 + 0);
91         ar2 = *(zbuf2 - 1);
92
93         /* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M
94          * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
95          * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
96          *   extra sign bits, and eat one in adding
97          */
98         t  = MULSHIFT32(sin2a, ar1 + ai1);
99         z2 = MULSHIFT32(cps2a, ai1) - t;
100         cms2 = cps2a - 2 * sin2a;
101         z1 = MULSHIFT32(cms2, ar1) + t;
102         *zbuf1++ = z1;  /* cos*ar1 + sin*ai1 */
103         *zbuf1++ = z2;  /* cos*ai1 - sin*ar1 */
104
105         t  = MULSHIFT32(sin2b, ar2 + ai2);
106         z2 = MULSHIFT32(cps2b, ai2) - t;
107         cms2 = cps2b - 2 * sin2b;
108         z1 = MULSHIFT32(cms2, ar2) + t;
109         *zbuf2-- = z2;  /* cos*ai2 - sin*ar2 */
110         *zbuf2-- = z1;  /* cos*ar2 + sin*ai2 */
111     }
112 }
113
114 /**************************************************************************************
115  * Function:    PostMultiply
116  *
117  * Description: post-twiddle stage of DCT4
118  *
119  * Inputs:      table index (for transform size)
120  *              buffer of nmdct samples
121  *
122  * Outputs:     processed samples in same buffer
123  *
124  * Return:      none
125  *
126  * Notes:       minimum 1 GB in, 2 GB out - gains 2 int bits
127  *              uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
128  **************************************************************************************/
129 static void PostMultiply(int tabidx, int *fft1)
130 {
131     int i, nmdct, ar1, ai1, ar2, ai2, skipFactor;
132     int t, cms2, cps2, sin2;
133     int *fft2;
134     const int *csptr;
135
136     nmdct = nmdctTab[tabidx];
137     csptr = cos1sin1tab;
138     skipFactor = postSkip[tabidx];
139     fft2 = fft1 + nmdct - 1;
140
141     /* load coeffs for first pass
142      * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
143      */
144     cps2 = *csptr++;
145     sin2 = *csptr;
146     csptr += skipFactor;
147     cms2 = cps2 - 2 * sin2;
148
149     for (i = nmdct >> 2; i != 0; i--) {
150         ar1 = *(fft1 + 0);
151         ai1 = *(fft1 + 1);
152         ar2 = *(fft2 - 1);
153         ai2 = *(fft2 + 0);
154
155         /* gain 2 ints bit from MULSHIFT32 by Q30
156          * max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414
157          * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
158          *   extra sign bits, and eat one in adding
159          */
160         t = MULSHIFT32(sin2, ar1 + ai1);
161         *fft2-- = t - MULSHIFT32(cps2, ai1);    /* sin*ar1 - cos*ai1 */
162         *fft1++ = t + MULSHIFT32(cms2, ar1);    /* cos*ar1 + sin*ai1 */
163         cps2 = *csptr++;
164         sin2 = *csptr;
165         csptr += skipFactor;
166
167         ai2 = -ai2;
168         t = MULSHIFT32(sin2, ar2 + ai2);
169         *fft2-- = t - MULSHIFT32(cps2, ai2);    /* sin*ar1 - cos*ai1 */
170         cms2 = cps2 - 2 * sin2;
171         *fft1++ = t + MULSHIFT32(cms2, ar2);    /* cos*ar1 + sin*ai1 */
172     }
173 }
174
175 /**************************************************************************************
176  * Function:    PreMultiplyRescale
177  *
178  * Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits
179  *
180  * Inputs:      table index (for transform size)
181  *              buffer of nmdct samples
182  *              number of guard bits to add to input before processing
183  *
184  * Outputs:     processed samples in same buffer
185  *
186  * Return:      none
187  *
188  * Notes:       see notes on PreMultiply(), above
189  **************************************************************************************/
190 static void PreMultiplyRescale(int tabidx, int *zbuf1, int es)
191 {
192     int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
193     int t, cms2, cps2a, sin2a, cps2b, sin2b;
194     int *zbuf2;
195     const int *csptr;
196
197     nmdct = nmdctTab[tabidx];
198     zbuf2 = zbuf1 + nmdct - 1;
199     csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
200
201     /* whole thing should fit in registers - verify that compiler does this */
202     for (i = nmdct >> 2; i != 0; i--) {
203         /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
204         cps2a = *csptr++;
205         sin2a = *csptr++;
206         cps2b = *csptr++;
207         sin2b = *csptr++;
208
209         ar1 = *(zbuf1 + 0) >> es;
210         ai1 = *(zbuf2 + 0) >> es;
211         ai2 = *(zbuf1 + 1) >> es;
212
213         t  = MULSHIFT32(sin2a, ar1 + ai1);
214         z2 = MULSHIFT32(cps2a, ai1) - t;
215         cms2 = cps2a - 2 * sin2a;
216         z1 = MULSHIFT32(cms2, ar1) + t;
217         *zbuf1++ = z1;
218         *zbuf1++ = z2;
219
220         ar2 = *(zbuf2 - 1) >> es;   /* do here to free up register used for es */
221
222         t  = MULSHIFT32(sin2b, ar2 + ai2);
223         z2 = MULSHIFT32(cps2b, ai2) - t;
224         cms2 = cps2b - 2 * sin2b;
225         z1 = MULSHIFT32(cms2, ar2) + t;
226         *zbuf2-- = z2;
227         *zbuf2-- = z1;
228
229     }
230 }
231
232 /**************************************************************************************
233  * Function:    PostMultiplyRescale
234  *
235  * Description: post-twiddle stage of DCT4, with rescaling for extra guard bits
236  *
237  * Inputs:      table index (for transform size)
238  *              buffer of nmdct samples
239  *              number of guard bits to remove from output
240  *
241  * Outputs:     processed samples in same buffer
242  *
243  * Return:      none
244  *
245  * Notes:       clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit
246  *              see notes on PostMultiply(), above
247  **************************************************************************************/
248 static void PostMultiplyRescale(int tabidx, int *fft1, int es)
249 {
250     int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z;
251     int t, cs2, sin2;
252     int *fft2;
253     const int *csptr;
254
255     nmdct = nmdctTab[tabidx];
256     csptr = cos1sin1tab;
257     skipFactor = postSkip[tabidx];
258     fft2 = fft1 + nmdct - 1;
259
260     /* load coeffs for first pass
261      * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
262      */
263     cs2 = *csptr++;
264     sin2 = *csptr;
265     csptr += skipFactor;
266
267     for (i = nmdct >> 2; i != 0; i--) {
268         ar1 = *(fft1 + 0);
269         ai1 = *(fft1 + 1);
270         ai2 = *(fft2 + 0);
271
272         t = MULSHIFT32(sin2, ar1 + ai1);
273         z = t - MULSHIFT32(cs2, ai1);
274         CLIP_2N_SHIFT(z, es);
275         *fft2-- = z;
276         cs2 -= 2 * sin2;
277         z = t + MULSHIFT32(cs2, ar1);
278         CLIP_2N_SHIFT(z, es);
279         *fft1++ = z;
280
281         cs2 = *csptr++;
282         sin2 = *csptr;
283         csptr += skipFactor;
284
285         ar2 = *fft2;
286         ai2 = -ai2;
287         t = MULSHIFT32(sin2, ar2 + ai2);
288         z = t - MULSHIFT32(cs2, ai2);
289         CLIP_2N_SHIFT(z, es);
290         *fft2-- = z;
291         cs2 -= 2 * sin2;
292         z = t + MULSHIFT32(cs2, ar2);
293         CLIP_2N_SHIFT(z, es);
294         *fft1++ = z;
295         cs2 += 2 * sin2;
296     }
297 }
298
299 /**************************************************************************************
300  * Function:    DCT4
301  *
302  * Description: type-IV DCT
303  *
304  * Inputs:      table index (for transform size)
305  *              buffer of nmdct samples
306  *              number of guard bits in the input buffer
307  *
308  * Outputs:     processed samples in same buffer
309  *
310  * Return:      none
311  *
312  * Notes:       operates in-place
313  *              if number of guard bits in input is < GBITS_IN_DCT4, the input is
314  *                scaled (>>) before the DCT4 and rescaled (<<, with clipping) after
315  *                the DCT4 (rare)
316  *              the output has FBITS_LOST_DCT4 fewer fraction bits than the input
317  *              the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4)
318  *              int bits gained per stage (PreMul + FFT + PostMul)
319  *                 short blocks = (-5 + 4 + 2) = 1 total
320  *                 long blocks =  (-8 + 7 + 2) = 1 total
321  **************************************************************************************/
322 void DCT4(int tabidx, int *coef, int gb)
323 {
324     int es;
325
326     /* fast in-place DCT-IV - adds guard bits if necessary */
327     if (gb < GBITS_IN_DCT4) {
328         es = GBITS_IN_DCT4 - gb;
329         PreMultiplyRescale(tabidx, coef, es);
330         R4FFT(tabidx, coef);
331         PostMultiplyRescale(tabidx, coef, es);
332     } else {
333         PreMultiply(tabidx, coef);
334         R4FFT(tabidx, coef);
335         PostMultiply(tabidx, coef);
336     }
337 }
338
1	/* *** BEGIN LICENSE BLOCK ***
2	* Source last modified: $Id: dct4.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $
3	*
4	* Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5	*
6	* The contents of this file, and the files included with this file,
7	* are subject to the current version of the RealNetworks Public
8	* Source License (the "RPSL") available at
9	* http://www.helixcommunity.org/content/rpsl unless you have licensed
10	* the file under the current version of the RealNetworks Community
11	* Source License (the "RCSL") available at
12	* http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13	* will apply. You may also obtain the license terms directly from
14	* RealNetworks. You may not use this file except in compliance with
15	* the RPSL or, if you have a valid RCSL with RealNetworks applicable
16	* to this file, the RCSL. Please see the applicable RPSL or RCSL for
17	* the rights, obligations and limitations governing use of the
18	* contents of the file.
19	*
20	* This file is part of the Helix DNA Technology. RealNetworks is the
21	* developer of the Original Code and owns the copyrights in the
22	* portions it created.
23	*
24	* This file, and the files included with this file, is distributed
25	* and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26	* KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27	* ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29	* ENJOYMENT OR NON-INFRINGEMENT.
30	*
31	* Technology Compatibility Kit Test Suite(s) Location:
32	* http://www.helixcommunity.org/content/tck
33	*
34	* Contributor(s):
35	*
36	* *** END LICENSE BLOCK *** */
37
38	/**************************************************************************************
39	* Fixed-point HE-AAC decoder
40	* Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41	* February 2005
42	*
43	* dct4.c - optimized DCT-IV
44	**************************************************************************************/
45
46	#include "coder.h"
47	#include "assembly.h"
48
49	static const int nmdctTab[NUM_IMDCT_SIZES] = {128, 1024};
50	static const int postSkip[NUM_IMDCT_SIZES] = {15, 1};
51
52	/**************************************************************************************
53	* Function: PreMultiply
54	*
55	* Description: pre-twiddle stage of DCT4
56	*
57	* Inputs: table index (for transform size)
58	* buffer of nmdct samples
59	*
60	* Outputs: processed samples in same buffer
61	*
62	* Return: none
63	*
64	* Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits
65	* i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long)
66	* normalization by -1/N is rolled into tables here (see trigtabs.c)
67	* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
68	**************************************************************************************/
69	static void PreMultiply(int tabidx, int *zbuf1)
70	{
71	int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
72	int t, cms2, cps2a, sin2a, cps2b, sin2b;
73	int *zbuf2;
74	const int *csptr;
75
76	nmdct = nmdctTab[tabidx];
77	zbuf2 = zbuf1 + nmdct - 1;
78	csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
79
80	/* whole thing should fit in registers - verify that compiler does this */
81	for (i = nmdct >> 2; i != 0; i--) {
82	/* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
83	cps2a = *csptr++;
84	sin2a = *csptr++;
85	cps2b = *csptr++;
86	sin2b = *csptr++;
87
88	ar1 = *(zbuf1 + 0);
89	ai2 = *(zbuf1 + 1);
90	ai1 = *(zbuf2 + 0);
91	ar2 = *(zbuf2 - 1);
92
93	/* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M
94	* max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
95	* i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
96	* extra sign bits, and eat one in adding
97	*/
98	t = MULSHIFT32(sin2a, ar1 + ai1);
99	z2 = MULSHIFT32(cps2a, ai1) - t;
100	cms2 = cps2a - 2 * sin2a;
101	z1 = MULSHIFT32(cms2, ar1) + t;
102	zbuf1++ = z1; / cosar1 + sinai1 */
103	zbuf1++ = z2; / cosai1 - sinar1 */
104
105	t = MULSHIFT32(sin2b, ar2 + ai2);
106	z2 = MULSHIFT32(cps2b, ai2) - t;
107	cms2 = cps2b - 2 * sin2b;
108	z1 = MULSHIFT32(cms2, ar2) + t;
109	zbuf2-- = z2; / cosai2 - sinar2 */
110	zbuf2-- = z1; / cosar2 + sinai2 */
111	}
112	}
113
114	/**************************************************************************************
115	* Function: PostMultiply
116	*
117	* Description: post-twiddle stage of DCT4
118	*
119	* Inputs: table index (for transform size)
120	* buffer of nmdct samples
121	*
122	* Outputs: processed samples in same buffer
123	*
124	* Return: none
125	*
126	* Notes: minimum 1 GB in, 2 GB out - gains 2 int bits
127	* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
128	**************************************************************************************/
129	static void PostMultiply(int tabidx, int *fft1)
130	{
131	int i, nmdct, ar1, ai1, ar2, ai2, skipFactor;
132	int t, cms2, cps2, sin2;
133	int *fft2;
134	const int *csptr;
135
136	nmdct = nmdctTab[tabidx];
137	csptr = cos1sin1tab;
138	skipFactor = postSkip[tabidx];
139	fft2 = fft1 + nmdct - 1;
140
141	/* load coeffs for first pass
142	* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
143	*/
144	cps2 = *csptr++;
145	sin2 = *csptr;
146	csptr += skipFactor;
147	cms2 = cps2 - 2 * sin2;
148
149	for (i = nmdct >> 2; i != 0; i--) {
150	ar1 = *(fft1 + 0);
151	ai1 = *(fft1 + 1);
152	ar2 = *(fft2 - 1);
153	ai2 = *(fft2 + 0);
154
155	/* gain 2 ints bit from MULSHIFT32 by Q30
156	* max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414
157	* i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
158	* extra sign bits, and eat one in adding
159	*/
160	t = MULSHIFT32(sin2, ar1 + ai1);
161	fft2-- = t - MULSHIFT32(cps2, ai1); / sinar1 - cosai1 */
162	fft1++ = t + MULSHIFT32(cms2, ar1); / cosar1 + sinai1 */
163	cps2 = *csptr++;
164	sin2 = *csptr;
165	csptr += skipFactor;
166
167	ai2 = -ai2;
168	t = MULSHIFT32(sin2, ar2 + ai2);
169	fft2-- = t - MULSHIFT32(cps2, ai2); / sinar1 - cosai1 */
170	cms2 = cps2 - 2 * sin2;
171	fft1++ = t + MULSHIFT32(cms2, ar2); / cosar1 + sinai1 */
172	}
173	}
174
175	/**************************************************************************************
176	* Function: PreMultiplyRescale
177	*
178	* Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits
179	*
180	* Inputs: table index (for transform size)
181	* buffer of nmdct samples
182	* number of guard bits to add to input before processing
183	*
184	* Outputs: processed samples in same buffer
185	*
186	* Return: none
187	*
188	* Notes: see notes on PreMultiply(), above
189	**************************************************************************************/
190	static void PreMultiplyRescale(int tabidx, int *zbuf1, int es)
191	{
192	int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
193	int t, cms2, cps2a, sin2a, cps2b, sin2b;
194	int *zbuf2;
195	const int *csptr;
196
197	nmdct = nmdctTab[tabidx];
198	zbuf2 = zbuf1 + nmdct - 1;
199	csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
200
201	/* whole thing should fit in registers - verify that compiler does this */
202	for (i = nmdct >> 2; i != 0; i--) {
203	/* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
204	cps2a = *csptr++;
205	sin2a = *csptr++;
206	cps2b = *csptr++;
207	sin2b = *csptr++;
208
209	ar1 = *(zbuf1 + 0) >> es;
210	ai1 = *(zbuf2 + 0) >> es;
211	ai2 = *(zbuf1 + 1) >> es;
212
213	t = MULSHIFT32(sin2a, ar1 + ai1);
214	z2 = MULSHIFT32(cps2a, ai1) - t;
215	cms2 = cps2a - 2 * sin2a;
216	z1 = MULSHIFT32(cms2, ar1) + t;
217	*zbuf1++ = z1;
218	*zbuf1++ = z2;
219
220	ar2 = (zbuf2 - 1) >> es; / do here to free up register used for es */
221
222	t = MULSHIFT32(sin2b, ar2 + ai2);
223	z2 = MULSHIFT32(cps2b, ai2) - t;
224	cms2 = cps2b - 2 * sin2b;
225	z1 = MULSHIFT32(cms2, ar2) + t;
226	*zbuf2-- = z2;
227	*zbuf2-- = z1;
228
229	}
230	}
231
232	/**************************************************************************************
233	* Function: PostMultiplyRescale
234	*
235	* Description: post-twiddle stage of DCT4, with rescaling for extra guard bits
236	*
237	* Inputs: table index (for transform size)
238	* buffer of nmdct samples
239	* number of guard bits to remove from output
240	*
241	* Outputs: processed samples in same buffer
242	*
243	* Return: none
244	*
245	* Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit
246	* see notes on PostMultiply(), above
247	**************************************************************************************/
248	static void PostMultiplyRescale(int tabidx, int *fft1, int es)
249	{
250	int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z;
251	int t, cs2, sin2;
252	int *fft2;
253	const int *csptr;
254
255	nmdct = nmdctTab[tabidx];
256	csptr = cos1sin1tab;
257	skipFactor = postSkip[tabidx];
258	fft2 = fft1 + nmdct - 1;
259
260	/* load coeffs for first pass
261	* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
262	*/
263	cs2 = *csptr++;
264	sin2 = *csptr;
265	csptr += skipFactor;
266
267	for (i = nmdct >> 2; i != 0; i--) {
268	ar1 = *(fft1 + 0);
269	ai1 = *(fft1 + 1);
270	ai2 = *(fft2 + 0);
271
272	t = MULSHIFT32(sin2, ar1 + ai1);
273	z = t - MULSHIFT32(cs2, ai1);
274	CLIP_2N_SHIFT(z, es);
275	*fft2-- = z;
276	cs2 -= 2 * sin2;
277	z = t + MULSHIFT32(cs2, ar1);
278	CLIP_2N_SHIFT(z, es);
279	*fft1++ = z;
280
281	cs2 = *csptr++;
282	sin2 = *csptr;
283	csptr += skipFactor;
284
285	ar2 = *fft2;
286	ai2 = -ai2;
287	t = MULSHIFT32(sin2, ar2 + ai2);
288	z = t - MULSHIFT32(cs2, ai2);
289	CLIP_2N_SHIFT(z, es);
290	*fft2-- = z;
291	cs2 -= 2 * sin2;
292	z = t + MULSHIFT32(cs2, ar2);
293	CLIP_2N_SHIFT(z, es);
294	*fft1++ = z;
295	cs2 += 2 * sin2;
296	}
297	}
298
299	/**************************************************************************************
300	* Function: DCT4
301	*
302	* Description: type-IV DCT
303	*
304	* Inputs: table index (for transform size)
305	* buffer of nmdct samples
306	* number of guard bits in the input buffer
307	*
308	* Outputs: processed samples in same buffer
309	*
310	* Return: none
311	*
312	* Notes: operates in-place
313	* if number of guard bits in input is < GBITS_IN_DCT4, the input is
314	* scaled (>>) before the DCT4 and rescaled (<<, with clipping) after
315	* the DCT4 (rare)
316	* the output has FBITS_LOST_DCT4 fewer fraction bits than the input
317	* the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4)
318	* int bits gained per stage (PreMul + FFT + PostMul)
319	* short blocks = (-5 + 4 + 2) = 1 total
320	* long blocks = (-8 + 7 + 2) = 1 total
321	**************************************************************************************/
322	void DCT4(int tabidx, int *coef, int gb)
323	{
324	int es;
325
326	/* fast in-place DCT-IV - adds guard bits if necessary */
327	if (gb < GBITS_IN_DCT4) {
328	es = GBITS_IN_DCT4 - gb;
329	PreMultiplyRescale(tabidx, coef, es);
330	R4FFT(tabidx, coef);
331	PostMultiplyRescale(tabidx, coef, es);
332	} else {
333	PreMultiply(tabidx, coef);
334	R4FFT(tabidx, coef);
335	PostMultiply(tabidx, coef);
336	}
337	}
338