platform/hardware/amlogic/LibAudio.git - Unnamed repository; edit this file 'description' to name the repository.

1 /* ***** BEGIN LICENSE BLOCK *****
2  * Source last modified: $Id: dct4.c,v 1.1 2005/02/26 01:47:34 jrecker Exp $
3  *
4  * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5  *
6  * The contents of this file, and the files included with this file,
7  * are subject to the current version of the RealNetworks Public
8  * Source License (the "RPSL") available at
9  * http://www.helixcommunity.org/content/rpsl unless you have licensed
10  * the file under the current version of the RealNetworks Community
11  * Source License (the "RCSL") available at
12  * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13  * will apply. You may also obtain the license terms directly from
14  * RealNetworks.  You may not use this file except in compliance with
15  * the RPSL or, if you have a valid RCSL with RealNetworks applicable
16  * to this file, the RCSL.  Please see the applicable RPSL or RCSL for
17  * the rights, obligations and limitations governing use of the
18  * contents of the file.
19  *
20  * This file is part of the Helix DNA Technology. RealNetworks is the
21  * developer of the Original Code and owns the copyrights in the
22  * portions it created.
23  *
24  * This file, and the files included with this file, is distributed
25  * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26  * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27  * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29  * ENJOYMENT OR NON-INFRINGEMENT.
30  *
31  * Technology Compatibility Kit Test Suite(s) Location:
32  *    http://www.helixcommunity.org/content/tck
33  *
34  * Contributor(s):
35  *
36  * ***** END LICENSE BLOCK ***** */
37
38 /**************************************************************************************
39  * Fixed-point HE-AAC decoder
40  * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41  * February 2005
42  *
43  * dct4.c - optimized DCT-IV
44  **************************************************************************************/
45
46 #include "coder.h"
47
48 #include "assembly.h"
49
50
51 static const int nmdctTab[NUM_IMDCT_SIZES] = {128, 1024};
52 static const int postSkip[NUM_IMDCT_SIZES] = {15, 1};
53
54 /**************************************************************************************
55  * Function:    PreMultiply
56  *
57  * Description: pre-twiddle stage of DCT4
58  *
59  * Inputs:      table index (for transform size)
60  *              buffer of nmdct samples
61  *
62  * Outputs:     processed samples in same buffer
63  *
64  * Return:      none
65  *
66  * Notes:       minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits
67  *              i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long)
68  *              normalization by -1/N is rolled into tables here (see trigtabs.c)
69  *              uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
70  **************************************************************************************/
71 static void PreMultiply(int tabidx, int *zbuf1)
72 {
73     int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
74     int t, cms2, cps2a, sin2a, cps2b, sin2b;
75     int *zbuf2;
76     const int *csptr;
77
78     nmdct = nmdctTab[tabidx];
79     zbuf2 = zbuf1 + nmdct - 1;
80     csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
81
82     /* whole thing should fit in registers - verify that compiler does this */
83     for (i = nmdct >> 2; i != 0; i--) {
84         /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
85         cps2a = *csptr++;
86         sin2a = *csptr++;
87         cps2b = *csptr++;
88         sin2b = *csptr++;
89
90         ar1 = *(zbuf1 + 0);
91         ai2 = *(zbuf1 + 1);
92         ai1 = *(zbuf2 + 0);
93         ar2 = *(zbuf2 - 1);
94
95         /* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M
96          * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
97          * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
98          *   extra sign bits, and eat one in adding
99          */
100         t  = MULSHIFT32(sin2a, ar1 + ai1);
101         z2 = MULSHIFT32(cps2a, ai1) - t;
102         cms2 = cps2a - 2 * sin2a;
103         z1 = MULSHIFT32(cms2, ar1) + t;
104         *zbuf1++ = z1;  /* cos*ar1 + sin*ai1 */
105         *zbuf1++ = z2;  /* cos*ai1 - sin*ar1 */
106
107         t  = MULSHIFT32(sin2b, ar2 + ai2);
108         z2 = MULSHIFT32(cps2b, ai2) - t;
109         cms2 = cps2b - 2 * sin2b;
110         z1 = MULSHIFT32(cms2, ar2) + t;
111         *zbuf2-- = z2;  /* cos*ai2 - sin*ar2 */
112         *zbuf2-- = z1;  /* cos*ar2 + sin*ai2 */
113     }
114 }
115
116 /**************************************************************************************
117  * Function:    PostMultiply
118  *
119  * Description: post-twiddle stage of DCT4
120  *
121  * Inputs:      table index (for transform size)
122  *              buffer of nmdct samples
123  *
124  * Outputs:     processed samples in same buffer
125  *
126  * Return:      none
127  *
128  * Notes:       minimum 1 GB in, 2 GB out - gains 2 int bits
129  *              uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
130  **************************************************************************************/
131 static void PostMultiply(int tabidx, int *fft1)
132 {
133     int i, nmdct, ar1, ai1, ar2, ai2, skipFactor;
134     int t, cms2, cps2, sin2;
135     int *fft2;
136     const int *csptr;
137
138     nmdct = nmdctTab[tabidx];
139     csptr = cos1sin1tab;
140     skipFactor = postSkip[tabidx];
141     fft2 = fft1 + nmdct - 1;
142
143     /* load coeffs for first pass
144      * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
145      */
146     cps2 = *csptr++;
147     sin2 = *csptr;
148     csptr += skipFactor;
149     cms2 = cps2 - 2 * sin2;
150
151     for (i = nmdct >> 2; i != 0; i--) {
152         ar1 = *(fft1 + 0);
153         ai1 = *(fft1 + 1);
154         ar2 = *(fft2 - 1);
155         ai2 = *(fft2 + 0);
156
157         /* gain 2 ints bit from MULSHIFT32 by Q30
158          * max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414
159          * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
160          *   extra sign bits, and eat one in adding
161          */
162         t = MULSHIFT32(sin2, ar1 + ai1);
163         *fft2-- = t - MULSHIFT32(cps2, ai1);    /* sin*ar1 - cos*ai1 */
164         *fft1++ = t + MULSHIFT32(cms2, ar1);    /* cos*ar1 + sin*ai1 */
165         cps2 = *csptr++;
166         sin2 = *csptr;
167         csptr += skipFactor;
168
169         ai2 = -ai2;
170         t = MULSHIFT32(sin2, ar2 + ai2);
171         *fft2-- = t - MULSHIFT32(cps2, ai2);    /* sin*ar1 - cos*ai1 */
172         cms2 = cps2 - 2 * sin2;
173         *fft1++ = t + MULSHIFT32(cms2, ar2);    /* cos*ar1 + sin*ai1 */
174     }
175 }
176
177 /**************************************************************************************
178  * Function:    PreMultiplyRescale
179  *
180  * Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits
181  *
182  * Inputs:      table index (for transform size)
183  *              buffer of nmdct samples
184  *              number of guard bits to add to input before processing
185  *
186  * Outputs:     processed samples in same buffer
187  *
188  * Return:      none
189  *
190  * Notes:       see notes on PreMultiply(), above
191  **************************************************************************************/
192 static void PreMultiplyRescale(int tabidx, int *zbuf1, int es)
193 {
194     int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
195     int t, cms2, cps2a, sin2a, cps2b, sin2b;
196     int *zbuf2;
197     const int *csptr;
198
199     nmdct = nmdctTab[tabidx];
200     zbuf2 = zbuf1 + nmdct - 1;
201     csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
202
203     /* whole thing should fit in registers - verify that compiler does this */
204     for (i = nmdct >> 2; i != 0; i--) {
205         /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
206         cps2a = *csptr++;
207         sin2a = *csptr++;
208         cps2b = *csptr++;
209         sin2b = *csptr++;
210
211         ar1 = *(zbuf1 + 0) >> es;
212         ai1 = *(zbuf2 + 0) >> es;
213         ai2 = *(zbuf1 + 1) >> es;
214
215         t  = MULSHIFT32(sin2a, ar1 + ai1);
216         z2 = MULSHIFT32(cps2a, ai1) - t;
217         cms2 = cps2a - 2 * sin2a;
218         z1 = MULSHIFT32(cms2, ar1) + t;
219         *zbuf1++ = z1;
220         *zbuf1++ = z2;
221
222         ar2 = *(zbuf2 - 1) >> es;   /* do here to free up register used for es */
223
224         t  = MULSHIFT32(sin2b, ar2 + ai2);
225         z2 = MULSHIFT32(cps2b, ai2) - t;
226         cms2 = cps2b - 2 * sin2b;
227         z1 = MULSHIFT32(cms2, ar2) + t;
228         *zbuf2-- = z2;
229         *zbuf2-- = z1;
230
231     }
232 }
233
234 /**************************************************************************************
235  * Function:    PostMultiplyRescale
236  *
237  * Description: post-twiddle stage of DCT4, with rescaling for extra guard bits
238  *
239  * Inputs:      table index (for transform size)
240  *              buffer of nmdct samples
241  *              number of guard bits to remove from output
242  *
243  * Outputs:     processed samples in same buffer
244  *
245  * Return:      none
246  *
247  * Notes:       clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit
248  *              see notes on PostMultiply(), above
249  **************************************************************************************/
250 static void PostMultiplyRescale(int tabidx, int *fft1, int es)
251 {
252     int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z;
253     int t, cs2, sin2;
254     int *fft2;
255     const int *csptr;
256
257     nmdct = nmdctTab[tabidx];
258     csptr = cos1sin1tab;
259     skipFactor = postSkip[tabidx];
260     fft2 = fft1 + nmdct - 1;
261
262     /* load coeffs for first pass
263      * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
264      */
265     cs2 = *csptr++;
266     sin2 = *csptr;
267     csptr += skipFactor;
268
269     for (i = nmdct >> 2; i != 0; i--) {
270         ar1 = *(fft1 + 0);
271         ai1 = *(fft1 + 1);
272         ai2 = *(fft2 + 0);
273
274         t = MULSHIFT32(sin2, ar1 + ai1);
275         z = t - MULSHIFT32(cs2, ai1);
276         CLIP_2N_SHIFT(z, es);
277         *fft2-- = z;
278         cs2 -= 2 * sin2;
279         z = t + MULSHIFT32(cs2, ar1);
280         CLIP_2N_SHIFT(z, es);
281         *fft1++ = z;
282
283         cs2 = *csptr++;
284         sin2 = *csptr;
285         csptr += skipFactor;
286
287         ar2 = *fft2;
288         ai2 = -ai2;
289         t = MULSHIFT32(sin2, ar2 + ai2);
290         z = t - MULSHIFT32(cs2, ai2);
291         CLIP_2N_SHIFT(z, es);
292         *fft2-- = z;
293         cs2 -= 2 * sin2;
294         z = t + MULSHIFT32(cs2, ar2);
295         CLIP_2N_SHIFT(z, es);
296         *fft1++ = z;
297         cs2 += 2 * sin2;
298     }
299 }
300
301 /**************************************************************************************
302  * Function:    DCT4
303  *
304  * Description: type-IV DCT
305  *
306  * Inputs:      table index (for transform size)
307  *              buffer of nmdct samples
308  *              number of guard bits in the input buffer
309  *
310  * Outputs:     processed samples in same buffer
311  *
312  * Return:      none
313  *
314  * Notes:       operates in-place
315  *              if number of guard bits in input is < GBITS_IN_DCT4, the input is
316  *                scaled (>>) before the DCT4 and rescaled (<<, with clipping) after
317  *                the DCT4 (rare)
318  *              the output has FBITS_LOST_DCT4 fewer fraction bits than the input
319  *              the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4)
320  *              int bits gained per stage (PreMul + FFT + PostMul)
321  *                 short blocks = (-5 + 4 + 2) = 1 total
322  *                 long blocks =  (-8 + 7 + 2) = 1 total
323  **************************************************************************************/
324 void DCT4(int tabidx, int *coef, int gb)
325 {
326     int es;
327
328     /* fast in-place DCT-IV - adds guard bits if necessary */
329     if (gb < GBITS_IN_DCT4) {
330         es = GBITS_IN_DCT4 - gb;
331         PreMultiplyRescale(tabidx, coef, es);
332         R4FFT(tabidx, coef);
333         PostMultiplyRescale(tabidx, coef, es);
334     } else {
335         PreMultiply(tabidx, coef);
336         R4FFT(tabidx, coef);
337         PostMultiply(tabidx, coef);
338     }
339 }
340
1	/* *** BEGIN LICENSE BLOCK ***
2	* Source last modified: $Id: dct4.c,v 1.1 2005/02/26 01:47:34 jrecker Exp $
3	*
4	* Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5	*
6	* The contents of this file, and the files included with this file,
7	* are subject to the current version of the RealNetworks Public
8	* Source License (the "RPSL") available at
9	* http://www.helixcommunity.org/content/rpsl unless you have licensed
10	* the file under the current version of the RealNetworks Community
11	* Source License (the "RCSL") available at
12	* http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13	* will apply. You may also obtain the license terms directly from
14	* RealNetworks. You may not use this file except in compliance with
15	* the RPSL or, if you have a valid RCSL with RealNetworks applicable
16	* to this file, the RCSL. Please see the applicable RPSL or RCSL for
17	* the rights, obligations and limitations governing use of the
18	* contents of the file.
19	*
20	* This file is part of the Helix DNA Technology. RealNetworks is the
21	* developer of the Original Code and owns the copyrights in the
22	* portions it created.
23	*
24	* This file, and the files included with this file, is distributed
25	* and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26	* KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27	* ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28	* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29	* ENJOYMENT OR NON-INFRINGEMENT.
30	*
31	* Technology Compatibility Kit Test Suite(s) Location:
32	* http://www.helixcommunity.org/content/tck
33	*
34	* Contributor(s):
35	*
36	* *** END LICENSE BLOCK *** */
37
38	/**************************************************************************************
39	* Fixed-point HE-AAC decoder
40	* Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41	* February 2005
42	*
43	* dct4.c - optimized DCT-IV
44	**************************************************************************************/
45
46	#include "coder.h"
47
48	#include "assembly.h"
49
50
51	static const int nmdctTab[NUM_IMDCT_SIZES] = {128, 1024};
52	static const int postSkip[NUM_IMDCT_SIZES] = {15, 1};
53
54	/**************************************************************************************
55	* Function: PreMultiply
56	*
57	* Description: pre-twiddle stage of DCT4
58	*
59	* Inputs: table index (for transform size)
60	* buffer of nmdct samples
61	*
62	* Outputs: processed samples in same buffer
63	*
64	* Return: none
65	*
66	* Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits
67	* i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long)
68	* normalization by -1/N is rolled into tables here (see trigtabs.c)
69	* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
70	**************************************************************************************/
71	static void PreMultiply(int tabidx, int *zbuf1)
72	{
73	int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
74	int t, cms2, cps2a, sin2a, cps2b, sin2b;
75	int *zbuf2;
76	const int *csptr;
77
78	nmdct = nmdctTab[tabidx];
79	zbuf2 = zbuf1 + nmdct - 1;
80	csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
81
82	/* whole thing should fit in registers - verify that compiler does this */
83	for (i = nmdct >> 2; i != 0; i--) {
84	/* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
85	cps2a = *csptr++;
86	sin2a = *csptr++;
87	cps2b = *csptr++;
88	sin2b = *csptr++;
89
90	ar1 = *(zbuf1 + 0);
91	ai2 = *(zbuf1 + 1);
92	ai1 = *(zbuf2 + 0);
93	ar2 = *(zbuf2 - 1);
94
95	/* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M
96	* max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
97	* i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
98	* extra sign bits, and eat one in adding
99	*/
100	t = MULSHIFT32(sin2a, ar1 + ai1);
101	z2 = MULSHIFT32(cps2a, ai1) - t;
102	cms2 = cps2a - 2 * sin2a;
103	z1 = MULSHIFT32(cms2, ar1) + t;
104	zbuf1++ = z1; / cosar1 + sinai1 */
105	zbuf1++ = z2; / cosai1 - sinar1 */
106
107	t = MULSHIFT32(sin2b, ar2 + ai2);
108	z2 = MULSHIFT32(cps2b, ai2) - t;
109	cms2 = cps2b - 2 * sin2b;
110	z1 = MULSHIFT32(cms2, ar2) + t;
111	zbuf2-- = z2; / cosai2 - sinar2 */
112	zbuf2-- = z1; / cosar2 + sinai2 */
113	}
114	}
115
116	/**************************************************************************************
117	* Function: PostMultiply
118	*
119	* Description: post-twiddle stage of DCT4
120	*
121	* Inputs: table index (for transform size)
122	* buffer of nmdct samples
123	*
124	* Outputs: processed samples in same buffer
125	*
126	* Return: none
127	*
128	* Notes: minimum 1 GB in, 2 GB out - gains 2 int bits
129	* uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
130	**************************************************************************************/
131	static void PostMultiply(int tabidx, int *fft1)
132	{
133	int i, nmdct, ar1, ai1, ar2, ai2, skipFactor;
134	int t, cms2, cps2, sin2;
135	int *fft2;
136	const int *csptr;
137
138	nmdct = nmdctTab[tabidx];
139	csptr = cos1sin1tab;
140	skipFactor = postSkip[tabidx];
141	fft2 = fft1 + nmdct - 1;
142
143	/* load coeffs for first pass
144	* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
145	*/
146	cps2 = *csptr++;
147	sin2 = *csptr;
148	csptr += skipFactor;
149	cms2 = cps2 - 2 * sin2;
150
151	for (i = nmdct >> 2; i != 0; i--) {
152	ar1 = *(fft1 + 0);
153	ai1 = *(fft1 + 1);
154	ar2 = *(fft2 - 1);
155	ai2 = *(fft2 + 0);
156
157	/* gain 2 ints bit from MULSHIFT32 by Q30
158	* max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414
159	* i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
160	* extra sign bits, and eat one in adding
161	*/
162	t = MULSHIFT32(sin2, ar1 + ai1);
163	fft2-- = t - MULSHIFT32(cps2, ai1); / sinar1 - cosai1 */
164	fft1++ = t + MULSHIFT32(cms2, ar1); / cosar1 + sinai1 */
165	cps2 = *csptr++;
166	sin2 = *csptr;
167	csptr += skipFactor;
168
169	ai2 = -ai2;
170	t = MULSHIFT32(sin2, ar2 + ai2);
171	fft2-- = t - MULSHIFT32(cps2, ai2); / sinar1 - cosai1 */
172	cms2 = cps2 - 2 * sin2;
173	fft1++ = t + MULSHIFT32(cms2, ar2); / cosar1 + sinai1 */
174	}
175	}
176
177	/**************************************************************************************
178	* Function: PreMultiplyRescale
179	*
180	* Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits
181	*
182	* Inputs: table index (for transform size)
183	* buffer of nmdct samples
184	* number of guard bits to add to input before processing
185	*
186	* Outputs: processed samples in same buffer
187	*
188	* Return: none
189	*
190	* Notes: see notes on PreMultiply(), above
191	**************************************************************************************/
192	static void PreMultiplyRescale(int tabidx, int *zbuf1, int es)
193	{
194	int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
195	int t, cms2, cps2a, sin2a, cps2b, sin2b;
196	int *zbuf2;
197	const int *csptr;
198
199	nmdct = nmdctTab[tabidx];
200	zbuf2 = zbuf1 + nmdct - 1;
201	csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
202
203	/* whole thing should fit in registers - verify that compiler does this */
204	for (i = nmdct >> 2; i != 0; i--) {
205	/* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
206	cps2a = *csptr++;
207	sin2a = *csptr++;
208	cps2b = *csptr++;
209	sin2b = *csptr++;
210
211	ar1 = *(zbuf1 + 0) >> es;
212	ai1 = *(zbuf2 + 0) >> es;
213	ai2 = *(zbuf1 + 1) >> es;
214
215	t = MULSHIFT32(sin2a, ar1 + ai1);
216	z2 = MULSHIFT32(cps2a, ai1) - t;
217	cms2 = cps2a - 2 * sin2a;
218	z1 = MULSHIFT32(cms2, ar1) + t;
219	*zbuf1++ = z1;
220	*zbuf1++ = z2;
221
222	ar2 = (zbuf2 - 1) >> es; / do here to free up register used for es */
223
224	t = MULSHIFT32(sin2b, ar2 + ai2);
225	z2 = MULSHIFT32(cps2b, ai2) - t;
226	cms2 = cps2b - 2 * sin2b;
227	z1 = MULSHIFT32(cms2, ar2) + t;
228	*zbuf2-- = z2;
229	*zbuf2-- = z1;
230
231	}
232	}
233
234	/**************************************************************************************
235	* Function: PostMultiplyRescale
236	*
237	* Description: post-twiddle stage of DCT4, with rescaling for extra guard bits
238	*
239	* Inputs: table index (for transform size)
240	* buffer of nmdct samples
241	* number of guard bits to remove from output
242	*
243	* Outputs: processed samples in same buffer
244	*
245	* Return: none
246	*
247	* Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit
248	* see notes on PostMultiply(), above
249	**************************************************************************************/
250	static void PostMultiplyRescale(int tabidx, int *fft1, int es)
251	{
252	int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z;
253	int t, cs2, sin2;
254	int *fft2;
255	const int *csptr;
256
257	nmdct = nmdctTab[tabidx];
258	csptr = cos1sin1tab;
259	skipFactor = postSkip[tabidx];
260	fft2 = fft1 + nmdct - 1;
261
262	/* load coeffs for first pass
263	* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
264	*/
265	cs2 = *csptr++;
266	sin2 = *csptr;
267	csptr += skipFactor;
268
269	for (i = nmdct >> 2; i != 0; i--) {
270	ar1 = *(fft1 + 0);
271	ai1 = *(fft1 + 1);
272	ai2 = *(fft2 + 0);
273
274	t = MULSHIFT32(sin2, ar1 + ai1);
275	z = t - MULSHIFT32(cs2, ai1);
276	CLIP_2N_SHIFT(z, es);
277	*fft2-- = z;
278	cs2 -= 2 * sin2;
279	z = t + MULSHIFT32(cs2, ar1);
280	CLIP_2N_SHIFT(z, es);
281	*fft1++ = z;
282
283	cs2 = *csptr++;
284	sin2 = *csptr;
285	csptr += skipFactor;
286
287	ar2 = *fft2;
288	ai2 = -ai2;
289	t = MULSHIFT32(sin2, ar2 + ai2);
290	z = t - MULSHIFT32(cs2, ai2);
291	CLIP_2N_SHIFT(z, es);
292	*fft2-- = z;
293	cs2 -= 2 * sin2;
294	z = t + MULSHIFT32(cs2, ar2);
295	CLIP_2N_SHIFT(z, es);
296	*fft1++ = z;
297	cs2 += 2 * sin2;
298	}
299	}
300
301	/**************************************************************************************
302	* Function: DCT4
303	*
304	* Description: type-IV DCT
305	*
306	* Inputs: table index (for transform size)
307	* buffer of nmdct samples
308	* number of guard bits in the input buffer
309	*
310	* Outputs: processed samples in same buffer
311	*
312	* Return: none
313	*
314	* Notes: operates in-place
315	* if number of guard bits in input is < GBITS_IN_DCT4, the input is
316	* scaled (>>) before the DCT4 and rescaled (<<, with clipping) after
317	* the DCT4 (rare)
318	* the output has FBITS_LOST_DCT4 fewer fraction bits than the input
319	* the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4)
320	* int bits gained per stage (PreMul + FFT + PostMul)
321	* short blocks = (-5 + 4 + 2) = 1 total
322	* long blocks = (-8 + 7 + 2) = 1 total
323	**************************************************************************************/
324	void DCT4(int tabidx, int *coef, int gb)
325	{
326	int es;
327
328	/* fast in-place DCT-IV - adds guard bits if necessary */
329	if (gb < GBITS_IN_DCT4) {
330	es = GBITS_IN_DCT4 - gb;
331	PreMultiplyRescale(tabidx, coef, es);
332	R4FFT(tabidx, coef);
333	PostMultiplyRescale(tabidx, coef, es);
334	} else {
335	PreMultiply(tabidx, coef);
336	R4FFT(tabidx, coef);
337	PostMultiply(tabidx, coef);
338	}
339	}
340