summaryrefslogtreecommitdiff
path: root/audio_codec/wfd_aac_decoder/dct4.c (plain)
blob: 3182479f56e4e66fb726841a2e44899743ce30c6
1/* ***** BEGIN LICENSE BLOCK *****
2 * Source last modified: $Id: dct4.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $
3 *
4 * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved.
5 *
6 * The contents of this file, and the files included with this file,
7 * are subject to the current version of the RealNetworks Public
8 * Source License (the "RPSL") available at
9 * http://www.helixcommunity.org/content/rpsl unless you have licensed
10 * the file under the current version of the RealNetworks Community
11 * Source License (the "RCSL") available at
12 * http://www.helixcommunity.org/content/rcsl, in which case the RCSL
13 * will apply. You may also obtain the license terms directly from
14 * RealNetworks. You may not use this file except in compliance with
15 * the RPSL or, if you have a valid RCSL with RealNetworks applicable
16 * to this file, the RCSL. Please see the applicable RPSL or RCSL for
17 * the rights, obligations and limitations governing use of the
18 * contents of the file.
19 *
20 * This file is part of the Helix DNA Technology. RealNetworks is the
21 * developer of the Original Code and owns the copyrights in the
22 * portions it created.
23 *
24 * This file, and the files included with this file, is distributed
25 * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY
26 * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS
27 * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES
28 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET
29 * ENJOYMENT OR NON-INFRINGEMENT.
30 *
31 * Technology Compatibility Kit Test Suite(s) Location:
32 * http://www.helixcommunity.org/content/tck
33 *
34 * Contributor(s):
35 *
36 * ***** END LICENSE BLOCK ***** */
37
38/**************************************************************************************
39 * Fixed-point HE-AAC decoder
40 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
41 * February 2005
42 *
43 * dct4.c - optimized DCT-IV
44 **************************************************************************************/
45
46#include "coder.h"
47#include "assembly.h"
48
49static const int nmdctTab[NUM_IMDCT_SIZES] = {128, 1024};
50static const int postSkip[NUM_IMDCT_SIZES] = {15, 1};
51
52/**************************************************************************************
53 * Function: PreMultiply
54 *
55 * Description: pre-twiddle stage of DCT4
56 *
57 * Inputs: table index (for transform size)
58 * buffer of nmdct samples
59 *
60 * Outputs: processed samples in same buffer
61 *
62 * Return: none
63 *
64 * Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits
65 * i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long)
66 * normalization by -1/N is rolled into tables here (see trigtabs.c)
67 * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
68 **************************************************************************************/
69static void PreMultiply(int tabidx, int *zbuf1)
70{
71 int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
72 int t, cms2, cps2a, sin2a, cps2b, sin2b;
73 int *zbuf2;
74 const int *csptr;
75
76 nmdct = nmdctTab[tabidx];
77 zbuf2 = zbuf1 + nmdct - 1;
78 csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
79
80 /* whole thing should fit in registers - verify that compiler does this */
81 for (i = nmdct >> 2; i != 0; i--) {
82 /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
83 cps2a = *csptr++;
84 sin2a = *csptr++;
85 cps2b = *csptr++;
86 sin2b = *csptr++;
87
88 ar1 = *(zbuf1 + 0);
89 ai2 = *(zbuf1 + 1);
90 ai1 = *(zbuf2 + 0);
91 ar2 = *(zbuf2 - 1);
92
93 /* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M
94 * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414
95 * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
96 * extra sign bits, and eat one in adding
97 */
98 t = MULSHIFT32(sin2a, ar1 + ai1);
99 z2 = MULSHIFT32(cps2a, ai1) - t;
100 cms2 = cps2a - 2 * sin2a;
101 z1 = MULSHIFT32(cms2, ar1) + t;
102 *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */
103 *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */
104
105 t = MULSHIFT32(sin2b, ar2 + ai2);
106 z2 = MULSHIFT32(cps2b, ai2) - t;
107 cms2 = cps2b - 2 * sin2b;
108 z1 = MULSHIFT32(cms2, ar2) + t;
109 *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */
110 *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */
111 }
112}
113
114/**************************************************************************************
115 * Function: PostMultiply
116 *
117 * Description: post-twiddle stage of DCT4
118 *
119 * Inputs: table index (for transform size)
120 * buffer of nmdct samples
121 *
122 * Outputs: processed samples in same buffer
123 *
124 * Return: none
125 *
126 * Notes: minimum 1 GB in, 2 GB out - gains 2 int bits
127 * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add
128 **************************************************************************************/
129static void PostMultiply(int tabidx, int *fft1)
130{
131 int i, nmdct, ar1, ai1, ar2, ai2, skipFactor;
132 int t, cms2, cps2, sin2;
133 int *fft2;
134 const int *csptr;
135
136 nmdct = nmdctTab[tabidx];
137 csptr = cos1sin1tab;
138 skipFactor = postSkip[tabidx];
139 fft2 = fft1 + nmdct - 1;
140
141 /* load coeffs for first pass
142 * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
143 */
144 cps2 = *csptr++;
145 sin2 = *csptr;
146 csptr += skipFactor;
147 cms2 = cps2 - 2 * sin2;
148
149 for (i = nmdct >> 2; i != 0; i--) {
150 ar1 = *(fft1 + 0);
151 ai1 = *(fft1 + 1);
152 ar2 = *(fft2 - 1);
153 ai2 = *(fft2 + 0);
154
155 /* gain 2 ints bit from MULSHIFT32 by Q30
156 * max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414
157 * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from
158 * extra sign bits, and eat one in adding
159 */
160 t = MULSHIFT32(sin2, ar1 + ai1);
161 *fft2-- = t - MULSHIFT32(cps2, ai1); /* sin*ar1 - cos*ai1 */
162 *fft1++ = t + MULSHIFT32(cms2, ar1); /* cos*ar1 + sin*ai1 */
163 cps2 = *csptr++;
164 sin2 = *csptr;
165 csptr += skipFactor;
166
167 ai2 = -ai2;
168 t = MULSHIFT32(sin2, ar2 + ai2);
169 *fft2-- = t - MULSHIFT32(cps2, ai2); /* sin*ar1 - cos*ai1 */
170 cms2 = cps2 - 2 * sin2;
171 *fft1++ = t + MULSHIFT32(cms2, ar2); /* cos*ar1 + sin*ai1 */
172 }
173}
174
175/**************************************************************************************
176 * Function: PreMultiplyRescale
177 *
178 * Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits
179 *
180 * Inputs: table index (for transform size)
181 * buffer of nmdct samples
182 * number of guard bits to add to input before processing
183 *
184 * Outputs: processed samples in same buffer
185 *
186 * Return: none
187 *
188 * Notes: see notes on PreMultiply(), above
189 **************************************************************************************/
190static void PreMultiplyRescale(int tabidx, int *zbuf1, int es)
191{
192 int i, nmdct, ar1, ai1, ar2, ai2, z1, z2;
193 int t, cms2, cps2a, sin2a, cps2b, sin2b;
194 int *zbuf2;
195 const int *csptr;
196
197 nmdct = nmdctTab[tabidx];
198 zbuf2 = zbuf1 + nmdct - 1;
199 csptr = cos4sin4tab + cos4sin4tabOffset[tabidx];
200
201 /* whole thing should fit in registers - verify that compiler does this */
202 for (i = nmdct >> 2; i != 0; i--) {
203 /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */
204 cps2a = *csptr++;
205 sin2a = *csptr++;
206 cps2b = *csptr++;
207 sin2b = *csptr++;
208
209 ar1 = *(zbuf1 + 0) >> es;
210 ai1 = *(zbuf2 + 0) >> es;
211 ai2 = *(zbuf1 + 1) >> es;
212
213 t = MULSHIFT32(sin2a, ar1 + ai1);
214 z2 = MULSHIFT32(cps2a, ai1) - t;
215 cms2 = cps2a - 2 * sin2a;
216 z1 = MULSHIFT32(cms2, ar1) + t;
217 *zbuf1++ = z1;
218 *zbuf1++ = z2;
219
220 ar2 = *(zbuf2 - 1) >> es; /* do here to free up register used for es */
221
222 t = MULSHIFT32(sin2b, ar2 + ai2);
223 z2 = MULSHIFT32(cps2b, ai2) - t;
224 cms2 = cps2b - 2 * sin2b;
225 z1 = MULSHIFT32(cms2, ar2) + t;
226 *zbuf2-- = z2;
227 *zbuf2-- = z1;
228
229 }
230}
231
232/**************************************************************************************
233 * Function: PostMultiplyRescale
234 *
235 * Description: post-twiddle stage of DCT4, with rescaling for extra guard bits
236 *
237 * Inputs: table index (for transform size)
238 * buffer of nmdct samples
239 * number of guard bits to remove from output
240 *
241 * Outputs: processed samples in same buffer
242 *
243 * Return: none
244 *
245 * Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit
246 * see notes on PostMultiply(), above
247 **************************************************************************************/
248static void PostMultiplyRescale(int tabidx, int *fft1, int es)
249{
250 int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z;
251 int t, cs2, sin2;
252 int *fft2;
253 const int *csptr;
254
255 nmdct = nmdctTab[tabidx];
256 csptr = cos1sin1tab;
257 skipFactor = postSkip[tabidx];
258 fft2 = fft1 + nmdct - 1;
259
260 /* load coeffs for first pass
261 * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin)
262 */
263 cs2 = *csptr++;
264 sin2 = *csptr;
265 csptr += skipFactor;
266
267 for (i = nmdct >> 2; i != 0; i--) {
268 ar1 = *(fft1 + 0);
269 ai1 = *(fft1 + 1);
270 ai2 = *(fft2 + 0);
271
272 t = MULSHIFT32(sin2, ar1 + ai1);
273 z = t - MULSHIFT32(cs2, ai1);
274 CLIP_2N_SHIFT(z, es);
275 *fft2-- = z;
276 cs2 -= 2 * sin2;
277 z = t + MULSHIFT32(cs2, ar1);
278 CLIP_2N_SHIFT(z, es);
279 *fft1++ = z;
280
281 cs2 = *csptr++;
282 sin2 = *csptr;
283 csptr += skipFactor;
284
285 ar2 = *fft2;
286 ai2 = -ai2;
287 t = MULSHIFT32(sin2, ar2 + ai2);
288 z = t - MULSHIFT32(cs2, ai2);
289 CLIP_2N_SHIFT(z, es);
290 *fft2-- = z;
291 cs2 -= 2 * sin2;
292 z = t + MULSHIFT32(cs2, ar2);
293 CLIP_2N_SHIFT(z, es);
294 *fft1++ = z;
295 cs2 += 2 * sin2;
296 }
297}
298
299/**************************************************************************************
300 * Function: DCT4
301 *
302 * Description: type-IV DCT
303 *
304 * Inputs: table index (for transform size)
305 * buffer of nmdct samples
306 * number of guard bits in the input buffer
307 *
308 * Outputs: processed samples in same buffer
309 *
310 * Return: none
311 *
312 * Notes: operates in-place
313 * if number of guard bits in input is < GBITS_IN_DCT4, the input is
314 * scaled (>>) before the DCT4 and rescaled (<<, with clipping) after
315 * the DCT4 (rare)
316 * the output has FBITS_LOST_DCT4 fewer fraction bits than the input
317 * the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4)
318 * int bits gained per stage (PreMul + FFT + PostMul)
319 * short blocks = (-5 + 4 + 2) = 1 total
320 * long blocks = (-8 + 7 + 2) = 1 total
321 **************************************************************************************/
322void DCT4(int tabidx, int *coef, int gb)
323{
324 int es;
325
326 /* fast in-place DCT-IV - adds guard bits if necessary */
327 if (gb < GBITS_IN_DCT4) {
328 es = GBITS_IN_DCT4 - gb;
329 PreMultiplyRescale(tabidx, coef, es);
330 R4FFT(tabidx, coef);
331 PostMultiplyRescale(tabidx, coef, es);
332 } else {
333 PreMultiply(tabidx, coef);
334 R4FFT(tabidx, coef);
335 PostMultiply(tabidx, coef);
336 }
337}
338