blob: cb0a77bccb8449bbf81e9d066e391bcd4c786f1a
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: dct4.c,v 1.1 2005/02/26 01:47:34 jrecker Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) |
41 | * February 2005 |
42 | * |
43 | * dct4.c - optimized DCT-IV |
44 | **************************************************************************************/ |
45 | |
46 | #include "coder.h" |
47 | |
48 | #include "assembly.h" |
49 | |
50 | |
51 | static const int nmdctTab[NUM_IMDCT_SIZES] = {128, 1024}; |
52 | static const int postSkip[NUM_IMDCT_SIZES] = {15, 1}; |
53 | |
54 | /************************************************************************************** |
55 | * Function: PreMultiply |
56 | * |
57 | * Description: pre-twiddle stage of DCT4 |
58 | * |
59 | * Inputs: table index (for transform size) |
60 | * buffer of nmdct samples |
61 | * |
62 | * Outputs: processed samples in same buffer |
63 | * |
64 | * Return: none |
65 | * |
66 | * Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits |
67 | * i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long) |
68 | * normalization by -1/N is rolled into tables here (see trigtabs.c) |
69 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
70 | **************************************************************************************/ |
71 | static void PreMultiply(int tabidx, int *zbuf1) |
72 | { |
73 | int i, nmdct, ar1, ai1, ar2, ai2, z1, z2; |
74 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
75 | int *zbuf2; |
76 | const int *csptr; |
77 | |
78 | nmdct = nmdctTab[tabidx]; |
79 | zbuf2 = zbuf1 + nmdct - 1; |
80 | csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; |
81 | |
82 | /* whole thing should fit in registers - verify that compiler does this */ |
83 | for (i = nmdct >> 2; i != 0; i--) { |
84 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
85 | cps2a = *csptr++; |
86 | sin2a = *csptr++; |
87 | cps2b = *csptr++; |
88 | sin2b = *csptr++; |
89 | |
90 | ar1 = *(zbuf1 + 0); |
91 | ai2 = *(zbuf1 + 1); |
92 | ai1 = *(zbuf2 + 0); |
93 | ar2 = *(zbuf2 - 1); |
94 | |
95 | /* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M |
96 | * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414 |
97 | * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from |
98 | * extra sign bits, and eat one in adding |
99 | */ |
100 | t = MULSHIFT32(sin2a, ar1 + ai1); |
101 | z2 = MULSHIFT32(cps2a, ai1) - t; |
102 | cms2 = cps2a - 2 * sin2a; |
103 | z1 = MULSHIFT32(cms2, ar1) + t; |
104 | *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */ |
105 | *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */ |
106 | |
107 | t = MULSHIFT32(sin2b, ar2 + ai2); |
108 | z2 = MULSHIFT32(cps2b, ai2) - t; |
109 | cms2 = cps2b - 2 * sin2b; |
110 | z1 = MULSHIFT32(cms2, ar2) + t; |
111 | *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */ |
112 | *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */ |
113 | } |
114 | } |
115 | |
116 | /************************************************************************************** |
117 | * Function: PostMultiply |
118 | * |
119 | * Description: post-twiddle stage of DCT4 |
120 | * |
121 | * Inputs: table index (for transform size) |
122 | * buffer of nmdct samples |
123 | * |
124 | * Outputs: processed samples in same buffer |
125 | * |
126 | * Return: none |
127 | * |
128 | * Notes: minimum 1 GB in, 2 GB out - gains 2 int bits |
129 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
130 | **************************************************************************************/ |
131 | static void PostMultiply(int tabidx, int *fft1) |
132 | { |
133 | int i, nmdct, ar1, ai1, ar2, ai2, skipFactor; |
134 | int t, cms2, cps2, sin2; |
135 | int *fft2; |
136 | const int *csptr; |
137 | |
138 | nmdct = nmdctTab[tabidx]; |
139 | csptr = cos1sin1tab; |
140 | skipFactor = postSkip[tabidx]; |
141 | fft2 = fft1 + nmdct - 1; |
142 | |
143 | /* load coeffs for first pass |
144 | * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) |
145 | */ |
146 | cps2 = *csptr++; |
147 | sin2 = *csptr; |
148 | csptr += skipFactor; |
149 | cms2 = cps2 - 2 * sin2; |
150 | |
151 | for (i = nmdct >> 2; i != 0; i--) { |
152 | ar1 = *(fft1 + 0); |
153 | ai1 = *(fft1 + 1); |
154 | ar2 = *(fft2 - 1); |
155 | ai2 = *(fft2 + 0); |
156 | |
157 | /* gain 2 ints bit from MULSHIFT32 by Q30 |
158 | * max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414 |
159 | * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from |
160 | * extra sign bits, and eat one in adding |
161 | */ |
162 | t = MULSHIFT32(sin2, ar1 + ai1); |
163 | *fft2-- = t - MULSHIFT32(cps2, ai1); /* sin*ar1 - cos*ai1 */ |
164 | *fft1++ = t + MULSHIFT32(cms2, ar1); /* cos*ar1 + sin*ai1 */ |
165 | cps2 = *csptr++; |
166 | sin2 = *csptr; |
167 | csptr += skipFactor; |
168 | |
169 | ai2 = -ai2; |
170 | t = MULSHIFT32(sin2, ar2 + ai2); |
171 | *fft2-- = t - MULSHIFT32(cps2, ai2); /* sin*ar1 - cos*ai1 */ |
172 | cms2 = cps2 - 2 * sin2; |
173 | *fft1++ = t + MULSHIFT32(cms2, ar2); /* cos*ar1 + sin*ai1 */ |
174 | } |
175 | } |
176 | |
177 | /************************************************************************************** |
178 | * Function: PreMultiplyRescale |
179 | * |
180 | * Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits |
181 | * |
182 | * Inputs: table index (for transform size) |
183 | * buffer of nmdct samples |
184 | * number of guard bits to add to input before processing |
185 | * |
186 | * Outputs: processed samples in same buffer |
187 | * |
188 | * Return: none |
189 | * |
190 | * Notes: see notes on PreMultiply(), above |
191 | **************************************************************************************/ |
192 | static void PreMultiplyRescale(int tabidx, int *zbuf1, int es) |
193 | { |
194 | int i, nmdct, ar1, ai1, ar2, ai2, z1, z2; |
195 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
196 | int *zbuf2; |
197 | const int *csptr; |
198 | |
199 | nmdct = nmdctTab[tabidx]; |
200 | zbuf2 = zbuf1 + nmdct - 1; |
201 | csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; |
202 | |
203 | /* whole thing should fit in registers - verify that compiler does this */ |
204 | for (i = nmdct >> 2; i != 0; i--) { |
205 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
206 | cps2a = *csptr++; |
207 | sin2a = *csptr++; |
208 | cps2b = *csptr++; |
209 | sin2b = *csptr++; |
210 | |
211 | ar1 = *(zbuf1 + 0) >> es; |
212 | ai1 = *(zbuf2 + 0) >> es; |
213 | ai2 = *(zbuf1 + 1) >> es; |
214 | |
215 | t = MULSHIFT32(sin2a, ar1 + ai1); |
216 | z2 = MULSHIFT32(cps2a, ai1) - t; |
217 | cms2 = cps2a - 2 * sin2a; |
218 | z1 = MULSHIFT32(cms2, ar1) + t; |
219 | *zbuf1++ = z1; |
220 | *zbuf1++ = z2; |
221 | |
222 | ar2 = *(zbuf2 - 1) >> es; /* do here to free up register used for es */ |
223 | |
224 | t = MULSHIFT32(sin2b, ar2 + ai2); |
225 | z2 = MULSHIFT32(cps2b, ai2) - t; |
226 | cms2 = cps2b - 2 * sin2b; |
227 | z1 = MULSHIFT32(cms2, ar2) + t; |
228 | *zbuf2-- = z2; |
229 | *zbuf2-- = z1; |
230 | |
231 | } |
232 | } |
233 | |
234 | /************************************************************************************** |
235 | * Function: PostMultiplyRescale |
236 | * |
237 | * Description: post-twiddle stage of DCT4, with rescaling for extra guard bits |
238 | * |
239 | * Inputs: table index (for transform size) |
240 | * buffer of nmdct samples |
241 | * number of guard bits to remove from output |
242 | * |
243 | * Outputs: processed samples in same buffer |
244 | * |
245 | * Return: none |
246 | * |
247 | * Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit |
248 | * see notes on PostMultiply(), above |
249 | **************************************************************************************/ |
250 | static void PostMultiplyRescale(int tabidx, int *fft1, int es) |
251 | { |
252 | int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z; |
253 | int t, cs2, sin2; |
254 | int *fft2; |
255 | const int *csptr; |
256 | |
257 | nmdct = nmdctTab[tabidx]; |
258 | csptr = cos1sin1tab; |
259 | skipFactor = postSkip[tabidx]; |
260 | fft2 = fft1 + nmdct - 1; |
261 | |
262 | /* load coeffs for first pass |
263 | * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) |
264 | */ |
265 | cs2 = *csptr++; |
266 | sin2 = *csptr; |
267 | csptr += skipFactor; |
268 | |
269 | for (i = nmdct >> 2; i != 0; i--) { |
270 | ar1 = *(fft1 + 0); |
271 | ai1 = *(fft1 + 1); |
272 | ai2 = *(fft2 + 0); |
273 | |
274 | t = MULSHIFT32(sin2, ar1 + ai1); |
275 | z = t - MULSHIFT32(cs2, ai1); |
276 | CLIP_2N_SHIFT(z, es); |
277 | *fft2-- = z; |
278 | cs2 -= 2 * sin2; |
279 | z = t + MULSHIFT32(cs2, ar1); |
280 | CLIP_2N_SHIFT(z, es); |
281 | *fft1++ = z; |
282 | |
283 | cs2 = *csptr++; |
284 | sin2 = *csptr; |
285 | csptr += skipFactor; |
286 | |
287 | ar2 = *fft2; |
288 | ai2 = -ai2; |
289 | t = MULSHIFT32(sin2, ar2 + ai2); |
290 | z = t - MULSHIFT32(cs2, ai2); |
291 | CLIP_2N_SHIFT(z, es); |
292 | *fft2-- = z; |
293 | cs2 -= 2 * sin2; |
294 | z = t + MULSHIFT32(cs2, ar2); |
295 | CLIP_2N_SHIFT(z, es); |
296 | *fft1++ = z; |
297 | cs2 += 2 * sin2; |
298 | } |
299 | } |
300 | |
301 | /************************************************************************************** |
302 | * Function: DCT4 |
303 | * |
304 | * Description: type-IV DCT |
305 | * |
306 | * Inputs: table index (for transform size) |
307 | * buffer of nmdct samples |
308 | * number of guard bits in the input buffer |
309 | * |
310 | * Outputs: processed samples in same buffer |
311 | * |
312 | * Return: none |
313 | * |
314 | * Notes: operates in-place |
315 | * if number of guard bits in input is < GBITS_IN_DCT4, the input is |
316 | * scaled (>>) before the DCT4 and rescaled (<<, with clipping) after |
317 | * the DCT4 (rare) |
318 | * the output has FBITS_LOST_DCT4 fewer fraction bits than the input |
319 | * the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4) |
320 | * int bits gained per stage (PreMul + FFT + PostMul) |
321 | * short blocks = (-5 + 4 + 2) = 1 total |
322 | * long blocks = (-8 + 7 + 2) = 1 total |
323 | **************************************************************************************/ |
324 | void DCT4(int tabidx, int *coef, int gb) |
325 | { |
326 | int es; |
327 | |
328 | /* fast in-place DCT-IV - adds guard bits if necessary */ |
329 | if (gb < GBITS_IN_DCT4) { |
330 | es = GBITS_IN_DCT4 - gb; |
331 | PreMultiplyRescale(tabidx, coef, es); |
332 | R4FFT(tabidx, coef); |
333 | PostMultiplyRescale(tabidx, coef, es); |
334 | } else { |
335 | PreMultiply(tabidx, coef); |
336 | R4FFT(tabidx, coef); |
337 | PostMultiply(tabidx, coef); |
338 | } |
339 | } |
340 |