blob: 3182479f56e4e66fb726841a2e44899743ce30c6
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: dct4.c,v 1.1.2.1 2005/02/26 02:05:12 jrecker Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) |
41 | * February 2005 |
42 | * |
43 | * dct4.c - optimized DCT-IV |
44 | **************************************************************************************/ |
45 | |
46 | #include "coder.h" |
47 | #include "assembly.h" |
48 | |
49 | static const int nmdctTab[NUM_IMDCT_SIZES] = {128, 1024}; |
50 | static const int postSkip[NUM_IMDCT_SIZES] = {15, 1}; |
51 | |
52 | /************************************************************************************** |
53 | * Function: PreMultiply |
54 | * |
55 | * Description: pre-twiddle stage of DCT4 |
56 | * |
57 | * Inputs: table index (for transform size) |
58 | * buffer of nmdct samples |
59 | * |
60 | * Outputs: processed samples in same buffer |
61 | * |
62 | * Return: none |
63 | * |
64 | * Notes: minimum 1 GB in, 2 GB out, gains 5 (short) or 8 (long) frac bits |
65 | * i.e. gains 2-7= -5 int bits (short) or 2-10 = -8 int bits (long) |
66 | * normalization by -1/N is rolled into tables here (see trigtabs.c) |
67 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
68 | **************************************************************************************/ |
69 | static void PreMultiply(int tabidx, int *zbuf1) |
70 | { |
71 | int i, nmdct, ar1, ai1, ar2, ai2, z1, z2; |
72 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
73 | int *zbuf2; |
74 | const int *csptr; |
75 | |
76 | nmdct = nmdctTab[tabidx]; |
77 | zbuf2 = zbuf1 + nmdct - 1; |
78 | csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; |
79 | |
80 | /* whole thing should fit in registers - verify that compiler does this */ |
81 | for (i = nmdct >> 2; i != 0; i--) { |
82 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
83 | cps2a = *csptr++; |
84 | sin2a = *csptr++; |
85 | cps2b = *csptr++; |
86 | sin2b = *csptr++; |
87 | |
88 | ar1 = *(zbuf1 + 0); |
89 | ai2 = *(zbuf1 + 1); |
90 | ai1 = *(zbuf2 + 0); |
91 | ar2 = *(zbuf2 - 1); |
92 | |
93 | /* gain 2 ints bit from MULSHIFT32 by Q30, but drop 7 or 10 int bits from table scaling of 1/M |
94 | * max per-sample gain (ignoring implicit scaling) = MAX(sin(angle)+cos(angle)) = 1.414 |
95 | * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from |
96 | * extra sign bits, and eat one in adding |
97 | */ |
98 | t = MULSHIFT32(sin2a, ar1 + ai1); |
99 | z2 = MULSHIFT32(cps2a, ai1) - t; |
100 | cms2 = cps2a - 2 * sin2a; |
101 | z1 = MULSHIFT32(cms2, ar1) + t; |
102 | *zbuf1++ = z1; /* cos*ar1 + sin*ai1 */ |
103 | *zbuf1++ = z2; /* cos*ai1 - sin*ar1 */ |
104 | |
105 | t = MULSHIFT32(sin2b, ar2 + ai2); |
106 | z2 = MULSHIFT32(cps2b, ai2) - t; |
107 | cms2 = cps2b - 2 * sin2b; |
108 | z1 = MULSHIFT32(cms2, ar2) + t; |
109 | *zbuf2-- = z2; /* cos*ai2 - sin*ar2 */ |
110 | *zbuf2-- = z1; /* cos*ar2 + sin*ai2 */ |
111 | } |
112 | } |
113 | |
114 | /************************************************************************************** |
115 | * Function: PostMultiply |
116 | * |
117 | * Description: post-twiddle stage of DCT4 |
118 | * |
119 | * Inputs: table index (for transform size) |
120 | * buffer of nmdct samples |
121 | * |
122 | * Outputs: processed samples in same buffer |
123 | * |
124 | * Return: none |
125 | * |
126 | * Notes: minimum 1 GB in, 2 GB out - gains 2 int bits |
127 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
128 | **************************************************************************************/ |
129 | static void PostMultiply(int tabidx, int *fft1) |
130 | { |
131 | int i, nmdct, ar1, ai1, ar2, ai2, skipFactor; |
132 | int t, cms2, cps2, sin2; |
133 | int *fft2; |
134 | const int *csptr; |
135 | |
136 | nmdct = nmdctTab[tabidx]; |
137 | csptr = cos1sin1tab; |
138 | skipFactor = postSkip[tabidx]; |
139 | fft2 = fft1 + nmdct - 1; |
140 | |
141 | /* load coeffs for first pass |
142 | * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) |
143 | */ |
144 | cps2 = *csptr++; |
145 | sin2 = *csptr; |
146 | csptr += skipFactor; |
147 | cms2 = cps2 - 2 * sin2; |
148 | |
149 | for (i = nmdct >> 2; i != 0; i--) { |
150 | ar1 = *(fft1 + 0); |
151 | ai1 = *(fft1 + 1); |
152 | ar2 = *(fft2 - 1); |
153 | ai2 = *(fft2 + 0); |
154 | |
155 | /* gain 2 ints bit from MULSHIFT32 by Q30 |
156 | * max per-sample gain = MAX(sin(angle)+cos(angle)) = 1.414 |
157 | * i.e. gain 1 GB since worst case is sin(angle) = cos(angle) = 0.707 (Q30), gain 2 from |
158 | * extra sign bits, and eat one in adding |
159 | */ |
160 | t = MULSHIFT32(sin2, ar1 + ai1); |
161 | *fft2-- = t - MULSHIFT32(cps2, ai1); /* sin*ar1 - cos*ai1 */ |
162 | *fft1++ = t + MULSHIFT32(cms2, ar1); /* cos*ar1 + sin*ai1 */ |
163 | cps2 = *csptr++; |
164 | sin2 = *csptr; |
165 | csptr += skipFactor; |
166 | |
167 | ai2 = -ai2; |
168 | t = MULSHIFT32(sin2, ar2 + ai2); |
169 | *fft2-- = t - MULSHIFT32(cps2, ai2); /* sin*ar1 - cos*ai1 */ |
170 | cms2 = cps2 - 2 * sin2; |
171 | *fft1++ = t + MULSHIFT32(cms2, ar2); /* cos*ar1 + sin*ai1 */ |
172 | } |
173 | } |
174 | |
175 | /************************************************************************************** |
176 | * Function: PreMultiplyRescale |
177 | * |
178 | * Description: pre-twiddle stage of DCT4, with rescaling for extra guard bits |
179 | * |
180 | * Inputs: table index (for transform size) |
181 | * buffer of nmdct samples |
182 | * number of guard bits to add to input before processing |
183 | * |
184 | * Outputs: processed samples in same buffer |
185 | * |
186 | * Return: none |
187 | * |
188 | * Notes: see notes on PreMultiply(), above |
189 | **************************************************************************************/ |
190 | static void PreMultiplyRescale(int tabidx, int *zbuf1, int es) |
191 | { |
192 | int i, nmdct, ar1, ai1, ar2, ai2, z1, z2; |
193 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
194 | int *zbuf2; |
195 | const int *csptr; |
196 | |
197 | nmdct = nmdctTab[tabidx]; |
198 | zbuf2 = zbuf1 + nmdct - 1; |
199 | csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; |
200 | |
201 | /* whole thing should fit in registers - verify that compiler does this */ |
202 | for (i = nmdct >> 2; i != 0; i--) { |
203 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
204 | cps2a = *csptr++; |
205 | sin2a = *csptr++; |
206 | cps2b = *csptr++; |
207 | sin2b = *csptr++; |
208 | |
209 | ar1 = *(zbuf1 + 0) >> es; |
210 | ai1 = *(zbuf2 + 0) >> es; |
211 | ai2 = *(zbuf1 + 1) >> es; |
212 | |
213 | t = MULSHIFT32(sin2a, ar1 + ai1); |
214 | z2 = MULSHIFT32(cps2a, ai1) - t; |
215 | cms2 = cps2a - 2 * sin2a; |
216 | z1 = MULSHIFT32(cms2, ar1) + t; |
217 | *zbuf1++ = z1; |
218 | *zbuf1++ = z2; |
219 | |
220 | ar2 = *(zbuf2 - 1) >> es; /* do here to free up register used for es */ |
221 | |
222 | t = MULSHIFT32(sin2b, ar2 + ai2); |
223 | z2 = MULSHIFT32(cps2b, ai2) - t; |
224 | cms2 = cps2b - 2 * sin2b; |
225 | z1 = MULSHIFT32(cms2, ar2) + t; |
226 | *zbuf2-- = z2; |
227 | *zbuf2-- = z1; |
228 | |
229 | } |
230 | } |
231 | |
232 | /************************************************************************************** |
233 | * Function: PostMultiplyRescale |
234 | * |
235 | * Description: post-twiddle stage of DCT4, with rescaling for extra guard bits |
236 | * |
237 | * Inputs: table index (for transform size) |
238 | * buffer of nmdct samples |
239 | * number of guard bits to remove from output |
240 | * |
241 | * Outputs: processed samples in same buffer |
242 | * |
243 | * Return: none |
244 | * |
245 | * Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit |
246 | * see notes on PostMultiply(), above |
247 | **************************************************************************************/ |
248 | static void PostMultiplyRescale(int tabidx, int *fft1, int es) |
249 | { |
250 | int i, nmdct, ar1, ai1, ar2, ai2, skipFactor, z; |
251 | int t, cs2, sin2; |
252 | int *fft2; |
253 | const int *csptr; |
254 | |
255 | nmdct = nmdctTab[tabidx]; |
256 | csptr = cos1sin1tab; |
257 | skipFactor = postSkip[tabidx]; |
258 | fft2 = fft1 + nmdct - 1; |
259 | |
260 | /* load coeffs for first pass |
261 | * cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) |
262 | */ |
263 | cs2 = *csptr++; |
264 | sin2 = *csptr; |
265 | csptr += skipFactor; |
266 | |
267 | for (i = nmdct >> 2; i != 0; i--) { |
268 | ar1 = *(fft1 + 0); |
269 | ai1 = *(fft1 + 1); |
270 | ai2 = *(fft2 + 0); |
271 | |
272 | t = MULSHIFT32(sin2, ar1 + ai1); |
273 | z = t - MULSHIFT32(cs2, ai1); |
274 | CLIP_2N_SHIFT(z, es); |
275 | *fft2-- = z; |
276 | cs2 -= 2 * sin2; |
277 | z = t + MULSHIFT32(cs2, ar1); |
278 | CLIP_2N_SHIFT(z, es); |
279 | *fft1++ = z; |
280 | |
281 | cs2 = *csptr++; |
282 | sin2 = *csptr; |
283 | csptr += skipFactor; |
284 | |
285 | ar2 = *fft2; |
286 | ai2 = -ai2; |
287 | t = MULSHIFT32(sin2, ar2 + ai2); |
288 | z = t - MULSHIFT32(cs2, ai2); |
289 | CLIP_2N_SHIFT(z, es); |
290 | *fft2-- = z; |
291 | cs2 -= 2 * sin2; |
292 | z = t + MULSHIFT32(cs2, ar2); |
293 | CLIP_2N_SHIFT(z, es); |
294 | *fft1++ = z; |
295 | cs2 += 2 * sin2; |
296 | } |
297 | } |
298 | |
299 | /************************************************************************************** |
300 | * Function: DCT4 |
301 | * |
302 | * Description: type-IV DCT |
303 | * |
304 | * Inputs: table index (for transform size) |
305 | * buffer of nmdct samples |
306 | * number of guard bits in the input buffer |
307 | * |
308 | * Outputs: processed samples in same buffer |
309 | * |
310 | * Return: none |
311 | * |
312 | * Notes: operates in-place |
313 | * if number of guard bits in input is < GBITS_IN_DCT4, the input is |
314 | * scaled (>>) before the DCT4 and rescaled (<<, with clipping) after |
315 | * the DCT4 (rare) |
316 | * the output has FBITS_LOST_DCT4 fewer fraction bits than the input |
317 | * the output will always have at least 1 guard bit (GBITS_IN_DCT4 >= 4) |
318 | * int bits gained per stage (PreMul + FFT + PostMul) |
319 | * short blocks = (-5 + 4 + 2) = 1 total |
320 | * long blocks = (-8 + 7 + 2) = 1 total |
321 | **************************************************************************************/ |
322 | void DCT4(int tabidx, int *coef, int gb) |
323 | { |
324 | int es; |
325 | |
326 | /* fast in-place DCT-IV - adds guard bits if necessary */ |
327 | if (gb < GBITS_IN_DCT4) { |
328 | es = GBITS_IN_DCT4 - gb; |
329 | PreMultiplyRescale(tabidx, coef, es); |
330 | R4FFT(tabidx, coef); |
331 | PostMultiplyRescale(tabidx, coef, es); |
332 | } else { |
333 | PreMultiply(tabidx, coef); |
334 | R4FFT(tabidx, coef); |
335 | PostMultiply(tabidx, coef); |
336 | } |
337 | } |
338 |