blob: 6947ca1c508c5a579daf2c62405b44eecde5c74b
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: mlt.c,v 1.5 2005/04/27 19:20:50 hubbe Exp $ |
3 | * |
4 | * REALNETWORKS CONFIDENTIAL--NOT FOR DISTRIBUTION IN SOURCE CODE FORM |
5 | * Portions Copyright (c) 1995-2002 RealNetworks, Inc. |
6 | * All Rights Reserved. |
7 | * |
8 | * The contents of this file, and the files included with this file, |
9 | * are subject to the current version of the Real Format Source Code |
10 | * Porting and Optimization License, available at |
11 | * https://helixcommunity.org/2005/license/realformatsource (unless |
12 | * RealNetworks otherwise expressly agrees in writing that you are |
13 | * subject to a different license). You may also obtain the license |
14 | * terms directly from RealNetworks. You may not use this file except |
15 | * in compliance with the Real Format Source Code Porting and |
16 | * Optimization License. There are no redistribution rights for the |
17 | * source code of this file. Please see the Real Format Source Code |
18 | * Porting and Optimization License for the rights, obligations and |
19 | * limitations governing use of the contents of the file. |
20 | * |
21 | * RealNetworks is the developer of the Original Code and owns the |
22 | * copyrights in the portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed and |
25 | * made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, |
26 | * EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL |
27 | * SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF |
28 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT |
29 | * OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * https://rarvcode-tck.helixcommunity.org |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point RealAudio 8 decoder |
40 | * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) |
41 | * October 2003 |
42 | * |
43 | * mlt.c - fast MLT/IMLT functions |
44 | **************************************************************************************/ |
45 | |
46 | #include "coder.h" |
47 | #include "assembly.h" |
48 | |
49 | static const int postSkip[NUM_MLT_SIZES] = {7, 3, 1}; |
50 | |
51 | /************************************************************************************** |
52 | * Function: PreMultiply |
53 | * |
54 | * Description: pre-twiddle stage of MDCT |
55 | * |
56 | * Inputs: table index (for transform size) |
57 | * buffer of nmlt samples |
58 | * |
59 | * Outputs: processed samples in same buffer |
60 | * |
61 | * Return: none |
62 | * |
63 | * Notes: minimum 1 GB in, 2 GB out - loses (1+tabidx) int bits |
64 | * normalization by 2/sqrt(N) is rolled into tables here |
65 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
66 | * should asm code (compiler not doing free pointer updates, etc.) |
67 | **************************************************************************************/ |
68 | static void PreMultiply(int tabidx, int *zbuf1) |
69 | { |
70 | int i, nmlt, ar1, ai1, ar2, ai2, z1, z2; |
71 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
72 | int *zbuf2; |
73 | const int *csptr; |
74 | |
75 | nmlt = nmltTab[tabidx]; |
76 | zbuf2 = zbuf1 + nmlt - 1; |
77 | csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; |
78 | |
79 | /* whole thing should fit in registers - verify that compiler does this */ |
80 | for (i = nmlt >> 2; i != 0; i--) { |
81 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
82 | cps2a = *csptr++; |
83 | sin2a = *csptr++; |
84 | cps2b = *csptr++; |
85 | sin2b = *csptr++; |
86 | |
87 | ar1 = *(zbuf1 + 0); |
88 | ai2 = *(zbuf1 + 1); |
89 | ai1 = *(zbuf2 + 0); |
90 | ar2 = *(zbuf2 - 1); |
91 | |
92 | /* gain 1 int bit from MULSHIFT32, but drop 2, 3, or 4 int bits from table scaling */ |
93 | t = MULSHIFT32(sin2a, ar1 + ai1); |
94 | z2 = MULSHIFT32(cps2a, ai1) - t; |
95 | cms2 = cps2a - 2 * sin2a; |
96 | z1 = MULSHIFT32(cms2, ar1) + t; |
97 | *zbuf1++ = z1; |
98 | *zbuf1++ = z2; |
99 | |
100 | t = MULSHIFT32(sin2b, ar2 + ai2); |
101 | z2 = MULSHIFT32(cps2b, ai2) - t; |
102 | cms2 = cps2b - 2 * sin2b; |
103 | z1 = MULSHIFT32(cms2, ar2) + t; |
104 | *zbuf2-- = z2; |
105 | *zbuf2-- = z1; |
106 | |
107 | } |
108 | |
109 | /* Note on scaling... |
110 | * assumes 1 guard bit in, gains (1 + tabidx) fraction bits |
111 | * i.e. gain 1, 2, or 3 fraction bits, for nSamps = 256, 512, 1024 |
112 | * (left-shifting, since table scaled by 2 / sqrt(nSamps)) |
113 | * this offsets the fact that each extra pass in FFT gains one more int bit |
114 | */ |
115 | return; |
116 | } |
117 | |
118 | /************************************************************************************** |
119 | * Function: PostMultiply |
120 | * |
121 | * Description: post-twiddle stage of MDCT |
122 | * |
123 | * Inputs: table index (for transform size) |
124 | * buffer of nmlt samples |
125 | * |
126 | * Outputs: processed samples in same buffer |
127 | * |
128 | * Return: none |
129 | * |
130 | * Notes: minimum 1 GB in, 2 GB out - gains 1 int bit |
131 | * uses 3-mul, 3-add butterflies instead of 4-mul, 2-add |
132 | * should asm code (compiler not doing free pointer updates, etc.) |
133 | **************************************************************************************/ |
134 | static void PostMultiply(int tabidx, int *fft1) |
135 | { |
136 | int i, nmlt, ar1, ai1, ar2, ai2, skipFactor; |
137 | int t, cms2, cps2, sin2; |
138 | int *fft2; |
139 | const int *csptr; |
140 | |
141 | nmlt = nmltTab[tabidx]; |
142 | csptr = cos1sin1tab; |
143 | skipFactor = postSkip[tabidx]; |
144 | fft2 = fft1 + nmlt - 1; |
145 | |
146 | /* load coeffs for first pass |
147 | * cps2 = (cos+sin)/2, sin2 = sin/2, cms2 = (cos-sin)/2 |
148 | */ |
149 | cps2 = *csptr++; |
150 | sin2 = *csptr; |
151 | csptr += skipFactor; |
152 | cms2 = cps2 - 2 * sin2; |
153 | |
154 | for (i = nmlt >> 2; i != 0; i--) { |
155 | ar1 = *(fft1 + 0); |
156 | ai1 = *(fft1 + 1); |
157 | ar2 = *(fft2 - 1); |
158 | ai2 = *(fft2 + 0); |
159 | |
160 | /* gain 1 int bit from MULSHIFT32, and one since coeffs are stored as 0.5 * (cos+sin), 0.5*sin */ |
161 | t = MULSHIFT32(sin2, ar1 + ai1); |
162 | *fft2-- = t - MULSHIFT32(cps2, ai1); |
163 | *fft1++ = t + MULSHIFT32(cms2, ar1); |
164 | |
165 | cps2 = *csptr++; |
166 | sin2 = *csptr; |
167 | csptr += skipFactor; |
168 | |
169 | ai2 = -ai2; |
170 | t = MULSHIFT32(sin2, ar2 + ai2); |
171 | *fft2-- = t - MULSHIFT32(cps2, ai2); |
172 | cms2 = cps2 - 2 * sin2; |
173 | *fft1++ = t + MULSHIFT32(cms2, ar2); |
174 | |
175 | } |
176 | |
177 | /* Note on scaling... |
178 | * assumes 1 guard bit in, gains 2 int bits |
179 | * max gain of abs(cos) + abs(sin) = sqrt(2) = 1.414, so current format |
180 | * guarantees 1 guard bit in output |
181 | */ |
182 | return; |
183 | } |
184 | |
185 | /************************************************************************************** |
186 | * Function: PreMultiplyRescale |
187 | * |
188 | * Description: pre-twiddle stage of MDCT, with rescaling for extra guard bits |
189 | * |
190 | * Inputs: table index (for transform size) |
191 | * buffer of nmlt samples |
192 | * number of guard bits to add to input before processing |
193 | * |
194 | * Outputs: processed samples in same buffer |
195 | * |
196 | * Return: none |
197 | * |
198 | * Notes: see notes on PreMultiply(), above |
199 | **************************************************************************************/ |
200 | static void PreMultiplyRescale(int tabidx, int *zbuf1, int es) |
201 | { |
202 | int i, nmlt, ar1, ai1, ar2, ai2, z1, z2; |
203 | int t, cms2, cps2a, sin2a, cps2b, sin2b; |
204 | int *zbuf2; |
205 | const int *csptr; |
206 | |
207 | nmlt = nmltTab[tabidx]; |
208 | zbuf2 = zbuf1 + nmlt - 1; |
209 | csptr = cos4sin4tab + cos4sin4tabOffset[tabidx]; |
210 | |
211 | /* whole thing should fit in registers - verify that compiler does this */ |
212 | for (i = nmlt >> 2; i != 0; i--) { |
213 | /* cps2 = (cos+sin), sin2 = sin, cms2 = (cos-sin) */ |
214 | cps2a = *csptr++; |
215 | sin2a = *csptr++; |
216 | cps2b = *csptr++; |
217 | sin2b = *csptr++; |
218 | |
219 | ar1 = *(zbuf1 + 0) >> es; |
220 | ai1 = *(zbuf2 + 0) >> es; |
221 | ai2 = *(zbuf1 + 1) >> es; |
222 | |
223 | /* gain 1 int bit from MULSHIFT32, but drop 2, 3, or 4 int bits from table scaling */ |
224 | t = MULSHIFT32(sin2a, ar1 + ai1); |
225 | z2 = MULSHIFT32(cps2a, ai1) - t; |
226 | cms2 = cps2a - 2 * sin2a; |
227 | z1 = MULSHIFT32(cms2, ar1) + t; |
228 | *zbuf1++ = z1; |
229 | *zbuf1++ = z2; |
230 | |
231 | ar2 = *(zbuf2 - 1) >> es; /* do here to free up register used for es */ |
232 | |
233 | t = MULSHIFT32(sin2b, ar2 + ai2); |
234 | z2 = MULSHIFT32(cps2b, ai2) - t; |
235 | cms2 = cps2b - 2 * sin2b; |
236 | z1 = MULSHIFT32(cms2, ar2) + t; |
237 | *zbuf2-- = z2; |
238 | *zbuf2-- = z1; |
239 | |
240 | } |
241 | |
242 | /* see comments in PreMultiply() for notes on scaling */ |
243 | return; |
244 | } |
245 | |
246 | /************************************************************************************** |
247 | * Function: PostMultiplyRescale |
248 | * |
249 | * Description: post-twiddle stage of MDCT, with rescaling for extra guard bits |
250 | * |
251 | * Inputs: table index (for transform size) |
252 | * buffer of nmlt samples |
253 | * number of guard bits to remove from output |
254 | * |
255 | * Outputs: processed samples in same buffer |
256 | * |
257 | * Return: none |
258 | * |
259 | * Notes: clips output to [-2^30, 2^30 - 1], guaranteeing at least 1 guard bit |
260 | * see notes on PostMultiply(), above |
261 | **************************************************************************************/ |
262 | static void PostMultiplyRescale(int tabidx, int *fft1, int es) |
263 | { |
264 | int i, nmlt, ar1, ai1, ar2, ai2, skipFactor, z; |
265 | int t, cs2, sin2; |
266 | int *fft2; |
267 | const int *csptr; |
268 | |
269 | nmlt = nmltTab[tabidx]; |
270 | csptr = cos1sin1tab; |
271 | skipFactor = postSkip[tabidx]; |
272 | fft2 = fft1 + nmlt - 1; |
273 | |
274 | /* load coeffs for first pass |
275 | * cps2 = (cos+sin)/2, sin2 = sin/2, cms2 = (cos-sin)/2 |
276 | */ |
277 | cs2 = *csptr++; |
278 | sin2 = *csptr; |
279 | csptr += skipFactor; |
280 | |
281 | for (i = nmlt >> 2; i != 0; i--) { |
282 | ar1 = *(fft1 + 0); |
283 | ai1 = *(fft1 + 1); |
284 | ai2 = *(fft2 + 0); |
285 | |
286 | /* gain 1 int bit from MULSHIFT32, and one since coeffs are stored as 0.5 * (cos+sin), 0.5*sin */ |
287 | t = MULSHIFT32(sin2, ar1 + ai1); |
288 | z = t - MULSHIFT32(cs2, ai1); |
289 | CLIP_2N_SHIFT(z, es); |
290 | *fft2-- = z; |
291 | cs2 -= 2 * sin2; |
292 | z = t + MULSHIFT32(cs2, ar1); |
293 | CLIP_2N_SHIFT(z, es); |
294 | *fft1++ = z; |
295 | |
296 | cs2 = *csptr++; |
297 | sin2 = *csptr; |
298 | csptr += skipFactor; |
299 | |
300 | ar2 = *fft2; |
301 | ai2 = -ai2; |
302 | t = MULSHIFT32(sin2, ar2 + ai2); |
303 | z = t - MULSHIFT32(cs2, ai2); |
304 | CLIP_2N_SHIFT(z, es); |
305 | *fft2-- = z; |
306 | cs2 -= 2 * sin2; |
307 | z = t + MULSHIFT32(cs2, ar2); |
308 | CLIP_2N_SHIFT(z, es); |
309 | *fft1++ = z; |
310 | cs2 += 2 * sin2; |
311 | |
312 | } |
313 | |
314 | /* see comments in PostMultiply() for notes on scaling */ |
315 | return; |
316 | } |
317 | |
318 | /************************************************************************************** |
319 | * Function: IMLTNoWindow |
320 | * |
321 | * Description: inverse MLT without window or overlap-add |
322 | * |
323 | * Inputs: table index (for transform size) |
324 | * buffer of nmlt samples |
325 | * number of guard bits in the input buffer |
326 | * |
327 | * Outputs: processed samples in same buffer |
328 | * |
329 | * Return: none |
330 | * |
331 | * Notes: operates in-place, and generates nmlt output samples from nmlt input |
332 | * samples (doesn't do synthesis window which expands to 2*nmlt samples) |
333 | * if number of guard bits in input is < GBITS_IN_IMLT, the input is |
334 | * scaled (>>) before the IMLT and rescaled (<<, with clipping) after |
335 | * the IMLT (rare) |
336 | * the output has FBITS_LOST_IMLT fewer fraction bits than the input |
337 | * the output will always have at least 1 guard bit |
338 | **************************************************************************************/ |
339 | void IMLTNoWindow(int tabidx, int *mlt, int gb) |
340 | { |
341 | int es; |
342 | |
343 | /* fast in-place DCT-IV - adds guard bits if necessary */ |
344 | if (gb < GBITS_IN_IMLT) { |
345 | es = GBITS_IN_IMLT - gb; |
346 | PreMultiplyRescale(tabidx, mlt, es); |
347 | R4FFT(tabidx, mlt); |
348 | PostMultiplyRescale(tabidx, mlt, es); |
349 | } else { |
350 | PreMultiply(tabidx, mlt); |
351 | R4FFT(tabidx, mlt); |
352 | PostMultiply(tabidx, mlt); |
353 | } |
354 | |
355 | return; |
356 | } |
357 | |
358 |