blob: a0f8095d4e8325f231868bc251b971e2bb557728
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: assembly.h,v 1.9 2007/02/28 07:10:21 gahluwalia Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com) |
41 | * February 2005 |
42 | * |
43 | * assembly.h - inline assembly language functions and prototypes |
44 | * |
45 | * MULSHIFT32(x, y) signed multiply of two 32-bit integers (x and y), |
46 | * returns top 32-bits of 64-bit result |
47 | * CLIPTOSHORT(x) convert 32-bit integer to 16-bit short, |
48 | * clipping to [-32768, 32767] |
49 | * FASTABS(x) branchless absolute value of signed integer x |
50 | * CLZ(x) count leading zeros on signed integer x |
51 | * MADD64(sum64, x, y) 64-bit multiply accumulate: sum64 += (x*y) |
52 | **************************************************************************************/ |
53 | |
54 | #ifndef _ASSEMBLY_H |
55 | #define _ASSEMBLY_H |
56 | |
57 | /* toolchain: MSFT Visual C++ |
58 | * target architecture: x86 |
59 | */ |
60 | #if (defined (_WIN32) && !defined (_WIN32_WCE)) || (defined (__WINS__) && defined (_SYMBIAN)) || (defined (WINCE_EMULATOR)) || (defined (_OPENWAVE_SIMULATOR)) |
61 | |
62 | #pragma warning( disable : 4035 ) /* complains about inline asm not returning a value */ |
63 | |
64 | static __inline int MULSHIFT32(int x, int y) |
65 | { |
66 | __asm { |
67 | mov eax, x |
68 | imul y |
69 | mov eax, edx |
70 | } |
71 | } |
72 | |
73 | static __inline short CLIPTOSHORT(int x) |
74 | { |
75 | int sign; |
76 | |
77 | /* clip to [-32768, 32767] */ |
78 | sign = x >> 31; |
79 | if (sign != (x >> 15)) { |
80 | x = sign ^((1 << 15) - 1); |
81 | } |
82 | |
83 | return (short)x; |
84 | } |
85 | |
86 | static __inline int FASTABS(int x) |
87 | { |
88 | int sign; |
89 | |
90 | sign = x >> (sizeof(int) * 8 - 1); |
91 | x ^= sign; |
92 | x -= sign; |
93 | |
94 | return x; |
95 | } |
96 | |
97 | static __inline int CLZ(int x) |
98 | { |
99 | int numZeros; |
100 | |
101 | if (!x) { |
102 | return 32; |
103 | } |
104 | |
105 | /* count leading zeros with binary search */ |
106 | numZeros = 1; |
107 | if (!((unsigned int)x >> 16)) { |
108 | numZeros += 16; |
109 | x <<= 16; |
110 | } |
111 | if (!((unsigned int)x >> 24)) { |
112 | numZeros += 8; |
113 | x <<= 8; |
114 | } |
115 | if (!((unsigned int)x >> 28)) { |
116 | numZeros += 4; |
117 | x <<= 4; |
118 | } |
119 | if (!((unsigned int)x >> 30)) { |
120 | numZeros += 2; |
121 | x <<= 2; |
122 | } |
123 | |
124 | numZeros -= ((unsigned int)x >> 31); |
125 | |
126 | return numZeros; |
127 | } |
128 | |
129 | #ifdef __CW32__ |
130 | typedef long long Word64; |
131 | #else |
132 | typedef __int64 Word64; |
133 | #endif |
134 | |
135 | typedef union _U64 { |
136 | Word64 w64; |
137 | struct { |
138 | /* x86 = little endian */ |
139 | unsigned int lo32; |
140 | signed int hi32; |
141 | } r; |
142 | } U64; |
143 | |
144 | /* returns 64-bit value in [edx:eax] */ |
145 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
146 | { |
147 | #if (defined (_SYMBIAN_61_) || defined (_SYMBIAN_70_)) && defined (__WINS__) && !defined (__CW32__) |
148 | /* Workaround for the Symbian emulator because of non existing longlong.lib and |
149 | * hence __allmul not defined. */ |
150 | __asm { |
151 | mov eax, x |
152 | imul y |
153 | add dword ptr sum64, eax |
154 | adc dword ptr sum64 + 4, edx |
155 | } |
156 | #else |
157 | sum64 += (Word64)x * (Word64)y; |
158 | #endif |
159 | |
160 | return sum64; |
161 | } |
162 | |
163 | /* toolchain: MSFT Embedded Visual C++ |
164 | * target architecture: ARM v.4 and above (require 'M' type processor for 32x32->64 multiplier) |
165 | */ |
166 | #elif defined (_WIN32) && defined (_WIN32_WCE) && defined (ARM) |
167 | |
168 | static __inline short CLIPTOSHORT(int x) |
169 | { |
170 | int sign; |
171 | |
172 | /* clip to [-32768, 32767] */ |
173 | sign = x >> 31; |
174 | if (sign != (x >> 15)) { |
175 | x = sign ^((1 << 15) - 1); |
176 | } |
177 | |
178 | return (short)x; |
179 | } |
180 | |
181 | static __inline int FASTABS(int x) |
182 | { |
183 | int sign; |
184 | |
185 | sign = x >> (sizeof(int) * 8 - 1); |
186 | x ^= sign; |
187 | x -= sign; |
188 | |
189 | return x; |
190 | } |
191 | |
192 | static __inline int CLZ(int x) |
193 | { |
194 | int numZeros; |
195 | |
196 | if (!x) { |
197 | return 32; |
198 | } |
199 | |
200 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
201 | numZeros = 1; |
202 | if (!((unsigned int)x >> 16)) { |
203 | numZeros += 16; |
204 | x <<= 16; |
205 | } |
206 | if (!((unsigned int)x >> 24)) { |
207 | numZeros += 8; |
208 | x <<= 8; |
209 | } |
210 | if (!((unsigned int)x >> 28)) { |
211 | numZeros += 4; |
212 | x <<= 4; |
213 | } |
214 | if (!((unsigned int)x >> 30)) { |
215 | numZeros += 2; |
216 | x <<= 2; |
217 | } |
218 | |
219 | numZeros -= ((unsigned int)x >> 31); |
220 | |
221 | return numZeros; |
222 | } |
223 | |
224 | /* implemented in asmfunc.s */ |
225 | #ifdef __cplusplus |
226 | extern "C" { |
227 | #endif |
228 | |
229 | typedef __int64 Word64; |
230 | |
231 | typedef union _U64 { |
232 | Word64 w64; |
233 | struct { |
234 | /* ARM WinCE = little endian */ |
235 | unsigned int lo32; |
236 | signed int hi32; |
237 | } r; |
238 | } U64; |
239 | |
240 | /* manual name mangling for just this platform (must match labels in .s file) */ |
241 | #define MULSHIFT32 raac_MULSHIFT32 |
242 | #define MADD64 raac_MADD64 |
243 | |
244 | int MULSHIFT32(int x, int y); |
245 | Word64 MADD64(Word64 sum64, int x, int y); |
246 | |
247 | #ifdef __cplusplus |
248 | } |
249 | #endif |
250 | |
251 | /* toolchain: ARM ADS or RealView |
252 | * target architecture: ARM v.4 and above (requires 'M' type processor for 32x32->64 multiplier) |
253 | */ |
254 | #elif (defined (__arm) && defined (__ARMCC_VERSION)) || (defined(HELIX_CONFIG_SYMBIAN_GENERATE_MMP) && !defined(__GCCE__)) |
255 | |
256 | static __inline int MULSHIFT32(int x, int y) |
257 | { |
258 | /* rules for smull RdLo, RdHi, Rm, Rs: |
259 | * RdHi != Rm |
260 | * RdLo != Rm |
261 | * RdHi != RdLo |
262 | */ |
263 | int zlow; |
264 | __asm { |
265 | smull zlow, y, x, y |
266 | } |
267 | |
268 | return y; |
269 | } |
270 | |
271 | static __inline short CLIPTOSHORT(int x) |
272 | { |
273 | int sign; |
274 | |
275 | /* clip to [-32768, 32767] */ |
276 | sign = x >> 31; |
277 | if (sign != (x >> 15)) { |
278 | x = sign ^((1 << 15) - 1); |
279 | } |
280 | |
281 | return (short)x; |
282 | } |
283 | |
284 | static __inline int FASTABS(int x) |
285 | { |
286 | int sign; |
287 | |
288 | sign = x >> (sizeof(int) * 8 - 1); |
289 | x ^= sign; |
290 | x -= sign; |
291 | |
292 | return x; |
293 | } |
294 | |
295 | static __inline int CLZ(int x) |
296 | { |
297 | int numZeros; |
298 | |
299 | if (!x) { |
300 | return 32; |
301 | } |
302 | |
303 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
304 | numZeros = 1; |
305 | if (!((unsigned int)x >> 16)) { |
306 | numZeros += 16; |
307 | x <<= 16; |
308 | } |
309 | if (!((unsigned int)x >> 24)) { |
310 | numZeros += 8; |
311 | x <<= 8; |
312 | } |
313 | if (!((unsigned int)x >> 28)) { |
314 | numZeros += 4; |
315 | x <<= 4; |
316 | } |
317 | if (!((unsigned int)x >> 30)) { |
318 | numZeros += 2; |
319 | x <<= 2; |
320 | } |
321 | |
322 | numZeros -= ((unsigned int)x >> 31); |
323 | |
324 | return numZeros; |
325 | |
326 | /* ARM code would look like this, but do NOT use inline asm in ADS for this, |
327 | because you can't safely use the status register flags intermixed with C code |
328 | |
329 | __asm { |
330 | mov numZeros, #1 |
331 | tst x, 0xffff0000 |
332 | addeq numZeros, numZeros, #16 |
333 | moveq x, x, lsl #16 |
334 | tst x, 0xff000000 |
335 | addeq numZeros, numZeros, #8 |
336 | moveq x, x, lsl #8 |
337 | tst x, 0xf0000000 |
338 | addeq numZeros, numZeros, #4 |
339 | moveq x, x, lsl #4 |
340 | tst x, 0xc0000000 |
341 | addeq numZeros, numZeros, #2 |
342 | moveq x, x, lsl #2 |
343 | sub numZeros, numZeros, x, lsr #31 |
344 | } |
345 | */ |
346 | /* reference: |
347 | numZeros = 0; |
348 | while (!(x & 0x80000000)) { |
349 | numZeros++; |
350 | x <<= 1; |
351 | } |
352 | */ |
353 | } |
354 | |
355 | typedef __int64 Word64; |
356 | |
357 | typedef union _U64 { |
358 | Word64 w64; |
359 | struct { |
360 | /* ARM ADS = little endian */ |
361 | unsigned int lo32; |
362 | signed int hi32; |
363 | } r; |
364 | } U64; |
365 | |
366 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
367 | { |
368 | U64 u; |
369 | u.w64 = sum64; |
370 | |
371 | __asm { |
372 | smlal u.r.lo32, u.r.hi32, x, y |
373 | } |
374 | |
375 | return u.w64; |
376 | } |
377 | |
378 | /* toolchain: ARM gcc |
379 | * target architecture: ARM v.4 and above (requires 'M' type processor for 32x32->64 multiplier) |
380 | */ |
381 | #elif defined(__GNUC__) && defined(__arm__) |
382 | |
383 | static __inline__ int MULSHIFT32(int x, int y) |
384 | { |
385 | int zlow; |
386 | __asm__ volatile("smull %0,%1,%2,%3" : "=&r"(zlow), "=r"(y) : "r"(x), "1"(y) : "cc"); |
387 | return y; |
388 | } |
389 | |
390 | static __inline short CLIPTOSHORT(int x) |
391 | { |
392 | int sign; |
393 | |
394 | /* clip to [-32768, 32767] */ |
395 | sign = x >> 31; |
396 | if (sign != (x >> 15)) { |
397 | x = sign ^((1 << 15) - 1); |
398 | } |
399 | |
400 | return (short)x; |
401 | } |
402 | |
403 | static __inline int FASTABS(int x) |
404 | { |
405 | int sign; |
406 | |
407 | sign = x >> (sizeof(int) * 8 - 1); |
408 | x ^= sign; |
409 | x -= sign; |
410 | |
411 | return x; |
412 | } |
413 | |
414 | static __inline int CLZ(int x) |
415 | { |
416 | int numZeros; |
417 | |
418 | if (!x) { |
419 | return (sizeof(int) * 8); |
420 | } |
421 | |
422 | numZeros = 0; |
423 | while (!(x & 0x80000000)) { |
424 | numZeros++; |
425 | x <<= 1; |
426 | } |
427 | |
428 | return numZeros; |
429 | } |
430 | |
431 | typedef long long Word64; |
432 | |
433 | typedef union _U64 { |
434 | Word64 w64; |
435 | struct { |
436 | /* ARM ADS = little endian */ |
437 | unsigned int lo32; |
438 | signed int hi32; |
439 | } r; |
440 | } U64; |
441 | |
442 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
443 | { |
444 | U64 u; |
445 | u.w64 = sum64; |
446 | |
447 | __asm__ volatile("smlal %0,%1,%2,%3" : "+&r"(u.r.lo32), "+&r"(u.r.hi32) : "r"(x), "r"(y) : "cc"); |
448 | |
449 | return u.w64; |
450 | } |
451 | |
452 | /* toolchain: x86 gcc |
453 | * target architecture: x86 |
454 | */ |
455 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) || (defined (_SOLARIS) && !defined (__GNUC__) && defined(_SOLARISX86)) |
456 | |
457 | typedef long long Word64; |
458 | |
459 | static __inline__ int MULSHIFT32(int x, int y) |
460 | { |
461 | int z; |
462 | |
463 | z = (Word64)x * (Word64)y >> 32; |
464 | |
465 | return z; |
466 | } |
467 | |
468 | static __inline short CLIPTOSHORT(int x) |
469 | { |
470 | int sign; |
471 | |
472 | /* clip to [-32768, 32767] */ |
473 | sign = x >> 31; |
474 | if (sign != (x >> 15)) { |
475 | x = sign ^((1 << 15) - 1); |
476 | } |
477 | |
478 | return (short)x; |
479 | } |
480 | |
481 | static __inline int FASTABS(int x) |
482 | { |
483 | int sign; |
484 | |
485 | sign = x >> (sizeof(int) * 8 - 1); |
486 | x ^= sign; |
487 | x -= sign; |
488 | |
489 | return x; |
490 | } |
491 | |
492 | static __inline int CLZ(int x) |
493 | { |
494 | int numZeros; |
495 | |
496 | if (!x) { |
497 | return 32; |
498 | } |
499 | |
500 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
501 | numZeros = 1; |
502 | if (!((unsigned int)x >> 16)) { |
503 | numZeros += 16; |
504 | x <<= 16; |
505 | } |
506 | if (!((unsigned int)x >> 24)) { |
507 | numZeros += 8; |
508 | x <<= 8; |
509 | } |
510 | if (!((unsigned int)x >> 28)) { |
511 | numZeros += 4; |
512 | x <<= 4; |
513 | } |
514 | if (!((unsigned int)x >> 30)) { |
515 | numZeros += 2; |
516 | x <<= 2; |
517 | } |
518 | |
519 | numZeros -= ((unsigned int)x >> 31); |
520 | |
521 | return numZeros; |
522 | } |
523 | |
524 | typedef union _U64 { |
525 | Word64 w64; |
526 | struct { |
527 | /* x86 = little endian */ |
528 | unsigned int lo32; |
529 | signed int hi32; |
530 | } r; |
531 | } U64; |
532 | |
533 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
534 | { |
535 | sum64 += (Word64)x * (Word64)y; |
536 | |
537 | return sum64; |
538 | } |
539 | |
540 | #elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__)) || (defined (_SOLARIS) && !defined (__GNUC__) && !defined (_SOLARISX86)) |
541 | |
542 | typedef long long Word64; |
543 | |
544 | static __inline__ int MULSHIFT32(int x, int y) |
545 | { |
546 | int z; |
547 | |
548 | z = (Word64)x * (Word64)y >> 32; |
549 | |
550 | return z; |
551 | } |
552 | |
553 | static __inline short CLIPTOSHORT(int x) |
554 | { |
555 | int sign; |
556 | |
557 | /* clip to [-32768, 32767] */ |
558 | sign = x >> 31; |
559 | if (sign != (x >> 15)) { |
560 | x = sign ^((1 << 15) - 1); |
561 | } |
562 | |
563 | return (short)x; |
564 | } |
565 | |
566 | static __inline int FASTABS(int x) |
567 | { |
568 | int sign; |
569 | |
570 | sign = x >> (sizeof(int) * 8 - 1); |
571 | x ^= sign; |
572 | x -= sign; |
573 | |
574 | return x; |
575 | } |
576 | |
577 | static __inline int CLZ(int x) |
578 | { |
579 | int numZeros; |
580 | |
581 | if (!x) { |
582 | return 32; |
583 | } |
584 | |
585 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
586 | numZeros = 1; |
587 | if (!((unsigned int)x >> 16)) { |
588 | numZeros += 16; |
589 | x <<= 16; |
590 | } |
591 | if (!((unsigned int)x >> 24)) { |
592 | numZeros += 8; |
593 | x <<= 8; |
594 | } |
595 | if (!((unsigned int)x >> 28)) { |
596 | numZeros += 4; |
597 | x <<= 4; |
598 | } |
599 | if (!((unsigned int)x >> 30)) { |
600 | numZeros += 2; |
601 | x <<= 2; |
602 | } |
603 | |
604 | numZeros -= ((unsigned int)x >> 31); |
605 | |
606 | return numZeros; |
607 | } |
608 | |
609 | typedef union _U64 { |
610 | Word64 w64; |
611 | struct { |
612 | /* PowerPC = big endian */ |
613 | signed int hi32; |
614 | unsigned int lo32; |
615 | } r; |
616 | } U64; |
617 | |
618 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
619 | { |
620 | sum64 += (Word64)x * (Word64)y; |
621 | |
622 | return sum64; |
623 | } |
624 | |
625 | #else |
626 | |
627 | #error Unsupported platform in assembly.h |
628 | |
629 | #endif /* platforms */ |
630 | |
631 | #endif /* _ASSEMBLY_H */ |
632 |