blob: 335f4b0567e2c367e7fd35638f2b02c18b7bafe6
1 | /* ***** BEGIN LICENSE BLOCK ***** |
2 | * Source last modified: $Id: assembly.h,v 1.7 2005/11/10 00:04:40 margotm Exp $ |
3 | * |
4 | * Portions Copyright (c) 1995-2005 RealNetworks, Inc. All Rights Reserved. |
5 | * |
6 | * The contents of this file, and the files included with this file, |
7 | * are subject to the current version of the RealNetworks Public |
8 | * Source License (the "RPSL") available at |
9 | * http://www.helixcommunity.org/content/rpsl unless you have licensed |
10 | * the file under the current version of the RealNetworks Community |
11 | * Source License (the "RCSL") available at |
12 | * http://www.helixcommunity.org/content/rcsl, in which case the RCSL |
13 | * will apply. You may also obtain the license terms directly from |
14 | * RealNetworks. You may not use this file except in compliance with |
15 | * the RPSL or, if you have a valid RCSL with RealNetworks applicable |
16 | * to this file, the RCSL. Please see the applicable RPSL or RCSL for |
17 | * the rights, obligations and limitations governing use of the |
18 | * contents of the file. |
19 | * |
20 | * This file is part of the Helix DNA Technology. RealNetworks is the |
21 | * developer of the Original Code and owns the copyrights in the |
22 | * portions it created. |
23 | * |
24 | * This file, and the files included with this file, is distributed |
25 | * and made available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY |
26 | * KIND, EITHER EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS |
27 | * ALL SUCH WARRANTIES, INCLUDING WITHOUT LIMITATION, ANY WARRANTIES |
28 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, QUIET |
29 | * ENJOYMENT OR NON-INFRINGEMENT. |
30 | * |
31 | * Technology Compatibility Kit Test Suite(s) Location: |
32 | * http://www.helixcommunity.org/content/tck |
33 | * |
34 | * Contributor(s): |
35 | * |
36 | * ***** END LICENSE BLOCK ***** */ |
37 | |
38 | /************************************************************************************** |
39 | * Fixed-point HE-AAC decoder |
40 | * Jon Recker (jrecker@real.com) |
41 | * February 2005 |
42 | * |
43 | * assembly.h - inline assembly language functions and prototypes |
44 | * |
45 | * MULSHIFT32(x, y) signed multiply of two 32-bit integers (x and y), |
46 | * returns top 32-bits of 64-bit result |
47 | * CLIPTOSHORT(x) convert 32-bit integer to 16-bit short, |
48 | * clipping to [-32768, 32767] |
49 | * FASTABS(x) branchless absolute value of signed integer x |
50 | * CLZ(x) count leading zeros on signed integer x |
51 | * MADD64(sum64, x, y) 64-bit multiply accumulate: sum64 += (x*y) |
52 | **************************************************************************************/ |
53 | |
54 | #ifndef _ASSEMBLY_H |
55 | #define _ASSEMBLY_H |
56 | |
57 | //#define _Inline inline |
58 | #define _ARC32 |
59 | /* toolchain: MSFT Visual C++ |
60 | * target architecture: x86 |
61 | */ |
62 | #if (defined (_WIN32) && !defined (_WIN32_WCE)) || (defined (__WINS__) && defined (_SYMBIAN)) || (defined (WINCE_EMULATOR)) || (defined (_OPENWAVE_SIMULATOR)) |
63 | |
64 | #pragma warning( disable : 4035 ) /* complains about inline asm not returning a value */ |
65 | |
66 | static __inline int MULSHIFT32(int x, int y) |
67 | { |
68 | __asm { |
69 | mov eax, x |
70 | imul y |
71 | mov eax, edx |
72 | } |
73 | } |
74 | |
75 | static __inline short CLIPTOSHORT(int x) |
76 | { |
77 | int sign; |
78 | |
79 | /* clip to [-32768, 32767] */ |
80 | sign = x >> 31; |
81 | if (sign != (x >> 15)) { |
82 | x = sign ^((1 << 15) - 1); |
83 | } |
84 | |
85 | return (short)x; |
86 | } |
87 | |
88 | static __inline int FASTABS(int x) |
89 | { |
90 | int sign; |
91 | |
92 | sign = x >> (sizeof(int) * 8 - 1); |
93 | x ^= sign; |
94 | x -= sign; |
95 | |
96 | return x; |
97 | } |
98 | |
99 | static __inline int CLZ(int x) |
100 | { |
101 | int numZeros; |
102 | |
103 | if (!x) { |
104 | return 32; |
105 | } |
106 | |
107 | /* count leading zeros with binary search */ |
108 | numZeros = 1; |
109 | if (!((unsigned int)x >> 16)) { |
110 | numZeros += 16; |
111 | x <<= 16; |
112 | } |
113 | if (!((unsigned int)x >> 24)) { |
114 | numZeros += 8; |
115 | x <<= 8; |
116 | } |
117 | if (!((unsigned int)x >> 28)) { |
118 | numZeros += 4; |
119 | x <<= 4; |
120 | } |
121 | if (!((unsigned int)x >> 30)) { |
122 | numZeros += 2; |
123 | x <<= 2; |
124 | } |
125 | |
126 | numZeros -= ((unsigned int)x >> 31); |
127 | |
128 | return numZeros; |
129 | } |
130 | |
131 | #ifdef __CW32__ |
132 | typedef long long Word64; |
133 | #else |
134 | typedef __int64 Word64; |
135 | #endif |
136 | |
137 | typedef union _U64 { |
138 | Word64 w64; |
139 | struct { |
140 | /* x86 = little endian */ |
141 | unsigned int lo32; |
142 | signed int hi32; |
143 | } r; |
144 | } U64; |
145 | |
146 | /* returns 64-bit value in [edx:eax] */ |
147 | static __inline Word64 madd64(Word64 sum64, int x, int y) |
148 | { |
149 | #if (defined (_SYMBIAN_61_) || defined (_SYMBIAN_70_)) && defined (__WINS__) && !defined (__CW32__) |
150 | /* Workaround for the Symbian emulator because of non existing longlong.lib and |
151 | * hence __allmul not defined. */ |
152 | __asm { |
153 | mov eax, x |
154 | imul y |
155 | add dword ptr sum64, eax |
156 | adc dword ptr sum64 + 4, edx |
157 | } |
158 | #else |
159 | sum64 += (Word64)x * (Word64)y; |
160 | |
161 | /* equivalent to return (sum + ((__int64)x * y)); */ |
162 | #endif |
163 | } |
164 | |
165 | #define SET_ZERO(x) x=0 |
166 | #define MADD64(sum64, x, y) sum64=madd64(sum64, x, y) |
167 | #define ADD64(x64, y64) x64 += y64; |
168 | |
169 | /* toolchain: MSFT Embedded Visual C++ |
170 | * target architecture: ARM v.4 and above (require 'M' type processor for 32x32->64 multiplier) |
171 | */ |
172 | #elif defined (_WIN32) && defined (_WIN32_WCE) && defined (ARM) |
173 | |
174 | static __inline short CLIPTOSHORT(int x) |
175 | { |
176 | int sign; |
177 | |
178 | /* clip to [-32768, 32767] */ |
179 | sign = x >> 31; |
180 | if (sign != (x >> 15)) { |
181 | x = sign ^((1 << 15) - 1); |
182 | } |
183 | |
184 | return (short)x; |
185 | } |
186 | |
187 | static __inline int FASTABS(int x) |
188 | { |
189 | int sign; |
190 | |
191 | sign = x >> (sizeof(int) * 8 - 1); |
192 | x ^= sign; |
193 | x -= sign; |
194 | |
195 | return x; |
196 | } |
197 | |
198 | static __inline int CLZ(int x) |
199 | { |
200 | int numZeros; |
201 | |
202 | if (!x) { |
203 | return 32; |
204 | } |
205 | |
206 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
207 | numZeros = 1; |
208 | if (!((unsigned int)x >> 16)) { |
209 | numZeros += 16; |
210 | x <<= 16; |
211 | } |
212 | if (!((unsigned int)x >> 24)) { |
213 | numZeros += 8; |
214 | x <<= 8; |
215 | } |
216 | if (!((unsigned int)x >> 28)) { |
217 | numZeros += 4; |
218 | x <<= 4; |
219 | } |
220 | if (!((unsigned int)x >> 30)) { |
221 | numZeros += 2; |
222 | x <<= 2; |
223 | } |
224 | |
225 | numZeros -= ((unsigned int)x >> 31); |
226 | |
227 | return numZeros; |
228 | } |
229 | |
230 | /* implemented in asmfunc.s */ |
231 | #ifdef __cplusplus |
232 | extern "C" { |
233 | #endif |
234 | |
235 | typedef __int64 Word64; |
236 | |
237 | typedef union _U64 { |
238 | Word64 w64; |
239 | struct { |
240 | /* ARM WinCE = little endian */ |
241 | unsigned int lo32; |
242 | signed int hi32; |
243 | } r; |
244 | } U64; |
245 | |
246 | /* manual name mangling for just this platform (must match labels in .s file) */ |
247 | #define MULSHIFT32 raac_MULSHIFT32 |
248 | #define MADD64 raac_MADD64 |
249 | |
250 | int MULSHIFT32(int x, int y); |
251 | Word64 MADD64(Word64 sum64, int x, int y); |
252 | |
253 | #ifdef __cplusplus |
254 | } |
255 | #endif |
256 | |
257 | /* toolchain: ARM ADS or RealView |
258 | * target architecture: ARM v.4 and above (requires 'M' type processor for 32x32->64 multiplier) |
259 | */ |
260 | #elif (defined (__arm) && defined (__ARMCC_VERSION)) || (defined(HELIX_CONFIG_SYMBIAN_GENERATE_MMP) && !defined(__GCCE__)) |
261 | |
262 | static __inline int MULSHIFT32(int x, int y) |
263 | { |
264 | /* rules for smull RdLo, RdHi, Rm, Rs: |
265 | * RdHi != Rm |
266 | * RdLo != Rm |
267 | * RdHi != RdLo |
268 | */ |
269 | int zlow; |
270 | __asm { |
271 | smull zlow, y, x, y |
272 | } |
273 | |
274 | return y; |
275 | } |
276 | |
277 | static __inline short CLIPTOSHORT(int x) |
278 | { |
279 | int sign; |
280 | |
281 | /* clip to [-32768, 32767] */ |
282 | sign = x >> 31; |
283 | if (sign != (x >> 15)) { |
284 | x = sign ^((1 << 15) - 1); |
285 | } |
286 | |
287 | return (short)x; |
288 | } |
289 | |
290 | static __inline int FASTABS(int x) |
291 | { |
292 | int sign; |
293 | |
294 | sign = x >> (sizeof(int) * 8 - 1); |
295 | x ^= sign; |
296 | x -= sign; |
297 | |
298 | return x; |
299 | } |
300 | |
301 | static __inline int CLZ(int x) |
302 | { |
303 | int numZeros; |
304 | |
305 | if (!x) { |
306 | return 32; |
307 | } |
308 | |
309 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
310 | numZeros = 1; |
311 | if (!((unsigned int)x >> 16)) { |
312 | numZeros += 16; |
313 | x <<= 16; |
314 | } |
315 | if (!((unsigned int)x >> 24)) { |
316 | numZeros += 8; |
317 | x <<= 8; |
318 | } |
319 | if (!((unsigned int)x >> 28)) { |
320 | numZeros += 4; |
321 | x <<= 4; |
322 | } |
323 | if (!((unsigned int)x >> 30)) { |
324 | numZeros += 2; |
325 | x <<= 2; |
326 | } |
327 | |
328 | numZeros -= ((unsigned int)x >> 31); |
329 | |
330 | return numZeros; |
331 | |
332 | /* ARM code would look like this, but do NOT use inline asm in ADS for this, |
333 | because you can't safely use the status register flags intermixed with C code |
334 | |
335 | __asm { |
336 | mov numZeros, #1 |
337 | tst x, 0xffff0000 |
338 | addeq numZeros, numZeros, #16 |
339 | moveq x, x, lsl #16 |
340 | tst x, 0xff000000 |
341 | addeq numZeros, numZeros, #8 |
342 | moveq x, x, lsl #8 |
343 | tst x, 0xf0000000 |
344 | addeq numZeros, numZeros, #4 |
345 | moveq x, x, lsl #4 |
346 | tst x, 0xc0000000 |
347 | addeq numZeros, numZeros, #2 |
348 | moveq x, x, lsl #2 |
349 | sub numZeros, numZeros, x, lsr #31 |
350 | } |
351 | */ |
352 | /* reference: |
353 | numZeros = 0; |
354 | while (!(x & 0x80000000)) { |
355 | numZeros++; |
356 | x <<= 1; |
357 | } |
358 | */ |
359 | } |
360 | |
361 | typedef __int64 Word64; |
362 | |
363 | typedef union _U64 { |
364 | Word64 w64; |
365 | struct { |
366 | /* ARM ADS = little endian */ |
367 | unsigned int lo32; |
368 | signed int hi32; |
369 | } r; |
370 | } U64; |
371 | |
372 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
373 | { |
374 | U64 u; |
375 | u.w64 = sum64; |
376 | |
377 | __asm { |
378 | smlal u.r.lo32, u.r.hi32, x, y |
379 | } |
380 | |
381 | return u.w64; |
382 | } |
383 | |
384 | /* toolchain: ARM gcc |
385 | * target architecture: ARM v.4 and above (requires 'M' type processor for 32x32->64 multiplier) |
386 | */ |
387 | #elif defined(__GNUC__) && defined(__arm__) |
388 | |
389 | static __inline__ int MULSHIFT32(int x, int y) |
390 | { |
391 | int zlow; |
392 | __asm__ volatile("smull %0,%1,%2,%3" : "=&r"(zlow), "=r"(y) : "r"(x), "1"(y) : "cc"); |
393 | return y; |
394 | } |
395 | |
396 | static __inline short CLIPTOSHORT(int x) |
397 | { |
398 | int sign; |
399 | |
400 | /* clip to [-32768, 32767] */ |
401 | sign = x >> 31; |
402 | if (sign != (x >> 15)) { |
403 | x = sign ^((1 << 15) - 1); |
404 | } |
405 | |
406 | return (short)x; |
407 | } |
408 | |
409 | static __inline int FASTABS(int x) |
410 | { |
411 | int sign; |
412 | |
413 | sign = x >> (sizeof(int) * 8 - 1); |
414 | x ^= sign; |
415 | x -= sign; |
416 | |
417 | return x; |
418 | } |
419 | |
420 | static __inline int CLZ(int x) |
421 | { |
422 | int numZeros; |
423 | |
424 | if (!x) { |
425 | return (sizeof(int) * 8); |
426 | } |
427 | |
428 | numZeros = 0; |
429 | while (!(x & 0x80000000)) { |
430 | numZeros++; |
431 | x <<= 1; |
432 | } |
433 | |
434 | return numZeros; |
435 | } |
436 | |
437 | typedef long long Word64; |
438 | |
439 | typedef union _U64 { |
440 | Word64 w64; |
441 | struct { |
442 | /* ARM ADS = little endian */ |
443 | unsigned int lo32; |
444 | signed int hi32; |
445 | } r; |
446 | } U64; |
447 | |
448 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
449 | { |
450 | U64 u; |
451 | u.w64 = sum64; |
452 | |
453 | __asm__ volatile("smlal %0,%1,%2,%3" : "+&r"(u.r.lo32), "+&r"(u.r.hi32) : "r"(x), "r"(y) : "cc"); |
454 | |
455 | return u.w64; |
456 | } |
457 | |
458 | /* toolchain: x86 gcc |
459 | * target architecture: x86 |
460 | */ |
461 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__amd64__)) || (defined (_SOLARIS) && !defined (__GNUC__) && defined(_SOLARISX86)) |
462 | |
463 | typedef long long Word64; |
464 | |
465 | static __inline__ int MULSHIFT32(int x, int y) |
466 | { |
467 | int z; |
468 | |
469 | z = (Word64)x * (Word64)y >> 32; |
470 | |
471 | return z; |
472 | } |
473 | |
474 | static __inline short CLIPTOSHORT(int x) |
475 | { |
476 | int sign; |
477 | |
478 | /* clip to [-32768, 32767] */ |
479 | sign = x >> 31; |
480 | if (sign != (x >> 15)) { |
481 | x = sign ^((1 << 15) - 1); |
482 | } |
483 | |
484 | return (short)x; |
485 | } |
486 | |
487 | static __inline int FASTABS(int x) |
488 | { |
489 | int sign; |
490 | |
491 | sign = x >> (sizeof(int) * 8 - 1); |
492 | x ^= sign; |
493 | x -= sign; |
494 | |
495 | return x; |
496 | } |
497 | |
498 | static __inline int CLZ(int x) |
499 | { |
500 | int numZeros; |
501 | |
502 | if (!x) { |
503 | return 32; |
504 | } |
505 | |
506 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
507 | numZeros = 1; |
508 | if (!((unsigned int)x >> 16)) { |
509 | numZeros += 16; |
510 | x <<= 16; |
511 | } |
512 | if (!((unsigned int)x >> 24)) { |
513 | numZeros += 8; |
514 | x <<= 8; |
515 | } |
516 | if (!((unsigned int)x >> 28)) { |
517 | numZeros += 4; |
518 | x <<= 4; |
519 | } |
520 | if (!((unsigned int)x >> 30)) { |
521 | numZeros += 2; |
522 | x <<= 2; |
523 | } |
524 | |
525 | numZeros -= ((unsigned int)x >> 31); |
526 | |
527 | return numZeros; |
528 | } |
529 | |
530 | typedef union _U64 { |
531 | Word64 w64; |
532 | struct { |
533 | /* x86 = little endian */ |
534 | unsigned int lo32; |
535 | signed int hi32; |
536 | } r; |
537 | } U64; |
538 | |
539 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
540 | { |
541 | sum64 += (Word64)x * (Word64)y; |
542 | |
543 | return sum64; |
544 | } |
545 | |
546 | #elif defined(__GNUC__) && (defined(__powerpc__) || defined(__POWERPC__)) || (defined (_SOLARIS) && !defined (__GNUC__) && !defined (_SOLARISX86)) |
547 | |
548 | typedef long long Word64; |
549 | |
550 | static __inline__ int MULSHIFT32(int x, int y) |
551 | { |
552 | int z; |
553 | |
554 | z = (Word64)x * (Word64)y >> 32; |
555 | |
556 | return z; |
557 | } |
558 | |
559 | static __inline short CLIPTOSHORT(int x) |
560 | { |
561 | int sign; |
562 | |
563 | /* clip to [-32768, 32767] */ |
564 | sign = x >> 31; |
565 | if (sign != (x >> 15)) { |
566 | x = sign ^((1 << 15) - 1); |
567 | } |
568 | |
569 | return (short)x; |
570 | } |
571 | |
572 | static __inline int FASTABS(int x) |
573 | { |
574 | int sign; |
575 | |
576 | sign = x >> (sizeof(int) * 8 - 1); |
577 | x ^= sign; |
578 | x -= sign; |
579 | |
580 | return x; |
581 | } |
582 | |
583 | static __inline int CLZ(int x) |
584 | { |
585 | int numZeros; |
586 | |
587 | if (!x) { |
588 | return 32; |
589 | } |
590 | |
591 | /* count leading zeros with binary search (function should be 17 ARM instructions total) */ |
592 | numZeros = 1; |
593 | if (!((unsigned int)x >> 16)) { |
594 | numZeros += 16; |
595 | x <<= 16; |
596 | } |
597 | if (!((unsigned int)x >> 24)) { |
598 | numZeros += 8; |
599 | x <<= 8; |
600 | } |
601 | if (!((unsigned int)x >> 28)) { |
602 | numZeros += 4; |
603 | x <<= 4; |
604 | } |
605 | if (!((unsigned int)x >> 30)) { |
606 | numZeros += 2; |
607 | x <<= 2; |
608 | } |
609 | |
610 | numZeros -= ((unsigned int)x >> 31); |
611 | |
612 | return numZeros; |
613 | } |
614 | |
615 | typedef union _U64 { |
616 | Word64 w64; |
617 | struct { |
618 | /* PowerPC = big endian */ |
619 | signed int hi32; |
620 | unsigned int lo32; |
621 | } r; |
622 | } U64; |
623 | |
624 | static __inline Word64 MADD64(Word64 sum64, int x, int y) |
625 | { |
626 | sum64 += (Word64)x * (Word64)y; |
627 | |
628 | return sum64; |
629 | } |
630 | |
631 | #elif defined(_ARC32) |
632 | |
633 | _Asm _Inline int MULSHIFT32(int x, int y) |
634 | { |
635 | % reg x, y |
636 | mullw 0, x, y |
637 | machlw % r0, x, y |
638 | % error |
639 | } |
640 | |
641 | _Asm _Inline short CLIPTOSHORT(int x) |
642 | { |
643 | % reg x |
644 | min % r0, x, 0x7fff |
645 | max % r0, % r0, -0x8000 |
646 | % error |
647 | } |
648 | |
649 | _Asm _Inline int FASTABS(int x) |
650 | { |
651 | % reg x |
652 | abs % r0, x |
653 | % error |
654 | } |
655 | |
656 | _Asm _Inline int CLZ(int x) |
657 | { |
658 | /* assume x>0, if x<0 should return 0 */ |
659 | % reg x; |
660 | norm % r0, x |
661 | add % r0, % r0, 1 |
662 | % error |
663 | } |
664 | #endif |
665 | typedef struct { |
666 | unsigned int lo32; |
667 | signed int hi32; |
668 | } Word64; |
669 | |
670 | typedef union _U64 { |
671 | Word64 w64; |
672 | struct { |
673 | unsigned int lo32; |
674 | signed int hi32; |
675 | } r; |
676 | } U64; |
677 | |
678 | _Asm _Inline unsigned add64_lo(unsigned int xlo, unsigned int ylo) |
679 | { |
680 | % reg xlo, ylo; |
681 | add.f % r0, xlo, ylo |
682 | % error |
683 | } |
684 | |
685 | _Asm _Inline int add64_hi(unsigned int xhi, unsigned int yhi) |
686 | { |
687 | % reg xhi, yhi; |
688 | adc % r0, xhi, yhi |
689 | % error |
690 | } |
691 | |
692 | _Asm _Inline unsigned madd64_lo(unsigned lo, int a, int b) |
693 | { |
694 | % reg lo, a, b; |
695 | mpy % r0, a, b |
696 | add.f % r0, lo, % r0 |
697 | % error |
698 | } |
699 | |
700 | |
701 | _Asm _Inline int madd64_hi(int hi, int a, int b) |
702 | { |
703 | % reg hi, a, b; |
704 | mpyh % r0, a, b |
705 | adc % r0, hi, % r0 |
706 | % error |
707 | } |
708 | |
709 | |
710 | |
711 | _Asm _Inline void madd64(int a, int b) |
712 | { |
713 | % reg a, b; |
714 | mulhlw 0, a, b |
715 | maclw 0, a, b |
716 | % error |
717 | } |
718 | |
719 | _Asm _Inline int madd64hi(int hi) |
720 | { |
721 | % reg hi |
722 | //mov %r0, %acc2 |
723 | adc % r0, hi, % acc1 |
724 | % error |
725 | } |
726 | |
727 | _Asm _Inline int madd64lo(int lo) |
728 | { |
729 | % reg lo |
730 | //mov %r0, %acc1 |
731 | add.f % r0, lo, % acc2 |
732 | % error |
733 | } |
734 | |
735 | |
736 | |
737 | #define SET_ZERO(x) x.lo32 = x.hi32 = 0 |
738 | |
739 | #define MADD64(w64, a, b) madd64(a, b); w64.lo32 = madd64lo(w64.lo32); w64.hi32 = madd64hi(w64.hi32); |
740 | |
741 | #define ADD64(x64, y64) x64.lo32 = add64_lo(x64.lo32,y64.lo32); x64.hi32 = add64_hi(x64.hi32,y64.hi32); |
742 | |
743 | #endif /* _ASSEMBLY_H */ |
744 |