summaryrefslogtreecommitdiff
path: root/audio_codec/libmad/imdct_l_arm.S (plain)
blob: badec5bcbd17f4a77acc29951cc1b75344da05d3
1/*****************************************************************************
2* Copyright (C) 2000-2001 Andre McCurdy <armccurdy@yahoo.co.uk>
3*
4* This program is free software. you can redistribute it and/or modify
5* it under the terms of the GNU General Public License as published by
6* the Free Software Foundation@ either version 2 of the License, or
7* (at your option) any later version.
8*
9* This program is distributed in the hope that it will be useful,
10* but WITHOUT ANY WARRANTY, without even the implied warranty of
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12* GNU General Public License for more details.
13*
14* You should have received a copy of the GNU General Public License
15* along with this program@ if not, write to the Free Software
16* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*
18*****************************************************************************
19*
20* Notes:
21*
22*
23*****************************************************************************
24*
25* $Id: imdct_l_arm.S,v 1.7 2001/03/25 20:03:34 rob Rel $
26*
27* 2001/03/24: Andre McCurdy <armccurdy@yahoo.co.uk>
28* - Corrected PIC unsafe loading of address of 'imdct36_long_karray'
29*
30* 2000/09/20: Robert Leslie <rob@mars.org>
31* - Added a global symbol with leading underscore per suggestion of
32* Simon Burge to support linking with the a.out format.
33*
34* 2000/09/15: Robert Leslie <rob@mars.org>
35* - Fixed a small bug where flags were changed before a conditional branch.
36*
37* 2000/09/15: Andre McCurdy <armccurdy@yahoo.co.uk>
38* - Applied Nicolas Pitre's rounding optimisation in all remaining places.
39*
40* 2000/09/09: Nicolas Pitre <nico@cam.org>
41* - Optimized rounding + scaling operations.
42*
43* 2000/08/09: Andre McCurdy <armccurdy@yahoo.co.uk>
44* - Original created.
45*
46****************************************************************************/
47
48
49/*
50 On entry:
51
52 r0 = pointer to 18 element input array
53 r1 = pointer to 36 element output array
54 r2 = windowing block type
55
56
57 Stack frame created during execution of the function:
58
59 Initial Holds:
60 Stack
61 pointer
62 minus:
63
64 0
65 4 lr
66 8 r11
67 12 r10
68 16 r9
69 20 r8
70 24 r7
71 28 r6
72 32 r5
73 36 r4
74
75 40 r2 : windowing block type
76
77 44 ct00 high
78 48 ct00 low
79 52 ct01 high
80 56 ct01 low
81 60 ct04 high
82 64 ct04 low
83 68 ct06 high
84 72 ct06 low
85 76 ct05 high
86 80 ct05 low
87 84 ct03 high
88 88 ct03 low
89 92 -ct05 high
90 96 -ct05 low
91 100 -ct07 high
92 104 -ct07 low
93 108 ct07 high
94 112 ct07 low
95 116 ct02 high
96 120 ct02 low
97*/
98
99#define BLOCK_MODE_NORMAL 0
100#define BLOCK_MODE_START 1
101#define BLOCK_MODE_STOP 3
102
103
104#define X0 0x00
105#define X1 0x04
106#define X2 0x08
107#define X3 0x0C
108#define X4 0x10
109#define X5 0x14
110#define X6 0x18
111#define X7 0x1c
112#define X8 0x20
113#define X9 0x24
114#define X10 0x28
115#define X11 0x2c
116#define X12 0x30
117#define X13 0x34
118#define X14 0x38
119#define X15 0x3c
120#define X16 0x40
121#define X17 0x44
122
123#define x0 0x00
124#define x1 0x04
125#define x2 0x08
126#define x3 0x0C
127#define x4 0x10
128#define x5 0x14
129#define x6 0x18
130#define x7 0x1c
131#define x8 0x20
132#define x9 0x24
133#define x10 0x28
134#define x11 0x2c
135#define x12 0x30
136#define x13 0x34
137#define x14 0x38
138#define x15 0x3c
139#define x16 0x40
140#define x17 0x44
141#define x18 0x48
142#define x19 0x4c
143#define x20 0x50
144#define x21 0x54
145#define x22 0x58
146#define x23 0x5c
147#define x24 0x60
148#define x25 0x64
149#define x26 0x68
150#define x27 0x6c
151#define x28 0x70
152#define x29 0x74
153#define x30 0x78
154#define x31 0x7c
155#define x32 0x80
156#define x33 0x84
157#define x34 0x88
158#define x35 0x8c
159
160#define K00 0x0ffc19fd
161#define K01 0x00b2aa3e
162#define K02 0x0fdcf549
163#define K03 0x0216a2a2
164#define K04 0x0f9ee890
165#define K05 0x03768962
166#define K06 0x0f426cb5
167#define K07 0x04cfb0e2
168#define K08 0x0ec835e8
169#define K09 0x061f78aa
170#define K10 0x0e313245
171#define K11 0x07635284
172#define K12 0x0d7e8807
173#define K13 0x0898c779
174#define K14 0x0cb19346
175#define K15 0x09bd7ca0
176#define K16 0x0bcbe352
177#define K17 0x0acf37ad
178
179#define minus_K02 0xf0230ab7
180
181#define WL0 0x00b2aa3e
182#define WL1 0x0216a2a2
183#define WL2 0x03768962
184#define WL3 0x04cfb0e2
185#define WL4 0x061f78aa
186#define WL5 0x07635284
187#define WL6 0x0898c779
188#define WL7 0x09bd7ca0
189#define WL8 0x0acf37ad
190#define WL9 0x0bcbe352
191#define WL10 0x0cb19346
192#define WL11 0x0d7e8807
193#define WL12 0x0e313245
194#define WL13 0x0ec835e8
195#define WL14 0x0f426cb5
196#define WL15 0x0f9ee890
197#define WL16 0x0fdcf549
198#define WL17 0x0ffc19fd
199
200
201@*****************************************************************************
202
203
204 .text
205 .align
206
207 .global III_imdct_l
208 .global _III_imdct_l
209
210III_imdct_l:
211_III_imdct_l:
212
213 stmdb sp!, { r2, r4 - r11, lr } @ all callee saved regs, plus arg3
214
215 ldr r4, =K08 @ r4 = K08
216 ldr r5, =K09 @ r5 = K09
217 ldr r8, [r0, #X4] @ r8 = X4
218 ldr r9, [r0, #X13] @ r9 = X13
219 rsb r6, r4, #0 @ r6 = -K08
220 rsb r7, r5, #0 @ r7 = -K09
221
222 smull r2, r3, r4, r8 @ r2..r3 = (X4 * K08)
223 smlal r2, r3, r5, r9 @ r2..r3 = (X4 * K08) + (X13 * K09) = ct01
224
225 smull r10, lr, r8, r5 @ r10..lr = (X4 * K09)
226 smlal r10, lr, r9, r6 @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00
227
228 ldr r8, [r0, #X7] @ r8 = X7
229 ldr r9, [r0, #X16] @ r9 = X16
230
231 stmdb sp!, { r2, r3, r10, lr } @ stack ct00_h, ct00_l, ct01_h, ct01_l
232
233 add r8, r8, r9 @ r8 = (X7 + X16)
234 ldr r9, [r0, #X1] @ r9 = X1
235
236 smlal r2, r3, r6, r8 @ r2..r3 = ct01 + ((X7 + X16) * -K08)
237 smlal r2, r3, r7, r9 @ r2..r3 += (X1 * -K09)
238
239 ldr r7, [r0, #X10] @ r7 = X10
240
241 rsbs r10, r10, #0
242 rsc lr, lr, #0 @ r10..lr = -ct00
243
244 smlal r2, r3, r5, r7 @ r2..r3 += (X10 * K09) = ct06
245
246 smlal r10, lr, r9, r6 @ r10..lr = -ct00 + ( X1 * -K08)
247 smlal r10, lr, r8, r5 @ r10..lr += ((X7 + X16) * K09)
248 smlal r10, lr, r7, r4 @ r10..lr += ( X10 * K08) = ct04
249
250 stmdb sp!, { r2, r3, r10, lr } @ stack ct04_h, ct04_l, ct06_h, ct06_l
251
252 @----
253
254 ldr r7, [r0, #X0]
255 ldr r8, [r0, #X11]
256 ldr r9, [r0, #X12]
257 sub r7, r7, r8
258 sub r7, r7, r9 @ r7 = (X0 - X11 -X12) = ct14
259
260 ldr r9, [r0, #X3]
261 ldr r8, [r0, #X8]
262 ldr r11, [r0, #X15]
263 sub r8, r8, r9
264 add r8, r8, r11 @ r8 = (X8 - X3 + X15) = ct16
265
266 add r11, r7, r8 @ r11 = ct14 + ct16 = ct18
267
268 smlal r2, r3, r6, r11 @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08)
269
270 ldr r6, [r0, #X2]
271 ldr r9, [r0, #X9]
272 ldr r12, [r0, #X14]
273 sub r6, r6, r9
274 sub r6, r6, r12 @ r6 = (X2 - X9 - X14) = ct15
275
276 ldr r9, [r0, #X5]
277 ldr r12, [r0, #X6]
278 sub r9, r9, r12
279 ldr r12, [r0, #X17]
280 sub r9, r9, r12 @ r9 = (X5 - X6 - X17) = ct17
281
282 add r12, r9, r6 @ r12 = ct15 + ct17 = ct19
283
284 smlal r2, r3, r5, r12 @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09)
285
286 smlal r10, lr, r11, r5 @ r10..lr = ct04 + (ct18 * K09)
287 smlal r10, lr, r12, r4 @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08)
288
289 movs r2, r2, lsr #28
290 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
291 str r2, [r1, #x22] @ store result x22
292
293 movs r10, r10, lsr #28
294 adc r10, r10, lr, lsl #4 @ r10 = bits[59..28] of r10..lr
295 str r10, [r1, #x4] @ store result x4
296
297 @----
298
299 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
300
301 @ r2..r3 = ct06
302 @ r4..r5 = ct04
303 @ r6 = ct15
304 @ r7 = ct14
305 @ r8 = ct16
306 @ r9 = ct17
307 @ r10 = .
308 @ r11 = .
309 @ r12 = .
310 @ lr = .
311
312 ldr r10, =K03 @ r10 = K03
313 ldr lr, =K15 @ lr = K15
314
315 smlal r2, r3, r10, r7 @ r2..r3 = ct06 + (ct14 * K03)
316 smlal r4, r5, lr, r7 @ r4..r5 = ct04 + (ct14 * K15)
317
318 ldr r12, =K14 @ r12 = K14
319 rsb r10, r10, #0 @ r10 = -K03
320
321 smlal r2, r3, lr, r6 @ r2..r3 += (ct15 * K15)
322 smlal r4, r5, r10, r6 @ r4..r5 += (ct15 * -K03)
323 smlal r2, r3, r12, r8 @ r2..r3 += (ct16 * K14)
324
325 ldr r11, =minus_K02 @ r11 = -K02
326 rsb r12, r12, #0 @ r12 = -K14
327
328 smlal r4, r5, r12, r9 @ r4..r5 += (ct17 * -K14)
329 smlal r2, r3, r11, r9 @ r2..r3 += (ct17 * -K02)
330 smlal r4, r5, r11, r8 @ r4..r5 += (ct16 * -K02)
331
332 movs r2, r2, lsr #28
333 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
334 str r2, [r1, #x7] @ store result x7
335
336 movs r4, r4, lsr #28
337 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
338 str r4, [r1, #x1] @ store result x1
339
340 @----
341
342 ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
343
344 @ r2..r3 = ct06
345 @ r4..r5 = ct04
346 @ r6 = ct15
347 @ r7 = ct14
348 @ r8 = ct16
349 @ r9 = ct17
350 @ r10 = -K03
351 @ r11 = -K02
352 @ r12 = -K14
353 @ lr = K15
354
355 rsbs r2, r2, #0
356 rsc r3, r3, #0 @ r2..r3 = -ct06
357
358 smlal r2, r3, r12, r7 @ r2..r3 = -ct06 + (ct14 * -K14)
359 smlal r2, r3, r10, r8 @ r2..r3 += (ct16 * -K03)
360
361 smlal r4, r5, r12, r6 @ r4..r5 = ct04 + (ct15 * -K14)
362 smlal r4, r5, r10, r9 @ r4..r5 += (ct17 * -K03)
363 smlal r4, r5, lr, r8 @ r4..r5 += (ct16 * K15)
364 smlal r4, r5, r11, r7 @ r4..r5 += (ct14 * -K02)
365
366 rsb lr, lr, #0 @ lr = -K15
367 rsb r11, r11, #0 @ r11 = K02
368
369 smlal r2, r3, lr, r9 @ r2..r3 += (ct17 * -K15)
370 smlal r2, r3, r11, r6 @ r2..r3 += (ct15 * K02)
371
372 movs r4, r4, lsr #28
373 adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
374 str r4, [r1, #x25] @ store result x25
375
376 movs r2, r2, lsr #28
377 adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
378 str r2, [r1, #x19] @ store result x19
379
380 @----
381
382 ldr r2, [sp, #16] @ r2 = ct01_l
383 ldr r3, [sp, #20] @ r3 = ct01_h
384
385 ldr r6, [r0, #X1]
386 ldr r8, [r0, #X7]
387 ldr r9, [r0, #X10]
388 ldr r7, [r0, #X16]
389
390 rsbs r2, r2, #0
391 rsc r3, r3, #0 @ r2..r3 = -ct01
392
393 mov r4, r2
394 mov r5, r3 @ r4..r5 = -ct01
395
396 @ r2..r3 = -ct01
397 @ r4..r5 = -ct01
398 @ r6 = X1
399 @ r7 = X16
400 @ r8 = X7
401 @ r9 = X10
402 @ r10 = -K03
403 @ r11 = K02
404 @ r12 = -K14
405 @ lr = -K15
406
407 smlal r4, r5, r12, r7 @ r4..r5 = -ct01 + (X16 * -K14)
408 smlal r2, r3, lr, r9 @ r2..r3 = -ct01 + (X10 * -K15)
409
410 smlal r4, r5, r10, r8 @ r4..r5 += (X7 * -K03)
411 smlal r2, r3, r10, r7 @ r2..r3 += (X16 * -K03)
412
413 smlal r4, r5, r11, r9 @ r4..r5 += (X10 * K02)
414 smlal r2, r3, r12, r8 @ r2..r3 += (X7 * -K14)
415
416 rsb lr, lr, #0 @ lr = K15
417 rsb r11, r11, #0 @ r11 = -K02
418
419 smlal r4, r5, lr, r6 @ r4..r5 += (X1 * K15) = ct05
420 smlal r2, r3, r11, r6 @ r2..r3 += (X1 * -K02) = ct03
421
422 stmdb sp!, { r2, r3, r4, r5 } @ stack ct05_h, ct05_l, ct03_h, ct03_l
423
424 rsbs r4, r4, #0
425 rsc r5, r5, #0 @ r4..r5 = -ct05
426
427 stmdb sp!, { r4, r5 } @ stack -ct05_h, -ct05_l
428
429 ldr r2, [sp, #48] @ r2 = ct00_l
430 ldr r3, [sp, #52] @ r3 = ct00_h
431
432 rsb r10, r10, #0 @ r10 = K03
433
434 rsbs r4, r2, #0
435 rsc r5, r3, #0 @ r4..r5 = -ct00
436
437 @ r2..r3 = ct00
438 @ r4..r5 = -ct00
439 @ r6 = X1
440 @ r7 = X16
441 @ r8 = X7
442 @ r9 = X10
443 @ r10 = K03
444 @ r11 = -K02
445 @ r12 = -K14
446 @ lr = K15
447
448 smlal r4, r5, r10, r6 @ r4..r5 = -ct00 + (X1 * K03)
449 smlal r2, r3, r10, r9 @ r2..r3 = ct00 + (X10 * K03)
450
451 smlal r4, r5, r12, r9 @ r4..r5 += (X10 * -K14)
452 smlal r2, r3, r12, r6 @ r2..r3 += (X1 * -K14)
453
454 smlal r4, r5, r11, r7 @ r4..r5 += (X16 * -K02)
455 smlal r4, r5, lr, r8 @ r4..r5 += (X7 * K15) = ct07
456
457 rsb lr, lr, #0 @ lr = -K15
458 rsb r11, r11, #0 @ r11 = K02
459
460 smlal r2, r3, r11, r8 @ r2..r3 += (X7 * K02)
461 smlal r2, r3, lr, r7 @ r2..r3 += (X16 * -K15) = ct02
462
463 rsbs r6, r4, #0
464 rsc r7, r5, #0 @ r6..r7 = -ct07
465
466 stmdb sp!, { r2 - r7 } @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l
467
468
469 @----
470
471 add r2, pc, #(imdct36_long_karray-.-8) @ r2 = base address of Knn array (PIC safe ?)
472
473
474loop:
475 ldr r12, [r0, #X0]
476
477 ldmia r2!, { r5 - r11 } @ first 7 words from Karray element
478
479 smull r3, r4, r5, r12 @ sum = (Kxx * X0)
480 ldr r12, [r0, #X2]
481 ldr r5, [r0, #X3]
482 smlal r3, r4, r6, r12 @ sum += (Kxx * X2)
483 ldr r12, [r0, #X5]
484 ldr r6, [r0, #X6]
485 smlal r3, r4, r7, r5 @ sum += (Kxx * X3)
486 smlal r3, r4, r8, r12 @ sum += (Kxx * X5)
487 ldr r12, [r0, #X8]
488 ldr r5, [r0, #X9]
489 smlal r3, r4, r9, r6 @ sum += (Kxx * X6)
490 smlal r3, r4, r10, r12 @ sum += (Kxx * X8)
491 smlal r3, r4, r11, r5 @ sum += (Kxx * X9)
492
493 ldmia r2!, { r5 - r10 } @ final 6 words from Karray element
494
495 ldr r11, [r0, #X11]
496 ldr r12, [r0, #X12]
497 smlal r3, r4, r5, r11 @ sum += (Kxx * X11)
498 ldr r11, [r0, #X14]
499 ldr r5, [r0, #X15]
500 smlal r3, r4, r6, r12 @ sum += (Kxx * X12)
501 smlal r3, r4, r7, r11 @ sum += (Kxx * X14)
502 ldr r11, [r0, #X17]
503 smlal r3, r4, r8, r5 @ sum += (Kxx * X15)
504 smlal r3, r4, r9, r11 @ sum += (Kxx * X17)
505
506 add r5, sp, r10, lsr #16 @ create index back into stack for required ctxx
507
508 ldmia r5, { r6, r7 } @ r6..r7 = ctxx
509
510 mov r8, r10, lsl #16 @ push ctxx index off the top end
511
512 adds r3, r3, r6 @ add low words
513 adc r4, r4, r7 @ add high words, with carry
514 movs r3, r3, lsr #28
515 adc r3, r3, r4, lsl #4 @ r3 = bits[59..28] of r3..r4
516
517 str r3, [r1, r8, lsr #24] @ push completion flag off the bottom end
518
519 movs r8, r8, lsl #8 @ push result location index off the top end
520 beq loop @ loop back if completion flag not set
521 b imdct_l_windowing @ branch to windowing stage if looping finished
522
523imdct36_long_karray:
524
525 .word K17, -K13, K10, -K06, -K05, K01, -K00, K04, -K07, K11, K12, -K16, 0x00000000
526 .word K13, K07, K16, K01, K10, -K05, K04, -K11, K00, -K17, K06, -K12, 0x00200800
527 .word K11, K17, K05, K12, -K01, K06, -K07, K00, -K13, K04, -K16, K10, 0x00200c00
528 .word K07, K00, -K12, K05, -K16, -K10, K11, -K17, K04, K13, K01, K06, 0x00001400
529 .word K05, K10, -K00, -K17, K07, -K13, K12, K06, -K16, K01, -K11, -K04, 0x00181800
530 .word K01, K05, -K07, -K11, K13, K17, -K16, -K12, K10, K06, -K04, -K00, 0x00102000
531 .word -K16, K12, -K11, K07, K04, -K00, -K01, K05, -K06, K10, K13, -K17, 0x00284800
532 .word -K12, K06, K17, -K00, -K11, K04, K05, -K10, K01, K16, -K07, -K13, 0x00085000
533 .word -K10, K16, K04, -K13, -K00, K07, K06, -K01, -K12, -K05, K17, K11, 0x00105400
534 .word -K06, -K01, K13, K04, K17, -K11, -K10, -K16, -K05, K12, K00, K07, 0x00185c00
535 .word -K04, -K11, -K01, K16, K06, K12, K13, -K07, -K17, -K00, -K10, -K05, 0x00006000
536 .word -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801
537
538
539 @----
540 @-------------------------------------------------------------------------
541 @----
542
543imdct_l_windowing:
544
545 ldr r11, [sp, #80] @ fetch function parameter 3 from out of the stack
546 ldmia r1!, { r0, r2 - r9 } @ load 9 words from x0, update pointer
547
548 @ r0 = x0
549 @ r1 = &x[9]
550 @ r2 = x1
551 @ r3 = x2
552 @ r4 = x3
553 @ r5 = x4
554 @ r6 = x5
555 @ r7 = x6
556 @ r8 = x7
557 @ r9 = x8
558 @ r10 = .
559 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
560 @ r12 = .
561 @ lr = .
562
563 cmp r11, #BLOCK_MODE_STOP @ setup flags
564 rsb r10, r0, #0 @ r10 = -x0 (DONT change flags !!)
565 beq stop_block_x0_to_x17
566
567
568 @ start and normal blocks are treated the same for x[0]..x[17]
569
570normal_block_x0_to_x17:
571
572 ldr r12, =WL9 @ r12 = window_l[9]
573
574 rsb r0, r9, #0 @ r0 = -x8
575 rsb r9, r2, #0 @ r9 = -x1
576 rsb r2, r8, #0 @ r2 = -x7
577 rsb r8, r3, #0 @ r8 = -x2
578 rsb r3, r7, #0 @ r3 = -x6
579 rsb r7, r4, #0 @ r7 = -x3
580 rsb r4, r6, #0 @ r4 = -x5
581 rsb r6, r5, #0 @ r6 = -x4
582
583 @ r0 = -x8
584 @ r1 = &x[9]
585 @ r2 = -x7
586 @ r3 = -x6
587 @ r4 = -x5
588 @ r5 = .
589 @ r6 = -x4
590 @ r7 = -x3
591 @ r8 = -x2
592 @ r9 = -x1
593 @ r10 = -x0
594 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
595 @ r12 = window_l[9]
596 @ lr = .
597
598 smull r5, lr, r12, r0 @ r5..lr = (window_l[9] * (x[9] == -x[8]))
599 ldr r12, =WL10 @ r12 = window_l[10]
600 movs r5, r5, lsr #28
601 adc r0, r5, lr, lsl #4 @ r0 = bits[59..28] of windowed x9
602
603 smull r5, lr, r12, r2 @ r5..lr = (window_l[10] * (x[10] == -x[7]))
604 ldr r12, =WL11 @ r12 = window_l[11]
605 movs r5, r5, lsr #28
606 adc r2, r5, lr, lsl #4 @ r2 = bits[59..28] of windowed x10
607
608 smull r5, lr, r12, r3 @ r5..lr = (window_l[11] * (x[11] == -x[6]))
609 ldr r12, =WL12 @ r12 = window_l[12]
610 movs r5, r5, lsr #28
611 adc r3, r5, lr, lsl #4 @ r3 = bits[59..28] of windowed x11
612
613 smull r5, lr, r12, r4 @ r5..lr = (window_l[12] * (x[12] == -x[5]))
614 ldr r12, =WL13 @ r12 = window_l[13]
615 movs r5, r5, lsr #28
616 adc r4, r5, lr, lsl #4 @ r4 = bits[59..28] of windowed x12
617
618 smull r5, lr, r12, r6 @ r5..lr = (window_l[13] * (x[13] == -x[4]))
619 ldr r12, =WL14 @ r12 = window_l[14]
620 movs r5, r5, lsr #28
621 adc r6, r5, lr, lsl #4 @ r6 = bits[59..28] of windowed x13
622
623 smull r5, lr, r12, r7 @ r5..lr = (window_l[14] * (x[14] == -x[3]))
624 ldr r12, =WL15 @ r12 = window_l[15]
625 movs r5, r5, lsr #28
626 adc r7, r5, lr, lsl #4 @ r7 = bits[59..28] of windowed x14
627
628 smull r5, lr, r12, r8 @ r5..lr = (window_l[15] * (x[15] == -x[2]))
629 ldr r12, =WL16 @ r12 = window_l[16]
630 movs r5, r5, lsr #28
631 adc r8, r5, lr, lsl #4 @ r8 = bits[59..28] of windowed x15
632
633 smull r5, lr, r12, r9 @ r5..lr = (window_l[16] * (x[16] == -x[1]))
634 ldr r12, =WL17 @ r12 = window_l[17]
635 movs r5, r5, lsr #28
636 adc r9, r5, lr, lsl #4 @ r9 = bits[59..28] of windowed x16
637
638 smull r5, lr, r12, r10 @ r5..lr = (window_l[17] * (x[17] == -x[0]))
639 ldr r12, =WL0 @ r12 = window_l[0]
640 movs r5, r5, lsr #28
641 adc r10, r5, lr, lsl #4 @ r10 = bits[59..28] of windowed x17
642
643
644 stmia r1, { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17]
645 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x0
646
647
648 smull r10, lr, r12, r0 @ r10..lr = (window_l[0] * x[0])
649 ldr r12, =WL1 @ r12 = window_l[1]
650 movs r10, r10, lsr #28
651 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
652
653 smull r10, lr, r12, r2 @ r10..lr = (window_l[1] * x[1])
654 ldr r12, =WL2 @ r12 = window_l[2]
655 movs r10, r10, lsr #28
656 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
657
658 smull r10, lr, r12, r3 @ r10..lr = (window_l[2] * x[2])
659 ldr r12, =WL3 @ r12 = window_l[3]
660 movs r10, r10, lsr #28
661 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
662
663 smull r10, lr, r12, r4 @ r10..lr = (window_l[3] * x[3])
664 ldr r12, =WL4 @ r12 = window_l[4]
665 movs r10, r10, lsr #28
666 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
667
668 smull r10, lr, r12, r5 @ r10..lr = (window_l[4] * x[4])
669 ldr r12, =WL5 @ r12 = window_l[5]
670 movs r10, r10, lsr #28
671 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
672
673 smull r10, lr, r12, r6 @ r10..lr = (window_l[5] * x[5])
674 ldr r12, =WL6 @ r12 = window_l[6]
675 movs r10, r10, lsr #28
676 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
677
678 smull r10, lr, r12, r7 @ r10..lr = (window_l[6] * x[6])
679 ldr r12, =WL7 @ r12 = window_l[7]
680 movs r10, r10, lsr #28
681 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
682
683 smull r10, lr, r12, r8 @ r10..lr = (window_l[7] * x[7])
684 ldr r12, =WL8 @ r12 = window_l[8]
685 movs r10, r10, lsr #28
686 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
687
688 smull r10, lr, r12, r9 @ r10..lr = (window_l[8] * x[8])
689 movs r10, r10, lsr #28
690 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
691
692 stmia r1, { r0, r2 - r9 } @ store windowed x[0] .. x[8]
693
694 cmp r11, #BLOCK_MODE_START
695 beq start_block_x18_to_x35
696
697
698 @----
699
700
701normal_block_x18_to_x35:
702
703 ldr r11, =WL3 @ r11 = window_l[3]
704 ldr r12, =WL4 @ r12 = window_l[4]
705
706 add r1, r1, #(18*4) @ r1 = &x[18]
707
708 ldmia r1!, { r0, r2 - r4, r6 - r10 } @ load 9 words from x18, update pointer
709
710 @ r0 = x18
711 @ r1 = &x[27]
712 @ r2 = x19
713 @ r3 = x20
714 @ r4 = x21
715 @ r5 = .
716 @ r6 = x22
717 @ r7 = x23
718 @ r8 = x24
719 @ r9 = x25
720 @ r10 = x26
721 @ r11 = window_l[3]
722 @ r12 = window_l[4]
723 @ lr = .
724
725 smull r5, lr, r12, r6 @ r5..lr = (window_l[4] * (x[22] == x[31]))
726 movs r5, r5, lsr #28
727 adc r5, r5, lr, lsl #4 @ r5 = bits[59..28] of windowed x31
728
729 smull r6, lr, r11, r4 @ r5..lr = (window_l[3] * (x[21] == x[32]))
730 ldr r12, =WL5 @ r12 = window_l[5]
731 movs r6, r6, lsr #28
732 adc r6, r6, lr, lsl #4 @ r6 = bits[59..28] of windowed x32
733
734 smull r4, lr, r12, r7 @ r4..lr = (window_l[5] * (x[23] == x[30]))
735 ldr r11, =WL1 @ r11 = window_l[1]
736 ldr r12, =WL2 @ r12 = window_l[2]
737 movs r4, r4, lsr #28
738 adc r4, r4, lr, lsl #4 @ r4 = bits[59..28] of windowed x30
739
740 smull r7, lr, r12, r3 @ r7..lr = (window_l[2] * (x[20] == x[33]))
741 ldr r12, =WL6 @ r12 = window_l[6]
742 movs r7, r7, lsr #28
743 adc r7, r7, lr, lsl #4 @ r7 = bits[59..28] of windowed x33
744
745 smull r3, lr, r12, r8 @ r3..lr = (window_l[6] * (x[24] == x[29]))
746 movs r3, r3, lsr #28
747 adc r3, r3, lr, lsl #4 @ r3 = bits[59..28] of windowed x29
748
749 smull r8, lr, r11, r2 @ r7..lr = (window_l[1] * (x[19] == x[34]))
750 ldr r12, =WL7 @ r12 = window_l[7]
751 ldr r11, =WL8 @ r11 = window_l[8]
752 movs r8, r8, lsr #28
753 adc r8, r8, lr, lsl #4 @ r8 = bits[59..28] of windowed x34
754
755 smull r2, lr, r12, r9 @ r7..lr = (window_l[7] * (x[25] == x[28]))
756 ldr r12, =WL0 @ r12 = window_l[0]
757 movs r2, r2, lsr #28
758 adc r2, r2, lr, lsl #4 @ r2 = bits[59..28] of windowed x28
759
760 smull r9, lr, r12, r0 @ r3..lr = (window_l[0] * (x[18] == x[35]))
761 movs r9, r9, lsr #28
762 adc r9, r9, lr, lsl #4 @ r9 = bits[59..28] of windowed x35
763
764 smull r0, lr, r11, r10 @ r7..lr = (window_l[8] * (x[26] == x[27]))
765 ldr r11, =WL16 @ r11 = window_l[16]
766 ldr r12, =WL17 @ r12 = window_l[17]
767 movs r0, r0, lsr #28
768 adc r0, r0, lr, lsl #4 @ r0 = bits[59..28] of windowed x27
769
770
771 stmia r1, { r0, r2 - r9 } @ store windowed x[27] .. x[35]
772 ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x18
773
774
775 smull r10, lr, r12, r0 @ r10..lr = (window_l[17] * x[18])
776 movs r10, r10, lsr #28
777 adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
778
779 smull r10, lr, r11, r2 @ r10..lr = (window_l[16] * x[19])
780 ldr r11, =WL14 @ r11 = window_l[14]
781 ldr r12, =WL15 @ r12 = window_l[15]
782 movs r10, r10, lsr #28
783 adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
784
785 smull r10, lr, r12, r3 @ r10..lr = (window_l[15] * x[20])
786 movs r10, r10, lsr #28
787 adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
788
789 smull r10, lr, r11, r4 @ r10..lr = (window_l[14] * x[21])
790 ldr r11, =WL12 @ r11 = window_l[12]
791 ldr r12, =WL13 @ r12 = window_l[13]
792 movs r10, r10, lsr #28
793 adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
794
795 smull r10, lr, r12, r5 @ r10..lr = (window_l[13] * x[22])
796 movs r10, r10, lsr #28
797 adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
798
799 smull r10, lr, r11, r6 @ r10..lr = (window_l[12] * x[23])
800 ldr r11, =WL10 @ r12 = window_l[10]
801 ldr r12, =WL11 @ r12 = window_l[11]
802 movs r10, r10, lsr #28
803 adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
804
805 smull r10, lr, r12, r7 @ r10..lr = (window_l[11] * x[24])
806 movs r10, r10, lsr #28
807 adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
808
809 smull r10, lr, r11, r8 @ r10..lr = (window_l[10] * x[25])
810 ldr r12, =WL9 @ r12 = window_l[9]
811 movs r10, r10, lsr #28
812 adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
813
814 smull r10, lr, r12, r9 @ r10..lr = (window_l[9] * x[26])
815
816 movs r10, r10, lsr #28
817 adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
818
819 stmia r1, { r0, r2 - r9 } @ store windowed x[18] .. x[26]
820
821 @----
822 @ NB there are 2 possible exits from this function - this is only one of them
823 @----
824
825 add sp, sp, #(21*4) @ return stack frame
826 ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return
827
828 @----
829
830
831stop_block_x0_to_x17:
832
833 @ r0 = x0
834 @ r1 = &x[9]
835 @ r2 = x1
836 @ r3 = x2
837 @ r4 = x3
838 @ r5 = x4
839 @ r6 = x5
840 @ r7 = x6
841 @ r8 = x7
842 @ r9 = x8
843 @ r10 = -x0
844 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
845 @ r12 = .
846 @ lr = .
847
848 rsb r0, r6, #0 @ r0 = -x5
849 rsb r6, r2, #0 @ r6 = -x1
850 rsb r2, r5, #0 @ r2 = -x4
851 rsb r5, r3, #0 @ r5 = -x2
852 rsb r3, r4, #0 @ r3 = -x3
853
854 add r1, r1, #(3*4) @ r1 = &x[12]
855 stmia r1, { r0, r2, r3, r5, r6, r10 } @ store unchanged x[12] .. x[17]
856
857 ldr r0, =WL1 @ r0 = window_l[1] == window_s[0]
858
859 rsb r10, r9, #0 @ r10 = -x8
860 rsb r12, r8, #0 @ r12 = -x7
861 rsb lr, r7, #0 @ lr = -x6
862
863 @ r0 = WL1
864 @ r1 = &x[12]
865 @ r2 = .
866 @ r3 = .
867 @ r4 = .
868 @ r5 = .
869 @ r6 = .
870 @ r7 = x6
871 @ r8 = x7
872 @ r9 = x8
873 @ r10 = -x8
874 @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
875 @ r12 = -x7
876 @ lr = -x6
877
878 smull r5, r6, r0, r7 @ r5..r6 = (window_l[1] * x[6])
879 ldr r2, =WL4 @ r2 = window_l[4] == window_s[1]
880 movs r5, r5, lsr #28
881 adc r7, r5, r6, lsl #4 @ r7 = bits[59..28] of windowed x6
882
883 smull r5, r6, r2, r8 @ r5..r6 = (window_l[4] * x[7])
884 ldr r3, =WL7 @ r3 = window_l[7] == window_s[2]
885 movs r5, r5, lsr #28
886 adc r8, r5, r6, lsl #4 @ r8 = bits[59..28] of windowed x7
887
888 smull r5, r6, r3, r9 @ r5..r6 = (window_l[7] * x[8])
889 ldr r4, =WL10 @ r4 = window_l[10] == window_s[3]
890 movs r5, r5, lsr #28
891 adc r9, r5, r6, lsl #4 @ r9 = bits[59..28] of windowed x8
892
893 smull r5, r6, r4, r10 @ r5..r6 = (window_l[10] * (x[9] == -x[8]))
894 ldr r0, =WL13 @ r0 = window_l[13] == window_s[4]
895 movs r5, r5, lsr #28
896 adc r10, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
897
898 smull r5, r6, r0, r12 @ r5..r6 = (window_l[13] * (x[10] == -x[7]))
899 ldr r2, =WL16 @ r2 = window_l[16] == window_s[5]
900 movs r5, r5, lsr #28
901 adc r12, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
902
903 smull r5, r6, r2, lr @ r5..r6 = (window_l[16] * (x[11] == -x[6]))
904
905 ldr r0, =0x00
906
907 movs r5, r5, lsr #28
908 adc lr, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
909
910 stmdb r1!, { r7 - r10, r12, lr } @ store windowed x[6] .. x[11]
911
912 ldr r5, =0x00
913 ldr r6, =0x00
914 ldr r2, =0x00
915 ldr r3, =0x00
916 ldr r4, =0x00
917
918 stmdb r1!, { r0, r2 - r6 } @ store windowed x[0] .. x[5]
919
920 b normal_block_x18_to_x35
921
922
923 @----
924
925
926start_block_x18_to_x35:
927
928 ldr r4, =WL1 @ r0 = window_l[1] == window_s[0]
929
930 add r1, r1, #(24*4) @ r1 = &x[24]
931
932 ldmia r1, { r0, r2, r3 } @ load 3 words from x24, dont update pointer
933
934 @ r0 = x24
935 @ r1 = &x[24]
936 @ r2 = x25
937 @ r3 = x26
938 @ r4 = WL1
939 @ r5 = WL4
940 @ r6 = WL7
941 @ r7 = WL10
942 @ r8 = WL13
943 @ r9 = WL16
944 @ r10 = .
945 @ r11 = .
946 @ r12 = .
947 @ lr = .
948
949 ldr r5, =WL4 @ r5 = window_l[4] == window_s[1]
950
951 smull r10, r11, r4, r0 @ r10..r11 = (window_l[1] * (x[24] == x[29]))
952 ldr r6, =WL7 @ r6 = window_l[7] == window_s[2]
953 movs r10, r10, lsr #28
954 adc lr, r10, r11, lsl #4 @ lr = bits[59..28] of windowed x29
955
956 smull r10, r11, r5, r2 @ r10..r11 = (window_l[4] * (x[25] == x[28]))
957 ldr r7, =WL10 @ r7 = window_l[10] == window_s[3]
958 movs r10, r10, lsr #28
959 adc r12, r10, r11, lsl #4 @ r12 = bits[59..28] of windowed x28
960
961 smull r10, r11, r6, r3 @ r10..r11 = (window_l[7] * (x[26] == x[27]))
962 ldr r8, =WL13 @ r8 = window_l[13] == window_s[4]
963 movs r10, r10, lsr #28
964 adc r4, r10, r11, lsl #4 @ r4 = bits[59..28] of windowed x27
965
966 smull r10, r11, r7, r3 @ r10..r11 = (window_l[10] * x[26])
967 ldr r9, =WL16 @ r9 = window_l[16] == window_s[5]
968 movs r10, r10, lsr #28
969 adc r3, r10, r11, lsl #4 @ r3 = bits[59..28] of windowed x26
970
971 smull r10, r11, r8, r2 @ r10..r11 = (window_l[13] * x[25])
972 ldr r5, =0x00
973 movs r10, r10, lsr #28
974 adc r2, r10, r11, lsl #4 @ r2 = bits[59..28] of windowed x25
975
976 smull r10, r11, r9, r0 @ r10..r11 = (window_l[16] * x[24])
977 ldr r6, =0x00
978 movs r10, r10, lsr #28
979 adc r0, r10, r11, lsl #4 @ r0 = bits[59..28] of windowed x24
980
981 stmia r1!, { r0, r2, r3, r4, r12, lr } @ store windowed x[24] .. x[29]
982
983 ldr r7, =0x00
984 ldr r8, =0x00
985 ldr r9, =0x00
986 ldr r10, =0x00
987
988 stmia r1!, { r5 - r10 } @ store windowed x[30] .. x[35]
989
990 @----
991 @ NB there are 2 possible exits from this function - this is only one of them
992 @----
993
994 add sp, sp, #(21*4) @ return stack frame
995 ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return
996
997 @----
998 @END
999 @----
1000
1001