1/* 2 * "memcpy" implementation of SuperH 3 * 4 * Copyright (C) 1999 Niibe Yutaka 5 * Copyright (c) 2002 STMicroelectronics Ltd 6 * Modified from memcpy.S and micro-optimised for SH4 7 * Stuart Menefy (stuart.menefy@st.com) 8 * 9 */ 10#include <linux/linkage.h> 11 12/* 13 * void *memcpy(void *dst, const void *src, size_t n); 14 * 15 * It is assumed that there is no overlap between src and dst. 16 * If there is an overlap, then the results are undefined. 17 */ 18 19 ! 20 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR. 21 ! 22 23 ! Size is 16 or greater, and may have trailing bytes 24 25 .balign 32 26.Lcase1: 27 ! Read a long word and write a long word at once 28 ! At the start of each iteration, r7 contains last long load 29 add #-1,r5 ! 79 EX 30 mov r4,r2 ! 5 MT (0 cycles latency) 31 32 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 33 add #-4,r5 ! 50 EX 34 35 add #7,r2 ! 79 EX 36 ! 37#ifdef CONFIG_CPU_LITTLE_ENDIAN 38 ! 6 cycles, 4 bytes per iteration 393: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 40 mov r7, r3 ! 5 MT (latency=0) ! RQPO 41 42 cmp/hi r2,r0 ! 57 MT 43 shll16 r3 ! 103 EX 44 45 mov r1,r6 ! 5 MT (latency=0) 46 shll8 r3 ! 102 EX ! Oxxx 47 48 shlr8 r6 ! 106 EX ! xNML 49 mov r1, r7 ! 5 MT (latency=0) 50 51 or r6,r3 ! 82 EX ! ONML 52 bt/s 3b ! 109 BR 53 54 mov.l r3,@-r0 ! 30 LS 55#else 563: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN 57 mov r7,r3 ! 5 MT (latency=0) ! OPQR 58 59 cmp/hi r2,r0 ! 57 MT 60 shlr16 r3 ! 107 EX 61 62 shlr8 r3 ! 106 EX ! xxxO 63 mov r1,r6 ! 5 MT (latency=0) 64 65 shll8 r6 ! 102 EX ! LMNx 66 mov r1,r7 ! 5 MT (latency=0) 67 68 or r6,r3 ! 82 EX ! LMNO 69 bt/s 3b ! 109 BR 70 71 mov.l r3,@-r0 ! 30 LS 72#endif 73 ! Finally, copy a byte at once, if necessary 74 75 add #4,r5 ! 50 EX 76 cmp/eq r4,r0 ! 54 MT 77 78 add #-6,r2 ! 50 EX 79 bt 9f ! 109 BR 80 818: cmp/hi r2,r0 ! 57 MT 82 mov.b @(r0,r5),r1 ! 20 LS (latency=2) 83 84 bt/s 8b ! 109 BR 85 86 mov.b r1,@-r0 ! 29 LS 87 889: rts 89 nop 90 91 92 ! 93 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R... 94 ! 95 96 ! Size is 16 or greater, and may have trailing bytes 97 98 .balign 32 99.Lcase3: 100 ! Read a long word and write a long word at once 101 ! At the start of each iteration, r7 contains last long load 102 add #-3,r5 ! 79 EX 103 mov r4,r2 ! 5 MT (0 cycles latency) 104 105 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency) 106 add #-4,r5 ! 50 EX 107 108 add #7,r2 ! 79 EX 109 ! 110#ifdef CONFIG_CPU_LITTLE_ENDIAN 111 ! 6 cycles, 4 bytes per iteration 1123: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK 113 mov r7, r3 ! 5 MT (latency=0) ! RQPO 114 115 cmp/hi r2,r0 ! 57 MT 116 shll8 r3 ! 102 EX ! QPOx 117 118 mov r1,r6 ! 5 MT (latency=0) 119 shlr16 r6 ! 107 EX 120 121 shlr8 r6 ! 106 EX ! xxxN 122 mov r1, r7 ! 5 MT (latency=0) 123 124 or r6,r3 ! 82 EX ! QPON 125 bt/s 3b ! 109 BR 126 127 mov.l r3,@-r0 ! 30 LS 128#else 1293: mov r7,r3 ! OPQR 130 shlr8 r3 ! xOPQ 131 mov.l @(r0,r5),r7 ! KLMN 132 mov r7,r6 133 shll16 r6 134 shll8 r6 ! Nxxx 135 or r6,r3 ! NOPQ 136 cmp/hi r2,r0 137 bt/s 3b 138 mov.l r3,@-r0 139#endif 140 141 ! Finally, copy a byte at once, if necessary 142 143 add #6,r5 ! 50 EX 144 cmp/eq r4,r0 ! 54 MT 145 146 add #-6,r2 ! 50 EX 147 bt 9f ! 109 BR 148 1498: cmp/hi r2,r0 ! 57 MT 150 mov.b @(r0,r5),r1 ! 20 LS (latency=2) 151 152 bt/s 8b ! 109 BR 153 154 mov.b r1,@-r0 ! 29 LS 155 1569: rts 157 nop 158 159ENTRY(memcpy) 160 161 ! Calculate the invariants which will be used in the remainder 162 ! of the code: 163 ! 164 ! r4 --> [ ... ] DST [ ... ] SRC 165 ! [ ... ] [ ... ] 166 ! : : 167 ! r0 --> [ ... ] r0+r5 --> [ ... ] 168 ! 169 ! 170 171 ! Short circuit the common case of src, dst and len being 32 bit aligned 172 ! and test for zero length move 173 174 mov r6, r0 ! 5 MT (0 cycle latency) 175 or r4, r0 ! 82 EX 176 177 or r5, r0 ! 82 EX 178 tst r6, r6 ! 86 MT 179 180 bt/s 99f ! 111 BR (zero len) 181 tst #3, r0 ! 87 MT 182 183 mov r4, r0 ! 5 MT (0 cycle latency) 184 add r6, r0 ! 49 EX 185 186 mov #16, r1 ! 6 EX 187 bt/s .Lcase00 ! 111 BR (aligned) 188 189 sub r4, r5 ! 75 EX 190 191 ! Arguments are not nicely long word aligned or zero len. 192 ! Check for small copies, and if so do a simple byte at a time copy. 193 ! 194 ! Deciding on an exact value of 'small' is not easy, as the point at which 195 ! using the optimised routines become worthwhile varies (these are the 196 ! cycle counts for differnet sizes using byte-at-a-time vs. optimised): 197 ! size byte-at-time long word byte 198 ! 16 42 39-40 46-50 50-55 199 ! 24 58 43-44 54-58 62-67 200 ! 36 82 49-50 66-70 80-85 201 ! However the penalty for getting it 'wrong' is much higher for long word 202 ! aligned data (and this is more common), so use a value of 16. 203 204 cmp/gt r6,r1 ! 56 MT 205 206 add #-1,r5 ! 50 EX 207 bf/s 6f ! 108 BR (not small) 208 209 mov r5, r3 ! 5 MT (latency=0) 210 shlr r6 ! 104 EX 211 212 mov.b @(r0,r5),r1 ! 20 LS (latency=2) 213 bf/s 4f ! 111 BR 214 215 add #-1,r3 ! 50 EX 216 tst r6, r6 ! 86 MT 217 218 bt/s 98f ! 110 BR 219 mov.b r1,@-r0 ! 29 LS 220 221 ! 4 cycles, 2 bytes per iteration 2223: mov.b @(r0,r5),r1 ! 20 LS (latency=2) 223 2244: mov.b @(r0,r3),r2 ! 20 LS (latency=2) 225 dt r6 ! 67 EX 226 227 mov.b r1,@-r0 ! 29 LS 228 bf/s 3b ! 111 BR 229 230 mov.b r2,@-r0 ! 29 LS 23198: 232 rts 233 nop 234 23599: rts 236 mov r4, r0 237 238 ! Size is not small, so its worthwhile looking for optimisations. 239 ! First align destination to a long word boundary. 240 ! 241 ! r5 = normal value -1 242 2436: tst #3, r0 ! 87 MT 244 mov #3, r3 ! 6 EX 245 246 bt/s 2f ! 111 BR 247 and r0,r3 ! 78 EX 248 249 ! 3 cycles, 1 byte per iteration 2501: dt r3 ! 67 EX 251 mov.b @(r0,r5),r1 ! 19 LS (latency=2) 252 253 add #-1, r6 ! 79 EX 254 bf/s 1b ! 109 BR 255 256 mov.b r1,@-r0 ! 28 LS 257 2582: add #1, r5 ! 79 EX 259 260 ! Now select the appropriate bulk transfer code based on relative 261 ! alignment of src and dst. 262 263 mov r0, r3 ! 5 MT (latency=0) 264 265 mov r5, r0 ! 5 MT (latency=0) 266 tst #1, r0 ! 87 MT 267 268 bf/s 1f ! 111 BR 269 mov #64, r7 ! 6 EX 270 271 ! bit 0 clear 272 273 cmp/ge r7, r6 ! 55 MT 274 275 bt/s 2f ! 111 BR 276 tst #2, r0 ! 87 MT 277 278 ! small 279 bt/s .Lcase0 280 mov r3, r0 281 282 bra .Lcase2 283 nop 284 285 ! big 2862: bt/s .Lcase0b 287 mov r3, r0 288 289 bra .Lcase2b 290 nop 291 292 ! bit 0 set 2931: tst #2, r0 ! 87 MT 294 295 bt/s .Lcase1 296 mov r3, r0 297 298 bra .Lcase3 299 nop 300 301 302 ! 303 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR 304 ! 305 306 ! src, dst and size are all long word aligned 307 ! size is non-zero 308 309 .balign 32 310.Lcase00: 311 mov #64, r1 ! 6 EX 312 mov r5, r3 ! 5 MT (latency=0) 313 314 cmp/gt r6, r1 ! 56 MT 315 add #-4, r5 ! 50 EX 316 317 bf .Lcase00b ! 108 BR (big loop) 318 shlr2 r6 ! 105 EX 319 320 shlr r6 ! 104 EX 321 mov.l @(r0, r5), r1 ! 21 LS (latency=2) 322 323 bf/s 4f ! 111 BR 324 add #-8, r3 ! 50 EX 325 326 tst r6, r6 ! 86 MT 327 bt/s 5f ! 110 BR 328 329 mov.l r1,@-r0 ! 30 LS 330 331 ! 4 cycles, 2 long words per iteration 3323: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 333 3344: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 335 dt r6 ! 67 EX 336 337 mov.l r1, @-r0 ! 30 LS 338 bf/s 3b ! 109 BR 339 340 mov.l r2, @-r0 ! 30 LS 341 3425: rts 343 nop 344 345 346 ! Size is 16 or greater and less than 64, but may have trailing bytes 347 348 .balign 32 349.Lcase0: 350 add #-4, r5 ! 50 EX 351 mov r4, r7 ! 5 MT (latency=0) 352 353 mov.l @(r0, r5), r1 ! 21 LS (latency=2) 354 mov #4, r2 ! 6 EX 355 356 add #11, r7 ! 50 EX 357 tst r2, r6 ! 86 MT 358 359 mov r5, r3 ! 5 MT (latency=0) 360 bt/s 4f ! 111 BR 361 362 add #-4, r3 ! 50 EX 363 mov.l r1,@-r0 ! 30 LS 364 365 ! 4 cycles, 2 long words per iteration 3663: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 367 3684: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 369 cmp/hi r7, r0 370 371 mov.l r1, @-r0 ! 30 LS 372 bt/s 3b ! 109 BR 373 374 mov.l r2, @-r0 ! 30 LS 375 376 ! Copy the final 0-3 bytes 377 378 add #3,r5 ! 50 EX 379 380 cmp/eq r0, r4 ! 54 MT 381 add #-10, r7 ! 50 EX 382 383 bt 9f ! 110 BR 384 385 ! 3 cycles, 1 byte per iteration 3861: mov.b @(r0,r5),r1 ! 19 LS 387 cmp/hi r7,r0 ! 57 MT 388 389 bt/s 1b ! 111 BR 390 mov.b r1,@-r0 ! 28 LS 391 3929: rts 393 nop 394 395 ! Size is at least 64 bytes, so will be going round the big loop at least once. 396 ! 397 ! r2 = rounded up r4 398 ! r3 = rounded down r0 399 400 .balign 32 401.Lcase0b: 402 add #-4, r5 ! 50 EX 403 404.Lcase00b: 405 mov r0, r3 ! 5 MT (latency=0) 406 mov #(~0x1f), r1 ! 6 EX 407 408 and r1, r3 ! 78 EX 409 mov r4, r2 ! 5 MT (latency=0) 410 411 cmp/eq r3, r0 ! 54 MT 412 add #0x1f, r2 ! 50 EX 413 414 bt/s 1f ! 110 BR 415 and r1, r2 ! 78 EX 416 417 ! copy initial words until cache line aligned 418 419 mov.l @(r0, r5), r1 ! 21 LS (latency=2) 420 tst #4, r0 ! 87 MT 421 422 mov r5, r6 ! 5 MT (latency=0) 423 add #-4, r6 ! 50 EX 424 425 bt/s 4f ! 111 BR 426 add #8, r3 ! 50 EX 427 428 tst #0x18, r0 ! 87 MT 429 430 bt/s 1f ! 109 BR 431 mov.l r1,@-r0 ! 30 LS 432 433 ! 4 cycles, 2 long words per iteration 4343: mov.l @(r0, r5), r1 ! 21 LS (latency=2) 435 4364: mov.l @(r0, r6), r7 ! 21 LS (latency=2) 437 cmp/eq r3, r0 ! 54 MT 438 439 mov.l r1, @-r0 ! 30 LS 440 bf/s 3b ! 109 BR 441 442 mov.l r7, @-r0 ! 30 LS 443 444 ! Copy the cache line aligned blocks 445 ! 446 ! In use: r0, r2, r4, r5 447 ! Scratch: r1, r3, r6, r7 448 ! 449 ! We could do this with the four scratch registers, but if src 450 ! and dest hit the same cache line, this will thrash, so make 451 ! use of additional registers. 452 ! 453 ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 454 ! r5: src (was r0+r5) 455 ! r1: dest (was r0) 456 ! this can be reversed at the end, so we don't need to save any extra 457 ! state. 458 ! 4591: mov.l r8, @-r15 ! 30 LS 460 add r0, r5 ! 49 EX 461 462 mov.l r9, @-r15 ! 30 LS 463 mov r0, r1 ! 5 MT (latency=0) 464 465 mov.l r10, @-r15 ! 30 LS 466 add #-0x1c, r5 ! 50 EX 467 468 mov.l r11, @-r15 ! 30 LS 469 470 ! 16 cycles, 32 bytes per iteration 4712: mov.l @(0x00,r5),r0 ! 18 LS (latency=2) 472 add #-0x20, r1 ! 50 EX 473 mov.l @(0x04,r5),r3 ! 18 LS (latency=2) 474 mov.l @(0x08,r5),r6 ! 18 LS (latency=2) 475 mov.l @(0x0c,r5),r7 ! 18 LS (latency=2) 476 mov.l @(0x10,r5),r8 ! 18 LS (latency=2) 477 mov.l @(0x14,r5),r9 ! 18 LS (latency=2) 478 mov.l @(0x18,r5),r10 ! 18 LS (latency=2) 479 mov.l @(0x1c,r5),r11 ! 18 LS (latency=2) 480 movca.l r0,@r1 ! 40 LS (latency=3-7) 481 mov.l r3,@(0x04,r1) ! 33 LS 482 mov.l r6,@(0x08,r1) ! 33 LS 483 mov.l r7,@(0x0c,r1) ! 33 LS 484 485 mov.l r8,@(0x10,r1) ! 33 LS 486 add #-0x20, r5 ! 50 EX 487 488 mov.l r9,@(0x14,r1) ! 33 LS 489 cmp/eq r2,r1 ! 54 MT 490 491 mov.l r10,@(0x18,r1) ! 33 LS 492 bf/s 2b ! 109 BR 493 494 mov.l r11,@(0x1c,r1) ! 33 LS 495 496 mov r1, r0 ! 5 MT (latency=0) 497 498 mov.l @r15+, r11 ! 15 LS 499 sub r1, r5 ! 75 EX 500 501 mov.l @r15+, r10 ! 15 LS 502 cmp/eq r4, r0 ! 54 MT 503 504 bf/s 1f ! 109 BR 505 mov.l @r15+, r9 ! 15 LS 506 507 rts 5081: mov.l @r15+, r8 ! 15 LS 509 sub r4, r1 ! 75 EX (len remaining) 510 511 ! number of trailing bytes is non-zero 512 ! 513 ! invariants restored (r5 already decremented by 4) 514 ! also r1=num bytes remaining 515 516 mov #4, r2 ! 6 EX 517 mov r4, r7 ! 5 MT (latency=0) 518 519 add #0x1c, r5 ! 50 EX (back to -4) 520 cmp/hs r2, r1 ! 58 MT 521 522 bf/s 5f ! 108 BR 523 add #11, r7 ! 50 EX 524 525 mov.l @(r0, r5), r6 ! 21 LS (latency=2) 526 tst r2, r1 ! 86 MT 527 528 mov r5, r3 ! 5 MT (latency=0) 529 bt/s 4f ! 111 BR 530 531 add #-4, r3 ! 50 EX 532 cmp/hs r2, r1 ! 58 MT 533 534 bt/s 5f ! 111 BR 535 mov.l r6,@-r0 ! 30 LS 536 537 ! 4 cycles, 2 long words per iteration 5383: mov.l @(r0, r5), r6 ! 21 LS (latency=2) 539 5404: mov.l @(r0, r3), r2 ! 21 LS (latency=2) 541 cmp/hi r7, r0 542 543 mov.l r6, @-r0 ! 30 LS 544 bt/s 3b ! 109 BR 545 546 mov.l r2, @-r0 ! 30 LS 547 548 ! Copy the final 0-3 bytes 549 5505: cmp/eq r0, r4 ! 54 MT 551 add #-10, r7 ! 50 EX 552 553 bt 9f ! 110 BR 554 add #3,r5 ! 50 EX 555 556 ! 3 cycles, 1 byte per iteration 5571: mov.b @(r0,r5),r1 ! 19 LS 558 cmp/hi r7,r0 ! 57 MT 559 560 bt/s 1b ! 111 BR 561 mov.b r1,@-r0 ! 28 LS 562 5639: rts 564 nop 565 566 ! 567 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR.. 568 ! 569 570 .balign 32 571.Lcase2: 572 ! Size is 16 or greater and less then 64, but may have trailing bytes 573 5742: mov r5, r6 ! 5 MT (latency=0) 575 add #-2,r5 ! 50 EX 576 577 mov r4,r2 ! 5 MT (latency=0) 578 add #-4,r6 ! 50 EX 579 580 add #7,r2 ! 50 EX 5813: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 582 583 mov.w @(r0,r6),r3 ! 20 LS (latency=2) 584 cmp/hi r2,r0 ! 57 MT 585 586 mov.w r1,@-r0 ! 29 LS 587 bt/s 3b ! 111 BR 588 589 mov.w r3,@-r0 ! 29 LS 590 591 bra 10f 592 nop 593 594 595 .balign 32 596.Lcase2b: 597 ! Size is at least 64 bytes, so will be going round the big loop at least once. 598 ! 599 ! r2 = rounded up r4 600 ! r3 = rounded down r0 601 602 mov r0, r3 ! 5 MT (latency=0) 603 mov #(~0x1f), r1 ! 6 EX 604 605 and r1, r3 ! 78 EX 606 mov r4, r2 ! 5 MT (latency=0) 607 608 cmp/eq r3, r0 ! 54 MT 609 add #0x1f, r2 ! 50 EX 610 611 add #-2, r5 ! 50 EX 612 bt/s 1f ! 110 BR 613 and r1, r2 ! 78 EX 614 615 ! Copy a short word one at a time until we are cache line aligned 616 ! Normal values: r0, r2, r3, r4 617 ! Unused: r1, r6, r7 618 ! Mod: r5 (=r5-2) 619 ! 620 add #2, r3 ! 50 EX 621 6222: mov.w @(r0,r5),r1 ! 20 LS (latency=2) 623 cmp/eq r3,r0 ! 54 MT 624 625 bf/s 2b ! 111 BR 626 627 mov.w r1,@-r0 ! 29 LS 628 629 ! Copy the cache line aligned blocks 630 ! 631 ! In use: r0, r2, r4, r5 (=r5-2) 632 ! Scratch: r1, r3, r6, r7 633 ! 634 ! We could do this with the four scratch registers, but if src 635 ! and dest hit the same cache line, this will thrash, so make 636 ! use of additional registers. 637 ! 638 ! We also need r0 as a temporary (for movca), so 'undo' the invariant: 639 ! r5: src (was r0+r5) 640 ! r1: dest (was r0) 641 ! this can be reversed at the end, so we don't need to save any extra 642 ! state. 643 ! 6441: mov.l r8, @-r15 ! 30 LS 645 add r0, r5 ! 49 EX 646 647 mov.l r9, @-r15 ! 30 LS 648 mov r0, r1 ! 5 MT (latency=0) 649 650 mov.l r10, @-r15 ! 30 LS 651 add #-0x1e, r5 ! 50 EX 652 653 mov.l r11, @-r15 ! 30 LS 654 655 mov.l r12, @-r15 ! 30 LS 656 657 ! 17 cycles, 32 bytes per iteration 658#ifdef CONFIG_CPU_LITTLE_ENDIAN 6592: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI 660 add #-0x20, r1 ! 50 EX 661 662 mov.l @r5+, r3 ! 15 LS (latency=2) NMLK 663 664 mov.l @r5+, r6 ! 15 LS (latency=2) RQPO 665 shll16 r0 ! 103 EX JI.. 666 667 mov.l @r5+, r7 ! 15 LS (latency=2) 668 xtrct r3, r0 ! 48 EX LKJI 669 670 mov.l @r5+, r8 ! 15 LS (latency=2) 671 xtrct r6, r3 ! 48 EX PONM 672 673 mov.l @r5+, r9 ! 15 LS (latency=2) 674 xtrct r7, r6 ! 48 EX 675 676 mov.l @r5+, r10 ! 15 LS (latency=2) 677 xtrct r8, r7 ! 48 EX 678 679 mov.l @r5+, r11 ! 15 LS (latency=2) 680 xtrct r9, r8 ! 48 EX 681 682 mov.w @r5+, r12 ! 15 LS (latency=2) 683 xtrct r10, r9 ! 48 EX 684 685 movca.l r0,@r1 ! 40 LS (latency=3-7) 686 xtrct r11, r10 ! 48 EX 687 688 mov.l r3, @(0x04,r1) ! 33 LS 689 xtrct r12, r11 ! 48 EX 690 691 mov.l r6, @(0x08,r1) ! 33 LS 692 693 mov.l r7, @(0x0c,r1) ! 33 LS 694 695 mov.l r8, @(0x10,r1) ! 33 LS 696 add #-0x40, r5 ! 50 EX 697 698 mov.l r9, @(0x14,r1) ! 33 LS 699 cmp/eq r2,r1 ! 54 MT 700 701 mov.l r10, @(0x18,r1) ! 33 LS 702 bf/s 2b ! 109 BR 703 704 mov.l r11, @(0x1c,r1) ! 33 LS 705#else 7062: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2) 707 add #-2, r5 ! 50 EX 708 709 mov.l @(0x1c,r5), r3 ! 18 LS (latency=2) 710 add #-4, r1 ! 50 EX 711 712 mov.l @(0x18,r5), r6 ! 18 LS (latency=2) 713 shll16 r0 ! 103 EX 714 715 mov.l @(0x14,r5), r7 ! 18 LS (latency=2) 716 xtrct r3, r0 ! 48 EX 717 718 mov.l @(0x10,r5), r8 ! 18 LS (latency=2) 719 xtrct r6, r3 ! 48 EX 720 721 mov.l @(0x0c,r5), r9 ! 18 LS (latency=2) 722 xtrct r7, r6 ! 48 EX 723 724 mov.l @(0x08,r5), r10 ! 18 LS (latency=2) 725 xtrct r8, r7 ! 48 EX 726 727 mov.l @(0x04,r5), r11 ! 18 LS (latency=2) 728 xtrct r9, r8 ! 48 EX 729 730 mov.l @(0x00,r5), r12 ! 18 LS (latency=2) 731 xtrct r10, r9 ! 48 EX 732 733 movca.l r0,@r1 ! 40 LS (latency=3-7) 734 add #-0x1c, r1 ! 50 EX 735 736 mov.l r3, @(0x18,r1) ! 33 LS 737 xtrct r11, r10 ! 48 EX 738 739 mov.l r6, @(0x14,r1) ! 33 LS 740 xtrct r12, r11 ! 48 EX 741 742 mov.l r7, @(0x10,r1) ! 33 LS 743 744 mov.l r8, @(0x0c,r1) ! 33 LS 745 add #-0x1e, r5 ! 50 EX 746 747 mov.l r9, @(0x08,r1) ! 33 LS 748 cmp/eq r2,r1 ! 54 MT 749 750 mov.l r10, @(0x04,r1) ! 33 LS 751 bf/s 2b ! 109 BR 752 753 mov.l r11, @(0x00,r1) ! 33 LS 754#endif 755 756 mov.l @r15+, r12 757 mov r1, r0 ! 5 MT (latency=0) 758 759 mov.l @r15+, r11 ! 15 LS 760 sub r1, r5 ! 75 EX 761 762 mov.l @r15+, r10 ! 15 LS 763 cmp/eq r4, r0 ! 54 MT 764 765 bf/s 1f ! 109 BR 766 mov.l @r15+, r9 ! 15 LS 767 768 rts 7691: mov.l @r15+, r8 ! 15 LS 770 771 add #0x1e, r5 ! 50 EX 772 773 ! Finish off a short word at a time 774 ! r5 must be invariant - 2 77510: mov r4,r2 ! 5 MT (latency=0) 776 add #1,r2 ! 50 EX 777 778 cmp/hi r2, r0 ! 57 MT 779 bf/s 1f ! 109 BR 780 781 add #2, r2 ! 50 EX 782 7833: mov.w @(r0,r5),r1 ! 20 LS 784 cmp/hi r2,r0 ! 57 MT 785 786 bt/s 3b ! 109 BR 787 788 mov.w r1,@-r0 ! 29 LS 7891: 790 791 ! 792 ! Finally, copy the last byte if necessary 793 cmp/eq r4,r0 ! 54 MT 794 bt/s 9b 795 add #1,r5 796 mov.b @(r0,r5),r1 797 rts 798 mov.b r1,@-r0 799 800