1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2012 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22#define STACKFRAMESIZE 256 23#define STK_REG(i) (112 + ((i)-14)*8) 24 25_GLOBAL(memcpy_power7) 26#ifdef CONFIG_ALTIVEC 27 cmpldi r5,16 28 cmpldi cr1,r5,4096 29 30 std r3,48(r1) 31 32 blt .Lshort_copy 33 bgt cr1,.Lvmx_copy 34#else 35 cmpldi r5,16 36 37 std r3,48(r1) 38 39 blt .Lshort_copy 40#endif 41 42.Lnonvmx_copy: 43 /* Get the source 8B aligned */ 44 neg r6,r4 45 mtocrf 0x01,r6 46 clrldi r6,r6,(64-3) 47 48 bf cr7*4+3,1f 49 lbz r0,0(r4) 50 addi r4,r4,1 51 stb r0,0(r3) 52 addi r3,r3,1 53 541: bf cr7*4+2,2f 55 lhz r0,0(r4) 56 addi r4,r4,2 57 sth r0,0(r3) 58 addi r3,r3,2 59 602: bf cr7*4+1,3f 61 lwz r0,0(r4) 62 addi r4,r4,4 63 stw r0,0(r3) 64 addi r3,r3,4 65 663: sub r5,r5,r6 67 cmpldi r5,128 68 blt 5f 69 70 mflr r0 71 stdu r1,-STACKFRAMESIZE(r1) 72 std r14,STK_REG(r14)(r1) 73 std r15,STK_REG(r15)(r1) 74 std r16,STK_REG(r16)(r1) 75 std r17,STK_REG(r17)(r1) 76 std r18,STK_REG(r18)(r1) 77 std r19,STK_REG(r19)(r1) 78 std r20,STK_REG(r20)(r1) 79 std r21,STK_REG(r21)(r1) 80 std r22,STK_REG(r22)(r1) 81 std r0,STACKFRAMESIZE+16(r1) 82 83 srdi r6,r5,7 84 mtctr r6 85 86 /* Now do cacheline (128B) sized loads and stores. */ 87 .align 5 884: 89 ld r0,0(r4) 90 ld r6,8(r4) 91 ld r7,16(r4) 92 ld r8,24(r4) 93 ld r9,32(r4) 94 ld r10,40(r4) 95 ld r11,48(r4) 96 ld r12,56(r4) 97 ld r14,64(r4) 98 ld r15,72(r4) 99 ld r16,80(r4) 100 ld r17,88(r4) 101 ld r18,96(r4) 102 ld r19,104(r4) 103 ld r20,112(r4) 104 ld r21,120(r4) 105 addi r4,r4,128 106 std r0,0(r3) 107 std r6,8(r3) 108 std r7,16(r3) 109 std r8,24(r3) 110 std r9,32(r3) 111 std r10,40(r3) 112 std r11,48(r3) 113 std r12,56(r3) 114 std r14,64(r3) 115 std r15,72(r3) 116 std r16,80(r3) 117 std r17,88(r3) 118 std r18,96(r3) 119 std r19,104(r3) 120 std r20,112(r3) 121 std r21,120(r3) 122 addi r3,r3,128 123 bdnz 4b 124 125 clrldi r5,r5,(64-7) 126 127 ld r14,STK_REG(r14)(r1) 128 ld r15,STK_REG(r15)(r1) 129 ld r16,STK_REG(r16)(r1) 130 ld r17,STK_REG(r17)(r1) 131 ld r18,STK_REG(r18)(r1) 132 ld r19,STK_REG(r19)(r1) 133 ld r20,STK_REG(r20)(r1) 134 ld r21,STK_REG(r21)(r1) 135 ld r22,STK_REG(r22)(r1) 136 addi r1,r1,STACKFRAMESIZE 137 138 /* Up to 127B to go */ 1395: srdi r6,r5,4 140 mtocrf 0x01,r6 141 1426: bf cr7*4+1,7f 143 ld r0,0(r4) 144 ld r6,8(r4) 145 ld r7,16(r4) 146 ld r8,24(r4) 147 ld r9,32(r4) 148 ld r10,40(r4) 149 ld r11,48(r4) 150 ld r12,56(r4) 151 addi r4,r4,64 152 std r0,0(r3) 153 std r6,8(r3) 154 std r7,16(r3) 155 std r8,24(r3) 156 std r9,32(r3) 157 std r10,40(r3) 158 std r11,48(r3) 159 std r12,56(r3) 160 addi r3,r3,64 161 162 /* Up to 63B to go */ 1637: bf cr7*4+2,8f 164 ld r0,0(r4) 165 ld r6,8(r4) 166 ld r7,16(r4) 167 ld r8,24(r4) 168 addi r4,r4,32 169 std r0,0(r3) 170 std r6,8(r3) 171 std r7,16(r3) 172 std r8,24(r3) 173 addi r3,r3,32 174 175 /* Up to 31B to go */ 1768: bf cr7*4+3,9f 177 ld r0,0(r4) 178 ld r6,8(r4) 179 addi r4,r4,16 180 std r0,0(r3) 181 std r6,8(r3) 182 addi r3,r3,16 183 1849: clrldi r5,r5,(64-4) 185 186 /* Up to 15B to go */ 187.Lshort_copy: 188 mtocrf 0x01,r5 189 bf cr7*4+0,12f 190 lwz r0,0(r4) /* Less chance of a reject with word ops */ 191 lwz r6,4(r4) 192 addi r4,r4,8 193 stw r0,0(r3) 194 stw r6,4(r3) 195 addi r3,r3,8 196 19712: bf cr7*4+1,13f 198 lwz r0,0(r4) 199 addi r4,r4,4 200 stw r0,0(r3) 201 addi r3,r3,4 202 20313: bf cr7*4+2,14f 204 lhz r0,0(r4) 205 addi r4,r4,2 206 sth r0,0(r3) 207 addi r3,r3,2 208 20914: bf cr7*4+3,15f 210 lbz r0,0(r4) 211 stb r0,0(r3) 212 21315: ld r3,48(r1) 214 blr 215 216.Lunwind_stack_nonvmx_copy: 217 addi r1,r1,STACKFRAMESIZE 218 b .Lnonvmx_copy 219 220#ifdef CONFIG_ALTIVEC 221.Lvmx_copy: 222 mflr r0 223 std r4,56(r1) 224 std r5,64(r1) 225 std r0,16(r1) 226 stdu r1,-STACKFRAMESIZE(r1) 227 bl .enter_vmx_copy 228 cmpwi r3,0 229 ld r0,STACKFRAMESIZE+16(r1) 230 ld r3,STACKFRAMESIZE+48(r1) 231 ld r4,STACKFRAMESIZE+56(r1) 232 ld r5,STACKFRAMESIZE+64(r1) 233 mtlr r0 234 235 /* 236 * We prefetch both the source and destination using enhanced touch 237 * instructions. We use a stream ID of 0 for the load side and 238 * 1 for the store side. 239 */ 240 clrrdi r6,r4,7 241 clrrdi r9,r3,7 242 ori r9,r9,1 /* stream=1 */ 243 244 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 245 cmpldi cr1,r7,0x3FF 246 ble cr1,1f 247 li r7,0x3FF 2481: lis r0,0x0E00 /* depth=7 */ 249 sldi r7,r7,7 250 or r7,r7,r0 251 ori r10,r7,1 /* stream=1 */ 252 253 lis r8,0x8000 /* GO=1 */ 254 clrldi r8,r8,32 255 256.machine push 257.machine "power4" 258 dcbt r0,r6,0b01000 259 dcbt r0,r7,0b01010 260 dcbtst r0,r9,0b01000 261 dcbtst r0,r10,0b01010 262 eieio 263 dcbt r0,r8,0b01010 /* GO */ 264.machine pop 265 266 beq .Lunwind_stack_nonvmx_copy 267 268 /* 269 * If source and destination are not relatively aligned we use a 270 * slower permute loop. 271 */ 272 xor r6,r4,r3 273 rldicl. r6,r6,0,(64-4) 274 bne .Lvmx_unaligned_copy 275 276 /* Get the destination 16B aligned */ 277 neg r6,r3 278 mtocrf 0x01,r6 279 clrldi r6,r6,(64-4) 280 281 bf cr7*4+3,1f 282 lbz r0,0(r4) 283 addi r4,r4,1 284 stb r0,0(r3) 285 addi r3,r3,1 286 2871: bf cr7*4+2,2f 288 lhz r0,0(r4) 289 addi r4,r4,2 290 sth r0,0(r3) 291 addi r3,r3,2 292 2932: bf cr7*4+1,3f 294 lwz r0,0(r4) 295 addi r4,r4,4 296 stw r0,0(r3) 297 addi r3,r3,4 298 2993: bf cr7*4+0,4f 300 ld r0,0(r4) 301 addi r4,r4,8 302 std r0,0(r3) 303 addi r3,r3,8 304 3054: sub r5,r5,r6 306 307 /* Get the desination 128B aligned */ 308 neg r6,r3 309 srdi r7,r6,4 310 mtocrf 0x01,r7 311 clrldi r6,r6,(64-7) 312 313 li r9,16 314 li r10,32 315 li r11,48 316 317 bf cr7*4+3,5f 318 lvx vr1,r0,r4 319 addi r4,r4,16 320 stvx vr1,r0,r3 321 addi r3,r3,16 322 3235: bf cr7*4+2,6f 324 lvx vr1,r0,r4 325 lvx vr0,r4,r9 326 addi r4,r4,32 327 stvx vr1,r0,r3 328 stvx vr0,r3,r9 329 addi r3,r3,32 330 3316: bf cr7*4+1,7f 332 lvx vr3,r0,r4 333 lvx vr2,r4,r9 334 lvx vr1,r4,r10 335 lvx vr0,r4,r11 336 addi r4,r4,64 337 stvx vr3,r0,r3 338 stvx vr2,r3,r9 339 stvx vr1,r3,r10 340 stvx vr0,r3,r11 341 addi r3,r3,64 342 3437: sub r5,r5,r6 344 srdi r6,r5,7 345 346 std r14,STK_REG(r14)(r1) 347 std r15,STK_REG(r15)(r1) 348 std r16,STK_REG(r16)(r1) 349 350 li r12,64 351 li r14,80 352 li r15,96 353 li r16,112 354 355 mtctr r6 356 357 /* 358 * Now do cacheline sized loads and stores. By this stage the 359 * cacheline stores are also cacheline aligned. 360 */ 361 .align 5 3628: 363 lvx vr7,r0,r4 364 lvx vr6,r4,r9 365 lvx vr5,r4,r10 366 lvx vr4,r4,r11 367 lvx vr3,r4,r12 368 lvx vr2,r4,r14 369 lvx vr1,r4,r15 370 lvx vr0,r4,r16 371 addi r4,r4,128 372 stvx vr7,r0,r3 373 stvx vr6,r3,r9 374 stvx vr5,r3,r10 375 stvx vr4,r3,r11 376 stvx vr3,r3,r12 377 stvx vr2,r3,r14 378 stvx vr1,r3,r15 379 stvx vr0,r3,r16 380 addi r3,r3,128 381 bdnz 8b 382 383 ld r14,STK_REG(r14)(r1) 384 ld r15,STK_REG(r15)(r1) 385 ld r16,STK_REG(r16)(r1) 386 387 /* Up to 127B to go */ 388 clrldi r5,r5,(64-7) 389 srdi r6,r5,4 390 mtocrf 0x01,r6 391 392 bf cr7*4+1,9f 393 lvx vr3,r0,r4 394 lvx vr2,r4,r9 395 lvx vr1,r4,r10 396 lvx vr0,r4,r11 397 addi r4,r4,64 398 stvx vr3,r0,r3 399 stvx vr2,r3,r9 400 stvx vr1,r3,r10 401 stvx vr0,r3,r11 402 addi r3,r3,64 403 4049: bf cr7*4+2,10f 405 lvx vr1,r0,r4 406 lvx vr0,r4,r9 407 addi r4,r4,32 408 stvx vr1,r0,r3 409 stvx vr0,r3,r9 410 addi r3,r3,32 411 41210: bf cr7*4+3,11f 413 lvx vr1,r0,r4 414 addi r4,r4,16 415 stvx vr1,r0,r3 416 addi r3,r3,16 417 418 /* Up to 15B to go */ 41911: clrldi r5,r5,(64-4) 420 mtocrf 0x01,r5 421 bf cr7*4+0,12f 422 ld r0,0(r4) 423 addi r4,r4,8 424 std r0,0(r3) 425 addi r3,r3,8 426 42712: bf cr7*4+1,13f 428 lwz r0,0(r4) 429 addi r4,r4,4 430 stw r0,0(r3) 431 addi r3,r3,4 432 43313: bf cr7*4+2,14f 434 lhz r0,0(r4) 435 addi r4,r4,2 436 sth r0,0(r3) 437 addi r3,r3,2 438 43914: bf cr7*4+3,15f 440 lbz r0,0(r4) 441 stb r0,0(r3) 442 44315: addi r1,r1,STACKFRAMESIZE 444 ld r3,48(r1) 445 b .exit_vmx_copy /* tail call optimise */ 446 447.Lvmx_unaligned_copy: 448 /* Get the destination 16B aligned */ 449 neg r6,r3 450 mtocrf 0x01,r6 451 clrldi r6,r6,(64-4) 452 453 bf cr7*4+3,1f 454 lbz r0,0(r4) 455 addi r4,r4,1 456 stb r0,0(r3) 457 addi r3,r3,1 458 4591: bf cr7*4+2,2f 460 lhz r0,0(r4) 461 addi r4,r4,2 462 sth r0,0(r3) 463 addi r3,r3,2 464 4652: bf cr7*4+1,3f 466 lwz r0,0(r4) 467 addi r4,r4,4 468 stw r0,0(r3) 469 addi r3,r3,4 470 4713: bf cr7*4+0,4f 472 lwz r0,0(r4) /* Less chance of a reject with word ops */ 473 lwz r7,4(r4) 474 addi r4,r4,8 475 stw r0,0(r3) 476 stw r7,4(r3) 477 addi r3,r3,8 478 4794: sub r5,r5,r6 480 481 /* Get the desination 128B aligned */ 482 neg r6,r3 483 srdi r7,r6,4 484 mtocrf 0x01,r7 485 clrldi r6,r6,(64-7) 486 487 li r9,16 488 li r10,32 489 li r11,48 490 491 lvsl vr16,0,r4 /* Setup permute control vector */ 492 lvx vr0,0,r4 493 addi r4,r4,16 494 495 bf cr7*4+3,5f 496 lvx vr1,r0,r4 497 vperm vr8,vr0,vr1,vr16 498 addi r4,r4,16 499 stvx vr8,r0,r3 500 addi r3,r3,16 501 vor vr0,vr1,vr1 502 5035: bf cr7*4+2,6f 504 lvx vr1,r0,r4 505 vperm vr8,vr0,vr1,vr16 506 lvx vr0,r4,r9 507 vperm vr9,vr1,vr0,vr16 508 addi r4,r4,32 509 stvx vr8,r0,r3 510 stvx vr9,r3,r9 511 addi r3,r3,32 512 5136: bf cr7*4+1,7f 514 lvx vr3,r0,r4 515 vperm vr8,vr0,vr3,vr16 516 lvx vr2,r4,r9 517 vperm vr9,vr3,vr2,vr16 518 lvx vr1,r4,r10 519 vperm vr10,vr2,vr1,vr16 520 lvx vr0,r4,r11 521 vperm vr11,vr1,vr0,vr16 522 addi r4,r4,64 523 stvx vr8,r0,r3 524 stvx vr9,r3,r9 525 stvx vr10,r3,r10 526 stvx vr11,r3,r11 527 addi r3,r3,64 528 5297: sub r5,r5,r6 530 srdi r6,r5,7 531 532 std r14,STK_REG(r14)(r1) 533 std r15,STK_REG(r15)(r1) 534 std r16,STK_REG(r16)(r1) 535 536 li r12,64 537 li r14,80 538 li r15,96 539 li r16,112 540 541 mtctr r6 542 543 /* 544 * Now do cacheline sized loads and stores. By this stage the 545 * cacheline stores are also cacheline aligned. 546 */ 547 .align 5 5488: 549 lvx vr7,r0,r4 550 vperm vr8,vr0,vr7,vr16 551 lvx vr6,r4,r9 552 vperm vr9,vr7,vr6,vr16 553 lvx vr5,r4,r10 554 vperm vr10,vr6,vr5,vr16 555 lvx vr4,r4,r11 556 vperm vr11,vr5,vr4,vr16 557 lvx vr3,r4,r12 558 vperm vr12,vr4,vr3,vr16 559 lvx vr2,r4,r14 560 vperm vr13,vr3,vr2,vr16 561 lvx vr1,r4,r15 562 vperm vr14,vr2,vr1,vr16 563 lvx vr0,r4,r16 564 vperm vr15,vr1,vr0,vr16 565 addi r4,r4,128 566 stvx vr8,r0,r3 567 stvx vr9,r3,r9 568 stvx vr10,r3,r10 569 stvx vr11,r3,r11 570 stvx vr12,r3,r12 571 stvx vr13,r3,r14 572 stvx vr14,r3,r15 573 stvx vr15,r3,r16 574 addi r3,r3,128 575 bdnz 8b 576 577 ld r14,STK_REG(r14)(r1) 578 ld r15,STK_REG(r15)(r1) 579 ld r16,STK_REG(r16)(r1) 580 581 /* Up to 127B to go */ 582 clrldi r5,r5,(64-7) 583 srdi r6,r5,4 584 mtocrf 0x01,r6 585 586 bf cr7*4+1,9f 587 lvx vr3,r0,r4 588 vperm vr8,vr0,vr3,vr16 589 lvx vr2,r4,r9 590 vperm vr9,vr3,vr2,vr16 591 lvx vr1,r4,r10 592 vperm vr10,vr2,vr1,vr16 593 lvx vr0,r4,r11 594 vperm vr11,vr1,vr0,vr16 595 addi r4,r4,64 596 stvx vr8,r0,r3 597 stvx vr9,r3,r9 598 stvx vr10,r3,r10 599 stvx vr11,r3,r11 600 addi r3,r3,64 601 6029: bf cr7*4+2,10f 603 lvx vr1,r0,r4 604 vperm vr8,vr0,vr1,vr16 605 lvx vr0,r4,r9 606 vperm vr9,vr1,vr0,vr16 607 addi r4,r4,32 608 stvx vr8,r0,r3 609 stvx vr9,r3,r9 610 addi r3,r3,32 611 61210: bf cr7*4+3,11f 613 lvx vr1,r0,r4 614 vperm vr8,vr0,vr1,vr16 615 addi r4,r4,16 616 stvx vr8,r0,r3 617 addi r3,r3,16 618 619 /* Up to 15B to go */ 62011: clrldi r5,r5,(64-4) 621 addi r4,r4,-16 /* Unwind the +16 load offset */ 622 mtocrf 0x01,r5 623 bf cr7*4+0,12f 624 lwz r0,0(r4) /* Less chance of a reject with word ops */ 625 lwz r6,4(r4) 626 addi r4,r4,8 627 stw r0,0(r3) 628 stw r6,4(r3) 629 addi r3,r3,8 630 63112: bf cr7*4+1,13f 632 lwz r0,0(r4) 633 addi r4,r4,4 634 stw r0,0(r3) 635 addi r3,r3,4 636 63713: bf cr7*4+2,14f 638 lhz r0,0(r4) 639 addi r4,r4,2 640 sth r0,0(r3) 641 addi r3,r3,2 642 64314: bf cr7*4+3,15f 644 lbz r0,0(r4) 645 stb r0,0(r3) 646 64715: addi r1,r1,STACKFRAMESIZE 648 ld r3,48(r1) 649 b .exit_vmx_copy /* tail call optimise */ 650#endif /* CONFiG_ALTIVEC */ 651