1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2012 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22_GLOBAL(memcpy_power7) 23 24#ifdef __BIG_ENDIAN__ 25#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 26#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 27#else 28#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 29#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 30#endif 31 32#ifdef CONFIG_ALTIVEC 33 cmpldi r5,16 34 cmpldi cr1,r5,4096 35 36 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 37 38 blt .Lshort_copy 39 bgt cr1,.Lvmx_copy 40#else 41 cmpldi r5,16 42 43 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 44 45 blt .Lshort_copy 46#endif 47 48.Lnonvmx_copy: 49 /* Get the source 8B aligned */ 50 neg r6,r4 51 mtocrf 0x01,r6 52 clrldi r6,r6,(64-3) 53 54 bf cr7*4+3,1f 55 lbz r0,0(r4) 56 addi r4,r4,1 57 stb r0,0(r3) 58 addi r3,r3,1 59 601: bf cr7*4+2,2f 61 lhz r0,0(r4) 62 addi r4,r4,2 63 sth r0,0(r3) 64 addi r3,r3,2 65 662: bf cr7*4+1,3f 67 lwz r0,0(r4) 68 addi r4,r4,4 69 stw r0,0(r3) 70 addi r3,r3,4 71 723: sub r5,r5,r6 73 cmpldi r5,128 74 blt 5f 75 76 mflr r0 77 stdu r1,-STACKFRAMESIZE(r1) 78 std r14,STK_REG(R14)(r1) 79 std r15,STK_REG(R15)(r1) 80 std r16,STK_REG(R16)(r1) 81 std r17,STK_REG(R17)(r1) 82 std r18,STK_REG(R18)(r1) 83 std r19,STK_REG(R19)(r1) 84 std r20,STK_REG(R20)(r1) 85 std r21,STK_REG(R21)(r1) 86 std r22,STK_REG(R22)(r1) 87 std r0,STACKFRAMESIZE+16(r1) 88 89 srdi r6,r5,7 90 mtctr r6 91 92 /* Now do cacheline (128B) sized loads and stores. */ 93 .align 5 944: 95 ld r0,0(r4) 96 ld r6,8(r4) 97 ld r7,16(r4) 98 ld r8,24(r4) 99 ld r9,32(r4) 100 ld r10,40(r4) 101 ld r11,48(r4) 102 ld r12,56(r4) 103 ld r14,64(r4) 104 ld r15,72(r4) 105 ld r16,80(r4) 106 ld r17,88(r4) 107 ld r18,96(r4) 108 ld r19,104(r4) 109 ld r20,112(r4) 110 ld r21,120(r4) 111 addi r4,r4,128 112 std r0,0(r3) 113 std r6,8(r3) 114 std r7,16(r3) 115 std r8,24(r3) 116 std r9,32(r3) 117 std r10,40(r3) 118 std r11,48(r3) 119 std r12,56(r3) 120 std r14,64(r3) 121 std r15,72(r3) 122 std r16,80(r3) 123 std r17,88(r3) 124 std r18,96(r3) 125 std r19,104(r3) 126 std r20,112(r3) 127 std r21,120(r3) 128 addi r3,r3,128 129 bdnz 4b 130 131 clrldi r5,r5,(64-7) 132 133 ld r14,STK_REG(R14)(r1) 134 ld r15,STK_REG(R15)(r1) 135 ld r16,STK_REG(R16)(r1) 136 ld r17,STK_REG(R17)(r1) 137 ld r18,STK_REG(R18)(r1) 138 ld r19,STK_REG(R19)(r1) 139 ld r20,STK_REG(R20)(r1) 140 ld r21,STK_REG(R21)(r1) 141 ld r22,STK_REG(R22)(r1) 142 addi r1,r1,STACKFRAMESIZE 143 144 /* Up to 127B to go */ 1455: srdi r6,r5,4 146 mtocrf 0x01,r6 147 1486: bf cr7*4+1,7f 149 ld r0,0(r4) 150 ld r6,8(r4) 151 ld r7,16(r4) 152 ld r8,24(r4) 153 ld r9,32(r4) 154 ld r10,40(r4) 155 ld r11,48(r4) 156 ld r12,56(r4) 157 addi r4,r4,64 158 std r0,0(r3) 159 std r6,8(r3) 160 std r7,16(r3) 161 std r8,24(r3) 162 std r9,32(r3) 163 std r10,40(r3) 164 std r11,48(r3) 165 std r12,56(r3) 166 addi r3,r3,64 167 168 /* Up to 63B to go */ 1697: bf cr7*4+2,8f 170 ld r0,0(r4) 171 ld r6,8(r4) 172 ld r7,16(r4) 173 ld r8,24(r4) 174 addi r4,r4,32 175 std r0,0(r3) 176 std r6,8(r3) 177 std r7,16(r3) 178 std r8,24(r3) 179 addi r3,r3,32 180 181 /* Up to 31B to go */ 1828: bf cr7*4+3,9f 183 ld r0,0(r4) 184 ld r6,8(r4) 185 addi r4,r4,16 186 std r0,0(r3) 187 std r6,8(r3) 188 addi r3,r3,16 189 1909: clrldi r5,r5,(64-4) 191 192 /* Up to 15B to go */ 193.Lshort_copy: 194 mtocrf 0x01,r5 195 bf cr7*4+0,12f 196 lwz r0,0(r4) /* Less chance of a reject with word ops */ 197 lwz r6,4(r4) 198 addi r4,r4,8 199 stw r0,0(r3) 200 stw r6,4(r3) 201 addi r3,r3,8 202 20312: bf cr7*4+1,13f 204 lwz r0,0(r4) 205 addi r4,r4,4 206 stw r0,0(r3) 207 addi r3,r3,4 208 20913: bf cr7*4+2,14f 210 lhz r0,0(r4) 211 addi r4,r4,2 212 sth r0,0(r3) 213 addi r3,r3,2 214 21514: bf cr7*4+3,15f 216 lbz r0,0(r4) 217 stb r0,0(r3) 218 21915: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 220 blr 221 222.Lunwind_stack_nonvmx_copy: 223 addi r1,r1,STACKFRAMESIZE 224 b .Lnonvmx_copy 225 226#ifdef CONFIG_ALTIVEC 227.Lvmx_copy: 228 mflr r0 229 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 230 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 231 std r0,16(r1) 232 stdu r1,-STACKFRAMESIZE(r1) 233 bl enter_vmx_copy 234 cmpwi cr1,r3,0 235 ld r0,STACKFRAMESIZE+16(r1) 236 ld r3,STK_REG(R31)(r1) 237 ld r4,STK_REG(R30)(r1) 238 ld r5,STK_REG(R29)(r1) 239 mtlr r0 240 241 /* 242 * We prefetch both the source and destination using enhanced touch 243 * instructions. We use a stream ID of 0 for the load side and 244 * 1 for the store side. 245 */ 246 clrrdi r6,r4,7 247 clrrdi r9,r3,7 248 ori r9,r9,1 /* stream=1 */ 249 250 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 251 cmpldi r7,0x3FF 252 ble 1f 253 li r7,0x3FF 2541: lis r0,0x0E00 /* depth=7 */ 255 sldi r7,r7,7 256 or r7,r7,r0 257 ori r10,r7,1 /* stream=1 */ 258 259 lis r8,0x8000 /* GO=1 */ 260 clrldi r8,r8,32 261 262 dcbt 0,r6,0b01000 263 dcbt 0,r7,0b01010 264 dcbtst 0,r9,0b01000 265 dcbtst 0,r10,0b01010 266 eieio 267 dcbt 0,r8,0b01010 /* GO */ 268 269 beq cr1,.Lunwind_stack_nonvmx_copy 270 271 /* 272 * If source and destination are not relatively aligned we use a 273 * slower permute loop. 274 */ 275 xor r6,r4,r3 276 rldicl. r6,r6,0,(64-4) 277 bne .Lvmx_unaligned_copy 278 279 /* Get the destination 16B aligned */ 280 neg r6,r3 281 mtocrf 0x01,r6 282 clrldi r6,r6,(64-4) 283 284 bf cr7*4+3,1f 285 lbz r0,0(r4) 286 addi r4,r4,1 287 stb r0,0(r3) 288 addi r3,r3,1 289 2901: bf cr7*4+2,2f 291 lhz r0,0(r4) 292 addi r4,r4,2 293 sth r0,0(r3) 294 addi r3,r3,2 295 2962: bf cr7*4+1,3f 297 lwz r0,0(r4) 298 addi r4,r4,4 299 stw r0,0(r3) 300 addi r3,r3,4 301 3023: bf cr7*4+0,4f 303 ld r0,0(r4) 304 addi r4,r4,8 305 std r0,0(r3) 306 addi r3,r3,8 307 3084: sub r5,r5,r6 309 310 /* Get the desination 128B aligned */ 311 neg r6,r3 312 srdi r7,r6,4 313 mtocrf 0x01,r7 314 clrldi r6,r6,(64-7) 315 316 li r9,16 317 li r10,32 318 li r11,48 319 320 bf cr7*4+3,5f 321 lvx v1,0,r4 322 addi r4,r4,16 323 stvx v1,0,r3 324 addi r3,r3,16 325 3265: bf cr7*4+2,6f 327 lvx v1,0,r4 328 lvx v0,r4,r9 329 addi r4,r4,32 330 stvx v1,0,r3 331 stvx v0,r3,r9 332 addi r3,r3,32 333 3346: bf cr7*4+1,7f 335 lvx v3,0,r4 336 lvx v2,r4,r9 337 lvx v1,r4,r10 338 lvx v0,r4,r11 339 addi r4,r4,64 340 stvx v3,0,r3 341 stvx v2,r3,r9 342 stvx v1,r3,r10 343 stvx v0,r3,r11 344 addi r3,r3,64 345 3467: sub r5,r5,r6 347 srdi r6,r5,7 348 349 std r14,STK_REG(R14)(r1) 350 std r15,STK_REG(R15)(r1) 351 std r16,STK_REG(R16)(r1) 352 353 li r12,64 354 li r14,80 355 li r15,96 356 li r16,112 357 358 mtctr r6 359 360 /* 361 * Now do cacheline sized loads and stores. By this stage the 362 * cacheline stores are also cacheline aligned. 363 */ 364 .align 5 3658: 366 lvx v7,0,r4 367 lvx v6,r4,r9 368 lvx v5,r4,r10 369 lvx v4,r4,r11 370 lvx v3,r4,r12 371 lvx v2,r4,r14 372 lvx v1,r4,r15 373 lvx v0,r4,r16 374 addi r4,r4,128 375 stvx v7,0,r3 376 stvx v6,r3,r9 377 stvx v5,r3,r10 378 stvx v4,r3,r11 379 stvx v3,r3,r12 380 stvx v2,r3,r14 381 stvx v1,r3,r15 382 stvx v0,r3,r16 383 addi r3,r3,128 384 bdnz 8b 385 386 ld r14,STK_REG(R14)(r1) 387 ld r15,STK_REG(R15)(r1) 388 ld r16,STK_REG(R16)(r1) 389 390 /* Up to 127B to go */ 391 clrldi r5,r5,(64-7) 392 srdi r6,r5,4 393 mtocrf 0x01,r6 394 395 bf cr7*4+1,9f 396 lvx v3,0,r4 397 lvx v2,r4,r9 398 lvx v1,r4,r10 399 lvx v0,r4,r11 400 addi r4,r4,64 401 stvx v3,0,r3 402 stvx v2,r3,r9 403 stvx v1,r3,r10 404 stvx v0,r3,r11 405 addi r3,r3,64 406 4079: bf cr7*4+2,10f 408 lvx v1,0,r4 409 lvx v0,r4,r9 410 addi r4,r4,32 411 stvx v1,0,r3 412 stvx v0,r3,r9 413 addi r3,r3,32 414 41510: bf cr7*4+3,11f 416 lvx v1,0,r4 417 addi r4,r4,16 418 stvx v1,0,r3 419 addi r3,r3,16 420 421 /* Up to 15B to go */ 42211: clrldi r5,r5,(64-4) 423 mtocrf 0x01,r5 424 bf cr7*4+0,12f 425 ld r0,0(r4) 426 addi r4,r4,8 427 std r0,0(r3) 428 addi r3,r3,8 429 43012: bf cr7*4+1,13f 431 lwz r0,0(r4) 432 addi r4,r4,4 433 stw r0,0(r3) 434 addi r3,r3,4 435 43613: bf cr7*4+2,14f 437 lhz r0,0(r4) 438 addi r4,r4,2 439 sth r0,0(r3) 440 addi r3,r3,2 441 44214: bf cr7*4+3,15f 443 lbz r0,0(r4) 444 stb r0,0(r3) 445 44615: addi r1,r1,STACKFRAMESIZE 447 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 448 b exit_vmx_copy /* tail call optimise */ 449 450.Lvmx_unaligned_copy: 451 /* Get the destination 16B aligned */ 452 neg r6,r3 453 mtocrf 0x01,r6 454 clrldi r6,r6,(64-4) 455 456 bf cr7*4+3,1f 457 lbz r0,0(r4) 458 addi r4,r4,1 459 stb r0,0(r3) 460 addi r3,r3,1 461 4621: bf cr7*4+2,2f 463 lhz r0,0(r4) 464 addi r4,r4,2 465 sth r0,0(r3) 466 addi r3,r3,2 467 4682: bf cr7*4+1,3f 469 lwz r0,0(r4) 470 addi r4,r4,4 471 stw r0,0(r3) 472 addi r3,r3,4 473 4743: bf cr7*4+0,4f 475 lwz r0,0(r4) /* Less chance of a reject with word ops */ 476 lwz r7,4(r4) 477 addi r4,r4,8 478 stw r0,0(r3) 479 stw r7,4(r3) 480 addi r3,r3,8 481 4824: sub r5,r5,r6 483 484 /* Get the desination 128B aligned */ 485 neg r6,r3 486 srdi r7,r6,4 487 mtocrf 0x01,r7 488 clrldi r6,r6,(64-7) 489 490 li r9,16 491 li r10,32 492 li r11,48 493 494 LVS(v16,0,r4) /* Setup permute control vector */ 495 lvx v0,0,r4 496 addi r4,r4,16 497 498 bf cr7*4+3,5f 499 lvx v1,0,r4 500 VPERM(v8,v0,v1,v16) 501 addi r4,r4,16 502 stvx v8,0,r3 503 addi r3,r3,16 504 vor v0,v1,v1 505 5065: bf cr7*4+2,6f 507 lvx v1,0,r4 508 VPERM(v8,v0,v1,v16) 509 lvx v0,r4,r9 510 VPERM(v9,v1,v0,v16) 511 addi r4,r4,32 512 stvx v8,0,r3 513 stvx v9,r3,r9 514 addi r3,r3,32 515 5166: bf cr7*4+1,7f 517 lvx v3,0,r4 518 VPERM(v8,v0,v3,v16) 519 lvx v2,r4,r9 520 VPERM(v9,v3,v2,v16) 521 lvx v1,r4,r10 522 VPERM(v10,v2,v1,v16) 523 lvx v0,r4,r11 524 VPERM(v11,v1,v0,v16) 525 addi r4,r4,64 526 stvx v8,0,r3 527 stvx v9,r3,r9 528 stvx v10,r3,r10 529 stvx v11,r3,r11 530 addi r3,r3,64 531 5327: sub r5,r5,r6 533 srdi r6,r5,7 534 535 std r14,STK_REG(R14)(r1) 536 std r15,STK_REG(R15)(r1) 537 std r16,STK_REG(R16)(r1) 538 539 li r12,64 540 li r14,80 541 li r15,96 542 li r16,112 543 544 mtctr r6 545 546 /* 547 * Now do cacheline sized loads and stores. By this stage the 548 * cacheline stores are also cacheline aligned. 549 */ 550 .align 5 5518: 552 lvx v7,0,r4 553 VPERM(v8,v0,v7,v16) 554 lvx v6,r4,r9 555 VPERM(v9,v7,v6,v16) 556 lvx v5,r4,r10 557 VPERM(v10,v6,v5,v16) 558 lvx v4,r4,r11 559 VPERM(v11,v5,v4,v16) 560 lvx v3,r4,r12 561 VPERM(v12,v4,v3,v16) 562 lvx v2,r4,r14 563 VPERM(v13,v3,v2,v16) 564 lvx v1,r4,r15 565 VPERM(v14,v2,v1,v16) 566 lvx v0,r4,r16 567 VPERM(v15,v1,v0,v16) 568 addi r4,r4,128 569 stvx v8,0,r3 570 stvx v9,r3,r9 571 stvx v10,r3,r10 572 stvx v11,r3,r11 573 stvx v12,r3,r12 574 stvx v13,r3,r14 575 stvx v14,r3,r15 576 stvx v15,r3,r16 577 addi r3,r3,128 578 bdnz 8b 579 580 ld r14,STK_REG(R14)(r1) 581 ld r15,STK_REG(R15)(r1) 582 ld r16,STK_REG(R16)(r1) 583 584 /* Up to 127B to go */ 585 clrldi r5,r5,(64-7) 586 srdi r6,r5,4 587 mtocrf 0x01,r6 588 589 bf cr7*4+1,9f 590 lvx v3,0,r4 591 VPERM(v8,v0,v3,v16) 592 lvx v2,r4,r9 593 VPERM(v9,v3,v2,v16) 594 lvx v1,r4,r10 595 VPERM(v10,v2,v1,v16) 596 lvx v0,r4,r11 597 VPERM(v11,v1,v0,v16) 598 addi r4,r4,64 599 stvx v8,0,r3 600 stvx v9,r3,r9 601 stvx v10,r3,r10 602 stvx v11,r3,r11 603 addi r3,r3,64 604 6059: bf cr7*4+2,10f 606 lvx v1,0,r4 607 VPERM(v8,v0,v1,v16) 608 lvx v0,r4,r9 609 VPERM(v9,v1,v0,v16) 610 addi r4,r4,32 611 stvx v8,0,r3 612 stvx v9,r3,r9 613 addi r3,r3,32 614 61510: bf cr7*4+3,11f 616 lvx v1,0,r4 617 VPERM(v8,v0,v1,v16) 618 addi r4,r4,16 619 stvx v8,0,r3 620 addi r3,r3,16 621 622 /* Up to 15B to go */ 62311: clrldi r5,r5,(64-4) 624 addi r4,r4,-16 /* Unwind the +16 load offset */ 625 mtocrf 0x01,r5 626 bf cr7*4+0,12f 627 lwz r0,0(r4) /* Less chance of a reject with word ops */ 628 lwz r6,4(r4) 629 addi r4,r4,8 630 stw r0,0(r3) 631 stw r6,4(r3) 632 addi r3,r3,8 633 63412: bf cr7*4+1,13f 635 lwz r0,0(r4) 636 addi r4,r4,4 637 stw r0,0(r3) 638 addi r3,r3,4 639 64013: bf cr7*4+2,14f 641 lhz r0,0(r4) 642 addi r4,r4,2 643 sth r0,0(r3) 644 addi r3,r3,2 645 64614: bf cr7*4+3,15f 647 lbz r0,0(r4) 648 stb r0,0(r3) 649 65015: addi r1,r1,STACKFRAMESIZE 651 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 652 b exit_vmx_copy /* tail call optimise */ 653#endif /* CONFIG_ALTIVEC */ 654