1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2012 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22_GLOBAL(memcpy_power7) 23#ifdef CONFIG_ALTIVEC 24 cmpldi r5,16 25 cmpldi cr1,r5,4096 26 27 std r3,48(r1) 28 29 blt .Lshort_copy 30 bgt cr1,.Lvmx_copy 31#else 32 cmpldi r5,16 33 34 std r3,48(r1) 35 36 blt .Lshort_copy 37#endif 38 39.Lnonvmx_copy: 40 /* Get the source 8B aligned */ 41 neg r6,r4 42 mtocrf 0x01,r6 43 clrldi r6,r6,(64-3) 44 45 bf cr7*4+3,1f 46 lbz r0,0(r4) 47 addi r4,r4,1 48 stb r0,0(r3) 49 addi r3,r3,1 50 511: bf cr7*4+2,2f 52 lhz r0,0(r4) 53 addi r4,r4,2 54 sth r0,0(r3) 55 addi r3,r3,2 56 572: bf cr7*4+1,3f 58 lwz r0,0(r4) 59 addi r4,r4,4 60 stw r0,0(r3) 61 addi r3,r3,4 62 633: sub r5,r5,r6 64 cmpldi r5,128 65 blt 5f 66 67 mflr r0 68 stdu r1,-STACKFRAMESIZE(r1) 69 std r14,STK_REG(R14)(r1) 70 std r15,STK_REG(R15)(r1) 71 std r16,STK_REG(R16)(r1) 72 std r17,STK_REG(R17)(r1) 73 std r18,STK_REG(R18)(r1) 74 std r19,STK_REG(R19)(r1) 75 std r20,STK_REG(R20)(r1) 76 std r21,STK_REG(R21)(r1) 77 std r22,STK_REG(R22)(r1) 78 std r0,STACKFRAMESIZE+16(r1) 79 80 srdi r6,r5,7 81 mtctr r6 82 83 /* Now do cacheline (128B) sized loads and stores. */ 84 .align 5 854: 86 ld r0,0(r4) 87 ld r6,8(r4) 88 ld r7,16(r4) 89 ld r8,24(r4) 90 ld r9,32(r4) 91 ld r10,40(r4) 92 ld r11,48(r4) 93 ld r12,56(r4) 94 ld r14,64(r4) 95 ld r15,72(r4) 96 ld r16,80(r4) 97 ld r17,88(r4) 98 ld r18,96(r4) 99 ld r19,104(r4) 100 ld r20,112(r4) 101 ld r21,120(r4) 102 addi r4,r4,128 103 std r0,0(r3) 104 std r6,8(r3) 105 std r7,16(r3) 106 std r8,24(r3) 107 std r9,32(r3) 108 std r10,40(r3) 109 std r11,48(r3) 110 std r12,56(r3) 111 std r14,64(r3) 112 std r15,72(r3) 113 std r16,80(r3) 114 std r17,88(r3) 115 std r18,96(r3) 116 std r19,104(r3) 117 std r20,112(r3) 118 std r21,120(r3) 119 addi r3,r3,128 120 bdnz 4b 121 122 clrldi r5,r5,(64-7) 123 124 ld r14,STK_REG(R14)(r1) 125 ld r15,STK_REG(R15)(r1) 126 ld r16,STK_REG(R16)(r1) 127 ld r17,STK_REG(R17)(r1) 128 ld r18,STK_REG(R18)(r1) 129 ld r19,STK_REG(R19)(r1) 130 ld r20,STK_REG(R20)(r1) 131 ld r21,STK_REG(R21)(r1) 132 ld r22,STK_REG(R22)(r1) 133 addi r1,r1,STACKFRAMESIZE 134 135 /* Up to 127B to go */ 1365: srdi r6,r5,4 137 mtocrf 0x01,r6 138 1396: bf cr7*4+1,7f 140 ld r0,0(r4) 141 ld r6,8(r4) 142 ld r7,16(r4) 143 ld r8,24(r4) 144 ld r9,32(r4) 145 ld r10,40(r4) 146 ld r11,48(r4) 147 ld r12,56(r4) 148 addi r4,r4,64 149 std r0,0(r3) 150 std r6,8(r3) 151 std r7,16(r3) 152 std r8,24(r3) 153 std r9,32(r3) 154 std r10,40(r3) 155 std r11,48(r3) 156 std r12,56(r3) 157 addi r3,r3,64 158 159 /* Up to 63B to go */ 1607: bf cr7*4+2,8f 161 ld r0,0(r4) 162 ld r6,8(r4) 163 ld r7,16(r4) 164 ld r8,24(r4) 165 addi r4,r4,32 166 std r0,0(r3) 167 std r6,8(r3) 168 std r7,16(r3) 169 std r8,24(r3) 170 addi r3,r3,32 171 172 /* Up to 31B to go */ 1738: bf cr7*4+3,9f 174 ld r0,0(r4) 175 ld r6,8(r4) 176 addi r4,r4,16 177 std r0,0(r3) 178 std r6,8(r3) 179 addi r3,r3,16 180 1819: clrldi r5,r5,(64-4) 182 183 /* Up to 15B to go */ 184.Lshort_copy: 185 mtocrf 0x01,r5 186 bf cr7*4+0,12f 187 lwz r0,0(r4) /* Less chance of a reject with word ops */ 188 lwz r6,4(r4) 189 addi r4,r4,8 190 stw r0,0(r3) 191 stw r6,4(r3) 192 addi r3,r3,8 193 19412: bf cr7*4+1,13f 195 lwz r0,0(r4) 196 addi r4,r4,4 197 stw r0,0(r3) 198 addi r3,r3,4 199 20013: bf cr7*4+2,14f 201 lhz r0,0(r4) 202 addi r4,r4,2 203 sth r0,0(r3) 204 addi r3,r3,2 205 20614: bf cr7*4+3,15f 207 lbz r0,0(r4) 208 stb r0,0(r3) 209 21015: ld r3,48(r1) 211 blr 212 213.Lunwind_stack_nonvmx_copy: 214 addi r1,r1,STACKFRAMESIZE 215 b .Lnonvmx_copy 216 217#ifdef CONFIG_ALTIVEC 218.Lvmx_copy: 219 mflr r0 220 std r4,56(r1) 221 std r5,64(r1) 222 std r0,16(r1) 223 stdu r1,-STACKFRAMESIZE(r1) 224 bl .enter_vmx_copy 225 cmpwi cr1,r3,0 226 ld r0,STACKFRAMESIZE+16(r1) 227 ld r3,STACKFRAMESIZE+48(r1) 228 ld r4,STACKFRAMESIZE+56(r1) 229 ld r5,STACKFRAMESIZE+64(r1) 230 mtlr r0 231 232 /* 233 * We prefetch both the source and destination using enhanced touch 234 * instructions. We use a stream ID of 0 for the load side and 235 * 1 for the store side. 236 */ 237 clrrdi r6,r4,7 238 clrrdi r9,r3,7 239 ori r9,r9,1 /* stream=1 */ 240 241 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 242 cmpldi r7,0x3FF 243 ble 1f 244 li r7,0x3FF 2451: lis r0,0x0E00 /* depth=7 */ 246 sldi r7,r7,7 247 or r7,r7,r0 248 ori r10,r7,1 /* stream=1 */ 249 250 lis r8,0x8000 /* GO=1 */ 251 clrldi r8,r8,32 252 253.machine push 254.machine "power4" 255 dcbt r0,r6,0b01000 256 dcbt r0,r7,0b01010 257 dcbtst r0,r9,0b01000 258 dcbtst r0,r10,0b01010 259 eieio 260 dcbt r0,r8,0b01010 /* GO */ 261.machine pop 262 263 beq cr1,.Lunwind_stack_nonvmx_copy 264 265 /* 266 * If source and destination are not relatively aligned we use a 267 * slower permute loop. 268 */ 269 xor r6,r4,r3 270 rldicl. r6,r6,0,(64-4) 271 bne .Lvmx_unaligned_copy 272 273 /* Get the destination 16B aligned */ 274 neg r6,r3 275 mtocrf 0x01,r6 276 clrldi r6,r6,(64-4) 277 278 bf cr7*4+3,1f 279 lbz r0,0(r4) 280 addi r4,r4,1 281 stb r0,0(r3) 282 addi r3,r3,1 283 2841: bf cr7*4+2,2f 285 lhz r0,0(r4) 286 addi r4,r4,2 287 sth r0,0(r3) 288 addi r3,r3,2 289 2902: bf cr7*4+1,3f 291 lwz r0,0(r4) 292 addi r4,r4,4 293 stw r0,0(r3) 294 addi r3,r3,4 295 2963: bf cr7*4+0,4f 297 ld r0,0(r4) 298 addi r4,r4,8 299 std r0,0(r3) 300 addi r3,r3,8 301 3024: sub r5,r5,r6 303 304 /* Get the desination 128B aligned */ 305 neg r6,r3 306 srdi r7,r6,4 307 mtocrf 0x01,r7 308 clrldi r6,r6,(64-7) 309 310 li r9,16 311 li r10,32 312 li r11,48 313 314 bf cr7*4+3,5f 315 lvx vr1,r0,r4 316 addi r4,r4,16 317 stvx vr1,r0,r3 318 addi r3,r3,16 319 3205: bf cr7*4+2,6f 321 lvx vr1,r0,r4 322 lvx vr0,r4,r9 323 addi r4,r4,32 324 stvx vr1,r0,r3 325 stvx vr0,r3,r9 326 addi r3,r3,32 327 3286: bf cr7*4+1,7f 329 lvx vr3,r0,r4 330 lvx vr2,r4,r9 331 lvx vr1,r4,r10 332 lvx vr0,r4,r11 333 addi r4,r4,64 334 stvx vr3,r0,r3 335 stvx vr2,r3,r9 336 stvx vr1,r3,r10 337 stvx vr0,r3,r11 338 addi r3,r3,64 339 3407: sub r5,r5,r6 341 srdi r6,r5,7 342 343 std r14,STK_REG(R14)(r1) 344 std r15,STK_REG(R15)(r1) 345 std r16,STK_REG(R16)(r1) 346 347 li r12,64 348 li r14,80 349 li r15,96 350 li r16,112 351 352 mtctr r6 353 354 /* 355 * Now do cacheline sized loads and stores. By this stage the 356 * cacheline stores are also cacheline aligned. 357 */ 358 .align 5 3598: 360 lvx vr7,r0,r4 361 lvx vr6,r4,r9 362 lvx vr5,r4,r10 363 lvx vr4,r4,r11 364 lvx vr3,r4,r12 365 lvx vr2,r4,r14 366 lvx vr1,r4,r15 367 lvx vr0,r4,r16 368 addi r4,r4,128 369 stvx vr7,r0,r3 370 stvx vr6,r3,r9 371 stvx vr5,r3,r10 372 stvx vr4,r3,r11 373 stvx vr3,r3,r12 374 stvx vr2,r3,r14 375 stvx vr1,r3,r15 376 stvx vr0,r3,r16 377 addi r3,r3,128 378 bdnz 8b 379 380 ld r14,STK_REG(R14)(r1) 381 ld r15,STK_REG(R15)(r1) 382 ld r16,STK_REG(R16)(r1) 383 384 /* Up to 127B to go */ 385 clrldi r5,r5,(64-7) 386 srdi r6,r5,4 387 mtocrf 0x01,r6 388 389 bf cr7*4+1,9f 390 lvx vr3,r0,r4 391 lvx vr2,r4,r9 392 lvx vr1,r4,r10 393 lvx vr0,r4,r11 394 addi r4,r4,64 395 stvx vr3,r0,r3 396 stvx vr2,r3,r9 397 stvx vr1,r3,r10 398 stvx vr0,r3,r11 399 addi r3,r3,64 400 4019: bf cr7*4+2,10f 402 lvx vr1,r0,r4 403 lvx vr0,r4,r9 404 addi r4,r4,32 405 stvx vr1,r0,r3 406 stvx vr0,r3,r9 407 addi r3,r3,32 408 40910: bf cr7*4+3,11f 410 lvx vr1,r0,r4 411 addi r4,r4,16 412 stvx vr1,r0,r3 413 addi r3,r3,16 414 415 /* Up to 15B to go */ 41611: clrldi r5,r5,(64-4) 417 mtocrf 0x01,r5 418 bf cr7*4+0,12f 419 ld r0,0(r4) 420 addi r4,r4,8 421 std r0,0(r3) 422 addi r3,r3,8 423 42412: bf cr7*4+1,13f 425 lwz r0,0(r4) 426 addi r4,r4,4 427 stw r0,0(r3) 428 addi r3,r3,4 429 43013: bf cr7*4+2,14f 431 lhz r0,0(r4) 432 addi r4,r4,2 433 sth r0,0(r3) 434 addi r3,r3,2 435 43614: bf cr7*4+3,15f 437 lbz r0,0(r4) 438 stb r0,0(r3) 439 44015: addi r1,r1,STACKFRAMESIZE 441 ld r3,48(r1) 442 b .exit_vmx_copy /* tail call optimise */ 443 444.Lvmx_unaligned_copy: 445 /* Get the destination 16B aligned */ 446 neg r6,r3 447 mtocrf 0x01,r6 448 clrldi r6,r6,(64-4) 449 450 bf cr7*4+3,1f 451 lbz r0,0(r4) 452 addi r4,r4,1 453 stb r0,0(r3) 454 addi r3,r3,1 455 4561: bf cr7*4+2,2f 457 lhz r0,0(r4) 458 addi r4,r4,2 459 sth r0,0(r3) 460 addi r3,r3,2 461 4622: bf cr7*4+1,3f 463 lwz r0,0(r4) 464 addi r4,r4,4 465 stw r0,0(r3) 466 addi r3,r3,4 467 4683: bf cr7*4+0,4f 469 lwz r0,0(r4) /* Less chance of a reject with word ops */ 470 lwz r7,4(r4) 471 addi r4,r4,8 472 stw r0,0(r3) 473 stw r7,4(r3) 474 addi r3,r3,8 475 4764: sub r5,r5,r6 477 478 /* Get the desination 128B aligned */ 479 neg r6,r3 480 srdi r7,r6,4 481 mtocrf 0x01,r7 482 clrldi r6,r6,(64-7) 483 484 li r9,16 485 li r10,32 486 li r11,48 487 488 lvsl vr16,0,r4 /* Setup permute control vector */ 489 lvx vr0,0,r4 490 addi r4,r4,16 491 492 bf cr7*4+3,5f 493 lvx vr1,r0,r4 494 vperm vr8,vr0,vr1,vr16 495 addi r4,r4,16 496 stvx vr8,r0,r3 497 addi r3,r3,16 498 vor vr0,vr1,vr1 499 5005: bf cr7*4+2,6f 501 lvx vr1,r0,r4 502 vperm vr8,vr0,vr1,vr16 503 lvx vr0,r4,r9 504 vperm vr9,vr1,vr0,vr16 505 addi r4,r4,32 506 stvx vr8,r0,r3 507 stvx vr9,r3,r9 508 addi r3,r3,32 509 5106: bf cr7*4+1,7f 511 lvx vr3,r0,r4 512 vperm vr8,vr0,vr3,vr16 513 lvx vr2,r4,r9 514 vperm vr9,vr3,vr2,vr16 515 lvx vr1,r4,r10 516 vperm vr10,vr2,vr1,vr16 517 lvx vr0,r4,r11 518 vperm vr11,vr1,vr0,vr16 519 addi r4,r4,64 520 stvx vr8,r0,r3 521 stvx vr9,r3,r9 522 stvx vr10,r3,r10 523 stvx vr11,r3,r11 524 addi r3,r3,64 525 5267: sub r5,r5,r6 527 srdi r6,r5,7 528 529 std r14,STK_REG(R14)(r1) 530 std r15,STK_REG(R15)(r1) 531 std r16,STK_REG(R16)(r1) 532 533 li r12,64 534 li r14,80 535 li r15,96 536 li r16,112 537 538 mtctr r6 539 540 /* 541 * Now do cacheline sized loads and stores. By this stage the 542 * cacheline stores are also cacheline aligned. 543 */ 544 .align 5 5458: 546 lvx vr7,r0,r4 547 vperm vr8,vr0,vr7,vr16 548 lvx vr6,r4,r9 549 vperm vr9,vr7,vr6,vr16 550 lvx vr5,r4,r10 551 vperm vr10,vr6,vr5,vr16 552 lvx vr4,r4,r11 553 vperm vr11,vr5,vr4,vr16 554 lvx vr3,r4,r12 555 vperm vr12,vr4,vr3,vr16 556 lvx vr2,r4,r14 557 vperm vr13,vr3,vr2,vr16 558 lvx vr1,r4,r15 559 vperm vr14,vr2,vr1,vr16 560 lvx vr0,r4,r16 561 vperm vr15,vr1,vr0,vr16 562 addi r4,r4,128 563 stvx vr8,r0,r3 564 stvx vr9,r3,r9 565 stvx vr10,r3,r10 566 stvx vr11,r3,r11 567 stvx vr12,r3,r12 568 stvx vr13,r3,r14 569 stvx vr14,r3,r15 570 stvx vr15,r3,r16 571 addi r3,r3,128 572 bdnz 8b 573 574 ld r14,STK_REG(R14)(r1) 575 ld r15,STK_REG(R15)(r1) 576 ld r16,STK_REG(R16)(r1) 577 578 /* Up to 127B to go */ 579 clrldi r5,r5,(64-7) 580 srdi r6,r5,4 581 mtocrf 0x01,r6 582 583 bf cr7*4+1,9f 584 lvx vr3,r0,r4 585 vperm vr8,vr0,vr3,vr16 586 lvx vr2,r4,r9 587 vperm vr9,vr3,vr2,vr16 588 lvx vr1,r4,r10 589 vperm vr10,vr2,vr1,vr16 590 lvx vr0,r4,r11 591 vperm vr11,vr1,vr0,vr16 592 addi r4,r4,64 593 stvx vr8,r0,r3 594 stvx vr9,r3,r9 595 stvx vr10,r3,r10 596 stvx vr11,r3,r11 597 addi r3,r3,64 598 5999: bf cr7*4+2,10f 600 lvx vr1,r0,r4 601 vperm vr8,vr0,vr1,vr16 602 lvx vr0,r4,r9 603 vperm vr9,vr1,vr0,vr16 604 addi r4,r4,32 605 stvx vr8,r0,r3 606 stvx vr9,r3,r9 607 addi r3,r3,32 608 60910: bf cr7*4+3,11f 610 lvx vr1,r0,r4 611 vperm vr8,vr0,vr1,vr16 612 addi r4,r4,16 613 stvx vr8,r0,r3 614 addi r3,r3,16 615 616 /* Up to 15B to go */ 61711: clrldi r5,r5,(64-4) 618 addi r4,r4,-16 /* Unwind the +16 load offset */ 619 mtocrf 0x01,r5 620 bf cr7*4+0,12f 621 lwz r0,0(r4) /* Less chance of a reject with word ops */ 622 lwz r6,4(r4) 623 addi r4,r4,8 624 stw r0,0(r3) 625 stw r6,4(r3) 626 addi r3,r3,8 627 62812: bf cr7*4+1,13f 629 lwz r0,0(r4) 630 addi r4,r4,4 631 stw r0,0(r3) 632 addi r3,r3,4 633 63413: bf cr7*4+2,14f 635 lhz r0,0(r4) 636 addi r4,r4,2 637 sth r0,0(r3) 638 addi r3,r3,2 639 64014: bf cr7*4+3,15f 641 lbz r0,0(r4) 642 stb r0,0(r3) 643 64415: addi r1,r1,STACKFRAMESIZE 645 ld r3,48(r1) 646 b .exit_vmx_copy /* tail call optimise */ 647#endif /* CONFiG_ALTIVEC */ 648