1/* 2 * Copyright (C) 2002 Paul Mackerras, IBM Corp. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 */ 9#include <asm/processor.h> 10#include <asm/ppc_asm.h> 11 12 .align 7 13_GLOBAL(__copy_tofrom_user) 14 /* first check for a whole page copy on a page boundary */ 15 cmpldi cr1,r5,16 16 cmpdi cr6,r5,4096 17 or r0,r3,r4 18 neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ 19 andi. r0,r0,4095 20 std r3,-24(r1) 21 crand cr0*4+2,cr0*4+2,cr6*4+2 22 std r4,-16(r1) 23 std r5,-8(r1) 24 dcbt 0,r4 25 beq .Lcopy_page_4K 26 andi. r6,r6,7 27 PPC_MTOCRF 0x01,r5 28 blt cr1,.Lshort_copy 29/* Below we want to nop out the bne if we're on a CPU that has the 30 * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit 31 * cleared. 32 * At the time of writing the only CPU that has this combination of bits 33 * set is Power6. 34 */ 35BEGIN_FTR_SECTION 36 nop 37FTR_SECTION_ELSE 38 bne .Ldst_unaligned 39ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ 40 CPU_FTR_UNALIGNED_LD_STD) 41.Ldst_aligned: 42 addi r3,r3,-16 43BEGIN_FTR_SECTION 44 andi. r0,r4,7 45 bne .Lsrc_unaligned 46END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 47 blt cr1,.Ldo_tail /* if < 16 bytes to copy */ 48 srdi r0,r5,5 49 cmpdi cr1,r0,0 5020: ld r7,0(r4) 51220: ld r6,8(r4) 52 addi r4,r4,16 53 mtctr r0 54 andi. r0,r5,0x10 55 beq 22f 56 addi r3,r3,16 57 addi r4,r4,-16 58 mr r9,r7 59 mr r8,r6 60 beq cr1,72f 6121: ld r7,16(r4) 62221: ld r6,24(r4) 63 addi r4,r4,32 6470: std r9,0(r3) 65270: std r8,8(r3) 6622: ld r9,0(r4) 67222: ld r8,8(r4) 6871: std r7,16(r3) 69271: std r6,24(r3) 70 addi r3,r3,32 71 bdnz 21b 7272: std r9,0(r3) 73272: std r8,8(r3) 74 andi. r5,r5,0xf 75 beq+ 3f 76 addi r4,r4,16 77.Ldo_tail: 78 addi r3,r3,16 79 bf cr7*4+0,246f 80244: ld r9,0(r4) 81 addi r4,r4,8 82245: std r9,0(r3) 83 addi r3,r3,8 84246: bf cr7*4+1,1f 8523: lwz r9,0(r4) 86 addi r4,r4,4 8773: stw r9,0(r3) 88 addi r3,r3,4 891: bf cr7*4+2,2f 9044: lhz r9,0(r4) 91 addi r4,r4,2 9274: sth r9,0(r3) 93 addi r3,r3,2 942: bf cr7*4+3,3f 9545: lbz r9,0(r4) 9675: stb r9,0(r3) 973: li r3,0 98 blr 99 100.Lsrc_unaligned: 101 srdi r6,r5,3 102 addi r5,r5,-16 103 subf r4,r0,r4 104 srdi r7,r5,4 105 sldi r10,r0,3 106 cmpldi cr6,r6,3 107 andi. r5,r5,7 108 mtctr r7 109 subfic r11,r10,64 110 add r5,r5,r0 111 bt cr7*4+0,28f 112 11324: ld r9,0(r4) /* 3+2n loads, 2+2n stores */ 11425: ld r0,8(r4) 115 sld r6,r9,r10 11626: ldu r9,16(r4) 117 srd r7,r0,r11 118 sld r8,r0,r10 119 or r7,r7,r6 120 blt cr6,79f 12127: ld r0,8(r4) 122 b 2f 123 12428: ld r0,0(r4) /* 4+2n loads, 3+2n stores */ 12529: ldu r9,8(r4) 126 sld r8,r0,r10 127 addi r3,r3,-8 128 blt cr6,5f 12930: ld r0,8(r4) 130 srd r12,r9,r11 131 sld r6,r9,r10 13231: ldu r9,16(r4) 133 or r12,r8,r12 134 srd r7,r0,r11 135 sld r8,r0,r10 136 addi r3,r3,16 137 beq cr6,78f 138 1391: or r7,r7,r6 14032: ld r0,8(r4) 14176: std r12,8(r3) 1422: srd r12,r9,r11 143 sld r6,r9,r10 14433: ldu r9,16(r4) 145 or r12,r8,r12 14677: stdu r7,16(r3) 147 srd r7,r0,r11 148 sld r8,r0,r10 149 bdnz 1b 150 15178: std r12,8(r3) 152 or r7,r7,r6 15379: std r7,16(r3) 1545: srd r12,r9,r11 155 or r12,r8,r12 15680: std r12,24(r3) 157 bne 6f 158 li r3,0 159 blr 1606: cmpwi cr1,r5,8 161 addi r3,r3,32 162 sld r9,r9,r10 163 ble cr1,7f 16434: ld r0,8(r4) 165 srd r7,r0,r11 166 or r9,r7,r9 1677: 168 bf cr7*4+1,1f 169 rotldi r9,r9,32 17094: stw r9,0(r3) 171 addi r3,r3,4 1721: bf cr7*4+2,2f 173 rotldi r9,r9,16 17495: sth r9,0(r3) 175 addi r3,r3,2 1762: bf cr7*4+3,3f 177 rotldi r9,r9,8 17896: stb r9,0(r3) 1793: li r3,0 180 blr 181 182.Ldst_unaligned: 183 PPC_MTOCRF 0x01,r6 /* put #bytes to 8B bdry into cr7 */ 184 subf r5,r6,r5 185 li r7,0 186 cmpldi cr1,r5,16 187 bf cr7*4+3,1f 18835: lbz r0,0(r4) 18981: stb r0,0(r3) 190 addi r7,r7,1 1911: bf cr7*4+2,2f 19236: lhzx r0,r7,r4 19382: sthx r0,r7,r3 194 addi r7,r7,2 1952: bf cr7*4+1,3f 19637: lwzx r0,r7,r4 19783: stwx r0,r7,r3 1983: PPC_MTOCRF 0x01,r5 199 add r4,r6,r4 200 add r3,r6,r3 201 b .Ldst_aligned 202 203.Lshort_copy: 204 bf cr7*4+0,1f 20538: lwz r0,0(r4) 20639: lwz r9,4(r4) 207 addi r4,r4,8 20884: stw r0,0(r3) 20985: stw r9,4(r3) 210 addi r3,r3,8 2111: bf cr7*4+1,2f 21240: lwz r0,0(r4) 213 addi r4,r4,4 21486: stw r0,0(r3) 215 addi r3,r3,4 2162: bf cr7*4+2,3f 21741: lhz r0,0(r4) 218 addi r4,r4,2 21987: sth r0,0(r3) 220 addi r3,r3,2 2213: bf cr7*4+3,4f 22242: lbz r0,0(r4) 22388: stb r0,0(r3) 2244: li r3,0 225 blr 226 227/* 228 * exception handlers follow 229 * we have to return the number of bytes not copied 230 * for an exception on a load, we set the rest of the destination to 0 231 */ 232 233136: 234137: 235 add r3,r3,r7 236 b 1f 237130: 238131: 239 addi r3,r3,8 240120: 241320: 242122: 243322: 244124: 245125: 246126: 247127: 248128: 249129: 250133: 251 addi r3,r3,8 252132: 253 addi r3,r3,8 254121: 255321: 256344: 257134: 258135: 259138: 260139: 261140: 262141: 263142: 264123: 265144: 266145: 267 268/* 269 * here we have had a fault on a load and r3 points to the first 270 * unmodified byte of the destination 271 */ 2721: ld r6,-24(r1) 273 ld r4,-16(r1) 274 ld r5,-8(r1) 275 subf r6,r6,r3 276 add r4,r4,r6 277 subf r5,r6,r5 /* #bytes left to go */ 278 279/* 280 * first see if we can copy any more bytes before hitting another exception 281 */ 282 mtctr r5 28343: lbz r0,0(r4) 284 addi r4,r4,1 28589: stb r0,0(r3) 286 addi r3,r3,1 287 bdnz 43b 288 li r3,0 /* huh? all copied successfully this time? */ 289 blr 290 291/* 292 * here we have trapped again, need to clear ctr bytes starting at r3 293 */ 294143: mfctr r5 295 li r0,0 296 mr r4,r3 297 mr r3,r5 /* return the number of bytes not copied */ 2981: andi. r9,r4,7 299 beq 3f 30090: stb r0,0(r4) 301 addic. r5,r5,-1 302 addi r4,r4,1 303 bne 1b 304 blr 3053: cmpldi cr1,r5,8 306 srdi r9,r5,3 307 andi. r5,r5,7 308 blt cr1,93f 309 mtctr r9 31091: std r0,0(r4) 311 addi r4,r4,8 312 bdnz 91b 31393: beqlr 314 mtctr r5 31592: stb r0,0(r4) 316 addi r4,r4,1 317 bdnz 92b 318 blr 319 320/* 321 * exception handlers for stores: we just need to work 322 * out how many bytes weren't copied 323 */ 324182: 325183: 326 add r3,r3,r7 327 b 1f 328371: 329180: 330 addi r3,r3,8 331171: 332177: 333 addi r3,r3,8 334370: 335372: 336176: 337178: 338 addi r3,r3,4 339185: 340 addi r3,r3,4 341170: 342172: 343345: 344173: 345174: 346175: 347179: 348181: 349184: 350186: 351187: 352188: 353189: 354194: 355195: 356196: 3571: 358 ld r6,-24(r1) 359 ld r5,-8(r1) 360 add r6,r6,r5 361 subf r3,r3,r6 /* #bytes not copied */ 362190: 363191: 364192: 365 blr /* #bytes not copied in r3 */ 366 367 .section __ex_table,"a" 368 .align 3 369 .llong 20b,120b 370 .llong 220b,320b 371 .llong 21b,121b 372 .llong 221b,321b 373 .llong 70b,170b 374 .llong 270b,370b 375 .llong 22b,122b 376 .llong 222b,322b 377 .llong 71b,171b 378 .llong 271b,371b 379 .llong 72b,172b 380 .llong 272b,372b 381 .llong 244b,344b 382 .llong 245b,345b 383 .llong 23b,123b 384 .llong 73b,173b 385 .llong 44b,144b 386 .llong 74b,174b 387 .llong 45b,145b 388 .llong 75b,175b 389 .llong 24b,124b 390 .llong 25b,125b 391 .llong 26b,126b 392 .llong 27b,127b 393 .llong 28b,128b 394 .llong 29b,129b 395 .llong 30b,130b 396 .llong 31b,131b 397 .llong 32b,132b 398 .llong 76b,176b 399 .llong 33b,133b 400 .llong 77b,177b 401 .llong 78b,178b 402 .llong 79b,179b 403 .llong 80b,180b 404 .llong 34b,134b 405 .llong 94b,194b 406 .llong 95b,195b 407 .llong 96b,196b 408 .llong 35b,135b 409 .llong 81b,181b 410 .llong 36b,136b 411 .llong 82b,182b 412 .llong 37b,137b 413 .llong 83b,183b 414 .llong 38b,138b 415 .llong 39b,139b 416 .llong 84b,184b 417 .llong 85b,185b 418 .llong 40b,140b 419 .llong 86b,186b 420 .llong 41b,141b 421 .llong 87b,187b 422 .llong 42b,142b 423 .llong 88b,188b 424 .llong 43b,143b 425 .llong 89b,189b 426 .llong 90b,190b 427 .llong 91b,191b 428 .llong 92b,192b 429 430 .text 431 432/* 433 * Routine to copy a whole page of data, optimized for POWER4. 434 * On POWER4 it is more than 50% faster than the simple loop 435 * above (following the .Ldst_aligned label) but it runs slightly 436 * slower on POWER3. 437 */ 438.Lcopy_page_4K: 439 std r31,-32(1) 440 std r30,-40(1) 441 std r29,-48(1) 442 std r28,-56(1) 443 std r27,-64(1) 444 std r26,-72(1) 445 std r25,-80(1) 446 std r24,-88(1) 447 std r23,-96(1) 448 std r22,-104(1) 449 std r21,-112(1) 450 std r20,-120(1) 451 li r5,4096/32 - 1 452 addi r3,r3,-8 453 li r0,5 4540: addi r5,r5,-24 455 mtctr r0 45620: ld r22,640(4) 45721: ld r21,512(4) 45822: ld r20,384(4) 45923: ld r11,256(4) 46024: ld r9,128(4) 46125: ld r7,0(4) 46226: ld r25,648(4) 46327: ld r24,520(4) 46428: ld r23,392(4) 46529: ld r10,264(4) 46630: ld r8,136(4) 46731: ldu r6,8(4) 468 cmpwi r5,24 4691: 47032: std r22,648(3) 47133: std r21,520(3) 47234: std r20,392(3) 47335: std r11,264(3) 47436: std r9,136(3) 47537: std r7,8(3) 47638: ld r28,648(4) 47739: ld r27,520(4) 47840: ld r26,392(4) 47941: ld r31,264(4) 48042: ld r30,136(4) 48143: ld r29,8(4) 48244: std r25,656(3) 48345: std r24,528(3) 48446: std r23,400(3) 48547: std r10,272(3) 48648: std r8,144(3) 48749: std r6,16(3) 48850: ld r22,656(4) 48951: ld r21,528(4) 49052: ld r20,400(4) 49153: ld r11,272(4) 49254: ld r9,144(4) 49355: ld r7,16(4) 49456: std r28,664(3) 49557: std r27,536(3) 49658: std r26,408(3) 49759: std r31,280(3) 49860: std r30,152(3) 49961: stdu r29,24(3) 50062: ld r25,664(4) 50163: ld r24,536(4) 50264: ld r23,408(4) 50365: ld r10,280(4) 50466: ld r8,152(4) 50567: ldu r6,24(4) 506 bdnz 1b 50768: std r22,648(3) 50869: std r21,520(3) 50970: std r20,392(3) 51071: std r11,264(3) 51172: std r9,136(3) 51273: std r7,8(3) 51374: addi r4,r4,640 51475: addi r3,r3,648 515 bge 0b 516 mtctr r5 51776: ld r7,0(4) 51877: ld r8,8(4) 51978: ldu r9,16(4) 5203: 52179: ld r10,8(4) 52280: std r7,8(3) 52381: ld r7,16(4) 52482: std r8,16(3) 52583: ld r8,24(4) 52684: std r9,24(3) 52785: ldu r9,32(4) 52886: stdu r10,32(3) 529 bdnz 3b 5304: 53187: ld r10,8(4) 53288: std r7,8(3) 53389: std r8,16(3) 53490: std r9,24(3) 53591: std r10,32(3) 5369: ld r20,-120(1) 537 ld r21,-112(1) 538 ld r22,-104(1) 539 ld r23,-96(1) 540 ld r24,-88(1) 541 ld r25,-80(1) 542 ld r26,-72(1) 543 ld r27,-64(1) 544 ld r28,-56(1) 545 ld r29,-48(1) 546 ld r30,-40(1) 547 ld r31,-32(1) 548 li r3,0 549 blr 550 551/* 552 * on an exception, reset to the beginning and jump back into the 553 * standard __copy_tofrom_user 554 */ 555100: ld r20,-120(1) 556 ld r21,-112(1) 557 ld r22,-104(1) 558 ld r23,-96(1) 559 ld r24,-88(1) 560 ld r25,-80(1) 561 ld r26,-72(1) 562 ld r27,-64(1) 563 ld r28,-56(1) 564 ld r29,-48(1) 565 ld r30,-40(1) 566 ld r31,-32(1) 567 ld r3,-24(r1) 568 ld r4,-16(r1) 569 li r5,4096 570 b .Ldst_aligned 571 572 .section __ex_table,"a" 573 .align 3 574 .llong 20b,100b 575 .llong 21b,100b 576 .llong 22b,100b 577 .llong 23b,100b 578 .llong 24b,100b 579 .llong 25b,100b 580 .llong 26b,100b 581 .llong 27b,100b 582 .llong 28b,100b 583 .llong 29b,100b 584 .llong 30b,100b 585 .llong 31b,100b 586 .llong 32b,100b 587 .llong 33b,100b 588 .llong 34b,100b 589 .llong 35b,100b 590 .llong 36b,100b 591 .llong 37b,100b 592 .llong 38b,100b 593 .llong 39b,100b 594 .llong 40b,100b 595 .llong 41b,100b 596 .llong 42b,100b 597 .llong 43b,100b 598 .llong 44b,100b 599 .llong 45b,100b 600 .llong 46b,100b 601 .llong 47b,100b 602 .llong 48b,100b 603 .llong 49b,100b 604 .llong 50b,100b 605 .llong 51b,100b 606 .llong 52b,100b 607 .llong 53b,100b 608 .llong 54b,100b 609 .llong 55b,100b 610 .llong 56b,100b 611 .llong 57b,100b 612 .llong 58b,100b 613 .llong 59b,100b 614 .llong 60b,100b 615 .llong 61b,100b 616 .llong 62b,100b 617 .llong 63b,100b 618 .llong 64b,100b 619 .llong 65b,100b 620 .llong 66b,100b 621 .llong 67b,100b 622 .llong 68b,100b 623 .llong 69b,100b 624 .llong 70b,100b 625 .llong 71b,100b 626 .llong 72b,100b 627 .llong 73b,100b 628 .llong 74b,100b 629 .llong 75b,100b 630 .llong 76b,100b 631 .llong 77b,100b 632 .llong 78b,100b 633 .llong 79b,100b 634 .llong 80b,100b 635 .llong 81b,100b 636 .llong 82b,100b 637 .llong 83b,100b 638 .llong 84b,100b 639 .llong 85b,100b 640 .llong 86b,100b 641 .llong 87b,100b 642 .llong 88b,100b 643 .llong 89b,100b 644 .llong 90b,100b 645 .llong 91b,100b 646