1/* 2 * 3 * Optimized version of the copy_user() routine. 4 * It is used to copy date across the kernel/user boundary. 5 * 6 * The source and destination are always on opposite side of 7 * the boundary. When reading from user space we must catch 8 * faults on loads. When writing to user space we must catch 9 * errors on stores. Note that because of the nature of the copy 10 * we don't need to worry about overlapping regions. 11 * 12 * 13 * Inputs: 14 * in0 address of source buffer 15 * in1 address of destination buffer 16 * in2 number of bytes to copy 17 * 18 * Outputs: 19 * ret0 0 in case of success. The number of bytes NOT copied in 20 * case of error. 21 * 22 * Copyright (C) 2000-2001 Hewlett-Packard Co 23 * Stephane Eranian <eranian@hpl.hp.com> 24 * 25 * Fixme: 26 * - handle the case where we have more than 16 bytes and the alignment 27 * are different. 28 * - more benchmarking 29 * - fix extraneous stop bit introduced by the EX() macro. 30 */ 31 32#include <asm/asmmacro.h> 33#include <asm/export.h> 34 35// 36// Tuneable parameters 37// 38#define COPY_BREAK 16 // we do byte copy below (must be >=16) 39#define PIPE_DEPTH 21 // pipe depth 40 41#define EPI p[PIPE_DEPTH-1] 42 43// 44// arguments 45// 46#define dst in0 47#define src in1 48#define len in2 49 50// 51// local registers 52// 53#define t1 r2 // rshift in bytes 54#define t2 r3 // lshift in bytes 55#define rshift r14 // right shift in bits 56#define lshift r15 // left shift in bits 57#define word1 r16 58#define word2 r17 59#define cnt r18 60#define len2 r19 61#define saved_lc r20 62#define saved_pr r21 63#define tmp r22 64#define val r23 65#define src1 r24 66#define dst1 r25 67#define src2 r26 68#define dst2 r27 69#define len1 r28 70#define enddst r29 71#define endsrc r30 72#define saved_pfs r31 73 74GLOBAL_ENTRY(__copy_user) 75 .prologue 76 .save ar.pfs, saved_pfs 77 alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) 78 79 .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] 80 .rotp p[PIPE_DEPTH] 81 82 adds len2=-1,len // br.ctop is repeat/until 83 mov ret0=r0 84 85 ;; // RAW of cfm when len=0 86 cmp.eq p8,p0=r0,len // check for zero length 87 .save ar.lc, saved_lc 88 mov saved_lc=ar.lc // preserve ar.lc (slow) 89(p8) br.ret.spnt.many rp // empty mempcy() 90 ;; 91 add enddst=dst,len // first byte after end of source 92 add endsrc=src,len // first byte after end of destination 93 .save pr, saved_pr 94 mov saved_pr=pr // preserve predicates 95 96 .body 97 98 mov dst1=dst // copy because of rotation 99 mov ar.ec=PIPE_DEPTH 100 mov pr.rot=1<<16 // p16=true all others are false 101 102 mov src1=src // copy because of rotation 103 mov ar.lc=len2 // initialize lc for small count 104 cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy 105 106 xor tmp=src,dst // same alignment test prepare 107(p10) br.cond.dptk .long_copy_user 108 ;; // RAW pr.rot/p16 ? 109 // 110 // Now we do the byte by byte loop with software pipeline 111 // 112 // p7 is necessarily false by now 1131: 114 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 115 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 116 br.ctop.dptk.few 1b 117 ;; 118 mov ar.lc=saved_lc 119 mov pr=saved_pr,0xffffffffffff0000 120 mov ar.pfs=saved_pfs // restore ar.ec 121 br.ret.sptk.many rp // end of short memcpy 122 123 // 124 // Not 8-byte aligned 125 // 126.diff_align_copy_user: 127 // At this point we know we have more than 16 bytes to copy 128 // and also that src and dest do _not_ have the same alignment. 129 and src2=0x7,src1 // src offset 130 and dst2=0x7,dst1 // dst offset 131 ;; 132 // The basic idea is that we copy byte-by-byte at the head so 133 // that we can reach 8-byte alignment for both src1 and dst1. 134 // Then copy the body using software pipelined 8-byte copy, 135 // shifting the two back-to-back words right and left, then copy 136 // the tail by copying byte-by-byte. 137 // 138 // Fault handling. If the byte-by-byte at the head fails on the 139 // load, then restart and finish the pipleline by copying zeros 140 // to the dst1. Then copy zeros for the rest of dst1. 141 // If 8-byte software pipeline fails on the load, do the same as 142 // failure_in3 does. If the byte-by-byte at the tail fails, it is 143 // handled simply by failure_in_pipe1. 144 // 145 // The case p14 represents the source has more bytes in the 146 // the first word (by the shifted part), whereas the p15 needs to 147 // copy some bytes from the 2nd word of the source that has the 148 // tail of the 1st of the destination. 149 // 150 151 // 152 // Optimization. If dst1 is 8-byte aligned (quite common), we don't need 153 // to copy the head to dst1, to start 8-byte copy software pipeline. 154 // We know src1 is not 8-byte aligned in this case. 155 // 156 cmp.eq p14,p15=r0,dst2 157(p15) br.cond.spnt 1f 158 ;; 159 sub t1=8,src2 160 mov t2=src2 161 ;; 162 shl rshift=t2,3 163 sub len1=len,t1 // set len1 164 ;; 165 sub lshift=64,rshift 166 ;; 167 br.cond.spnt .word_copy_user 168 ;; 1691: 170 cmp.leu p14,p15=src2,dst2 171 sub t1=dst2,src2 172 ;; 173 .pred.rel "mutex", p14, p15 174(p14) sub word1=8,src2 // (8 - src offset) 175(p15) sub t1=r0,t1 // absolute value 176(p15) sub word1=8,dst2 // (8 - dst offset) 177 ;; 178 // For the case p14, we don't need to copy the shifted part to 179 // the 1st word of destination. 180 sub t2=8,t1 181(p14) sub word1=word1,t1 182 ;; 183 sub len1=len,word1 // resulting len 184(p15) shl rshift=t1,3 // in bits 185(p14) shl rshift=t2,3 186 ;; 187(p14) sub len1=len1,t1 188 adds cnt=-1,word1 189 ;; 190 sub lshift=64,rshift 191 mov ar.ec=PIPE_DEPTH 192 mov pr.rot=1<<16 // p16=true all others are false 193 mov ar.lc=cnt 194 ;; 1952: 196 EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) 197 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 198 br.ctop.dptk.few 2b 199 ;; 200 clrrrb 201 ;; 202.word_copy_user: 203 cmp.gtu p9,p0=16,len1 204(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy 205 ;; 206 shr.u cnt=len1,3 // number of 64-bit words 207 ;; 208 adds cnt=-1,cnt 209 ;; 210 .pred.rel "mutex", p14, p15 211(p14) sub src1=src1,t2 212(p15) sub src1=src1,t1 213 // 214 // Now both src1 and dst1 point to an 8-byte aligned address. And 215 // we have more than 8 bytes to copy. 216 // 217 mov ar.lc=cnt 218 mov ar.ec=PIPE_DEPTH 219 mov pr.rot=1<<16 // p16=true all others are false 220 ;; 2213: 222 // 223 // The pipleline consists of 3 stages: 224 // 1 (p16): Load a word from src1 225 // 2 (EPI_1): Shift right pair, saving to tmp 226 // 3 (EPI): Store tmp to dst1 227 // 228 // To make it simple, use at least 2 (p16) loops to set up val1[n] 229 // because we need 2 back-to-back val1[] to get tmp. 230 // Note that this implies EPI_2 must be p18 or greater. 231 // 232 233#define EPI_1 p[PIPE_DEPTH-2] 234#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift 235#define CASE(pred, shift) \ 236 (pred) br.cond.spnt .copy_user_bit##shift 237#define BODY(rshift) \ 238.copy_user_bit##rshift: \ 2391: \ 240 EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ 241(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 242 EX(3f,(p16) ld8 val1[1]=[src1],8); \ 243(p16) mov val1[0]=r0; \ 244 br.ctop.dptk 1b; \ 245 ;; \ 246 br.cond.sptk.many .diff_align_do_tail; \ 2472: \ 248(EPI) st8 [dst1]=tmp,8; \ 249(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 2503: \ 251(p16) mov val1[1]=r0; \ 252(p16) mov val1[0]=r0; \ 253 br.ctop.dptk 2b; \ 254 ;; \ 255 br.cond.sptk.many .failure_in2 256 257 // 258 // Since the instruction 'shrp' requires a fixed 128-bit value 259 // specifying the bits to shift, we need to provide 7 cases 260 // below. 261 // 262 SWITCH(p6, 8) 263 SWITCH(p7, 16) 264 SWITCH(p8, 24) 265 SWITCH(p9, 32) 266 SWITCH(p10, 40) 267 SWITCH(p11, 48) 268 SWITCH(p12, 56) 269 ;; 270 CASE(p6, 8) 271 CASE(p7, 16) 272 CASE(p8, 24) 273 CASE(p9, 32) 274 CASE(p10, 40) 275 CASE(p11, 48) 276 CASE(p12, 56) 277 ;; 278 BODY(8) 279 BODY(16) 280 BODY(24) 281 BODY(32) 282 BODY(40) 283 BODY(48) 284 BODY(56) 285 ;; 286.diff_align_do_tail: 287 .pred.rel "mutex", p14, p15 288(p14) sub src1=src1,t1 289(p14) adds dst1=-8,dst1 290(p15) sub dst1=dst1,t1 291 ;; 2924: 293 // Tail correction. 294 // 295 // The problem with this piplelined loop is that the last word is not 296 // loaded and thus parf of the last word written is not correct. 297 // To fix that, we simply copy the tail byte by byte. 298 299 sub len1=endsrc,src1,1 300 clrrrb 301 ;; 302 mov ar.ec=PIPE_DEPTH 303 mov pr.rot=1<<16 // p16=true all others are false 304 mov ar.lc=len1 305 ;; 3065: 307 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 308 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 309 br.ctop.dptk.few 5b 310 ;; 311 mov ar.lc=saved_lc 312 mov pr=saved_pr,0xffffffffffff0000 313 mov ar.pfs=saved_pfs 314 br.ret.sptk.many rp 315 316 // 317 // Beginning of long mempcy (i.e. > 16 bytes) 318 // 319.long_copy_user: 320 tbit.nz p6,p7=src1,0 // odd alignment 321 and tmp=7,tmp 322 ;; 323 cmp.eq p10,p8=r0,tmp 324 mov len1=len // copy because of rotation 325(p8) br.cond.dpnt .diff_align_copy_user 326 ;; 327 // At this point we know we have more than 16 bytes to copy 328 // and also that both src and dest have the same alignment 329 // which may not be the one we want. So for now we must move 330 // forward slowly until we reach 16byte alignment: no need to 331 // worry about reaching the end of buffer. 332 // 333 EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned 334(p6) adds len1=-1,len1;; 335 tbit.nz p7,p0=src1,1 336 ;; 337 EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned 338(p7) adds len1=-2,len1;; 339 tbit.nz p8,p0=src1,2 340 ;; 341 // 342 // Stop bit not required after ld4 because if we fail on ld4 343 // we have never executed the ld1, therefore st1 is not executed. 344 // 345 EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned 346 ;; 347 EX(.failure_out,(p6) st1 [dst1]=val1[0],1) 348 tbit.nz p9,p0=src1,3 349 ;; 350 // 351 // Stop bit not required after ld8 because if we fail on ld8 352 // we have never executed the ld2, therefore st2 is not executed. 353 // 354 EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned 355 EX(.failure_out,(p7) st2 [dst1]=val1[1],2) 356(p8) adds len1=-4,len1 357 ;; 358 EX(.failure_out, (p8) st4 [dst1]=val2[0],4) 359(p9) adds len1=-8,len1;; 360 shr.u cnt=len1,4 // number of 128-bit (2x64bit) words 361 ;; 362 EX(.failure_out, (p9) st8 [dst1]=val2[1],8) 363 tbit.nz p6,p0=len1,3 364 cmp.eq p7,p0=r0,cnt 365 adds tmp=-1,cnt // br.ctop is repeat/until 366(p7) br.cond.dpnt .dotail // we have less than 16 bytes left 367 ;; 368 adds src2=8,src1 369 adds dst2=8,dst1 370 mov ar.lc=tmp 371 ;; 372 // 373 // 16bytes/iteration 374 // 3752: 376 EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) 377(p16) ld8 val2[0]=[src2],16 378 379 EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) 380(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 381 br.ctop.dptk 2b 382 ;; // RAW on src1 when fall through from loop 383 // 384 // Tail correction based on len only 385 // 386 // No matter where we come from (loop or test) the src1 pointer 387 // is 16 byte aligned AND we have less than 16 bytes to copy. 388 // 389.dotail: 390 EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes 391 tbit.nz p7,p0=len1,2 392 ;; 393 EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes 394 tbit.nz p8,p0=len1,1 395 ;; 396 EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes 397 tbit.nz p9,p0=len1,0 398 ;; 399 EX(.failure_out, (p6) st8 [dst1]=val1[0],8) 400 ;; 401 EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left 402 mov ar.lc=saved_lc 403 ;; 404 EX(.failure_out,(p7) st4 [dst1]=val1[1],4) 405 mov pr=saved_pr,0xffffffffffff0000 406 ;; 407 EX(.failure_out, (p8) st2 [dst1]=val2[0],2) 408 mov ar.pfs=saved_pfs 409 ;; 410 EX(.failure_out, (p9) st1 [dst1]=val2[1]) 411 br.ret.sptk.many rp 412 413 414 // 415 // Here we handle the case where the byte by byte copy fails 416 // on the load. 417 // Several factors make the zeroing of the rest of the buffer kind of 418 // tricky: 419 // - the pipeline: loads/stores are not in sync (pipeline) 420 // 421 // In the same loop iteration, the dst1 pointer does not directly 422 // reflect where the faulty load was. 423 // 424 // - pipeline effect 425 // When you get a fault on load, you may have valid data from 426 // previous loads not yet store in transit. Such data must be 427 // store normally before moving onto zeroing the rest. 428 // 429 // - single/multi dispersal independence. 430 // 431 // solution: 432 // - we don't disrupt the pipeline, i.e. data in transit in 433 // the software pipeline will be eventually move to memory. 434 // We simply replace the load with a simple mov and keep the 435 // pipeline going. We can't really do this inline because 436 // p16 is always reset to 1 when lc > 0. 437 // 438.failure_in_pipe1: 439 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 4401: 441(p16) mov val1[0]=r0 442(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 443 br.ctop.dptk 1b 444 ;; 445 mov pr=saved_pr,0xffffffffffff0000 446 mov ar.lc=saved_lc 447 mov ar.pfs=saved_pfs 448 br.ret.sptk.many rp 449 450 // 451 // This is the case where the byte by byte copy fails on the load 452 // when we copy the head. We need to finish the pipeline and copy 453 // zeros for the rest of the destination. Since this happens 454 // at the top we still need to fill the body and tail. 455.failure_in_pipe2: 456 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 4572: 458(p16) mov val1[0]=r0 459(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 460 br.ctop.dptk 2b 461 ;; 462 sub len=enddst,dst1,1 // precompute len 463 br.cond.dptk.many .failure_in1bis 464 ;; 465 466 // 467 // Here we handle the head & tail part when we check for alignment. 468 // The following code handles only the load failures. The 469 // main diffculty comes from the fact that loads/stores are 470 // scheduled. So when you fail on a load, the stores corresponding 471 // to previous successful loads must be executed. 472 // 473 // However some simplifications are possible given the way 474 // things work. 475 // 476 // 1) HEAD 477 // Theory of operation: 478 // 479 // Page A | Page B 480 // ---------|----- 481 // 1|8 x 482 // 1 2|8 x 483 // 4|8 x 484 // 1 4|8 x 485 // 2 4|8 x 486 // 1 2 4|8 x 487 // |1 488 // |2 x 489 // |4 x 490 // 491 // page_size >= 4k (2^12). (x means 4, 2, 1) 492 // Here we suppose Page A exists and Page B does not. 493 // 494 // As we move towards eight byte alignment we may encounter faults. 495 // The numbers on each page show the size of the load (current alignment). 496 // 497 // Key point: 498 // - if you fail on 1, 2, 4 then you have never executed any smaller 499 // size loads, e.g. failing ld4 means no ld1 nor ld2 executed 500 // before. 501 // 502 // This allows us to simplify the cleanup code, because basically you 503 // only have to worry about "pending" stores in the case of a failing 504 // ld8(). Given the way the code is written today, this means only 505 // worry about st2, st4. There we can use the information encapsulated 506 // into the predicates. 507 // 508 // Other key point: 509 // - if you fail on the ld8 in the head, it means you went straight 510 // to it, i.e. 8byte alignment within an unexisting page. 511 // Again this comes from the fact that if you crossed just for the ld8 then 512 // you are 8byte aligned but also 16byte align, therefore you would 513 // either go for the 16byte copy loop OR the ld8 in the tail part. 514 // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible 515 // because it would mean you had 15bytes to copy in which case you 516 // would have defaulted to the byte by byte copy. 517 // 518 // 519 // 2) TAIL 520 // Here we now we have less than 16 bytes AND we are either 8 or 16 byte 521 // aligned. 522 // 523 // Key point: 524 // This means that we either: 525 // - are right on a page boundary 526 // OR 527 // - are at more than 16 bytes from a page boundary with 528 // at most 15 bytes to copy: no chance of crossing. 529 // 530 // This allows us to assume that if we fail on a load we haven't possibly 531 // executed any of the previous (tail) ones, so we don't need to do 532 // any stores. For instance, if we fail on ld2, this means we had 533 // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. 534 // 535 // This means that we are in a situation similar the a fault in the 536 // head part. That's nice! 537 // 538.failure_in1: 539 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 540 sub len=endsrc,src1,1 541 // 542 // we know that ret0 can never be zero at this point 543 // because we failed why trying to do a load, i.e. there is still 544 // some work to do. 545 // The failure_in1bis and length problem is taken care of at the 546 // calling side. 547 // 548 ;; 549.failure_in1bis: // from (.failure_in3) 550 mov ar.lc=len // Continue with a stupid byte store. 551 ;; 5525: 553 st1 [dst1]=r0,1 554 br.cloop.dptk 5b 555 ;; 556 mov pr=saved_pr,0xffffffffffff0000 557 mov ar.lc=saved_lc 558 mov ar.pfs=saved_pfs 559 br.ret.sptk.many rp 560 561 // 562 // Here we simply restart the loop but instead 563 // of doing loads we fill the pipeline with zeroes 564 // We can't simply store r0 because we may have valid 565 // data in transit in the pipeline. 566 // ar.lc and ar.ec are setup correctly at this point 567 // 568 // we MUST use src1/endsrc here and not dst1/enddst because 569 // of the pipeline effect. 570 // 571.failure_in3: 572 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 573 ;; 5742: 575(p16) mov val1[0]=r0 576(p16) mov val2[0]=r0 577(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 578(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 579 br.ctop.dptk 2b 580 ;; 581 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 582 sub len=enddst,dst1,1 // precompute len 583(p6) br.cond.dptk .failure_in1bis 584 ;; 585 mov pr=saved_pr,0xffffffffffff0000 586 mov ar.lc=saved_lc 587 mov ar.pfs=saved_pfs 588 br.ret.sptk.many rp 589 590.failure_in2: 591 sub ret0=endsrc,src1 592 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 593 sub len=enddst,dst1,1 // precompute len 594(p6) br.cond.dptk .failure_in1bis 595 ;; 596 mov pr=saved_pr,0xffffffffffff0000 597 mov ar.lc=saved_lc 598 mov ar.pfs=saved_pfs 599 br.ret.sptk.many rp 600 601 // 602 // handling of failures on stores: that's the easy part 603 // 604.failure_out: 605 sub ret0=enddst,dst1 606 mov pr=saved_pr,0xffffffffffff0000 607 mov ar.lc=saved_lc 608 609 mov ar.pfs=saved_pfs 610 br.ret.sptk.many rp 611END(__copy_user) 612EXPORT_SYMBOL(__copy_user) 613