1/* 2 * 3 * Optimized version of the copy_user() routine. 4 * It is used to copy date across the kernel/user boundary. 5 * 6 * The source and destination are always on opposite side of 7 * the boundary. When reading from user space we must catch 8 * faults on loads. When writing to user space we must catch 9 * errors on stores. Note that because of the nature of the copy 10 * we don't need to worry about overlapping regions. 11 * 12 * 13 * Inputs: 14 * in0 address of source buffer 15 * in1 address of destination buffer 16 * in2 number of bytes to copy 17 * 18 * Outputs: 19 * ret0 0 in case of success. The number of bytes NOT copied in 20 * case of error. 21 * 22 * Copyright (C) 2000-2001 Hewlett-Packard Co 23 * Stephane Eranian <eranian@hpl.hp.com> 24 * 25 * Fixme: 26 * - handle the case where we have more than 16 bytes and the alignment 27 * are different. 28 * - more benchmarking 29 * - fix extraneous stop bit introduced by the EX() macro. 30 */ 31 32#include <asm/asmmacro.h> 33 34// 35// Tuneable parameters 36// 37#define COPY_BREAK 16 // we do byte copy below (must be >=16) 38#define PIPE_DEPTH 21 // pipe depth 39 40#define EPI p[PIPE_DEPTH-1] 41 42// 43// arguments 44// 45#define dst in0 46#define src in1 47#define len in2 48 49// 50// local registers 51// 52#define t1 r2 // rshift in bytes 53#define t2 r3 // lshift in bytes 54#define rshift r14 // right shift in bits 55#define lshift r15 // left shift in bits 56#define word1 r16 57#define word2 r17 58#define cnt r18 59#define len2 r19 60#define saved_lc r20 61#define saved_pr r21 62#define tmp r22 63#define val r23 64#define src1 r24 65#define dst1 r25 66#define src2 r26 67#define dst2 r27 68#define len1 r28 69#define enddst r29 70#define endsrc r30 71#define saved_pfs r31 72 73GLOBAL_ENTRY(__copy_user) 74 .prologue 75 .save ar.pfs, saved_pfs 76 alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) 77 78 .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] 79 .rotp p[PIPE_DEPTH] 80 81 adds len2=-1,len // br.ctop is repeat/until 82 mov ret0=r0 83 84 ;; // RAW of cfm when len=0 85 cmp.eq p8,p0=r0,len // check for zero length 86 .save ar.lc, saved_lc 87 mov saved_lc=ar.lc // preserve ar.lc (slow) 88(p8) br.ret.spnt.many rp // empty mempcy() 89 ;; 90 add enddst=dst,len // first byte after end of source 91 add endsrc=src,len // first byte after end of destination 92 .save pr, saved_pr 93 mov saved_pr=pr // preserve predicates 94 95 .body 96 97 mov dst1=dst // copy because of rotation 98 mov ar.ec=PIPE_DEPTH 99 mov pr.rot=1<<16 // p16=true all others are false 100 101 mov src1=src // copy because of rotation 102 mov ar.lc=len2 // initialize lc for small count 103 cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy 104 105 xor tmp=src,dst // same alignment test prepare 106(p10) br.cond.dptk .long_copy_user 107 ;; // RAW pr.rot/p16 ? 108 // 109 // Now we do the byte by byte loop with software pipeline 110 // 111 // p7 is necessarily false by now 1121: 113 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 114 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 115 br.ctop.dptk.few 1b 116 ;; 117 mov ar.lc=saved_lc 118 mov pr=saved_pr,0xffffffffffff0000 119 mov ar.pfs=saved_pfs // restore ar.ec 120 br.ret.sptk.many rp // end of short memcpy 121 122 // 123 // Not 8-byte aligned 124 // 125.diff_align_copy_user: 126 // At this point we know we have more than 16 bytes to copy 127 // and also that src and dest do _not_ have the same alignment. 128 and src2=0x7,src1 // src offset 129 and dst2=0x7,dst1 // dst offset 130 ;; 131 // The basic idea is that we copy byte-by-byte at the head so 132 // that we can reach 8-byte alignment for both src1 and dst1. 133 // Then copy the body using software pipelined 8-byte copy, 134 // shifting the two back-to-back words right and left, then copy 135 // the tail by copying byte-by-byte. 136 // 137 // Fault handling. If the byte-by-byte at the head fails on the 138 // load, then restart and finish the pipleline by copying zeros 139 // to the dst1. Then copy zeros for the rest of dst1. 140 // If 8-byte software pipeline fails on the load, do the same as 141 // failure_in3 does. If the byte-by-byte at the tail fails, it is 142 // handled simply by failure_in_pipe1. 143 // 144 // The case p14 represents the source has more bytes in the 145 // the first word (by the shifted part), whereas the p15 needs to 146 // copy some bytes from the 2nd word of the source that has the 147 // tail of the 1st of the destination. 148 // 149 150 // 151 // Optimization. If dst1 is 8-byte aligned (quite common), we don't need 152 // to copy the head to dst1, to start 8-byte copy software pipeline. 153 // We know src1 is not 8-byte aligned in this case. 154 // 155 cmp.eq p14,p15=r0,dst2 156(p15) br.cond.spnt 1f 157 ;; 158 sub t1=8,src2 159 mov t2=src2 160 ;; 161 shl rshift=t2,3 162 sub len1=len,t1 // set len1 163 ;; 164 sub lshift=64,rshift 165 ;; 166 br.cond.spnt .word_copy_user 167 ;; 1681: 169 cmp.leu p14,p15=src2,dst2 170 sub t1=dst2,src2 171 ;; 172 .pred.rel "mutex", p14, p15 173(p14) sub word1=8,src2 // (8 - src offset) 174(p15) sub t1=r0,t1 // absolute value 175(p15) sub word1=8,dst2 // (8 - dst offset) 176 ;; 177 // For the case p14, we don't need to copy the shifted part to 178 // the 1st word of destination. 179 sub t2=8,t1 180(p14) sub word1=word1,t1 181 ;; 182 sub len1=len,word1 // resulting len 183(p15) shl rshift=t1,3 // in bits 184(p14) shl rshift=t2,3 185 ;; 186(p14) sub len1=len1,t1 187 adds cnt=-1,word1 188 ;; 189 sub lshift=64,rshift 190 mov ar.ec=PIPE_DEPTH 191 mov pr.rot=1<<16 // p16=true all others are false 192 mov ar.lc=cnt 193 ;; 1942: 195 EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) 196 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 197 br.ctop.dptk.few 2b 198 ;; 199 clrrrb 200 ;; 201.word_copy_user: 202 cmp.gtu p9,p0=16,len1 203(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy 204 ;; 205 shr.u cnt=len1,3 // number of 64-bit words 206 ;; 207 adds cnt=-1,cnt 208 ;; 209 .pred.rel "mutex", p14, p15 210(p14) sub src1=src1,t2 211(p15) sub src1=src1,t1 212 // 213 // Now both src1 and dst1 point to an 8-byte aligned address. And 214 // we have more than 8 bytes to copy. 215 // 216 mov ar.lc=cnt 217 mov ar.ec=PIPE_DEPTH 218 mov pr.rot=1<<16 // p16=true all others are false 219 ;; 2203: 221 // 222 // The pipleline consists of 3 stages: 223 // 1 (p16): Load a word from src1 224 // 2 (EPI_1): Shift right pair, saving to tmp 225 // 3 (EPI): Store tmp to dst1 226 // 227 // To make it simple, use at least 2 (p16) loops to set up val1[n] 228 // because we need 2 back-to-back val1[] to get tmp. 229 // Note that this implies EPI_2 must be p18 or greater. 230 // 231 232#define EPI_1 p[PIPE_DEPTH-2] 233#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift 234#define CASE(pred, shift) \ 235 (pred) br.cond.spnt .copy_user_bit##shift 236#define BODY(rshift) \ 237.copy_user_bit##rshift: \ 2381: \ 239 EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ 240(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 241 EX(3f,(p16) ld8 val1[1]=[src1],8); \ 242(p16) mov val1[0]=r0; \ 243 br.ctop.dptk 1b; \ 244 ;; \ 245 br.cond.sptk.many .diff_align_do_tail; \ 2462: \ 247(EPI) st8 [dst1]=tmp,8; \ 248(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 2493: \ 250(p16) mov val1[1]=r0; \ 251(p16) mov val1[0]=r0; \ 252 br.ctop.dptk 2b; \ 253 ;; \ 254 br.cond.sptk.many .failure_in2 255 256 // 257 // Since the instruction 'shrp' requires a fixed 128-bit value 258 // specifying the bits to shift, we need to provide 7 cases 259 // below. 260 // 261 SWITCH(p6, 8) 262 SWITCH(p7, 16) 263 SWITCH(p8, 24) 264 SWITCH(p9, 32) 265 SWITCH(p10, 40) 266 SWITCH(p11, 48) 267 SWITCH(p12, 56) 268 ;; 269 CASE(p6, 8) 270 CASE(p7, 16) 271 CASE(p8, 24) 272 CASE(p9, 32) 273 CASE(p10, 40) 274 CASE(p11, 48) 275 CASE(p12, 56) 276 ;; 277 BODY(8) 278 BODY(16) 279 BODY(24) 280 BODY(32) 281 BODY(40) 282 BODY(48) 283 BODY(56) 284 ;; 285.diff_align_do_tail: 286 .pred.rel "mutex", p14, p15 287(p14) sub src1=src1,t1 288(p14) adds dst1=-8,dst1 289(p15) sub dst1=dst1,t1 290 ;; 2914: 292 // Tail correction. 293 // 294 // The problem with this piplelined loop is that the last word is not 295 // loaded and thus parf of the last word written is not correct. 296 // To fix that, we simply copy the tail byte by byte. 297 298 sub len1=endsrc,src1,1 299 clrrrb 300 ;; 301 mov ar.ec=PIPE_DEPTH 302 mov pr.rot=1<<16 // p16=true all others are false 303 mov ar.lc=len1 304 ;; 3055: 306 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 307 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 308 br.ctop.dptk.few 5b 309 ;; 310 mov ar.lc=saved_lc 311 mov pr=saved_pr,0xffffffffffff0000 312 mov ar.pfs=saved_pfs 313 br.ret.sptk.many rp 314 315 // 316 // Beginning of long mempcy (i.e. > 16 bytes) 317 // 318.long_copy_user: 319 tbit.nz p6,p7=src1,0 // odd alignment 320 and tmp=7,tmp 321 ;; 322 cmp.eq p10,p8=r0,tmp 323 mov len1=len // copy because of rotation 324(p8) br.cond.dpnt .diff_align_copy_user 325 ;; 326 // At this point we know we have more than 16 bytes to copy 327 // and also that both src and dest have the same alignment 328 // which may not be the one we want. So for now we must move 329 // forward slowly until we reach 16byte alignment: no need to 330 // worry about reaching the end of buffer. 331 // 332 EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned 333(p6) adds len1=-1,len1;; 334 tbit.nz p7,p0=src1,1 335 ;; 336 EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned 337(p7) adds len1=-2,len1;; 338 tbit.nz p8,p0=src1,2 339 ;; 340 // 341 // Stop bit not required after ld4 because if we fail on ld4 342 // we have never executed the ld1, therefore st1 is not executed. 343 // 344 EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned 345 ;; 346 EX(.failure_out,(p6) st1 [dst1]=val1[0],1) 347 tbit.nz p9,p0=src1,3 348 ;; 349 // 350 // Stop bit not required after ld8 because if we fail on ld8 351 // we have never executed the ld2, therefore st2 is not executed. 352 // 353 EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned 354 EX(.failure_out,(p7) st2 [dst1]=val1[1],2) 355(p8) adds len1=-4,len1 356 ;; 357 EX(.failure_out, (p8) st4 [dst1]=val2[0],4) 358(p9) adds len1=-8,len1;; 359 shr.u cnt=len1,4 // number of 128-bit (2x64bit) words 360 ;; 361 EX(.failure_out, (p9) st8 [dst1]=val2[1],8) 362 tbit.nz p6,p0=len1,3 363 cmp.eq p7,p0=r0,cnt 364 adds tmp=-1,cnt // br.ctop is repeat/until 365(p7) br.cond.dpnt .dotail // we have less than 16 bytes left 366 ;; 367 adds src2=8,src1 368 adds dst2=8,dst1 369 mov ar.lc=tmp 370 ;; 371 // 372 // 16bytes/iteration 373 // 3742: 375 EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) 376(p16) ld8 val2[0]=[src2],16 377 378 EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) 379(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 380 br.ctop.dptk 2b 381 ;; // RAW on src1 when fall through from loop 382 // 383 // Tail correction based on len only 384 // 385 // No matter where we come from (loop or test) the src1 pointer 386 // is 16 byte aligned AND we have less than 16 bytes to copy. 387 // 388.dotail: 389 EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes 390 tbit.nz p7,p0=len1,2 391 ;; 392 EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes 393 tbit.nz p8,p0=len1,1 394 ;; 395 EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes 396 tbit.nz p9,p0=len1,0 397 ;; 398 EX(.failure_out, (p6) st8 [dst1]=val1[0],8) 399 ;; 400 EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left 401 mov ar.lc=saved_lc 402 ;; 403 EX(.failure_out,(p7) st4 [dst1]=val1[1],4) 404 mov pr=saved_pr,0xffffffffffff0000 405 ;; 406 EX(.failure_out, (p8) st2 [dst1]=val2[0],2) 407 mov ar.pfs=saved_pfs 408 ;; 409 EX(.failure_out, (p9) st1 [dst1]=val2[1]) 410 br.ret.sptk.many rp 411 412 413 // 414 // Here we handle the case where the byte by byte copy fails 415 // on the load. 416 // Several factors make the zeroing of the rest of the buffer kind of 417 // tricky: 418 // - the pipeline: loads/stores are not in sync (pipeline) 419 // 420 // In the same loop iteration, the dst1 pointer does not directly 421 // reflect where the faulty load was. 422 // 423 // - pipeline effect 424 // When you get a fault on load, you may have valid data from 425 // previous loads not yet store in transit. Such data must be 426 // store normally before moving onto zeroing the rest. 427 // 428 // - single/multi dispersal independence. 429 // 430 // solution: 431 // - we don't disrupt the pipeline, i.e. data in transit in 432 // the software pipeline will be eventually move to memory. 433 // We simply replace the load with a simple mov and keep the 434 // pipeline going. We can't really do this inline because 435 // p16 is always reset to 1 when lc > 0. 436 // 437.failure_in_pipe1: 438 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 4391: 440(p16) mov val1[0]=r0 441(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 442 br.ctop.dptk 1b 443 ;; 444 mov pr=saved_pr,0xffffffffffff0000 445 mov ar.lc=saved_lc 446 mov ar.pfs=saved_pfs 447 br.ret.sptk.many rp 448 449 // 450 // This is the case where the byte by byte copy fails on the load 451 // when we copy the head. We need to finish the pipeline and copy 452 // zeros for the rest of the destination. Since this happens 453 // at the top we still need to fill the body and tail. 454.failure_in_pipe2: 455 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 4562: 457(p16) mov val1[0]=r0 458(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 459 br.ctop.dptk 2b 460 ;; 461 sub len=enddst,dst1,1 // precompute len 462 br.cond.dptk.many .failure_in1bis 463 ;; 464 465 // 466 // Here we handle the head & tail part when we check for alignment. 467 // The following code handles only the load failures. The 468 // main diffculty comes from the fact that loads/stores are 469 // scheduled. So when you fail on a load, the stores corresponding 470 // to previous successful loads must be executed. 471 // 472 // However some simplifications are possible given the way 473 // things work. 474 // 475 // 1) HEAD 476 // Theory of operation: 477 // 478 // Page A | Page B 479 // ---------|----- 480 // 1|8 x 481 // 1 2|8 x 482 // 4|8 x 483 // 1 4|8 x 484 // 2 4|8 x 485 // 1 2 4|8 x 486 // |1 487 // |2 x 488 // |4 x 489 // 490 // page_size >= 4k (2^12). (x means 4, 2, 1) 491 // Here we suppose Page A exists and Page B does not. 492 // 493 // As we move towards eight byte alignment we may encounter faults. 494 // The numbers on each page show the size of the load (current alignment). 495 // 496 // Key point: 497 // - if you fail on 1, 2, 4 then you have never executed any smaller 498 // size loads, e.g. failing ld4 means no ld1 nor ld2 executed 499 // before. 500 // 501 // This allows us to simplify the cleanup code, because basically you 502 // only have to worry about "pending" stores in the case of a failing 503 // ld8(). Given the way the code is written today, this means only 504 // worry about st2, st4. There we can use the information encapsulated 505 // into the predicates. 506 // 507 // Other key point: 508 // - if you fail on the ld8 in the head, it means you went straight 509 // to it, i.e. 8byte alignment within an unexisting page. 510 // Again this comes from the fact that if you crossed just for the ld8 then 511 // you are 8byte aligned but also 16byte align, therefore you would 512 // either go for the 16byte copy loop OR the ld8 in the tail part. 513 // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible 514 // because it would mean you had 15bytes to copy in which case you 515 // would have defaulted to the byte by byte copy. 516 // 517 // 518 // 2) TAIL 519 // Here we now we have less than 16 bytes AND we are either 8 or 16 byte 520 // aligned. 521 // 522 // Key point: 523 // This means that we either: 524 // - are right on a page boundary 525 // OR 526 // - are at more than 16 bytes from a page boundary with 527 // at most 15 bytes to copy: no chance of crossing. 528 // 529 // This allows us to assume that if we fail on a load we haven't possibly 530 // executed any of the previous (tail) ones, so we don't need to do 531 // any stores. For instance, if we fail on ld2, this means we had 532 // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. 533 // 534 // This means that we are in a situation similar the a fault in the 535 // head part. That's nice! 536 // 537.failure_in1: 538 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 539 sub len=endsrc,src1,1 540 // 541 // we know that ret0 can never be zero at this point 542 // because we failed why trying to do a load, i.e. there is still 543 // some work to do. 544 // The failure_in1bis and length problem is taken care of at the 545 // calling side. 546 // 547 ;; 548.failure_in1bis: // from (.failure_in3) 549 mov ar.lc=len // Continue with a stupid byte store. 550 ;; 5515: 552 st1 [dst1]=r0,1 553 br.cloop.dptk 5b 554 ;; 555 mov pr=saved_pr,0xffffffffffff0000 556 mov ar.lc=saved_lc 557 mov ar.pfs=saved_pfs 558 br.ret.sptk.many rp 559 560 // 561 // Here we simply restart the loop but instead 562 // of doing loads we fill the pipeline with zeroes 563 // We can't simply store r0 because we may have valid 564 // data in transit in the pipeline. 565 // ar.lc and ar.ec are setup correctly at this point 566 // 567 // we MUST use src1/endsrc here and not dst1/enddst because 568 // of the pipeline effect. 569 // 570.failure_in3: 571 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 572 ;; 5732: 574(p16) mov val1[0]=r0 575(p16) mov val2[0]=r0 576(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 577(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 578 br.ctop.dptk 2b 579 ;; 580 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 581 sub len=enddst,dst1,1 // precompute len 582(p6) br.cond.dptk .failure_in1bis 583 ;; 584 mov pr=saved_pr,0xffffffffffff0000 585 mov ar.lc=saved_lc 586 mov ar.pfs=saved_pfs 587 br.ret.sptk.many rp 588 589.failure_in2: 590 sub ret0=endsrc,src1 591 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 592 sub len=enddst,dst1,1 // precompute len 593(p6) br.cond.dptk .failure_in1bis 594 ;; 595 mov pr=saved_pr,0xffffffffffff0000 596 mov ar.lc=saved_lc 597 mov ar.pfs=saved_pfs 598 br.ret.sptk.many rp 599 600 // 601 // handling of failures on stores: that's the easy part 602 // 603.failure_out: 604 sub ret0=enddst,dst1 605 mov pr=saved_pr,0xffffffffffff0000 606 mov ar.lc=saved_lc 607 608 mov ar.pfs=saved_pfs 609 br.ret.sptk.many rp 610END(__copy_user) 611