1/* SPDX-License-Identifier: GPL-2.0 */ 2/* 3 * 4 * Optimized version of the copy_user() routine. 5 * It is used to copy date across the kernel/user boundary. 6 * 7 * The source and destination are always on opposite side of 8 * the boundary. When reading from user space we must catch 9 * faults on loads. When writing to user space we must catch 10 * errors on stores. Note that because of the nature of the copy 11 * we don't need to worry about overlapping regions. 12 * 13 * 14 * Inputs: 15 * in0 address of source buffer 16 * in1 address of destination buffer 17 * in2 number of bytes to copy 18 * 19 * Outputs: 20 * ret0 0 in case of success. The number of bytes NOT copied in 21 * case of error. 22 * 23 * Copyright (C) 2000-2001 Hewlett-Packard Co 24 * Stephane Eranian <eranian@hpl.hp.com> 25 * 26 * Fixme: 27 * - handle the case where we have more than 16 bytes and the alignment 28 * are different. 29 * - more benchmarking 30 * - fix extraneous stop bit introduced by the EX() macro. 31 */ 32 33#include <linux/export.h> 34#include <asm/asmmacro.h> 35 36// 37// Tuneable parameters 38// 39#define COPY_BREAK 16 // we do byte copy below (must be >=16) 40#define PIPE_DEPTH 21 // pipe depth 41 42#define EPI p[PIPE_DEPTH-1] 43 44// 45// arguments 46// 47#define dst in0 48#define src in1 49#define len in2 50 51// 52// local registers 53// 54#define t1 r2 // rshift in bytes 55#define t2 r3 // lshift in bytes 56#define rshift r14 // right shift in bits 57#define lshift r15 // left shift in bits 58#define word1 r16 59#define word2 r17 60#define cnt r18 61#define len2 r19 62#define saved_lc r20 63#define saved_pr r21 64#define tmp r22 65#define val r23 66#define src1 r24 67#define dst1 r25 68#define src2 r26 69#define dst2 r27 70#define len1 r28 71#define enddst r29 72#define endsrc r30 73#define saved_pfs r31 74 75GLOBAL_ENTRY(__copy_user) 76 .prologue 77 .save ar.pfs, saved_pfs 78 alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) 79 80 .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] 81 .rotp p[PIPE_DEPTH] 82 83 adds len2=-1,len // br.ctop is repeat/until 84 mov ret0=r0 85 86 ;; // RAW of cfm when len=0 87 cmp.eq p8,p0=r0,len // check for zero length 88 .save ar.lc, saved_lc 89 mov saved_lc=ar.lc // preserve ar.lc (slow) 90(p8) br.ret.spnt.many rp // empty mempcy() 91 ;; 92 add enddst=dst,len // first byte after end of source 93 add endsrc=src,len // first byte after end of destination 94 .save pr, saved_pr 95 mov saved_pr=pr // preserve predicates 96 97 .body 98 99 mov dst1=dst // copy because of rotation 100 mov ar.ec=PIPE_DEPTH 101 mov pr.rot=1<<16 // p16=true all others are false 102 103 mov src1=src // copy because of rotation 104 mov ar.lc=len2 // initialize lc for small count 105 cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy 106 107 xor tmp=src,dst // same alignment test prepare 108(p10) br.cond.dptk .long_copy_user 109 ;; // RAW pr.rot/p16 ? 110 // 111 // Now we do the byte by byte loop with software pipeline 112 // 113 // p7 is necessarily false by now 1141: 115 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 116 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 117 br.ctop.dptk.few 1b 118 ;; 119 mov ar.lc=saved_lc 120 mov pr=saved_pr,0xffffffffffff0000 121 mov ar.pfs=saved_pfs // restore ar.ec 122 br.ret.sptk.many rp // end of short memcpy 123 124 // 125 // Not 8-byte aligned 126 // 127.diff_align_copy_user: 128 // At this point we know we have more than 16 bytes to copy 129 // and also that src and dest do _not_ have the same alignment. 130 and src2=0x7,src1 // src offset 131 and dst2=0x7,dst1 // dst offset 132 ;; 133 // The basic idea is that we copy byte-by-byte at the head so 134 // that we can reach 8-byte alignment for both src1 and dst1. 135 // Then copy the body using software pipelined 8-byte copy, 136 // shifting the two back-to-back words right and left, then copy 137 // the tail by copying byte-by-byte. 138 // 139 // Fault handling. If the byte-by-byte at the head fails on the 140 // load, then restart and finish the pipleline by copying zeros 141 // to the dst1. Then copy zeros for the rest of dst1. 142 // If 8-byte software pipeline fails on the load, do the same as 143 // failure_in3 does. If the byte-by-byte at the tail fails, it is 144 // handled simply by failure_in_pipe1. 145 // 146 // The case p14 represents the source has more bytes in the 147 // the first word (by the shifted part), whereas the p15 needs to 148 // copy some bytes from the 2nd word of the source that has the 149 // tail of the 1st of the destination. 150 // 151 152 // 153 // Optimization. If dst1 is 8-byte aligned (quite common), we don't need 154 // to copy the head to dst1, to start 8-byte copy software pipeline. 155 // We know src1 is not 8-byte aligned in this case. 156 // 157 cmp.eq p14,p15=r0,dst2 158(p15) br.cond.spnt 1f 159 ;; 160 sub t1=8,src2 161 mov t2=src2 162 ;; 163 shl rshift=t2,3 164 sub len1=len,t1 // set len1 165 ;; 166 sub lshift=64,rshift 167 ;; 168 br.cond.spnt .word_copy_user 169 ;; 1701: 171 cmp.leu p14,p15=src2,dst2 172 sub t1=dst2,src2 173 ;; 174 .pred.rel "mutex", p14, p15 175(p14) sub word1=8,src2 // (8 - src offset) 176(p15) sub t1=r0,t1 // absolute value 177(p15) sub word1=8,dst2 // (8 - dst offset) 178 ;; 179 // For the case p14, we don't need to copy the shifted part to 180 // the 1st word of destination. 181 sub t2=8,t1 182(p14) sub word1=word1,t1 183 ;; 184 sub len1=len,word1 // resulting len 185(p15) shl rshift=t1,3 // in bits 186(p14) shl rshift=t2,3 187 ;; 188(p14) sub len1=len1,t1 189 adds cnt=-1,word1 190 ;; 191 sub lshift=64,rshift 192 mov ar.ec=PIPE_DEPTH 193 mov pr.rot=1<<16 // p16=true all others are false 194 mov ar.lc=cnt 195 ;; 1962: 197 EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) 198 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 199 br.ctop.dptk.few 2b 200 ;; 201 clrrrb 202 ;; 203.word_copy_user: 204 cmp.gtu p9,p0=16,len1 205(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy 206 ;; 207 shr.u cnt=len1,3 // number of 64-bit words 208 ;; 209 adds cnt=-1,cnt 210 ;; 211 .pred.rel "mutex", p14, p15 212(p14) sub src1=src1,t2 213(p15) sub src1=src1,t1 214 // 215 // Now both src1 and dst1 point to an 8-byte aligned address. And 216 // we have more than 8 bytes to copy. 217 // 218 mov ar.lc=cnt 219 mov ar.ec=PIPE_DEPTH 220 mov pr.rot=1<<16 // p16=true all others are false 221 ;; 2223: 223 // 224 // The pipleline consists of 3 stages: 225 // 1 (p16): Load a word from src1 226 // 2 (EPI_1): Shift right pair, saving to tmp 227 // 3 (EPI): Store tmp to dst1 228 // 229 // To make it simple, use at least 2 (p16) loops to set up val1[n] 230 // because we need 2 back-to-back val1[] to get tmp. 231 // Note that this implies EPI_2 must be p18 or greater. 232 // 233 234#define EPI_1 p[PIPE_DEPTH-2] 235#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift 236#define CASE(pred, shift) \ 237 (pred) br.cond.spnt .copy_user_bit##shift 238#define BODY(rshift) \ 239.copy_user_bit##rshift: \ 2401: \ 241 EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \ 242(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 243 EX(3f,(p16) ld8 val1[1]=[src1],8); \ 244(p16) mov val1[0]=r0; \ 245 br.ctop.dptk 1b; \ 246 ;; \ 247 br.cond.sptk.many .diff_align_do_tail; \ 2482: \ 249(EPI) st8 [dst1]=tmp,8; \ 250(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \ 2513: \ 252(p16) mov val1[1]=r0; \ 253(p16) mov val1[0]=r0; \ 254 br.ctop.dptk 2b; \ 255 ;; \ 256 br.cond.sptk.many .failure_in2 257 258 // 259 // Since the instruction 'shrp' requires a fixed 128-bit value 260 // specifying the bits to shift, we need to provide 7 cases 261 // below. 262 // 263 SWITCH(p6, 8) 264 SWITCH(p7, 16) 265 SWITCH(p8, 24) 266 SWITCH(p9, 32) 267 SWITCH(p10, 40) 268 SWITCH(p11, 48) 269 SWITCH(p12, 56) 270 ;; 271 CASE(p6, 8) 272 CASE(p7, 16) 273 CASE(p8, 24) 274 CASE(p9, 32) 275 CASE(p10, 40) 276 CASE(p11, 48) 277 CASE(p12, 56) 278 ;; 279 BODY(8) 280 BODY(16) 281 BODY(24) 282 BODY(32) 283 BODY(40) 284 BODY(48) 285 BODY(56) 286 ;; 287.diff_align_do_tail: 288 .pred.rel "mutex", p14, p15 289(p14) sub src1=src1,t1 290(p14) adds dst1=-8,dst1 291(p15) sub dst1=dst1,t1 292 ;; 2934: 294 // Tail correction. 295 // 296 // The problem with this piplelined loop is that the last word is not 297 // loaded and thus parf of the last word written is not correct. 298 // To fix that, we simply copy the tail byte by byte. 299 300 sub len1=endsrc,src1,1 301 clrrrb 302 ;; 303 mov ar.ec=PIPE_DEPTH 304 mov pr.rot=1<<16 // p16=true all others are false 305 mov ar.lc=len1 306 ;; 3075: 308 EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) 309 EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) 310 br.ctop.dptk.few 5b 311 ;; 312 mov ar.lc=saved_lc 313 mov pr=saved_pr,0xffffffffffff0000 314 mov ar.pfs=saved_pfs 315 br.ret.sptk.many rp 316 317 // 318 // Beginning of long mempcy (i.e. > 16 bytes) 319 // 320.long_copy_user: 321 tbit.nz p6,p7=src1,0 // odd alignment 322 and tmp=7,tmp 323 ;; 324 cmp.eq p10,p8=r0,tmp 325 mov len1=len // copy because of rotation 326(p8) br.cond.dpnt .diff_align_copy_user 327 ;; 328 // At this point we know we have more than 16 bytes to copy 329 // and also that both src and dest have the same alignment 330 // which may not be the one we want. So for now we must move 331 // forward slowly until we reach 16byte alignment: no need to 332 // worry about reaching the end of buffer. 333 // 334 EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned 335(p6) adds len1=-1,len1;; 336 tbit.nz p7,p0=src1,1 337 ;; 338 EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned 339(p7) adds len1=-2,len1;; 340 tbit.nz p8,p0=src1,2 341 ;; 342 // 343 // Stop bit not required after ld4 because if we fail on ld4 344 // we have never executed the ld1, therefore st1 is not executed. 345 // 346 EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned 347 ;; 348 EX(.failure_out,(p6) st1 [dst1]=val1[0],1) 349 tbit.nz p9,p0=src1,3 350 ;; 351 // 352 // Stop bit not required after ld8 because if we fail on ld8 353 // we have never executed the ld2, therefore st2 is not executed. 354 // 355 EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned 356 EX(.failure_out,(p7) st2 [dst1]=val1[1],2) 357(p8) adds len1=-4,len1 358 ;; 359 EX(.failure_out, (p8) st4 [dst1]=val2[0],4) 360(p9) adds len1=-8,len1;; 361 shr.u cnt=len1,4 // number of 128-bit (2x64bit) words 362 ;; 363 EX(.failure_out, (p9) st8 [dst1]=val2[1],8) 364 tbit.nz p6,p0=len1,3 365 cmp.eq p7,p0=r0,cnt 366 adds tmp=-1,cnt // br.ctop is repeat/until 367(p7) br.cond.dpnt .dotail // we have less than 16 bytes left 368 ;; 369 adds src2=8,src1 370 adds dst2=8,dst1 371 mov ar.lc=tmp 372 ;; 373 // 374 // 16bytes/iteration 375 // 3762: 377 EX(.failure_in3,(p16) ld8 val1[0]=[src1],16) 378(p16) ld8 val2[0]=[src2],16 379 380 EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16) 381(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 382 br.ctop.dptk 2b 383 ;; // RAW on src1 when fall through from loop 384 // 385 // Tail correction based on len only 386 // 387 // No matter where we come from (loop or test) the src1 pointer 388 // is 16 byte aligned AND we have less than 16 bytes to copy. 389 // 390.dotail: 391 EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes 392 tbit.nz p7,p0=len1,2 393 ;; 394 EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes 395 tbit.nz p8,p0=len1,1 396 ;; 397 EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes 398 tbit.nz p9,p0=len1,0 399 ;; 400 EX(.failure_out, (p6) st8 [dst1]=val1[0],8) 401 ;; 402 EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left 403 mov ar.lc=saved_lc 404 ;; 405 EX(.failure_out,(p7) st4 [dst1]=val1[1],4) 406 mov pr=saved_pr,0xffffffffffff0000 407 ;; 408 EX(.failure_out, (p8) st2 [dst1]=val2[0],2) 409 mov ar.pfs=saved_pfs 410 ;; 411 EX(.failure_out, (p9) st1 [dst1]=val2[1]) 412 br.ret.sptk.many rp 413 414 415 // 416 // Here we handle the case where the byte by byte copy fails 417 // on the load. 418 // Several factors make the zeroing of the rest of the buffer kind of 419 // tricky: 420 // - the pipeline: loads/stores are not in sync (pipeline) 421 // 422 // In the same loop iteration, the dst1 pointer does not directly 423 // reflect where the faulty load was. 424 // 425 // - pipeline effect 426 // When you get a fault on load, you may have valid data from 427 // previous loads not yet store in transit. Such data must be 428 // store normally before moving onto zeroing the rest. 429 // 430 // - single/multi dispersal independence. 431 // 432 // solution: 433 // - we don't disrupt the pipeline, i.e. data in transit in 434 // the software pipeline will be eventually move to memory. 435 // We simply replace the load with a simple mov and keep the 436 // pipeline going. We can't really do this inline because 437 // p16 is always reset to 1 when lc > 0. 438 // 439.failure_in_pipe1: 440 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 4411: 442(p16) mov val1[0]=r0 443(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 444 br.ctop.dptk 1b 445 ;; 446 mov pr=saved_pr,0xffffffffffff0000 447 mov ar.lc=saved_lc 448 mov ar.pfs=saved_pfs 449 br.ret.sptk.many rp 450 451 // 452 // This is the case where the byte by byte copy fails on the load 453 // when we copy the head. We need to finish the pipeline and copy 454 // zeros for the rest of the destination. Since this happens 455 // at the top we still need to fill the body and tail. 456.failure_in_pipe2: 457 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 4582: 459(p16) mov val1[0]=r0 460(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1 461 br.ctop.dptk 2b 462 ;; 463 sub len=enddst,dst1,1 // precompute len 464 br.cond.dptk.many .failure_in1bis 465 ;; 466 467 // 468 // Here we handle the head & tail part when we check for alignment. 469 // The following code handles only the load failures. The 470 // main diffculty comes from the fact that loads/stores are 471 // scheduled. So when you fail on a load, the stores corresponding 472 // to previous successful loads must be executed. 473 // 474 // However some simplifications are possible given the way 475 // things work. 476 // 477 // 1) HEAD 478 // Theory of operation: 479 // 480 // Page A | Page B 481 // ---------|----- 482 // 1|8 x 483 // 1 2|8 x 484 // 4|8 x 485 // 1 4|8 x 486 // 2 4|8 x 487 // 1 2 4|8 x 488 // |1 489 // |2 x 490 // |4 x 491 // 492 // page_size >= 4k (2^12). (x means 4, 2, 1) 493 // Here we suppose Page A exists and Page B does not. 494 // 495 // As we move towards eight byte alignment we may encounter faults. 496 // The numbers on each page show the size of the load (current alignment). 497 // 498 // Key point: 499 // - if you fail on 1, 2, 4 then you have never executed any smaller 500 // size loads, e.g. failing ld4 means no ld1 nor ld2 executed 501 // before. 502 // 503 // This allows us to simplify the cleanup code, because basically you 504 // only have to worry about "pending" stores in the case of a failing 505 // ld8(). Given the way the code is written today, this means only 506 // worry about st2, st4. There we can use the information encapsulated 507 // into the predicates. 508 // 509 // Other key point: 510 // - if you fail on the ld8 in the head, it means you went straight 511 // to it, i.e. 8byte alignment within an unexisting page. 512 // Again this comes from the fact that if you crossed just for the ld8 then 513 // you are 8byte aligned but also 16byte align, therefore you would 514 // either go for the 16byte copy loop OR the ld8 in the tail part. 515 // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible 516 // because it would mean you had 15bytes to copy in which case you 517 // would have defaulted to the byte by byte copy. 518 // 519 // 520 // 2) TAIL 521 // Here we now we have less than 16 bytes AND we are either 8 or 16 byte 522 // aligned. 523 // 524 // Key point: 525 // This means that we either: 526 // - are right on a page boundary 527 // OR 528 // - are at more than 16 bytes from a page boundary with 529 // at most 15 bytes to copy: no chance of crossing. 530 // 531 // This allows us to assume that if we fail on a load we haven't possibly 532 // executed any of the previous (tail) ones, so we don't need to do 533 // any stores. For instance, if we fail on ld2, this means we had 534 // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4. 535 // 536 // This means that we are in a situation similar the a fault in the 537 // head part. That's nice! 538 // 539.failure_in1: 540 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 541 sub len=endsrc,src1,1 542 // 543 // we know that ret0 can never be zero at this point 544 // because we failed why trying to do a load, i.e. there is still 545 // some work to do. 546 // The failure_in1bis and length problem is taken care of at the 547 // calling side. 548 // 549 ;; 550.failure_in1bis: // from (.failure_in3) 551 mov ar.lc=len // Continue with a stupid byte store. 552 ;; 5535: 554 st1 [dst1]=r0,1 555 br.cloop.dptk 5b 556 ;; 557 mov pr=saved_pr,0xffffffffffff0000 558 mov ar.lc=saved_lc 559 mov ar.pfs=saved_pfs 560 br.ret.sptk.many rp 561 562 // 563 // Here we simply restart the loop but instead 564 // of doing loads we fill the pipeline with zeroes 565 // We can't simply store r0 because we may have valid 566 // data in transit in the pipeline. 567 // ar.lc and ar.ec are setup correctly at this point 568 // 569 // we MUST use src1/endsrc here and not dst1/enddst because 570 // of the pipeline effect. 571 // 572.failure_in3: 573 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied 574 ;; 5752: 576(p16) mov val1[0]=r0 577(p16) mov val2[0]=r0 578(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16 579(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16 580 br.ctop.dptk 2b 581 ;; 582 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 583 sub len=enddst,dst1,1 // precompute len 584(p6) br.cond.dptk .failure_in1bis 585 ;; 586 mov pr=saved_pr,0xffffffffffff0000 587 mov ar.lc=saved_lc 588 mov ar.pfs=saved_pfs 589 br.ret.sptk.many rp 590 591.failure_in2: 592 sub ret0=endsrc,src1 593 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ? 594 sub len=enddst,dst1,1 // precompute len 595(p6) br.cond.dptk .failure_in1bis 596 ;; 597 mov pr=saved_pr,0xffffffffffff0000 598 mov ar.lc=saved_lc 599 mov ar.pfs=saved_pfs 600 br.ret.sptk.many rp 601 602 // 603 // handling of failures on stores: that's the easy part 604 // 605.failure_out: 606 sub ret0=enddst,dst1 607 mov pr=saved_pr,0xffffffffffff0000 608 mov ar.lc=saved_lc 609 610 mov ar.pfs=saved_pfs 611 br.ret.sptk.many rp 612END(__copy_user) 613EXPORT_SYMBOL(__copy_user) 614