1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers iovec and kvec alike */ 20 #define iterate_iovec(i, n, __v, __p, skip, STEP) { \ 21 size_t left; \ 22 size_t wanted = n; \ 23 do { \ 24 __v.iov_len = min(n, __p->iov_len - skip); \ 25 if (likely(__v.iov_len)) { \ 26 __v.iov_base = __p->iov_base + skip; \ 27 left = (STEP); \ 28 __v.iov_len -= left; \ 29 skip += __v.iov_len; \ 30 n -= __v.iov_len; \ 31 if (skip < __p->iov_len) \ 32 break; \ 33 } \ 34 __p++; \ 35 skip = 0; \ 36 } while (n); \ 37 n = wanted - n; \ 38 } 39 40 #define iterate_bvec(i, n, __v, __bi, skip, STEP) { \ 41 struct bvec_iter __start; \ 42 __start.bi_size = n; \ 43 __start.bi_bvec_done = skip; \ 44 __start.bi_idx = 0; \ 45 for_each_bvec(__v, i->bvec, __bi, __start) { \ 46 (void)(STEP); \ 47 } \ 48 } 49 50 #define iterate_xarray(i, n, __v, skip, STEP) { \ 51 struct page *head = NULL; \ 52 size_t wanted = n, seg, offset; \ 53 loff_t start = i->xarray_start + skip; \ 54 pgoff_t index = start >> PAGE_SHIFT; \ 55 int j; \ 56 \ 57 XA_STATE(xas, i->xarray, index); \ 58 \ 59 rcu_read_lock(); \ 60 xas_for_each(&xas, head, ULONG_MAX) { \ 61 if (xas_retry(&xas, head)) \ 62 continue; \ 63 if (WARN_ON(xa_is_value(head))) \ 64 break; \ 65 if (WARN_ON(PageHuge(head))) \ 66 break; \ 67 for (j = (head->index < index) ? index - head->index : 0; \ 68 j < thp_nr_pages(head); j++) { \ 69 __v.bv_page = head + j; \ 70 offset = (i->xarray_start + skip) & ~PAGE_MASK; \ 71 seg = PAGE_SIZE - offset; \ 72 __v.bv_offset = offset; \ 73 __v.bv_len = min(n, seg); \ 74 (void)(STEP); \ 75 n -= __v.bv_len; \ 76 skip += __v.bv_len; \ 77 if (n == 0) \ 78 break; \ 79 } \ 80 if (n == 0) \ 81 break; \ 82 } \ 83 rcu_read_unlock(); \ 84 n = wanted - n; \ 85 } 86 87 #define iterate_and_advance(i, n, v, I, B, K, X) { \ 88 if (unlikely(i->count < n)) \ 89 n = i->count; \ 90 if (likely(n)) { \ 91 size_t skip = i->iov_offset; \ 92 if (likely(iter_is_iovec(i))) { \ 93 const struct iovec *iov = i->iov; \ 94 struct iovec v; \ 95 iterate_iovec(i, n, v, iov, skip, (I)) \ 96 i->nr_segs -= iov - i->iov; \ 97 i->iov = iov; \ 98 } else if (iov_iter_is_bvec(i)) { \ 99 const struct bio_vec *bvec = i->bvec; \ 100 struct bio_vec v; \ 101 struct bvec_iter __bi; \ 102 iterate_bvec(i, n, v, __bi, skip, (B)) \ 103 i->bvec = __bvec_iter_bvec(i->bvec, __bi); \ 104 i->nr_segs -= i->bvec - bvec; \ 105 skip = __bi.bi_bvec_done; \ 106 } else if (iov_iter_is_kvec(i)) { \ 107 const struct kvec *kvec = i->kvec; \ 108 struct kvec v; \ 109 iterate_iovec(i, n, v, kvec, skip, \ 110 ((void)(K),0)) \ 111 i->nr_segs -= kvec - i->kvec; \ 112 i->kvec = kvec; \ 113 } else if (iov_iter_is_xarray(i)) { \ 114 struct bio_vec v; \ 115 iterate_xarray(i, n, v, skip, (X)) \ 116 } \ 117 i->count -= n; \ 118 i->iov_offset = skip; \ 119 } \ 120 } 121 122 static int copyout(void __user *to, const void *from, size_t n) 123 { 124 if (should_fail_usercopy()) 125 return n; 126 if (access_ok(to, n)) { 127 instrument_copy_to_user(to, from, n); 128 n = raw_copy_to_user(to, from, n); 129 } 130 return n; 131 } 132 133 static int copyin(void *to, const void __user *from, size_t n) 134 { 135 if (should_fail_usercopy()) 136 return n; 137 if (access_ok(from, n)) { 138 instrument_copy_from_user(to, from, n); 139 n = raw_copy_from_user(to, from, n); 140 } 141 return n; 142 } 143 144 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 145 struct iov_iter *i) 146 { 147 size_t skip, copy, left, wanted; 148 const struct iovec *iov; 149 char __user *buf; 150 void *kaddr, *from; 151 152 if (unlikely(bytes > i->count)) 153 bytes = i->count; 154 155 if (unlikely(!bytes)) 156 return 0; 157 158 might_fault(); 159 wanted = bytes; 160 iov = i->iov; 161 skip = i->iov_offset; 162 buf = iov->iov_base + skip; 163 copy = min(bytes, iov->iov_len - skip); 164 165 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { 166 kaddr = kmap_atomic(page); 167 from = kaddr + offset; 168 169 /* first chunk, usually the only one */ 170 left = copyout(buf, from, copy); 171 copy -= left; 172 skip += copy; 173 from += copy; 174 bytes -= copy; 175 176 while (unlikely(!left && bytes)) { 177 iov++; 178 buf = iov->iov_base; 179 copy = min(bytes, iov->iov_len); 180 left = copyout(buf, from, copy); 181 copy -= left; 182 skip = copy; 183 from += copy; 184 bytes -= copy; 185 } 186 if (likely(!bytes)) { 187 kunmap_atomic(kaddr); 188 goto done; 189 } 190 offset = from - kaddr; 191 buf += copy; 192 kunmap_atomic(kaddr); 193 copy = min(bytes, iov->iov_len - skip); 194 } 195 /* Too bad - revert to non-atomic kmap */ 196 197 kaddr = kmap(page); 198 from = kaddr + offset; 199 left = copyout(buf, from, copy); 200 copy -= left; 201 skip += copy; 202 from += copy; 203 bytes -= copy; 204 while (unlikely(!left && bytes)) { 205 iov++; 206 buf = iov->iov_base; 207 copy = min(bytes, iov->iov_len); 208 left = copyout(buf, from, copy); 209 copy -= left; 210 skip = copy; 211 from += copy; 212 bytes -= copy; 213 } 214 kunmap(page); 215 216 done: 217 if (skip == iov->iov_len) { 218 iov++; 219 skip = 0; 220 } 221 i->count -= wanted - bytes; 222 i->nr_segs -= iov - i->iov; 223 i->iov = iov; 224 i->iov_offset = skip; 225 return wanted - bytes; 226 } 227 228 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, 229 struct iov_iter *i) 230 { 231 size_t skip, copy, left, wanted; 232 const struct iovec *iov; 233 char __user *buf; 234 void *kaddr, *to; 235 236 if (unlikely(bytes > i->count)) 237 bytes = i->count; 238 239 if (unlikely(!bytes)) 240 return 0; 241 242 might_fault(); 243 wanted = bytes; 244 iov = i->iov; 245 skip = i->iov_offset; 246 buf = iov->iov_base + skip; 247 copy = min(bytes, iov->iov_len - skip); 248 249 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { 250 kaddr = kmap_atomic(page); 251 to = kaddr + offset; 252 253 /* first chunk, usually the only one */ 254 left = copyin(to, buf, copy); 255 copy -= left; 256 skip += copy; 257 to += copy; 258 bytes -= copy; 259 260 while (unlikely(!left && bytes)) { 261 iov++; 262 buf = iov->iov_base; 263 copy = min(bytes, iov->iov_len); 264 left = copyin(to, buf, copy); 265 copy -= left; 266 skip = copy; 267 to += copy; 268 bytes -= copy; 269 } 270 if (likely(!bytes)) { 271 kunmap_atomic(kaddr); 272 goto done; 273 } 274 offset = to - kaddr; 275 buf += copy; 276 kunmap_atomic(kaddr); 277 copy = min(bytes, iov->iov_len - skip); 278 } 279 /* Too bad - revert to non-atomic kmap */ 280 281 kaddr = kmap(page); 282 to = kaddr + offset; 283 left = copyin(to, buf, copy); 284 copy -= left; 285 skip += copy; 286 to += copy; 287 bytes -= copy; 288 while (unlikely(!left && bytes)) { 289 iov++; 290 buf = iov->iov_base; 291 copy = min(bytes, iov->iov_len); 292 left = copyin(to, buf, copy); 293 copy -= left; 294 skip = copy; 295 to += copy; 296 bytes -= copy; 297 } 298 kunmap(page); 299 300 done: 301 if (skip == iov->iov_len) { 302 iov++; 303 skip = 0; 304 } 305 i->count -= wanted - bytes; 306 i->nr_segs -= iov - i->iov; 307 i->iov = iov; 308 i->iov_offset = skip; 309 return wanted - bytes; 310 } 311 312 #ifdef PIPE_PARANOIA 313 static bool sanity(const struct iov_iter *i) 314 { 315 struct pipe_inode_info *pipe = i->pipe; 316 unsigned int p_head = pipe->head; 317 unsigned int p_tail = pipe->tail; 318 unsigned int p_mask = pipe->ring_size - 1; 319 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 320 unsigned int i_head = i->head; 321 unsigned int idx; 322 323 if (i->iov_offset) { 324 struct pipe_buffer *p; 325 if (unlikely(p_occupancy == 0)) 326 goto Bad; // pipe must be non-empty 327 if (unlikely(i_head != p_head - 1)) 328 goto Bad; // must be at the last buffer... 329 330 p = &pipe->bufs[i_head & p_mask]; 331 if (unlikely(p->offset + p->len != i->iov_offset)) 332 goto Bad; // ... at the end of segment 333 } else { 334 if (i_head != p_head) 335 goto Bad; // must be right after the last buffer 336 } 337 return true; 338 Bad: 339 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 340 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 341 p_head, p_tail, pipe->ring_size); 342 for (idx = 0; idx < pipe->ring_size; idx++) 343 printk(KERN_ERR "[%p %p %d %d]\n", 344 pipe->bufs[idx].ops, 345 pipe->bufs[idx].page, 346 pipe->bufs[idx].offset, 347 pipe->bufs[idx].len); 348 WARN_ON(1); 349 return false; 350 } 351 #else 352 #define sanity(i) true 353 #endif 354 355 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 356 struct iov_iter *i) 357 { 358 struct pipe_inode_info *pipe = i->pipe; 359 struct pipe_buffer *buf; 360 unsigned int p_tail = pipe->tail; 361 unsigned int p_mask = pipe->ring_size - 1; 362 unsigned int i_head = i->head; 363 size_t off; 364 365 if (unlikely(bytes > i->count)) 366 bytes = i->count; 367 368 if (unlikely(!bytes)) 369 return 0; 370 371 if (!sanity(i)) 372 return 0; 373 374 off = i->iov_offset; 375 buf = &pipe->bufs[i_head & p_mask]; 376 if (off) { 377 if (offset == off && buf->page == page) { 378 /* merge with the last one */ 379 buf->len += bytes; 380 i->iov_offset += bytes; 381 goto out; 382 } 383 i_head++; 384 buf = &pipe->bufs[i_head & p_mask]; 385 } 386 if (pipe_full(i_head, p_tail, pipe->max_usage)) 387 return 0; 388 389 buf->ops = &page_cache_pipe_buf_ops; 390 get_page(page); 391 buf->page = page; 392 buf->offset = offset; 393 buf->len = bytes; 394 395 pipe->head = i_head + 1; 396 i->iov_offset = offset + bytes; 397 i->head = i_head; 398 out: 399 i->count -= bytes; 400 return bytes; 401 } 402 403 /* 404 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 405 * bytes. For each iovec, fault in each page that constitutes the iovec. 406 * 407 * Return 0 on success, or non-zero if the memory could not be accessed (i.e. 408 * because it is an invalid address). 409 */ 410 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) 411 { 412 if (iter_is_iovec(i)) { 413 const struct iovec *p; 414 size_t skip; 415 416 if (bytes > i->count) 417 bytes = i->count; 418 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { 419 size_t len = min(bytes, p->iov_len - skip); 420 int err; 421 422 if (unlikely(!len)) 423 continue; 424 err = fault_in_pages_readable(p->iov_base + skip, len); 425 if (unlikely(err)) 426 return err; 427 bytes -= len; 428 } 429 } 430 return 0; 431 } 432 EXPORT_SYMBOL(iov_iter_fault_in_readable); 433 434 void iov_iter_init(struct iov_iter *i, unsigned int direction, 435 const struct iovec *iov, unsigned long nr_segs, 436 size_t count) 437 { 438 WARN_ON(direction & ~(READ | WRITE)); 439 WARN_ON_ONCE(uaccess_kernel()); 440 *i = (struct iov_iter) { 441 .iter_type = ITER_IOVEC, 442 .data_source = direction, 443 .iov = iov, 444 .nr_segs = nr_segs, 445 .iov_offset = 0, 446 .count = count 447 }; 448 } 449 EXPORT_SYMBOL(iov_iter_init); 450 451 static inline bool allocated(struct pipe_buffer *buf) 452 { 453 return buf->ops == &default_pipe_buf_ops; 454 } 455 456 static inline void data_start(const struct iov_iter *i, 457 unsigned int *iter_headp, size_t *offp) 458 { 459 unsigned int p_mask = i->pipe->ring_size - 1; 460 unsigned int iter_head = i->head; 461 size_t off = i->iov_offset; 462 463 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || 464 off == PAGE_SIZE)) { 465 iter_head++; 466 off = 0; 467 } 468 *iter_headp = iter_head; 469 *offp = off; 470 } 471 472 static size_t push_pipe(struct iov_iter *i, size_t size, 473 int *iter_headp, size_t *offp) 474 { 475 struct pipe_inode_info *pipe = i->pipe; 476 unsigned int p_tail = pipe->tail; 477 unsigned int p_mask = pipe->ring_size - 1; 478 unsigned int iter_head; 479 size_t off; 480 ssize_t left; 481 482 if (unlikely(size > i->count)) 483 size = i->count; 484 if (unlikely(!size)) 485 return 0; 486 487 left = size; 488 data_start(i, &iter_head, &off); 489 *iter_headp = iter_head; 490 *offp = off; 491 if (off) { 492 left -= PAGE_SIZE - off; 493 if (left <= 0) { 494 pipe->bufs[iter_head & p_mask].len += size; 495 return size; 496 } 497 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; 498 iter_head++; 499 } 500 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { 501 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; 502 struct page *page = alloc_page(GFP_USER); 503 if (!page) 504 break; 505 506 buf->ops = &default_pipe_buf_ops; 507 buf->page = page; 508 buf->offset = 0; 509 buf->len = min_t(ssize_t, left, PAGE_SIZE); 510 left -= buf->len; 511 iter_head++; 512 pipe->head = iter_head; 513 514 if (left == 0) 515 return size; 516 } 517 return size - left; 518 } 519 520 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 521 struct iov_iter *i) 522 { 523 struct pipe_inode_info *pipe = i->pipe; 524 unsigned int p_mask = pipe->ring_size - 1; 525 unsigned int i_head; 526 size_t n, off; 527 528 if (!sanity(i)) 529 return 0; 530 531 bytes = n = push_pipe(i, bytes, &i_head, &off); 532 if (unlikely(!n)) 533 return 0; 534 do { 535 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 536 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 537 i->head = i_head; 538 i->iov_offset = off + chunk; 539 n -= chunk; 540 addr += chunk; 541 off = 0; 542 i_head++; 543 } while (n); 544 i->count -= bytes; 545 return bytes; 546 } 547 548 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 549 __wsum sum, size_t off) 550 { 551 __wsum next = csum_partial_copy_nocheck(from, to, len); 552 return csum_block_add(sum, next, off); 553 } 554 555 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 556 struct csum_state *csstate, 557 struct iov_iter *i) 558 { 559 struct pipe_inode_info *pipe = i->pipe; 560 unsigned int p_mask = pipe->ring_size - 1; 561 __wsum sum = csstate->csum; 562 size_t off = csstate->off; 563 unsigned int i_head; 564 size_t n, r; 565 566 if (!sanity(i)) 567 return 0; 568 569 bytes = n = push_pipe(i, bytes, &i_head, &r); 570 if (unlikely(!n)) 571 return 0; 572 do { 573 size_t chunk = min_t(size_t, n, PAGE_SIZE - r); 574 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page); 575 sum = csum_and_memcpy(p + r, addr, chunk, sum, off); 576 kunmap_atomic(p); 577 i->head = i_head; 578 i->iov_offset = r + chunk; 579 n -= chunk; 580 off += chunk; 581 addr += chunk; 582 r = 0; 583 i_head++; 584 } while (n); 585 i->count -= bytes; 586 csstate->csum = sum; 587 csstate->off = off; 588 return bytes; 589 } 590 591 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 592 { 593 const char *from = addr; 594 if (unlikely(iov_iter_is_pipe(i))) 595 return copy_pipe_to_iter(addr, bytes, i); 596 if (iter_is_iovec(i)) 597 might_fault(); 598 iterate_and_advance(i, bytes, v, 599 copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), 600 memcpy_to_page(v.bv_page, v.bv_offset, 601 (from += v.bv_len) - v.bv_len, v.bv_len), 602 memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), 603 memcpy_to_page(v.bv_page, v.bv_offset, 604 (from += v.bv_len) - v.bv_len, v.bv_len) 605 ) 606 607 return bytes; 608 } 609 EXPORT_SYMBOL(_copy_to_iter); 610 611 #ifdef CONFIG_ARCH_HAS_COPY_MC 612 static int copyout_mc(void __user *to, const void *from, size_t n) 613 { 614 if (access_ok(to, n)) { 615 instrument_copy_to_user(to, from, n); 616 n = copy_mc_to_user((__force void *) to, from, n); 617 } 618 return n; 619 } 620 621 static unsigned long copy_mc_to_page(struct page *page, size_t offset, 622 const char *from, size_t len) 623 { 624 unsigned long ret; 625 char *to; 626 627 to = kmap_atomic(page); 628 ret = copy_mc_to_kernel(to + offset, from, len); 629 kunmap_atomic(to); 630 631 return ret; 632 } 633 634 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 635 struct iov_iter *i) 636 { 637 struct pipe_inode_info *pipe = i->pipe; 638 unsigned int p_mask = pipe->ring_size - 1; 639 unsigned int i_head; 640 size_t n, off, xfer = 0; 641 642 if (!sanity(i)) 643 return 0; 644 645 bytes = n = push_pipe(i, bytes, &i_head, &off); 646 if (unlikely(!n)) 647 return 0; 648 do { 649 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 650 unsigned long rem; 651 652 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page, 653 off, addr, chunk); 654 i->head = i_head; 655 i->iov_offset = off + chunk - rem; 656 xfer += chunk - rem; 657 if (rem) 658 break; 659 n -= chunk; 660 addr += chunk; 661 off = 0; 662 i_head++; 663 } while (n); 664 i->count -= xfer; 665 return xfer; 666 } 667 668 /** 669 * _copy_mc_to_iter - copy to iter with source memory error exception handling 670 * @addr: source kernel address 671 * @bytes: total transfer length 672 * @iter: destination iterator 673 * 674 * The pmem driver deploys this for the dax operation 675 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 676 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 677 * successfully copied. 678 * 679 * The main differences between this and typical _copy_to_iter(). 680 * 681 * * Typical tail/residue handling after a fault retries the copy 682 * byte-by-byte until the fault happens again. Re-triggering machine 683 * checks is potentially fatal so the implementation uses source 684 * alignment and poison alignment assumptions to avoid re-triggering 685 * hardware exceptions. 686 * 687 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 688 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 689 * a short copy. 690 */ 691 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 692 { 693 const char *from = addr; 694 unsigned long rem, curr_addr, s_addr = (unsigned long) addr; 695 696 if (unlikely(iov_iter_is_pipe(i))) 697 return copy_mc_pipe_to_iter(addr, bytes, i); 698 if (iter_is_iovec(i)) 699 might_fault(); 700 iterate_and_advance(i, bytes, v, 701 copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len, 702 v.iov_len), 703 ({ 704 rem = copy_mc_to_page(v.bv_page, v.bv_offset, 705 (from += v.bv_len) - v.bv_len, v.bv_len); 706 if (rem) { 707 curr_addr = (unsigned long) from; 708 bytes = curr_addr - s_addr - rem; 709 return bytes; 710 } 711 }), 712 ({ 713 rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len) 714 - v.iov_len, v.iov_len); 715 if (rem) { 716 curr_addr = (unsigned long) from; 717 bytes = curr_addr - s_addr - rem; 718 return bytes; 719 } 720 }), 721 ({ 722 rem = copy_mc_to_page(v.bv_page, v.bv_offset, 723 (from += v.bv_len) - v.bv_len, v.bv_len); 724 if (rem) { 725 curr_addr = (unsigned long) from; 726 bytes = curr_addr - s_addr - rem; 727 rcu_read_unlock(); 728 i->iov_offset += bytes; 729 i->count -= bytes; 730 return bytes; 731 } 732 }) 733 ) 734 735 return bytes; 736 } 737 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 738 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 739 740 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 741 { 742 char *to = addr; 743 if (unlikely(iov_iter_is_pipe(i))) { 744 WARN_ON(1); 745 return 0; 746 } 747 if (iter_is_iovec(i)) 748 might_fault(); 749 iterate_and_advance(i, bytes, v, 750 copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), 751 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, 752 v.bv_offset, v.bv_len), 753 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), 754 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, 755 v.bv_offset, v.bv_len) 756 ) 757 758 return bytes; 759 } 760 EXPORT_SYMBOL(_copy_from_iter); 761 762 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 763 { 764 char *to = addr; 765 if (unlikely(iov_iter_is_pipe(i))) { 766 WARN_ON(1); 767 return 0; 768 } 769 iterate_and_advance(i, bytes, v, 770 __copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len, 771 v.iov_base, v.iov_len), 772 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, 773 v.bv_offset, v.bv_len), 774 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), 775 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, 776 v.bv_offset, v.bv_len) 777 ) 778 779 return bytes; 780 } 781 EXPORT_SYMBOL(_copy_from_iter_nocache); 782 783 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 784 /** 785 * _copy_from_iter_flushcache - write destination through cpu cache 786 * @addr: destination kernel address 787 * @bytes: total transfer length 788 * @iter: source iterator 789 * 790 * The pmem driver arranges for filesystem-dax to use this facility via 791 * dax_copy_from_iter() for ensuring that writes to persistent memory 792 * are flushed through the CPU cache. It is differentiated from 793 * _copy_from_iter_nocache() in that guarantees all data is flushed for 794 * all iterator types. The _copy_from_iter_nocache() only attempts to 795 * bypass the cache for the ITER_IOVEC case, and on some archs may use 796 * instructions that strand dirty-data in the cache. 797 */ 798 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 799 { 800 char *to = addr; 801 if (unlikely(iov_iter_is_pipe(i))) { 802 WARN_ON(1); 803 return 0; 804 } 805 iterate_and_advance(i, bytes, v, 806 __copy_from_user_flushcache((to += v.iov_len) - v.iov_len, 807 v.iov_base, v.iov_len), 808 memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page, 809 v.bv_offset, v.bv_len), 810 memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base, 811 v.iov_len), 812 memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page, 813 v.bv_offset, v.bv_len) 814 ) 815 816 return bytes; 817 } 818 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 819 #endif 820 821 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 822 { 823 struct page *head; 824 size_t v = n + offset; 825 826 /* 827 * The general case needs to access the page order in order 828 * to compute the page size. 829 * However, we mostly deal with order-0 pages and thus can 830 * avoid a possible cache line miss for requests that fit all 831 * page orders. 832 */ 833 if (n <= v && v <= PAGE_SIZE) 834 return true; 835 836 head = compound_head(page); 837 v += (page - head) << PAGE_SHIFT; 838 839 if (likely(n <= v && v <= (page_size(head)))) 840 return true; 841 WARN_ON(1); 842 return false; 843 } 844 845 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 846 struct iov_iter *i) 847 { 848 if (likely(iter_is_iovec(i))) 849 return copy_page_to_iter_iovec(page, offset, bytes, i); 850 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 851 void *kaddr = kmap_atomic(page); 852 size_t wanted = copy_to_iter(kaddr + offset, bytes, i); 853 kunmap_atomic(kaddr); 854 return wanted; 855 } 856 if (iov_iter_is_pipe(i)) 857 return copy_page_to_iter_pipe(page, offset, bytes, i); 858 if (unlikely(iov_iter_is_discard(i))) { 859 if (unlikely(i->count < bytes)) 860 bytes = i->count; 861 i->count -= bytes; 862 return bytes; 863 } 864 WARN_ON(1); 865 return 0; 866 } 867 868 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 869 struct iov_iter *i) 870 { 871 size_t res = 0; 872 if (unlikely(!page_copy_sane(page, offset, bytes))) 873 return 0; 874 page += offset / PAGE_SIZE; // first subpage 875 offset %= PAGE_SIZE; 876 while (1) { 877 size_t n = __copy_page_to_iter(page, offset, 878 min(bytes, (size_t)PAGE_SIZE - offset), i); 879 res += n; 880 bytes -= n; 881 if (!bytes || !n) 882 break; 883 offset += n; 884 if (offset == PAGE_SIZE) { 885 page++; 886 offset = 0; 887 } 888 } 889 return res; 890 } 891 EXPORT_SYMBOL(copy_page_to_iter); 892 893 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 894 struct iov_iter *i) 895 { 896 if (unlikely(!page_copy_sane(page, offset, bytes))) 897 return 0; 898 if (likely(iter_is_iovec(i))) 899 return copy_page_from_iter_iovec(page, offset, bytes, i); 900 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 901 void *kaddr = kmap_atomic(page); 902 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 903 kunmap_atomic(kaddr); 904 return wanted; 905 } 906 WARN_ON(1); 907 return 0; 908 } 909 EXPORT_SYMBOL(copy_page_from_iter); 910 911 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 912 { 913 struct pipe_inode_info *pipe = i->pipe; 914 unsigned int p_mask = pipe->ring_size - 1; 915 unsigned int i_head; 916 size_t n, off; 917 918 if (!sanity(i)) 919 return 0; 920 921 bytes = n = push_pipe(i, bytes, &i_head, &off); 922 if (unlikely(!n)) 923 return 0; 924 925 do { 926 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 927 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk); 928 i->head = i_head; 929 i->iov_offset = off + chunk; 930 n -= chunk; 931 off = 0; 932 i_head++; 933 } while (n); 934 i->count -= bytes; 935 return bytes; 936 } 937 938 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 939 { 940 if (unlikely(iov_iter_is_pipe(i))) 941 return pipe_zero(bytes, i); 942 iterate_and_advance(i, bytes, v, 943 clear_user(v.iov_base, v.iov_len), 944 memzero_page(v.bv_page, v.bv_offset, v.bv_len), 945 memset(v.iov_base, 0, v.iov_len), 946 memzero_page(v.bv_page, v.bv_offset, v.bv_len) 947 ) 948 949 return bytes; 950 } 951 EXPORT_SYMBOL(iov_iter_zero); 952 953 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 954 struct iov_iter *i) 955 { 956 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 957 if (unlikely(!page_copy_sane(page, offset, bytes))) { 958 kunmap_atomic(kaddr); 959 return 0; 960 } 961 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 962 kunmap_atomic(kaddr); 963 WARN_ON(1); 964 return 0; 965 } 966 iterate_and_advance(i, bytes, v, 967 copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), 968 memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, 969 v.bv_offset, v.bv_len), 970 memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), 971 memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, 972 v.bv_offset, v.bv_len) 973 ) 974 kunmap_atomic(kaddr); 975 return bytes; 976 } 977 EXPORT_SYMBOL(copy_page_from_iter_atomic); 978 979 static inline void pipe_truncate(struct iov_iter *i) 980 { 981 struct pipe_inode_info *pipe = i->pipe; 982 unsigned int p_tail = pipe->tail; 983 unsigned int p_head = pipe->head; 984 unsigned int p_mask = pipe->ring_size - 1; 985 986 if (!pipe_empty(p_head, p_tail)) { 987 struct pipe_buffer *buf; 988 unsigned int i_head = i->head; 989 size_t off = i->iov_offset; 990 991 if (off) { 992 buf = &pipe->bufs[i_head & p_mask]; 993 buf->len = off - buf->offset; 994 i_head++; 995 } 996 while (p_head != i_head) { 997 p_head--; 998 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 999 } 1000 1001 pipe->head = p_head; 1002 } 1003 } 1004 1005 static void pipe_advance(struct iov_iter *i, size_t size) 1006 { 1007 struct pipe_inode_info *pipe = i->pipe; 1008 if (size) { 1009 struct pipe_buffer *buf; 1010 unsigned int p_mask = pipe->ring_size - 1; 1011 unsigned int i_head = i->head; 1012 size_t off = i->iov_offset, left = size; 1013 1014 if (off) /* make it relative to the beginning of buffer */ 1015 left += off - pipe->bufs[i_head & p_mask].offset; 1016 while (1) { 1017 buf = &pipe->bufs[i_head & p_mask]; 1018 if (left <= buf->len) 1019 break; 1020 left -= buf->len; 1021 i_head++; 1022 } 1023 i->head = i_head; 1024 i->iov_offset = buf->offset + left; 1025 } 1026 i->count -= size; 1027 /* ... and discard everything past that point */ 1028 pipe_truncate(i); 1029 } 1030 1031 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 1032 { 1033 struct bvec_iter bi; 1034 1035 bi.bi_size = i->count; 1036 bi.bi_bvec_done = i->iov_offset; 1037 bi.bi_idx = 0; 1038 bvec_iter_advance(i->bvec, &bi, size); 1039 1040 i->bvec += bi.bi_idx; 1041 i->nr_segs -= bi.bi_idx; 1042 i->count = bi.bi_size; 1043 i->iov_offset = bi.bi_bvec_done; 1044 } 1045 1046 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 1047 { 1048 const struct iovec *iov, *end; 1049 1050 if (!i->count) 1051 return; 1052 i->count -= size; 1053 1054 size += i->iov_offset; // from beginning of current segment 1055 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 1056 if (likely(size < iov->iov_len)) 1057 break; 1058 size -= iov->iov_len; 1059 } 1060 i->iov_offset = size; 1061 i->nr_segs -= iov - i->iov; 1062 i->iov = iov; 1063 } 1064 1065 void iov_iter_advance(struct iov_iter *i, size_t size) 1066 { 1067 if (unlikely(i->count < size)) 1068 size = i->count; 1069 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 1070 /* iovec and kvec have identical layouts */ 1071 iov_iter_iovec_advance(i, size); 1072 } else if (iov_iter_is_bvec(i)) { 1073 iov_iter_bvec_advance(i, size); 1074 } else if (iov_iter_is_pipe(i)) { 1075 pipe_advance(i, size); 1076 } else if (unlikely(iov_iter_is_xarray(i))) { 1077 i->iov_offset += size; 1078 i->count -= size; 1079 } else if (iov_iter_is_discard(i)) { 1080 i->count -= size; 1081 } 1082 } 1083 EXPORT_SYMBOL(iov_iter_advance); 1084 1085 void iov_iter_revert(struct iov_iter *i, size_t unroll) 1086 { 1087 if (!unroll) 1088 return; 1089 if (WARN_ON(unroll > MAX_RW_COUNT)) 1090 return; 1091 i->count += unroll; 1092 if (unlikely(iov_iter_is_pipe(i))) { 1093 struct pipe_inode_info *pipe = i->pipe; 1094 unsigned int p_mask = pipe->ring_size - 1; 1095 unsigned int i_head = i->head; 1096 size_t off = i->iov_offset; 1097 while (1) { 1098 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 1099 size_t n = off - b->offset; 1100 if (unroll < n) { 1101 off -= unroll; 1102 break; 1103 } 1104 unroll -= n; 1105 if (!unroll && i_head == i->start_head) { 1106 off = 0; 1107 break; 1108 } 1109 i_head--; 1110 b = &pipe->bufs[i_head & p_mask]; 1111 off = b->offset + b->len; 1112 } 1113 i->iov_offset = off; 1114 i->head = i_head; 1115 pipe_truncate(i); 1116 return; 1117 } 1118 if (unlikely(iov_iter_is_discard(i))) 1119 return; 1120 if (unroll <= i->iov_offset) { 1121 i->iov_offset -= unroll; 1122 return; 1123 } 1124 unroll -= i->iov_offset; 1125 if (iov_iter_is_xarray(i)) { 1126 BUG(); /* We should never go beyond the start of the specified 1127 * range since we might then be straying into pages that 1128 * aren't pinned. 1129 */ 1130 } else if (iov_iter_is_bvec(i)) { 1131 const struct bio_vec *bvec = i->bvec; 1132 while (1) { 1133 size_t n = (--bvec)->bv_len; 1134 i->nr_segs++; 1135 if (unroll <= n) { 1136 i->bvec = bvec; 1137 i->iov_offset = n - unroll; 1138 return; 1139 } 1140 unroll -= n; 1141 } 1142 } else { /* same logics for iovec and kvec */ 1143 const struct iovec *iov = i->iov; 1144 while (1) { 1145 size_t n = (--iov)->iov_len; 1146 i->nr_segs++; 1147 if (unroll <= n) { 1148 i->iov = iov; 1149 i->iov_offset = n - unroll; 1150 return; 1151 } 1152 unroll -= n; 1153 } 1154 } 1155 } 1156 EXPORT_SYMBOL(iov_iter_revert); 1157 1158 /* 1159 * Return the count of just the current iov_iter segment. 1160 */ 1161 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1162 { 1163 if (i->nr_segs > 1) { 1164 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1165 return min(i->count, i->iov->iov_len - i->iov_offset); 1166 if (iov_iter_is_bvec(i)) 1167 return min(i->count, i->bvec->bv_len - i->iov_offset); 1168 } 1169 return i->count; 1170 } 1171 EXPORT_SYMBOL(iov_iter_single_seg_count); 1172 1173 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1174 const struct kvec *kvec, unsigned long nr_segs, 1175 size_t count) 1176 { 1177 WARN_ON(direction & ~(READ | WRITE)); 1178 *i = (struct iov_iter){ 1179 .iter_type = ITER_KVEC, 1180 .data_source = direction, 1181 .kvec = kvec, 1182 .nr_segs = nr_segs, 1183 .iov_offset = 0, 1184 .count = count 1185 }; 1186 } 1187 EXPORT_SYMBOL(iov_iter_kvec); 1188 1189 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1190 const struct bio_vec *bvec, unsigned long nr_segs, 1191 size_t count) 1192 { 1193 WARN_ON(direction & ~(READ | WRITE)); 1194 *i = (struct iov_iter){ 1195 .iter_type = ITER_BVEC, 1196 .data_source = direction, 1197 .bvec = bvec, 1198 .nr_segs = nr_segs, 1199 .iov_offset = 0, 1200 .count = count 1201 }; 1202 } 1203 EXPORT_SYMBOL(iov_iter_bvec); 1204 1205 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1206 struct pipe_inode_info *pipe, 1207 size_t count) 1208 { 1209 BUG_ON(direction != READ); 1210 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1211 *i = (struct iov_iter){ 1212 .iter_type = ITER_PIPE, 1213 .data_source = false, 1214 .pipe = pipe, 1215 .head = pipe->head, 1216 .start_head = pipe->head, 1217 .iov_offset = 0, 1218 .count = count 1219 }; 1220 } 1221 EXPORT_SYMBOL(iov_iter_pipe); 1222 1223 /** 1224 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1225 * @i: The iterator to initialise. 1226 * @direction: The direction of the transfer. 1227 * @xarray: The xarray to access. 1228 * @start: The start file position. 1229 * @count: The size of the I/O buffer in bytes. 1230 * 1231 * Set up an I/O iterator to either draw data out of the pages attached to an 1232 * inode or to inject data into those pages. The pages *must* be prevented 1233 * from evaporation, either by taking a ref on them or locking them by the 1234 * caller. 1235 */ 1236 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1237 struct xarray *xarray, loff_t start, size_t count) 1238 { 1239 BUG_ON(direction & ~1); 1240 *i = (struct iov_iter) { 1241 .iter_type = ITER_XARRAY, 1242 .data_source = direction, 1243 .xarray = xarray, 1244 .xarray_start = start, 1245 .count = count, 1246 .iov_offset = 0 1247 }; 1248 } 1249 EXPORT_SYMBOL(iov_iter_xarray); 1250 1251 /** 1252 * iov_iter_discard - Initialise an I/O iterator that discards data 1253 * @i: The iterator to initialise. 1254 * @direction: The direction of the transfer. 1255 * @count: The size of the I/O buffer in bytes. 1256 * 1257 * Set up an I/O iterator that just discards everything that's written to it. 1258 * It's only available as a READ iterator. 1259 */ 1260 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1261 { 1262 BUG_ON(direction != READ); 1263 *i = (struct iov_iter){ 1264 .iter_type = ITER_DISCARD, 1265 .data_source = false, 1266 .count = count, 1267 .iov_offset = 0 1268 }; 1269 } 1270 EXPORT_SYMBOL(iov_iter_discard); 1271 1272 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1273 { 1274 unsigned long res = 0; 1275 size_t size = i->count; 1276 size_t skip = i->iov_offset; 1277 unsigned k; 1278 1279 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1280 size_t len = i->iov[k].iov_len - skip; 1281 if (len) { 1282 res |= (unsigned long)i->iov[k].iov_base + skip; 1283 if (len > size) 1284 len = size; 1285 res |= len; 1286 size -= len; 1287 if (!size) 1288 break; 1289 } 1290 } 1291 return res; 1292 } 1293 1294 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1295 { 1296 unsigned res = 0; 1297 size_t size = i->count; 1298 unsigned skip = i->iov_offset; 1299 unsigned k; 1300 1301 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1302 size_t len = i->bvec[k].bv_len - skip; 1303 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1304 if (len > size) 1305 len = size; 1306 res |= len; 1307 size -= len; 1308 if (!size) 1309 break; 1310 } 1311 return res; 1312 } 1313 1314 unsigned long iov_iter_alignment(const struct iov_iter *i) 1315 { 1316 /* iovec and kvec have identical layouts */ 1317 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1318 return iov_iter_alignment_iovec(i); 1319 1320 if (iov_iter_is_bvec(i)) 1321 return iov_iter_alignment_bvec(i); 1322 1323 if (iov_iter_is_pipe(i)) { 1324 unsigned int p_mask = i->pipe->ring_size - 1; 1325 size_t size = i->count; 1326 1327 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) 1328 return size | i->iov_offset; 1329 return size; 1330 } 1331 1332 if (iov_iter_is_xarray(i)) 1333 return (i->xarray_start + i->iov_offset) | i->count; 1334 1335 return 0; 1336 } 1337 EXPORT_SYMBOL(iov_iter_alignment); 1338 1339 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1340 { 1341 unsigned long res = 0; 1342 unsigned long v = 0; 1343 size_t size = i->count; 1344 unsigned k; 1345 1346 if (WARN_ON(!iter_is_iovec(i))) 1347 return ~0U; 1348 1349 for (k = 0; k < i->nr_segs; k++) { 1350 if (i->iov[k].iov_len) { 1351 unsigned long base = (unsigned long)i->iov[k].iov_base; 1352 if (v) // if not the first one 1353 res |= base | v; // this start | previous end 1354 v = base + i->iov[k].iov_len; 1355 if (size <= i->iov[k].iov_len) 1356 break; 1357 size -= i->iov[k].iov_len; 1358 } 1359 } 1360 return res; 1361 } 1362 EXPORT_SYMBOL(iov_iter_gap_alignment); 1363 1364 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1365 size_t maxsize, 1366 struct page **pages, 1367 int iter_head, 1368 size_t *start) 1369 { 1370 struct pipe_inode_info *pipe = i->pipe; 1371 unsigned int p_mask = pipe->ring_size - 1; 1372 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1373 if (!n) 1374 return -EFAULT; 1375 1376 maxsize = n; 1377 n += *start; 1378 while (n > 0) { 1379 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1380 iter_head++; 1381 n -= PAGE_SIZE; 1382 } 1383 1384 return maxsize; 1385 } 1386 1387 static ssize_t pipe_get_pages(struct iov_iter *i, 1388 struct page **pages, size_t maxsize, unsigned maxpages, 1389 size_t *start) 1390 { 1391 unsigned int iter_head, npages; 1392 size_t capacity; 1393 1394 if (!sanity(i)) 1395 return -EFAULT; 1396 1397 data_start(i, &iter_head, start); 1398 /* Amount of free space: some of this one + all after this one */ 1399 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1400 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1401 1402 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1403 } 1404 1405 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1406 pgoff_t index, unsigned int nr_pages) 1407 { 1408 XA_STATE(xas, xa, index); 1409 struct page *page; 1410 unsigned int ret = 0; 1411 1412 rcu_read_lock(); 1413 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1414 if (xas_retry(&xas, page)) 1415 continue; 1416 1417 /* Has the page moved or been split? */ 1418 if (unlikely(page != xas_reload(&xas))) { 1419 xas_reset(&xas); 1420 continue; 1421 } 1422 1423 pages[ret] = find_subpage(page, xas.xa_index); 1424 get_page(pages[ret]); 1425 if (++ret == nr_pages) 1426 break; 1427 } 1428 rcu_read_unlock(); 1429 return ret; 1430 } 1431 1432 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1433 struct page **pages, size_t maxsize, 1434 unsigned maxpages, size_t *_start_offset) 1435 { 1436 unsigned nr, offset; 1437 pgoff_t index, count; 1438 size_t size = maxsize, actual; 1439 loff_t pos; 1440 1441 if (!size || !maxpages) 1442 return 0; 1443 1444 pos = i->xarray_start + i->iov_offset; 1445 index = pos >> PAGE_SHIFT; 1446 offset = pos & ~PAGE_MASK; 1447 *_start_offset = offset; 1448 1449 count = 1; 1450 if (size > PAGE_SIZE - offset) { 1451 size -= PAGE_SIZE - offset; 1452 count += size >> PAGE_SHIFT; 1453 size &= ~PAGE_MASK; 1454 if (size) 1455 count++; 1456 } 1457 1458 if (count > maxpages) 1459 count = maxpages; 1460 1461 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1462 if (nr == 0) 1463 return 0; 1464 1465 actual = PAGE_SIZE * nr; 1466 actual -= offset; 1467 if (nr == count && size > 0) { 1468 unsigned last_offset = (nr > 1) ? 0 : offset; 1469 actual -= PAGE_SIZE - (last_offset + size); 1470 } 1471 return actual; 1472 } 1473 1474 /* must be done on non-empty ITER_IOVEC one */ 1475 static unsigned long first_iovec_segment(const struct iov_iter *i, 1476 size_t *size, size_t *start, 1477 size_t maxsize, unsigned maxpages) 1478 { 1479 size_t skip; 1480 long k; 1481 1482 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1483 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; 1484 size_t len = i->iov[k].iov_len - skip; 1485 1486 if (unlikely(!len)) 1487 continue; 1488 if (len > maxsize) 1489 len = maxsize; 1490 len += (*start = addr % PAGE_SIZE); 1491 if (len > maxpages * PAGE_SIZE) 1492 len = maxpages * PAGE_SIZE; 1493 *size = len; 1494 return addr & PAGE_MASK; 1495 } 1496 BUG(); // if it had been empty, we wouldn't get called 1497 } 1498 1499 /* must be done on non-empty ITER_BVEC one */ 1500 static struct page *first_bvec_segment(const struct iov_iter *i, 1501 size_t *size, size_t *start, 1502 size_t maxsize, unsigned maxpages) 1503 { 1504 struct page *page; 1505 size_t skip = i->iov_offset, len; 1506 1507 len = i->bvec->bv_len - skip; 1508 if (len > maxsize) 1509 len = maxsize; 1510 skip += i->bvec->bv_offset; 1511 page = i->bvec->bv_page + skip / PAGE_SIZE; 1512 len += (*start = skip % PAGE_SIZE); 1513 if (len > maxpages * PAGE_SIZE) 1514 len = maxpages * PAGE_SIZE; 1515 *size = len; 1516 return page; 1517 } 1518 1519 ssize_t iov_iter_get_pages(struct iov_iter *i, 1520 struct page **pages, size_t maxsize, unsigned maxpages, 1521 size_t *start) 1522 { 1523 size_t len; 1524 int n, res; 1525 1526 if (maxsize > i->count) 1527 maxsize = i->count; 1528 if (!maxsize) 1529 return 0; 1530 1531 if (likely(iter_is_iovec(i))) { 1532 unsigned long addr; 1533 1534 addr = first_iovec_segment(i, &len, start, maxsize, maxpages); 1535 n = DIV_ROUND_UP(len, PAGE_SIZE); 1536 res = get_user_pages_fast(addr, n, 1537 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, 1538 pages); 1539 if (unlikely(res < 0)) 1540 return res; 1541 return (res == n ? len : res * PAGE_SIZE) - *start; 1542 } 1543 if (iov_iter_is_bvec(i)) { 1544 struct page *page; 1545 1546 page = first_bvec_segment(i, &len, start, maxsize, maxpages); 1547 n = DIV_ROUND_UP(len, PAGE_SIZE); 1548 while (n--) 1549 get_page(*pages++ = page++); 1550 return len - *start; 1551 } 1552 if (iov_iter_is_pipe(i)) 1553 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1554 if (iov_iter_is_xarray(i)) 1555 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1556 return -EFAULT; 1557 } 1558 EXPORT_SYMBOL(iov_iter_get_pages); 1559 1560 static struct page **get_pages_array(size_t n) 1561 { 1562 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1563 } 1564 1565 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1566 struct page ***pages, size_t maxsize, 1567 size_t *start) 1568 { 1569 struct page **p; 1570 unsigned int iter_head, npages; 1571 ssize_t n; 1572 1573 if (!sanity(i)) 1574 return -EFAULT; 1575 1576 data_start(i, &iter_head, start); 1577 /* Amount of free space: some of this one + all after this one */ 1578 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1579 n = npages * PAGE_SIZE - *start; 1580 if (maxsize > n) 1581 maxsize = n; 1582 else 1583 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1584 p = get_pages_array(npages); 1585 if (!p) 1586 return -ENOMEM; 1587 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1588 if (n > 0) 1589 *pages = p; 1590 else 1591 kvfree(p); 1592 return n; 1593 } 1594 1595 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1596 struct page ***pages, size_t maxsize, 1597 size_t *_start_offset) 1598 { 1599 struct page **p; 1600 unsigned nr, offset; 1601 pgoff_t index, count; 1602 size_t size = maxsize, actual; 1603 loff_t pos; 1604 1605 if (!size) 1606 return 0; 1607 1608 pos = i->xarray_start + i->iov_offset; 1609 index = pos >> PAGE_SHIFT; 1610 offset = pos & ~PAGE_MASK; 1611 *_start_offset = offset; 1612 1613 count = 1; 1614 if (size > PAGE_SIZE - offset) { 1615 size -= PAGE_SIZE - offset; 1616 count += size >> PAGE_SHIFT; 1617 size &= ~PAGE_MASK; 1618 if (size) 1619 count++; 1620 } 1621 1622 p = get_pages_array(count); 1623 if (!p) 1624 return -ENOMEM; 1625 *pages = p; 1626 1627 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1628 if (nr == 0) 1629 return 0; 1630 1631 actual = PAGE_SIZE * nr; 1632 actual -= offset; 1633 if (nr == count && size > 0) { 1634 unsigned last_offset = (nr > 1) ? 0 : offset; 1635 actual -= PAGE_SIZE - (last_offset + size); 1636 } 1637 return actual; 1638 } 1639 1640 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1641 struct page ***pages, size_t maxsize, 1642 size_t *start) 1643 { 1644 struct page **p; 1645 size_t len; 1646 int n, res; 1647 1648 if (maxsize > i->count) 1649 maxsize = i->count; 1650 if (!maxsize) 1651 return 0; 1652 1653 if (likely(iter_is_iovec(i))) { 1654 unsigned long addr; 1655 1656 addr = first_iovec_segment(i, &len, start, maxsize, ~0U); 1657 n = DIV_ROUND_UP(len, PAGE_SIZE); 1658 p = get_pages_array(n); 1659 if (!p) 1660 return -ENOMEM; 1661 res = get_user_pages_fast(addr, n, 1662 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); 1663 if (unlikely(res < 0)) { 1664 kvfree(p); 1665 return res; 1666 } 1667 *pages = p; 1668 return (res == n ? len : res * PAGE_SIZE) - *start; 1669 } 1670 if (iov_iter_is_bvec(i)) { 1671 struct page *page; 1672 1673 page = first_bvec_segment(i, &len, start, maxsize, ~0U); 1674 n = DIV_ROUND_UP(len, PAGE_SIZE); 1675 *pages = p = get_pages_array(n); 1676 if (!p) 1677 return -ENOMEM; 1678 while (n--) 1679 get_page(*p++ = page++); 1680 return len - *start; 1681 } 1682 if (iov_iter_is_pipe(i)) 1683 return pipe_get_pages_alloc(i, pages, maxsize, start); 1684 if (iov_iter_is_xarray(i)) 1685 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1686 return -EFAULT; 1687 } 1688 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1689 1690 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1691 struct iov_iter *i) 1692 { 1693 char *to = addr; 1694 __wsum sum, next; 1695 size_t off = 0; 1696 sum = *csum; 1697 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1698 WARN_ON(1); 1699 return 0; 1700 } 1701 iterate_and_advance(i, bytes, v, ({ 1702 next = csum_and_copy_from_user(v.iov_base, 1703 (to += v.iov_len) - v.iov_len, 1704 v.iov_len); 1705 if (next) { 1706 sum = csum_block_add(sum, next, off); 1707 off += v.iov_len; 1708 } 1709 next ? 0 : v.iov_len; 1710 }), ({ 1711 char *p = kmap_atomic(v.bv_page); 1712 sum = csum_and_memcpy((to += v.bv_len) - v.bv_len, 1713 p + v.bv_offset, v.bv_len, 1714 sum, off); 1715 kunmap_atomic(p); 1716 off += v.bv_len; 1717 }),({ 1718 sum = csum_and_memcpy((to += v.iov_len) - v.iov_len, 1719 v.iov_base, v.iov_len, 1720 sum, off); 1721 off += v.iov_len; 1722 }), ({ 1723 char *p = kmap_atomic(v.bv_page); 1724 sum = csum_and_memcpy((to += v.bv_len) - v.bv_len, 1725 p + v.bv_offset, v.bv_len, 1726 sum, off); 1727 kunmap_atomic(p); 1728 off += v.bv_len; 1729 }) 1730 ) 1731 *csum = sum; 1732 return bytes; 1733 } 1734 EXPORT_SYMBOL(csum_and_copy_from_iter); 1735 1736 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1737 struct iov_iter *i) 1738 { 1739 struct csum_state *csstate = _csstate; 1740 const char *from = addr; 1741 __wsum sum, next; 1742 size_t off; 1743 1744 if (unlikely(iov_iter_is_pipe(i))) 1745 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i); 1746 1747 sum = csum_shift(csstate->csum, csstate->off); 1748 off = 0; 1749 if (unlikely(iov_iter_is_discard(i))) { 1750 WARN_ON(1); /* for now */ 1751 return 0; 1752 } 1753 iterate_and_advance(i, bytes, v, ({ 1754 next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, 1755 v.iov_base, 1756 v.iov_len); 1757 if (next) { 1758 sum = csum_block_add(sum, next, off); 1759 off += v.iov_len; 1760 } 1761 next ? 0 : v.iov_len; 1762 }), ({ 1763 char *p = kmap_atomic(v.bv_page); 1764 sum = csum_and_memcpy(p + v.bv_offset, 1765 (from += v.bv_len) - v.bv_len, 1766 v.bv_len, sum, off); 1767 kunmap_atomic(p); 1768 off += v.bv_len; 1769 }),({ 1770 sum = csum_and_memcpy(v.iov_base, 1771 (from += v.iov_len) - v.iov_len, 1772 v.iov_len, sum, off); 1773 off += v.iov_len; 1774 }), ({ 1775 char *p = kmap_atomic(v.bv_page); 1776 sum = csum_and_memcpy(p + v.bv_offset, 1777 (from += v.bv_len) - v.bv_len, 1778 v.bv_len, sum, off); 1779 kunmap_atomic(p); 1780 off += v.bv_len; 1781 }) 1782 ) 1783 csstate->csum = csum_shift(sum, csstate->off); 1784 csstate->off += bytes; 1785 return bytes; 1786 } 1787 EXPORT_SYMBOL(csum_and_copy_to_iter); 1788 1789 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1790 struct iov_iter *i) 1791 { 1792 #ifdef CONFIG_CRYPTO_HASH 1793 struct ahash_request *hash = hashp; 1794 struct scatterlist sg; 1795 size_t copied; 1796 1797 copied = copy_to_iter(addr, bytes, i); 1798 sg_init_one(&sg, addr, copied); 1799 ahash_request_set_crypt(hash, &sg, NULL, copied); 1800 crypto_ahash_update(hash); 1801 return copied; 1802 #else 1803 return 0; 1804 #endif 1805 } 1806 EXPORT_SYMBOL(hash_and_copy_to_iter); 1807 1808 static int iov_npages(const struct iov_iter *i, int maxpages) 1809 { 1810 size_t skip = i->iov_offset, size = i->count; 1811 const struct iovec *p; 1812 int npages = 0; 1813 1814 for (p = i->iov; size; skip = 0, p++) { 1815 unsigned offs = offset_in_page(p->iov_base + skip); 1816 size_t len = min(p->iov_len - skip, size); 1817 1818 if (len) { 1819 size -= len; 1820 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1821 if (unlikely(npages > maxpages)) 1822 return maxpages; 1823 } 1824 } 1825 return npages; 1826 } 1827 1828 static int bvec_npages(const struct iov_iter *i, int maxpages) 1829 { 1830 size_t skip = i->iov_offset, size = i->count; 1831 const struct bio_vec *p; 1832 int npages = 0; 1833 1834 for (p = i->bvec; size; skip = 0, p++) { 1835 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1836 size_t len = min(p->bv_len - skip, size); 1837 1838 size -= len; 1839 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1840 if (unlikely(npages > maxpages)) 1841 return maxpages; 1842 } 1843 return npages; 1844 } 1845 1846 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1847 { 1848 if (unlikely(!i->count)) 1849 return 0; 1850 /* iovec and kvec have identical layouts */ 1851 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1852 return iov_npages(i, maxpages); 1853 if (iov_iter_is_bvec(i)) 1854 return bvec_npages(i, maxpages); 1855 if (iov_iter_is_pipe(i)) { 1856 unsigned int iter_head; 1857 int npages; 1858 size_t off; 1859 1860 if (!sanity(i)) 1861 return 0; 1862 1863 data_start(i, &iter_head, &off); 1864 /* some of this one + all after this one */ 1865 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1866 return min(npages, maxpages); 1867 } 1868 if (iov_iter_is_xarray(i)) { 1869 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1870 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1871 return min(npages, maxpages); 1872 } 1873 return 0; 1874 } 1875 EXPORT_SYMBOL(iov_iter_npages); 1876 1877 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1878 { 1879 *new = *old; 1880 if (unlikely(iov_iter_is_pipe(new))) { 1881 WARN_ON(1); 1882 return NULL; 1883 } 1884 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) 1885 return NULL; 1886 if (iov_iter_is_bvec(new)) 1887 return new->bvec = kmemdup(new->bvec, 1888 new->nr_segs * sizeof(struct bio_vec), 1889 flags); 1890 else 1891 /* iovec and kvec have identical layout */ 1892 return new->iov = kmemdup(new->iov, 1893 new->nr_segs * sizeof(struct iovec), 1894 flags); 1895 } 1896 EXPORT_SYMBOL(dup_iter); 1897 1898 static int copy_compat_iovec_from_user(struct iovec *iov, 1899 const struct iovec __user *uvec, unsigned long nr_segs) 1900 { 1901 const struct compat_iovec __user *uiov = 1902 (const struct compat_iovec __user *)uvec; 1903 int ret = -EFAULT, i; 1904 1905 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1906 return -EFAULT; 1907 1908 for (i = 0; i < nr_segs; i++) { 1909 compat_uptr_t buf; 1910 compat_ssize_t len; 1911 1912 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1913 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1914 1915 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1916 if (len < 0) { 1917 ret = -EINVAL; 1918 goto uaccess_end; 1919 } 1920 iov[i].iov_base = compat_ptr(buf); 1921 iov[i].iov_len = len; 1922 } 1923 1924 ret = 0; 1925 uaccess_end: 1926 user_access_end(); 1927 return ret; 1928 } 1929 1930 static int copy_iovec_from_user(struct iovec *iov, 1931 const struct iovec __user *uvec, unsigned long nr_segs) 1932 { 1933 unsigned long seg; 1934 1935 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1936 return -EFAULT; 1937 for (seg = 0; seg < nr_segs; seg++) { 1938 if ((ssize_t)iov[seg].iov_len < 0) 1939 return -EINVAL; 1940 } 1941 1942 return 0; 1943 } 1944 1945 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1946 unsigned long nr_segs, unsigned long fast_segs, 1947 struct iovec *fast_iov, bool compat) 1948 { 1949 struct iovec *iov = fast_iov; 1950 int ret; 1951 1952 /* 1953 * SuS says "The readv() function *may* fail if the iovcnt argument was 1954 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1955 * traditionally returned zero for zero segments, so... 1956 */ 1957 if (nr_segs == 0) 1958 return iov; 1959 if (nr_segs > UIO_MAXIOV) 1960 return ERR_PTR(-EINVAL); 1961 if (nr_segs > fast_segs) { 1962 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1963 if (!iov) 1964 return ERR_PTR(-ENOMEM); 1965 } 1966 1967 if (compat) 1968 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1969 else 1970 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1971 if (ret) { 1972 if (iov != fast_iov) 1973 kfree(iov); 1974 return ERR_PTR(ret); 1975 } 1976 1977 return iov; 1978 } 1979 1980 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1981 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1982 struct iov_iter *i, bool compat) 1983 { 1984 ssize_t total_len = 0; 1985 unsigned long seg; 1986 struct iovec *iov; 1987 1988 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1989 if (IS_ERR(iov)) { 1990 *iovp = NULL; 1991 return PTR_ERR(iov); 1992 } 1993 1994 /* 1995 * According to the Single Unix Specification we should return EINVAL if 1996 * an element length is < 0 when cast to ssize_t or if the total length 1997 * would overflow the ssize_t return value of the system call. 1998 * 1999 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 2000 * overflow case. 2001 */ 2002 for (seg = 0; seg < nr_segs; seg++) { 2003 ssize_t len = (ssize_t)iov[seg].iov_len; 2004 2005 if (!access_ok(iov[seg].iov_base, len)) { 2006 if (iov != *iovp) 2007 kfree(iov); 2008 *iovp = NULL; 2009 return -EFAULT; 2010 } 2011 2012 if (len > MAX_RW_COUNT - total_len) { 2013 len = MAX_RW_COUNT - total_len; 2014 iov[seg].iov_len = len; 2015 } 2016 total_len += len; 2017 } 2018 2019 iov_iter_init(i, type, iov, nr_segs, total_len); 2020 if (iov == *iovp) 2021 *iovp = NULL; 2022 else 2023 *iovp = iov; 2024 return total_len; 2025 } 2026 2027 /** 2028 * import_iovec() - Copy an array of &struct iovec from userspace 2029 * into the kernel, check that it is valid, and initialize a new 2030 * &struct iov_iter iterator to access it. 2031 * 2032 * @type: One of %READ or %WRITE. 2033 * @uvec: Pointer to the userspace array. 2034 * @nr_segs: Number of elements in userspace array. 2035 * @fast_segs: Number of elements in @iov. 2036 * @iovp: (input and output parameter) Pointer to pointer to (usually small 2037 * on-stack) kernel array. 2038 * @i: Pointer to iterator that will be initialized on success. 2039 * 2040 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 2041 * then this function places %NULL in *@iov on return. Otherwise, a new 2042 * array will be allocated and the result placed in *@iov. This means that 2043 * the caller may call kfree() on *@iov regardless of whether the small 2044 * on-stack array was used or not (and regardless of whether this function 2045 * returns an error or not). 2046 * 2047 * Return: Negative error code on error, bytes imported on success 2048 */ 2049 ssize_t import_iovec(int type, const struct iovec __user *uvec, 2050 unsigned nr_segs, unsigned fast_segs, 2051 struct iovec **iovp, struct iov_iter *i) 2052 { 2053 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 2054 in_compat_syscall()); 2055 } 2056 EXPORT_SYMBOL(import_iovec); 2057 2058 int import_single_range(int rw, void __user *buf, size_t len, 2059 struct iovec *iov, struct iov_iter *i) 2060 { 2061 if (len > MAX_RW_COUNT) 2062 len = MAX_RW_COUNT; 2063 if (unlikely(!access_ok(buf, len))) 2064 return -EFAULT; 2065 2066 iov->iov_base = buf; 2067 iov->iov_len = len; 2068 iov_iter_init(i, rw, iov, 1, len); 2069 return 0; 2070 } 2071 EXPORT_SYMBOL(import_single_range); 2072