1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers iovec and kvec alike */ 20 #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ 21 size_t off = 0; \ 22 size_t skip = i->iov_offset; \ 23 do { \ 24 len = min(n, __p->iov_len - skip); \ 25 if (likely(len)) { \ 26 base = __p->iov_base + skip; \ 27 len -= (STEP); \ 28 off += len; \ 29 skip += len; \ 30 n -= len; \ 31 if (skip < __p->iov_len) \ 32 break; \ 33 } \ 34 __p++; \ 35 skip = 0; \ 36 } while (n); \ 37 i->iov_offset = skip; \ 38 n = off; \ 39 } 40 41 #define iterate_bvec(i, n, base, len, off, p, STEP) { \ 42 size_t off = 0; \ 43 unsigned skip = i->iov_offset; \ 44 while (n) { \ 45 unsigned offset = p->bv_offset + skip; \ 46 unsigned left; \ 47 void *kaddr = kmap_local_page(p->bv_page + \ 48 offset / PAGE_SIZE); \ 49 base = kaddr + offset % PAGE_SIZE; \ 50 len = min(min(n, (size_t)(p->bv_len - skip)), \ 51 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 52 left = (STEP); \ 53 kunmap_local(kaddr); \ 54 len -= left; \ 55 off += len; \ 56 skip += len; \ 57 if (skip == p->bv_len) { \ 58 skip = 0; \ 59 p++; \ 60 } \ 61 n -= len; \ 62 if (left) \ 63 break; \ 64 } \ 65 i->iov_offset = skip; \ 66 n = off; \ 67 } 68 69 #define iterate_xarray(i, n, base, len, __off, STEP) { \ 70 __label__ __out; \ 71 size_t __off = 0; \ 72 struct page *head = NULL; \ 73 loff_t start = i->xarray_start + i->iov_offset; \ 74 unsigned offset = start % PAGE_SIZE; \ 75 pgoff_t index = start / PAGE_SIZE; \ 76 int j; \ 77 \ 78 XA_STATE(xas, i->xarray, index); \ 79 \ 80 rcu_read_lock(); \ 81 xas_for_each(&xas, head, ULONG_MAX) { \ 82 unsigned left; \ 83 if (xas_retry(&xas, head)) \ 84 continue; \ 85 if (WARN_ON(xa_is_value(head))) \ 86 break; \ 87 if (WARN_ON(PageHuge(head))) \ 88 break; \ 89 for (j = (head->index < index) ? index - head->index : 0; \ 90 j < thp_nr_pages(head); j++) { \ 91 void *kaddr = kmap_local_page(head + j); \ 92 base = kaddr + offset; \ 93 len = PAGE_SIZE - offset; \ 94 len = min(n, len); \ 95 left = (STEP); \ 96 kunmap_local(kaddr); \ 97 len -= left; \ 98 __off += len; \ 99 n -= len; \ 100 if (left || n == 0) \ 101 goto __out; \ 102 offset = 0; \ 103 } \ 104 } \ 105 __out: \ 106 rcu_read_unlock(); \ 107 i->iov_offset += __off; \ 108 n = __off; \ 109 } 110 111 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 112 if (unlikely(i->count < n)) \ 113 n = i->count; \ 114 if (likely(n)) { \ 115 if (likely(iter_is_iovec(i))) { \ 116 const struct iovec *iov = i->iov; \ 117 void __user *base; \ 118 size_t len; \ 119 iterate_iovec(i, n, base, len, off, \ 120 iov, (I)) \ 121 i->nr_segs -= iov - i->iov; \ 122 i->iov = iov; \ 123 } else if (iov_iter_is_bvec(i)) { \ 124 const struct bio_vec *bvec = i->bvec; \ 125 void *base; \ 126 size_t len; \ 127 iterate_bvec(i, n, base, len, off, \ 128 bvec, (K)) \ 129 i->nr_segs -= bvec - i->bvec; \ 130 i->bvec = bvec; \ 131 } else if (iov_iter_is_kvec(i)) { \ 132 const struct kvec *kvec = i->kvec; \ 133 void *base; \ 134 size_t len; \ 135 iterate_iovec(i, n, base, len, off, \ 136 kvec, (K)) \ 137 i->nr_segs -= kvec - i->kvec; \ 138 i->kvec = kvec; \ 139 } else if (iov_iter_is_xarray(i)) { \ 140 void *base; \ 141 size_t len; \ 142 iterate_xarray(i, n, base, len, off, \ 143 (K)) \ 144 } \ 145 i->count -= n; \ 146 } \ 147 } 148 #define iterate_and_advance(i, n, base, len, off, I, K) \ 149 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 150 151 static int copyout(void __user *to, const void *from, size_t n) 152 { 153 if (should_fail_usercopy()) 154 return n; 155 if (access_ok(to, n)) { 156 instrument_copy_to_user(to, from, n); 157 n = raw_copy_to_user(to, from, n); 158 } 159 return n; 160 } 161 162 static int copyin(void *to, const void __user *from, size_t n) 163 { 164 if (should_fail_usercopy()) 165 return n; 166 if (access_ok(from, n)) { 167 instrument_copy_from_user(to, from, n); 168 n = raw_copy_from_user(to, from, n); 169 } 170 return n; 171 } 172 173 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 174 struct iov_iter *i) 175 { 176 size_t skip, copy, left, wanted; 177 const struct iovec *iov; 178 char __user *buf; 179 void *kaddr, *from; 180 181 if (unlikely(bytes > i->count)) 182 bytes = i->count; 183 184 if (unlikely(!bytes)) 185 return 0; 186 187 might_fault(); 188 wanted = bytes; 189 iov = i->iov; 190 skip = i->iov_offset; 191 buf = iov->iov_base + skip; 192 copy = min(bytes, iov->iov_len - skip); 193 194 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { 195 kaddr = kmap_atomic(page); 196 from = kaddr + offset; 197 198 /* first chunk, usually the only one */ 199 left = copyout(buf, from, copy); 200 copy -= left; 201 skip += copy; 202 from += copy; 203 bytes -= copy; 204 205 while (unlikely(!left && bytes)) { 206 iov++; 207 buf = iov->iov_base; 208 copy = min(bytes, iov->iov_len); 209 left = copyout(buf, from, copy); 210 copy -= left; 211 skip = copy; 212 from += copy; 213 bytes -= copy; 214 } 215 if (likely(!bytes)) { 216 kunmap_atomic(kaddr); 217 goto done; 218 } 219 offset = from - kaddr; 220 buf += copy; 221 kunmap_atomic(kaddr); 222 copy = min(bytes, iov->iov_len - skip); 223 } 224 /* Too bad - revert to non-atomic kmap */ 225 226 kaddr = kmap(page); 227 from = kaddr + offset; 228 left = copyout(buf, from, copy); 229 copy -= left; 230 skip += copy; 231 from += copy; 232 bytes -= copy; 233 while (unlikely(!left && bytes)) { 234 iov++; 235 buf = iov->iov_base; 236 copy = min(bytes, iov->iov_len); 237 left = copyout(buf, from, copy); 238 copy -= left; 239 skip = copy; 240 from += copy; 241 bytes -= copy; 242 } 243 kunmap(page); 244 245 done: 246 if (skip == iov->iov_len) { 247 iov++; 248 skip = 0; 249 } 250 i->count -= wanted - bytes; 251 i->nr_segs -= iov - i->iov; 252 i->iov = iov; 253 i->iov_offset = skip; 254 return wanted - bytes; 255 } 256 257 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, 258 struct iov_iter *i) 259 { 260 size_t skip, copy, left, wanted; 261 const struct iovec *iov; 262 char __user *buf; 263 void *kaddr, *to; 264 265 if (unlikely(bytes > i->count)) 266 bytes = i->count; 267 268 if (unlikely(!bytes)) 269 return 0; 270 271 might_fault(); 272 wanted = bytes; 273 iov = i->iov; 274 skip = i->iov_offset; 275 buf = iov->iov_base + skip; 276 copy = min(bytes, iov->iov_len - skip); 277 278 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { 279 kaddr = kmap_atomic(page); 280 to = kaddr + offset; 281 282 /* first chunk, usually the only one */ 283 left = copyin(to, buf, copy); 284 copy -= left; 285 skip += copy; 286 to += copy; 287 bytes -= copy; 288 289 while (unlikely(!left && bytes)) { 290 iov++; 291 buf = iov->iov_base; 292 copy = min(bytes, iov->iov_len); 293 left = copyin(to, buf, copy); 294 copy -= left; 295 skip = copy; 296 to += copy; 297 bytes -= copy; 298 } 299 if (likely(!bytes)) { 300 kunmap_atomic(kaddr); 301 goto done; 302 } 303 offset = to - kaddr; 304 buf += copy; 305 kunmap_atomic(kaddr); 306 copy = min(bytes, iov->iov_len - skip); 307 } 308 /* Too bad - revert to non-atomic kmap */ 309 310 kaddr = kmap(page); 311 to = kaddr + offset; 312 left = copyin(to, buf, copy); 313 copy -= left; 314 skip += copy; 315 to += copy; 316 bytes -= copy; 317 while (unlikely(!left && bytes)) { 318 iov++; 319 buf = iov->iov_base; 320 copy = min(bytes, iov->iov_len); 321 left = copyin(to, buf, copy); 322 copy -= left; 323 skip = copy; 324 to += copy; 325 bytes -= copy; 326 } 327 kunmap(page); 328 329 done: 330 if (skip == iov->iov_len) { 331 iov++; 332 skip = 0; 333 } 334 i->count -= wanted - bytes; 335 i->nr_segs -= iov - i->iov; 336 i->iov = iov; 337 i->iov_offset = skip; 338 return wanted - bytes; 339 } 340 341 #ifdef PIPE_PARANOIA 342 static bool sanity(const struct iov_iter *i) 343 { 344 struct pipe_inode_info *pipe = i->pipe; 345 unsigned int p_head = pipe->head; 346 unsigned int p_tail = pipe->tail; 347 unsigned int p_mask = pipe->ring_size - 1; 348 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 349 unsigned int i_head = i->head; 350 unsigned int idx; 351 352 if (i->iov_offset) { 353 struct pipe_buffer *p; 354 if (unlikely(p_occupancy == 0)) 355 goto Bad; // pipe must be non-empty 356 if (unlikely(i_head != p_head - 1)) 357 goto Bad; // must be at the last buffer... 358 359 p = &pipe->bufs[i_head & p_mask]; 360 if (unlikely(p->offset + p->len != i->iov_offset)) 361 goto Bad; // ... at the end of segment 362 } else { 363 if (i_head != p_head) 364 goto Bad; // must be right after the last buffer 365 } 366 return true; 367 Bad: 368 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 369 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 370 p_head, p_tail, pipe->ring_size); 371 for (idx = 0; idx < pipe->ring_size; idx++) 372 printk(KERN_ERR "[%p %p %d %d]\n", 373 pipe->bufs[idx].ops, 374 pipe->bufs[idx].page, 375 pipe->bufs[idx].offset, 376 pipe->bufs[idx].len); 377 WARN_ON(1); 378 return false; 379 } 380 #else 381 #define sanity(i) true 382 #endif 383 384 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 385 struct iov_iter *i) 386 { 387 struct pipe_inode_info *pipe = i->pipe; 388 struct pipe_buffer *buf; 389 unsigned int p_tail = pipe->tail; 390 unsigned int p_mask = pipe->ring_size - 1; 391 unsigned int i_head = i->head; 392 size_t off; 393 394 if (unlikely(bytes > i->count)) 395 bytes = i->count; 396 397 if (unlikely(!bytes)) 398 return 0; 399 400 if (!sanity(i)) 401 return 0; 402 403 off = i->iov_offset; 404 buf = &pipe->bufs[i_head & p_mask]; 405 if (off) { 406 if (offset == off && buf->page == page) { 407 /* merge with the last one */ 408 buf->len += bytes; 409 i->iov_offset += bytes; 410 goto out; 411 } 412 i_head++; 413 buf = &pipe->bufs[i_head & p_mask]; 414 } 415 if (pipe_full(i_head, p_tail, pipe->max_usage)) 416 return 0; 417 418 buf->ops = &page_cache_pipe_buf_ops; 419 get_page(page); 420 buf->page = page; 421 buf->offset = offset; 422 buf->len = bytes; 423 424 pipe->head = i_head + 1; 425 i->iov_offset = offset + bytes; 426 i->head = i_head; 427 out: 428 i->count -= bytes; 429 return bytes; 430 } 431 432 /* 433 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 434 * bytes. For each iovec, fault in each page that constitutes the iovec. 435 * 436 * Return 0 on success, or non-zero if the memory could not be accessed (i.e. 437 * because it is an invalid address). 438 */ 439 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) 440 { 441 if (iter_is_iovec(i)) { 442 const struct iovec *p; 443 size_t skip; 444 445 if (bytes > i->count) 446 bytes = i->count; 447 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { 448 size_t len = min(bytes, p->iov_len - skip); 449 int err; 450 451 if (unlikely(!len)) 452 continue; 453 err = fault_in_pages_readable(p->iov_base + skip, len); 454 if (unlikely(err)) 455 return err; 456 bytes -= len; 457 } 458 } 459 return 0; 460 } 461 EXPORT_SYMBOL(iov_iter_fault_in_readable); 462 463 void iov_iter_init(struct iov_iter *i, unsigned int direction, 464 const struct iovec *iov, unsigned long nr_segs, 465 size_t count) 466 { 467 WARN_ON(direction & ~(READ | WRITE)); 468 WARN_ON_ONCE(uaccess_kernel()); 469 *i = (struct iov_iter) { 470 .iter_type = ITER_IOVEC, 471 .data_source = direction, 472 .iov = iov, 473 .nr_segs = nr_segs, 474 .iov_offset = 0, 475 .count = count 476 }; 477 } 478 EXPORT_SYMBOL(iov_iter_init); 479 480 static inline bool allocated(struct pipe_buffer *buf) 481 { 482 return buf->ops == &default_pipe_buf_ops; 483 } 484 485 static inline void data_start(const struct iov_iter *i, 486 unsigned int *iter_headp, size_t *offp) 487 { 488 unsigned int p_mask = i->pipe->ring_size - 1; 489 unsigned int iter_head = i->head; 490 size_t off = i->iov_offset; 491 492 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || 493 off == PAGE_SIZE)) { 494 iter_head++; 495 off = 0; 496 } 497 *iter_headp = iter_head; 498 *offp = off; 499 } 500 501 static size_t push_pipe(struct iov_iter *i, size_t size, 502 int *iter_headp, size_t *offp) 503 { 504 struct pipe_inode_info *pipe = i->pipe; 505 unsigned int p_tail = pipe->tail; 506 unsigned int p_mask = pipe->ring_size - 1; 507 unsigned int iter_head; 508 size_t off; 509 ssize_t left; 510 511 if (unlikely(size > i->count)) 512 size = i->count; 513 if (unlikely(!size)) 514 return 0; 515 516 left = size; 517 data_start(i, &iter_head, &off); 518 *iter_headp = iter_head; 519 *offp = off; 520 if (off) { 521 left -= PAGE_SIZE - off; 522 if (left <= 0) { 523 pipe->bufs[iter_head & p_mask].len += size; 524 return size; 525 } 526 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; 527 iter_head++; 528 } 529 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { 530 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; 531 struct page *page = alloc_page(GFP_USER); 532 if (!page) 533 break; 534 535 buf->ops = &default_pipe_buf_ops; 536 buf->page = page; 537 buf->offset = 0; 538 buf->len = min_t(ssize_t, left, PAGE_SIZE); 539 left -= buf->len; 540 iter_head++; 541 pipe->head = iter_head; 542 543 if (left == 0) 544 return size; 545 } 546 return size - left; 547 } 548 549 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 550 struct iov_iter *i) 551 { 552 struct pipe_inode_info *pipe = i->pipe; 553 unsigned int p_mask = pipe->ring_size - 1; 554 unsigned int i_head; 555 size_t n, off; 556 557 if (!sanity(i)) 558 return 0; 559 560 bytes = n = push_pipe(i, bytes, &i_head, &off); 561 if (unlikely(!n)) 562 return 0; 563 do { 564 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 565 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 566 i->head = i_head; 567 i->iov_offset = off + chunk; 568 n -= chunk; 569 addr += chunk; 570 off = 0; 571 i_head++; 572 } while (n); 573 i->count -= bytes; 574 return bytes; 575 } 576 577 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 578 __wsum sum, size_t off) 579 { 580 __wsum next = csum_partial_copy_nocheck(from, to, len); 581 return csum_block_add(sum, next, off); 582 } 583 584 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 585 struct csum_state *csstate, 586 struct iov_iter *i) 587 { 588 struct pipe_inode_info *pipe = i->pipe; 589 unsigned int p_mask = pipe->ring_size - 1; 590 __wsum sum = csstate->csum; 591 size_t off = csstate->off; 592 unsigned int i_head; 593 size_t n, r; 594 595 if (!sanity(i)) 596 return 0; 597 598 bytes = n = push_pipe(i, bytes, &i_head, &r); 599 if (unlikely(!n)) 600 return 0; 601 do { 602 size_t chunk = min_t(size_t, n, PAGE_SIZE - r); 603 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 604 sum = csum_and_memcpy(p + r, addr, chunk, sum, off); 605 kunmap_local(p); 606 i->head = i_head; 607 i->iov_offset = r + chunk; 608 n -= chunk; 609 off += chunk; 610 addr += chunk; 611 r = 0; 612 i_head++; 613 } while (n); 614 i->count -= bytes; 615 csstate->csum = sum; 616 csstate->off = off; 617 return bytes; 618 } 619 620 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 621 { 622 if (unlikely(iov_iter_is_pipe(i))) 623 return copy_pipe_to_iter(addr, bytes, i); 624 if (iter_is_iovec(i)) 625 might_fault(); 626 iterate_and_advance(i, bytes, base, len, off, 627 copyout(base, addr + off, len), 628 memcpy(base, addr + off, len) 629 ) 630 631 return bytes; 632 } 633 EXPORT_SYMBOL(_copy_to_iter); 634 635 #ifdef CONFIG_ARCH_HAS_COPY_MC 636 static int copyout_mc(void __user *to, const void *from, size_t n) 637 { 638 if (access_ok(to, n)) { 639 instrument_copy_to_user(to, from, n); 640 n = copy_mc_to_user((__force void *) to, from, n); 641 } 642 return n; 643 } 644 645 static unsigned long copy_mc_to_page(struct page *page, size_t offset, 646 const char *from, size_t len) 647 { 648 unsigned long ret; 649 char *to; 650 651 to = kmap_atomic(page); 652 ret = copy_mc_to_kernel(to + offset, from, len); 653 kunmap_atomic(to); 654 655 return ret; 656 } 657 658 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 659 struct iov_iter *i) 660 { 661 struct pipe_inode_info *pipe = i->pipe; 662 unsigned int p_mask = pipe->ring_size - 1; 663 unsigned int i_head; 664 size_t n, off, xfer = 0; 665 666 if (!sanity(i)) 667 return 0; 668 669 bytes = n = push_pipe(i, bytes, &i_head, &off); 670 if (unlikely(!n)) 671 return 0; 672 do { 673 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 674 unsigned long rem; 675 676 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page, 677 off, addr, chunk); 678 i->head = i_head; 679 i->iov_offset = off + chunk - rem; 680 xfer += chunk - rem; 681 if (rem) 682 break; 683 n -= chunk; 684 addr += chunk; 685 off = 0; 686 i_head++; 687 } while (n); 688 i->count -= xfer; 689 return xfer; 690 } 691 692 /** 693 * _copy_mc_to_iter - copy to iter with source memory error exception handling 694 * @addr: source kernel address 695 * @bytes: total transfer length 696 * @iter: destination iterator 697 * 698 * The pmem driver deploys this for the dax operation 699 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 700 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 701 * successfully copied. 702 * 703 * The main differences between this and typical _copy_to_iter(). 704 * 705 * * Typical tail/residue handling after a fault retries the copy 706 * byte-by-byte until the fault happens again. Re-triggering machine 707 * checks is potentially fatal so the implementation uses source 708 * alignment and poison alignment assumptions to avoid re-triggering 709 * hardware exceptions. 710 * 711 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 712 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 713 * a short copy. 714 */ 715 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 716 { 717 if (unlikely(iov_iter_is_pipe(i))) 718 return copy_mc_pipe_to_iter(addr, bytes, i); 719 if (iter_is_iovec(i)) 720 might_fault(); 721 __iterate_and_advance(i, bytes, base, len, off, 722 copyout_mc(base, addr + off, len), 723 copy_mc_to_kernel(base, addr + off, len) 724 ) 725 726 return bytes; 727 } 728 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 729 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 730 731 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 732 { 733 if (unlikely(iov_iter_is_pipe(i))) { 734 WARN_ON(1); 735 return 0; 736 } 737 if (iter_is_iovec(i)) 738 might_fault(); 739 iterate_and_advance(i, bytes, base, len, off, 740 copyin(addr + off, base, len), 741 memcpy(addr + off, base, len) 742 ) 743 744 return bytes; 745 } 746 EXPORT_SYMBOL(_copy_from_iter); 747 748 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 749 { 750 if (unlikely(iov_iter_is_pipe(i))) { 751 WARN_ON(1); 752 return 0; 753 } 754 iterate_and_advance(i, bytes, base, len, off, 755 __copy_from_user_inatomic_nocache(addr + off, base, len), 756 memcpy(addr + off, base, len) 757 ) 758 759 return bytes; 760 } 761 EXPORT_SYMBOL(_copy_from_iter_nocache); 762 763 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 764 /** 765 * _copy_from_iter_flushcache - write destination through cpu cache 766 * @addr: destination kernel address 767 * @bytes: total transfer length 768 * @iter: source iterator 769 * 770 * The pmem driver arranges for filesystem-dax to use this facility via 771 * dax_copy_from_iter() for ensuring that writes to persistent memory 772 * are flushed through the CPU cache. It is differentiated from 773 * _copy_from_iter_nocache() in that guarantees all data is flushed for 774 * all iterator types. The _copy_from_iter_nocache() only attempts to 775 * bypass the cache for the ITER_IOVEC case, and on some archs may use 776 * instructions that strand dirty-data in the cache. 777 */ 778 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 779 { 780 if (unlikely(iov_iter_is_pipe(i))) { 781 WARN_ON(1); 782 return 0; 783 } 784 iterate_and_advance(i, bytes, base, len, off, 785 __copy_from_user_flushcache(addr + off, base, len), 786 memcpy_flushcache(addr + off, base, len) 787 ) 788 789 return bytes; 790 } 791 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 792 #endif 793 794 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 795 { 796 struct page *head; 797 size_t v = n + offset; 798 799 /* 800 * The general case needs to access the page order in order 801 * to compute the page size. 802 * However, we mostly deal with order-0 pages and thus can 803 * avoid a possible cache line miss for requests that fit all 804 * page orders. 805 */ 806 if (n <= v && v <= PAGE_SIZE) 807 return true; 808 809 head = compound_head(page); 810 v += (page - head) << PAGE_SHIFT; 811 812 if (likely(n <= v && v <= (page_size(head)))) 813 return true; 814 WARN_ON(1); 815 return false; 816 } 817 818 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 819 struct iov_iter *i) 820 { 821 if (likely(iter_is_iovec(i))) 822 return copy_page_to_iter_iovec(page, offset, bytes, i); 823 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 824 void *kaddr = kmap_local_page(page); 825 size_t wanted = _copy_to_iter(kaddr + offset, bytes, i); 826 kunmap_local(kaddr); 827 return wanted; 828 } 829 if (iov_iter_is_pipe(i)) 830 return copy_page_to_iter_pipe(page, offset, bytes, i); 831 if (unlikely(iov_iter_is_discard(i))) { 832 if (unlikely(i->count < bytes)) 833 bytes = i->count; 834 i->count -= bytes; 835 return bytes; 836 } 837 WARN_ON(1); 838 return 0; 839 } 840 841 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 842 struct iov_iter *i) 843 { 844 size_t res = 0; 845 if (unlikely(!page_copy_sane(page, offset, bytes))) 846 return 0; 847 page += offset / PAGE_SIZE; // first subpage 848 offset %= PAGE_SIZE; 849 while (1) { 850 size_t n = __copy_page_to_iter(page, offset, 851 min(bytes, (size_t)PAGE_SIZE - offset), i); 852 res += n; 853 bytes -= n; 854 if (!bytes || !n) 855 break; 856 offset += n; 857 if (offset == PAGE_SIZE) { 858 page++; 859 offset = 0; 860 } 861 } 862 return res; 863 } 864 EXPORT_SYMBOL(copy_page_to_iter); 865 866 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 867 struct iov_iter *i) 868 { 869 if (unlikely(!page_copy_sane(page, offset, bytes))) 870 return 0; 871 if (likely(iter_is_iovec(i))) 872 return copy_page_from_iter_iovec(page, offset, bytes, i); 873 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 874 void *kaddr = kmap_local_page(page); 875 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 876 kunmap_local(kaddr); 877 return wanted; 878 } 879 WARN_ON(1); 880 return 0; 881 } 882 EXPORT_SYMBOL(copy_page_from_iter); 883 884 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 885 { 886 struct pipe_inode_info *pipe = i->pipe; 887 unsigned int p_mask = pipe->ring_size - 1; 888 unsigned int i_head; 889 size_t n, off; 890 891 if (!sanity(i)) 892 return 0; 893 894 bytes = n = push_pipe(i, bytes, &i_head, &off); 895 if (unlikely(!n)) 896 return 0; 897 898 do { 899 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 900 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk); 901 i->head = i_head; 902 i->iov_offset = off + chunk; 903 n -= chunk; 904 off = 0; 905 i_head++; 906 } while (n); 907 i->count -= bytes; 908 return bytes; 909 } 910 911 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 912 { 913 if (unlikely(iov_iter_is_pipe(i))) 914 return pipe_zero(bytes, i); 915 iterate_and_advance(i, bytes, base, len, count, 916 clear_user(base, len), 917 memset(base, 0, len) 918 ) 919 920 return bytes; 921 } 922 EXPORT_SYMBOL(iov_iter_zero); 923 924 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 925 struct iov_iter *i) 926 { 927 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 928 if (unlikely(!page_copy_sane(page, offset, bytes))) { 929 kunmap_atomic(kaddr); 930 return 0; 931 } 932 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 933 kunmap_atomic(kaddr); 934 WARN_ON(1); 935 return 0; 936 } 937 iterate_and_advance(i, bytes, base, len, off, 938 copyin(p + off, base, len), 939 memcpy(p + off, base, len) 940 ) 941 kunmap_atomic(kaddr); 942 return bytes; 943 } 944 EXPORT_SYMBOL(copy_page_from_iter_atomic); 945 946 static inline void pipe_truncate(struct iov_iter *i) 947 { 948 struct pipe_inode_info *pipe = i->pipe; 949 unsigned int p_tail = pipe->tail; 950 unsigned int p_head = pipe->head; 951 unsigned int p_mask = pipe->ring_size - 1; 952 953 if (!pipe_empty(p_head, p_tail)) { 954 struct pipe_buffer *buf; 955 unsigned int i_head = i->head; 956 size_t off = i->iov_offset; 957 958 if (off) { 959 buf = &pipe->bufs[i_head & p_mask]; 960 buf->len = off - buf->offset; 961 i_head++; 962 } 963 while (p_head != i_head) { 964 p_head--; 965 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 966 } 967 968 pipe->head = p_head; 969 } 970 } 971 972 static void pipe_advance(struct iov_iter *i, size_t size) 973 { 974 struct pipe_inode_info *pipe = i->pipe; 975 if (size) { 976 struct pipe_buffer *buf; 977 unsigned int p_mask = pipe->ring_size - 1; 978 unsigned int i_head = i->head; 979 size_t off = i->iov_offset, left = size; 980 981 if (off) /* make it relative to the beginning of buffer */ 982 left += off - pipe->bufs[i_head & p_mask].offset; 983 while (1) { 984 buf = &pipe->bufs[i_head & p_mask]; 985 if (left <= buf->len) 986 break; 987 left -= buf->len; 988 i_head++; 989 } 990 i->head = i_head; 991 i->iov_offset = buf->offset + left; 992 } 993 i->count -= size; 994 /* ... and discard everything past that point */ 995 pipe_truncate(i); 996 } 997 998 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 999 { 1000 struct bvec_iter bi; 1001 1002 bi.bi_size = i->count; 1003 bi.bi_bvec_done = i->iov_offset; 1004 bi.bi_idx = 0; 1005 bvec_iter_advance(i->bvec, &bi, size); 1006 1007 i->bvec += bi.bi_idx; 1008 i->nr_segs -= bi.bi_idx; 1009 i->count = bi.bi_size; 1010 i->iov_offset = bi.bi_bvec_done; 1011 } 1012 1013 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 1014 { 1015 const struct iovec *iov, *end; 1016 1017 if (!i->count) 1018 return; 1019 i->count -= size; 1020 1021 size += i->iov_offset; // from beginning of current segment 1022 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 1023 if (likely(size < iov->iov_len)) 1024 break; 1025 size -= iov->iov_len; 1026 } 1027 i->iov_offset = size; 1028 i->nr_segs -= iov - i->iov; 1029 i->iov = iov; 1030 } 1031 1032 void iov_iter_advance(struct iov_iter *i, size_t size) 1033 { 1034 if (unlikely(i->count < size)) 1035 size = i->count; 1036 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 1037 /* iovec and kvec have identical layouts */ 1038 iov_iter_iovec_advance(i, size); 1039 } else if (iov_iter_is_bvec(i)) { 1040 iov_iter_bvec_advance(i, size); 1041 } else if (iov_iter_is_pipe(i)) { 1042 pipe_advance(i, size); 1043 } else if (unlikely(iov_iter_is_xarray(i))) { 1044 i->iov_offset += size; 1045 i->count -= size; 1046 } else if (iov_iter_is_discard(i)) { 1047 i->count -= size; 1048 } 1049 } 1050 EXPORT_SYMBOL(iov_iter_advance); 1051 1052 void iov_iter_revert(struct iov_iter *i, size_t unroll) 1053 { 1054 if (!unroll) 1055 return; 1056 if (WARN_ON(unroll > MAX_RW_COUNT)) 1057 return; 1058 i->count += unroll; 1059 if (unlikely(iov_iter_is_pipe(i))) { 1060 struct pipe_inode_info *pipe = i->pipe; 1061 unsigned int p_mask = pipe->ring_size - 1; 1062 unsigned int i_head = i->head; 1063 size_t off = i->iov_offset; 1064 while (1) { 1065 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 1066 size_t n = off - b->offset; 1067 if (unroll < n) { 1068 off -= unroll; 1069 break; 1070 } 1071 unroll -= n; 1072 if (!unroll && i_head == i->start_head) { 1073 off = 0; 1074 break; 1075 } 1076 i_head--; 1077 b = &pipe->bufs[i_head & p_mask]; 1078 off = b->offset + b->len; 1079 } 1080 i->iov_offset = off; 1081 i->head = i_head; 1082 pipe_truncate(i); 1083 return; 1084 } 1085 if (unlikely(iov_iter_is_discard(i))) 1086 return; 1087 if (unroll <= i->iov_offset) { 1088 i->iov_offset -= unroll; 1089 return; 1090 } 1091 unroll -= i->iov_offset; 1092 if (iov_iter_is_xarray(i)) { 1093 BUG(); /* We should never go beyond the start of the specified 1094 * range since we might then be straying into pages that 1095 * aren't pinned. 1096 */ 1097 } else if (iov_iter_is_bvec(i)) { 1098 const struct bio_vec *bvec = i->bvec; 1099 while (1) { 1100 size_t n = (--bvec)->bv_len; 1101 i->nr_segs++; 1102 if (unroll <= n) { 1103 i->bvec = bvec; 1104 i->iov_offset = n - unroll; 1105 return; 1106 } 1107 unroll -= n; 1108 } 1109 } else { /* same logics for iovec and kvec */ 1110 const struct iovec *iov = i->iov; 1111 while (1) { 1112 size_t n = (--iov)->iov_len; 1113 i->nr_segs++; 1114 if (unroll <= n) { 1115 i->iov = iov; 1116 i->iov_offset = n - unroll; 1117 return; 1118 } 1119 unroll -= n; 1120 } 1121 } 1122 } 1123 EXPORT_SYMBOL(iov_iter_revert); 1124 1125 /* 1126 * Return the count of just the current iov_iter segment. 1127 */ 1128 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1129 { 1130 if (i->nr_segs > 1) { 1131 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1132 return min(i->count, i->iov->iov_len - i->iov_offset); 1133 if (iov_iter_is_bvec(i)) 1134 return min(i->count, i->bvec->bv_len - i->iov_offset); 1135 } 1136 return i->count; 1137 } 1138 EXPORT_SYMBOL(iov_iter_single_seg_count); 1139 1140 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1141 const struct kvec *kvec, unsigned long nr_segs, 1142 size_t count) 1143 { 1144 WARN_ON(direction & ~(READ | WRITE)); 1145 *i = (struct iov_iter){ 1146 .iter_type = ITER_KVEC, 1147 .data_source = direction, 1148 .kvec = kvec, 1149 .nr_segs = nr_segs, 1150 .iov_offset = 0, 1151 .count = count 1152 }; 1153 } 1154 EXPORT_SYMBOL(iov_iter_kvec); 1155 1156 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1157 const struct bio_vec *bvec, unsigned long nr_segs, 1158 size_t count) 1159 { 1160 WARN_ON(direction & ~(READ | WRITE)); 1161 *i = (struct iov_iter){ 1162 .iter_type = ITER_BVEC, 1163 .data_source = direction, 1164 .bvec = bvec, 1165 .nr_segs = nr_segs, 1166 .iov_offset = 0, 1167 .count = count 1168 }; 1169 } 1170 EXPORT_SYMBOL(iov_iter_bvec); 1171 1172 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1173 struct pipe_inode_info *pipe, 1174 size_t count) 1175 { 1176 BUG_ON(direction != READ); 1177 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1178 *i = (struct iov_iter){ 1179 .iter_type = ITER_PIPE, 1180 .data_source = false, 1181 .pipe = pipe, 1182 .head = pipe->head, 1183 .start_head = pipe->head, 1184 .iov_offset = 0, 1185 .count = count 1186 }; 1187 } 1188 EXPORT_SYMBOL(iov_iter_pipe); 1189 1190 /** 1191 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1192 * @i: The iterator to initialise. 1193 * @direction: The direction of the transfer. 1194 * @xarray: The xarray to access. 1195 * @start: The start file position. 1196 * @count: The size of the I/O buffer in bytes. 1197 * 1198 * Set up an I/O iterator to either draw data out of the pages attached to an 1199 * inode or to inject data into those pages. The pages *must* be prevented 1200 * from evaporation, either by taking a ref on them or locking them by the 1201 * caller. 1202 */ 1203 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1204 struct xarray *xarray, loff_t start, size_t count) 1205 { 1206 BUG_ON(direction & ~1); 1207 *i = (struct iov_iter) { 1208 .iter_type = ITER_XARRAY, 1209 .data_source = direction, 1210 .xarray = xarray, 1211 .xarray_start = start, 1212 .count = count, 1213 .iov_offset = 0 1214 }; 1215 } 1216 EXPORT_SYMBOL(iov_iter_xarray); 1217 1218 /** 1219 * iov_iter_discard - Initialise an I/O iterator that discards data 1220 * @i: The iterator to initialise. 1221 * @direction: The direction of the transfer. 1222 * @count: The size of the I/O buffer in bytes. 1223 * 1224 * Set up an I/O iterator that just discards everything that's written to it. 1225 * It's only available as a READ iterator. 1226 */ 1227 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1228 { 1229 BUG_ON(direction != READ); 1230 *i = (struct iov_iter){ 1231 .iter_type = ITER_DISCARD, 1232 .data_source = false, 1233 .count = count, 1234 .iov_offset = 0 1235 }; 1236 } 1237 EXPORT_SYMBOL(iov_iter_discard); 1238 1239 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1240 { 1241 unsigned long res = 0; 1242 size_t size = i->count; 1243 size_t skip = i->iov_offset; 1244 unsigned k; 1245 1246 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1247 size_t len = i->iov[k].iov_len - skip; 1248 if (len) { 1249 res |= (unsigned long)i->iov[k].iov_base + skip; 1250 if (len > size) 1251 len = size; 1252 res |= len; 1253 size -= len; 1254 if (!size) 1255 break; 1256 } 1257 } 1258 return res; 1259 } 1260 1261 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1262 { 1263 unsigned res = 0; 1264 size_t size = i->count; 1265 unsigned skip = i->iov_offset; 1266 unsigned k; 1267 1268 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1269 size_t len = i->bvec[k].bv_len - skip; 1270 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1271 if (len > size) 1272 len = size; 1273 res |= len; 1274 size -= len; 1275 if (!size) 1276 break; 1277 } 1278 return res; 1279 } 1280 1281 unsigned long iov_iter_alignment(const struct iov_iter *i) 1282 { 1283 /* iovec and kvec have identical layouts */ 1284 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1285 return iov_iter_alignment_iovec(i); 1286 1287 if (iov_iter_is_bvec(i)) 1288 return iov_iter_alignment_bvec(i); 1289 1290 if (iov_iter_is_pipe(i)) { 1291 unsigned int p_mask = i->pipe->ring_size - 1; 1292 size_t size = i->count; 1293 1294 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) 1295 return size | i->iov_offset; 1296 return size; 1297 } 1298 1299 if (iov_iter_is_xarray(i)) 1300 return (i->xarray_start + i->iov_offset) | i->count; 1301 1302 return 0; 1303 } 1304 EXPORT_SYMBOL(iov_iter_alignment); 1305 1306 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1307 { 1308 unsigned long res = 0; 1309 unsigned long v = 0; 1310 size_t size = i->count; 1311 unsigned k; 1312 1313 if (WARN_ON(!iter_is_iovec(i))) 1314 return ~0U; 1315 1316 for (k = 0; k < i->nr_segs; k++) { 1317 if (i->iov[k].iov_len) { 1318 unsigned long base = (unsigned long)i->iov[k].iov_base; 1319 if (v) // if not the first one 1320 res |= base | v; // this start | previous end 1321 v = base + i->iov[k].iov_len; 1322 if (size <= i->iov[k].iov_len) 1323 break; 1324 size -= i->iov[k].iov_len; 1325 } 1326 } 1327 return res; 1328 } 1329 EXPORT_SYMBOL(iov_iter_gap_alignment); 1330 1331 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1332 size_t maxsize, 1333 struct page **pages, 1334 int iter_head, 1335 size_t *start) 1336 { 1337 struct pipe_inode_info *pipe = i->pipe; 1338 unsigned int p_mask = pipe->ring_size - 1; 1339 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1340 if (!n) 1341 return -EFAULT; 1342 1343 maxsize = n; 1344 n += *start; 1345 while (n > 0) { 1346 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1347 iter_head++; 1348 n -= PAGE_SIZE; 1349 } 1350 1351 return maxsize; 1352 } 1353 1354 static ssize_t pipe_get_pages(struct iov_iter *i, 1355 struct page **pages, size_t maxsize, unsigned maxpages, 1356 size_t *start) 1357 { 1358 unsigned int iter_head, npages; 1359 size_t capacity; 1360 1361 if (!sanity(i)) 1362 return -EFAULT; 1363 1364 data_start(i, &iter_head, start); 1365 /* Amount of free space: some of this one + all after this one */ 1366 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1367 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1368 1369 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1370 } 1371 1372 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1373 pgoff_t index, unsigned int nr_pages) 1374 { 1375 XA_STATE(xas, xa, index); 1376 struct page *page; 1377 unsigned int ret = 0; 1378 1379 rcu_read_lock(); 1380 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1381 if (xas_retry(&xas, page)) 1382 continue; 1383 1384 /* Has the page moved or been split? */ 1385 if (unlikely(page != xas_reload(&xas))) { 1386 xas_reset(&xas); 1387 continue; 1388 } 1389 1390 pages[ret] = find_subpage(page, xas.xa_index); 1391 get_page(pages[ret]); 1392 if (++ret == nr_pages) 1393 break; 1394 } 1395 rcu_read_unlock(); 1396 return ret; 1397 } 1398 1399 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1400 struct page **pages, size_t maxsize, 1401 unsigned maxpages, size_t *_start_offset) 1402 { 1403 unsigned nr, offset; 1404 pgoff_t index, count; 1405 size_t size = maxsize, actual; 1406 loff_t pos; 1407 1408 if (!size || !maxpages) 1409 return 0; 1410 1411 pos = i->xarray_start + i->iov_offset; 1412 index = pos >> PAGE_SHIFT; 1413 offset = pos & ~PAGE_MASK; 1414 *_start_offset = offset; 1415 1416 count = 1; 1417 if (size > PAGE_SIZE - offset) { 1418 size -= PAGE_SIZE - offset; 1419 count += size >> PAGE_SHIFT; 1420 size &= ~PAGE_MASK; 1421 if (size) 1422 count++; 1423 } 1424 1425 if (count > maxpages) 1426 count = maxpages; 1427 1428 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1429 if (nr == 0) 1430 return 0; 1431 1432 actual = PAGE_SIZE * nr; 1433 actual -= offset; 1434 if (nr == count && size > 0) { 1435 unsigned last_offset = (nr > 1) ? 0 : offset; 1436 actual -= PAGE_SIZE - (last_offset + size); 1437 } 1438 return actual; 1439 } 1440 1441 /* must be done on non-empty ITER_IOVEC one */ 1442 static unsigned long first_iovec_segment(const struct iov_iter *i, 1443 size_t *size, size_t *start, 1444 size_t maxsize, unsigned maxpages) 1445 { 1446 size_t skip; 1447 long k; 1448 1449 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1450 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; 1451 size_t len = i->iov[k].iov_len - skip; 1452 1453 if (unlikely(!len)) 1454 continue; 1455 if (len > maxsize) 1456 len = maxsize; 1457 len += (*start = addr % PAGE_SIZE); 1458 if (len > maxpages * PAGE_SIZE) 1459 len = maxpages * PAGE_SIZE; 1460 *size = len; 1461 return addr & PAGE_MASK; 1462 } 1463 BUG(); // if it had been empty, we wouldn't get called 1464 } 1465 1466 /* must be done on non-empty ITER_BVEC one */ 1467 static struct page *first_bvec_segment(const struct iov_iter *i, 1468 size_t *size, size_t *start, 1469 size_t maxsize, unsigned maxpages) 1470 { 1471 struct page *page; 1472 size_t skip = i->iov_offset, len; 1473 1474 len = i->bvec->bv_len - skip; 1475 if (len > maxsize) 1476 len = maxsize; 1477 skip += i->bvec->bv_offset; 1478 page = i->bvec->bv_page + skip / PAGE_SIZE; 1479 len += (*start = skip % PAGE_SIZE); 1480 if (len > maxpages * PAGE_SIZE) 1481 len = maxpages * PAGE_SIZE; 1482 *size = len; 1483 return page; 1484 } 1485 1486 ssize_t iov_iter_get_pages(struct iov_iter *i, 1487 struct page **pages, size_t maxsize, unsigned maxpages, 1488 size_t *start) 1489 { 1490 size_t len; 1491 int n, res; 1492 1493 if (maxsize > i->count) 1494 maxsize = i->count; 1495 if (!maxsize) 1496 return 0; 1497 1498 if (likely(iter_is_iovec(i))) { 1499 unsigned long addr; 1500 1501 addr = first_iovec_segment(i, &len, start, maxsize, maxpages); 1502 n = DIV_ROUND_UP(len, PAGE_SIZE); 1503 res = get_user_pages_fast(addr, n, 1504 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, 1505 pages); 1506 if (unlikely(res < 0)) 1507 return res; 1508 return (res == n ? len : res * PAGE_SIZE) - *start; 1509 } 1510 if (iov_iter_is_bvec(i)) { 1511 struct page *page; 1512 1513 page = first_bvec_segment(i, &len, start, maxsize, maxpages); 1514 n = DIV_ROUND_UP(len, PAGE_SIZE); 1515 while (n--) 1516 get_page(*pages++ = page++); 1517 return len - *start; 1518 } 1519 if (iov_iter_is_pipe(i)) 1520 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1521 if (iov_iter_is_xarray(i)) 1522 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1523 return -EFAULT; 1524 } 1525 EXPORT_SYMBOL(iov_iter_get_pages); 1526 1527 static struct page **get_pages_array(size_t n) 1528 { 1529 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1530 } 1531 1532 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1533 struct page ***pages, size_t maxsize, 1534 size_t *start) 1535 { 1536 struct page **p; 1537 unsigned int iter_head, npages; 1538 ssize_t n; 1539 1540 if (!sanity(i)) 1541 return -EFAULT; 1542 1543 data_start(i, &iter_head, start); 1544 /* Amount of free space: some of this one + all after this one */ 1545 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1546 n = npages * PAGE_SIZE - *start; 1547 if (maxsize > n) 1548 maxsize = n; 1549 else 1550 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1551 p = get_pages_array(npages); 1552 if (!p) 1553 return -ENOMEM; 1554 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1555 if (n > 0) 1556 *pages = p; 1557 else 1558 kvfree(p); 1559 return n; 1560 } 1561 1562 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1563 struct page ***pages, size_t maxsize, 1564 size_t *_start_offset) 1565 { 1566 struct page **p; 1567 unsigned nr, offset; 1568 pgoff_t index, count; 1569 size_t size = maxsize, actual; 1570 loff_t pos; 1571 1572 if (!size) 1573 return 0; 1574 1575 pos = i->xarray_start + i->iov_offset; 1576 index = pos >> PAGE_SHIFT; 1577 offset = pos & ~PAGE_MASK; 1578 *_start_offset = offset; 1579 1580 count = 1; 1581 if (size > PAGE_SIZE - offset) { 1582 size -= PAGE_SIZE - offset; 1583 count += size >> PAGE_SHIFT; 1584 size &= ~PAGE_MASK; 1585 if (size) 1586 count++; 1587 } 1588 1589 p = get_pages_array(count); 1590 if (!p) 1591 return -ENOMEM; 1592 *pages = p; 1593 1594 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1595 if (nr == 0) 1596 return 0; 1597 1598 actual = PAGE_SIZE * nr; 1599 actual -= offset; 1600 if (nr == count && size > 0) { 1601 unsigned last_offset = (nr > 1) ? 0 : offset; 1602 actual -= PAGE_SIZE - (last_offset + size); 1603 } 1604 return actual; 1605 } 1606 1607 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1608 struct page ***pages, size_t maxsize, 1609 size_t *start) 1610 { 1611 struct page **p; 1612 size_t len; 1613 int n, res; 1614 1615 if (maxsize > i->count) 1616 maxsize = i->count; 1617 if (!maxsize) 1618 return 0; 1619 1620 if (likely(iter_is_iovec(i))) { 1621 unsigned long addr; 1622 1623 addr = first_iovec_segment(i, &len, start, maxsize, ~0U); 1624 n = DIV_ROUND_UP(len, PAGE_SIZE); 1625 p = get_pages_array(n); 1626 if (!p) 1627 return -ENOMEM; 1628 res = get_user_pages_fast(addr, n, 1629 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); 1630 if (unlikely(res < 0)) { 1631 kvfree(p); 1632 return res; 1633 } 1634 *pages = p; 1635 return (res == n ? len : res * PAGE_SIZE) - *start; 1636 } 1637 if (iov_iter_is_bvec(i)) { 1638 struct page *page; 1639 1640 page = first_bvec_segment(i, &len, start, maxsize, ~0U); 1641 n = DIV_ROUND_UP(len, PAGE_SIZE); 1642 *pages = p = get_pages_array(n); 1643 if (!p) 1644 return -ENOMEM; 1645 while (n--) 1646 get_page(*p++ = page++); 1647 return len - *start; 1648 } 1649 if (iov_iter_is_pipe(i)) 1650 return pipe_get_pages_alloc(i, pages, maxsize, start); 1651 if (iov_iter_is_xarray(i)) 1652 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1653 return -EFAULT; 1654 } 1655 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1656 1657 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1658 struct iov_iter *i) 1659 { 1660 __wsum sum, next; 1661 sum = *csum; 1662 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1663 WARN_ON(1); 1664 return 0; 1665 } 1666 iterate_and_advance(i, bytes, base, len, off, ({ 1667 next = csum_and_copy_from_user(base, addr + off, len); 1668 sum = csum_block_add(sum, next, off); 1669 next ? 0 : len; 1670 }), ({ 1671 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1672 }) 1673 ) 1674 *csum = sum; 1675 return bytes; 1676 } 1677 EXPORT_SYMBOL(csum_and_copy_from_iter); 1678 1679 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1680 struct iov_iter *i) 1681 { 1682 struct csum_state *csstate = _csstate; 1683 __wsum sum, next; 1684 1685 if (unlikely(iov_iter_is_pipe(i))) 1686 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i); 1687 1688 sum = csum_shift(csstate->csum, csstate->off); 1689 if (unlikely(iov_iter_is_discard(i))) { 1690 WARN_ON(1); /* for now */ 1691 return 0; 1692 } 1693 iterate_and_advance(i, bytes, base, len, off, ({ 1694 next = csum_and_copy_to_user(addr + off, base, len); 1695 sum = csum_block_add(sum, next, off); 1696 next ? 0 : len; 1697 }), ({ 1698 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1699 }) 1700 ) 1701 csstate->csum = csum_shift(sum, csstate->off); 1702 csstate->off += bytes; 1703 return bytes; 1704 } 1705 EXPORT_SYMBOL(csum_and_copy_to_iter); 1706 1707 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1708 struct iov_iter *i) 1709 { 1710 #ifdef CONFIG_CRYPTO_HASH 1711 struct ahash_request *hash = hashp; 1712 struct scatterlist sg; 1713 size_t copied; 1714 1715 copied = copy_to_iter(addr, bytes, i); 1716 sg_init_one(&sg, addr, copied); 1717 ahash_request_set_crypt(hash, &sg, NULL, copied); 1718 crypto_ahash_update(hash); 1719 return copied; 1720 #else 1721 return 0; 1722 #endif 1723 } 1724 EXPORT_SYMBOL(hash_and_copy_to_iter); 1725 1726 static int iov_npages(const struct iov_iter *i, int maxpages) 1727 { 1728 size_t skip = i->iov_offset, size = i->count; 1729 const struct iovec *p; 1730 int npages = 0; 1731 1732 for (p = i->iov; size; skip = 0, p++) { 1733 unsigned offs = offset_in_page(p->iov_base + skip); 1734 size_t len = min(p->iov_len - skip, size); 1735 1736 if (len) { 1737 size -= len; 1738 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1739 if (unlikely(npages > maxpages)) 1740 return maxpages; 1741 } 1742 } 1743 return npages; 1744 } 1745 1746 static int bvec_npages(const struct iov_iter *i, int maxpages) 1747 { 1748 size_t skip = i->iov_offset, size = i->count; 1749 const struct bio_vec *p; 1750 int npages = 0; 1751 1752 for (p = i->bvec; size; skip = 0, p++) { 1753 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1754 size_t len = min(p->bv_len - skip, size); 1755 1756 size -= len; 1757 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1758 if (unlikely(npages > maxpages)) 1759 return maxpages; 1760 } 1761 return npages; 1762 } 1763 1764 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1765 { 1766 if (unlikely(!i->count)) 1767 return 0; 1768 /* iovec and kvec have identical layouts */ 1769 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1770 return iov_npages(i, maxpages); 1771 if (iov_iter_is_bvec(i)) 1772 return bvec_npages(i, maxpages); 1773 if (iov_iter_is_pipe(i)) { 1774 unsigned int iter_head; 1775 int npages; 1776 size_t off; 1777 1778 if (!sanity(i)) 1779 return 0; 1780 1781 data_start(i, &iter_head, &off); 1782 /* some of this one + all after this one */ 1783 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1784 return min(npages, maxpages); 1785 } 1786 if (iov_iter_is_xarray(i)) { 1787 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1788 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1789 return min(npages, maxpages); 1790 } 1791 return 0; 1792 } 1793 EXPORT_SYMBOL(iov_iter_npages); 1794 1795 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1796 { 1797 *new = *old; 1798 if (unlikely(iov_iter_is_pipe(new))) { 1799 WARN_ON(1); 1800 return NULL; 1801 } 1802 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) 1803 return NULL; 1804 if (iov_iter_is_bvec(new)) 1805 return new->bvec = kmemdup(new->bvec, 1806 new->nr_segs * sizeof(struct bio_vec), 1807 flags); 1808 else 1809 /* iovec and kvec have identical layout */ 1810 return new->iov = kmemdup(new->iov, 1811 new->nr_segs * sizeof(struct iovec), 1812 flags); 1813 } 1814 EXPORT_SYMBOL(dup_iter); 1815 1816 static int copy_compat_iovec_from_user(struct iovec *iov, 1817 const struct iovec __user *uvec, unsigned long nr_segs) 1818 { 1819 const struct compat_iovec __user *uiov = 1820 (const struct compat_iovec __user *)uvec; 1821 int ret = -EFAULT, i; 1822 1823 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1824 return -EFAULT; 1825 1826 for (i = 0; i < nr_segs; i++) { 1827 compat_uptr_t buf; 1828 compat_ssize_t len; 1829 1830 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1831 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1832 1833 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1834 if (len < 0) { 1835 ret = -EINVAL; 1836 goto uaccess_end; 1837 } 1838 iov[i].iov_base = compat_ptr(buf); 1839 iov[i].iov_len = len; 1840 } 1841 1842 ret = 0; 1843 uaccess_end: 1844 user_access_end(); 1845 return ret; 1846 } 1847 1848 static int copy_iovec_from_user(struct iovec *iov, 1849 const struct iovec __user *uvec, unsigned long nr_segs) 1850 { 1851 unsigned long seg; 1852 1853 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1854 return -EFAULT; 1855 for (seg = 0; seg < nr_segs; seg++) { 1856 if ((ssize_t)iov[seg].iov_len < 0) 1857 return -EINVAL; 1858 } 1859 1860 return 0; 1861 } 1862 1863 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1864 unsigned long nr_segs, unsigned long fast_segs, 1865 struct iovec *fast_iov, bool compat) 1866 { 1867 struct iovec *iov = fast_iov; 1868 int ret; 1869 1870 /* 1871 * SuS says "The readv() function *may* fail if the iovcnt argument was 1872 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1873 * traditionally returned zero for zero segments, so... 1874 */ 1875 if (nr_segs == 0) 1876 return iov; 1877 if (nr_segs > UIO_MAXIOV) 1878 return ERR_PTR(-EINVAL); 1879 if (nr_segs > fast_segs) { 1880 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1881 if (!iov) 1882 return ERR_PTR(-ENOMEM); 1883 } 1884 1885 if (compat) 1886 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1887 else 1888 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1889 if (ret) { 1890 if (iov != fast_iov) 1891 kfree(iov); 1892 return ERR_PTR(ret); 1893 } 1894 1895 return iov; 1896 } 1897 1898 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1899 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1900 struct iov_iter *i, bool compat) 1901 { 1902 ssize_t total_len = 0; 1903 unsigned long seg; 1904 struct iovec *iov; 1905 1906 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1907 if (IS_ERR(iov)) { 1908 *iovp = NULL; 1909 return PTR_ERR(iov); 1910 } 1911 1912 /* 1913 * According to the Single Unix Specification we should return EINVAL if 1914 * an element length is < 0 when cast to ssize_t or if the total length 1915 * would overflow the ssize_t return value of the system call. 1916 * 1917 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1918 * overflow case. 1919 */ 1920 for (seg = 0; seg < nr_segs; seg++) { 1921 ssize_t len = (ssize_t)iov[seg].iov_len; 1922 1923 if (!access_ok(iov[seg].iov_base, len)) { 1924 if (iov != *iovp) 1925 kfree(iov); 1926 *iovp = NULL; 1927 return -EFAULT; 1928 } 1929 1930 if (len > MAX_RW_COUNT - total_len) { 1931 len = MAX_RW_COUNT - total_len; 1932 iov[seg].iov_len = len; 1933 } 1934 total_len += len; 1935 } 1936 1937 iov_iter_init(i, type, iov, nr_segs, total_len); 1938 if (iov == *iovp) 1939 *iovp = NULL; 1940 else 1941 *iovp = iov; 1942 return total_len; 1943 } 1944 1945 /** 1946 * import_iovec() - Copy an array of &struct iovec from userspace 1947 * into the kernel, check that it is valid, and initialize a new 1948 * &struct iov_iter iterator to access it. 1949 * 1950 * @type: One of %READ or %WRITE. 1951 * @uvec: Pointer to the userspace array. 1952 * @nr_segs: Number of elements in userspace array. 1953 * @fast_segs: Number of elements in @iov. 1954 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1955 * on-stack) kernel array. 1956 * @i: Pointer to iterator that will be initialized on success. 1957 * 1958 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1959 * then this function places %NULL in *@iov on return. Otherwise, a new 1960 * array will be allocated and the result placed in *@iov. This means that 1961 * the caller may call kfree() on *@iov regardless of whether the small 1962 * on-stack array was used or not (and regardless of whether this function 1963 * returns an error or not). 1964 * 1965 * Return: Negative error code on error, bytes imported on success 1966 */ 1967 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1968 unsigned nr_segs, unsigned fast_segs, 1969 struct iovec **iovp, struct iov_iter *i) 1970 { 1971 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1972 in_compat_syscall()); 1973 } 1974 EXPORT_SYMBOL(import_iovec); 1975 1976 int import_single_range(int rw, void __user *buf, size_t len, 1977 struct iovec *iov, struct iov_iter *i) 1978 { 1979 if (len > MAX_RW_COUNT) 1980 len = MAX_RW_COUNT; 1981 if (unlikely(!access_ok(buf, len))) 1982 return -EFAULT; 1983 1984 iov->iov_base = buf; 1985 iov->iov_len = len; 1986 iov_iter_init(i, rw, iov, 1, len); 1987 return 0; 1988 } 1989 EXPORT_SYMBOL(import_single_range); 1990