1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers iovec and kvec alike */ 20 #define iterate_iovec(i, n, base, len, off, __p, skip, STEP) { \ 21 size_t off = 0; \ 22 do { \ 23 len = min(n, __p->iov_len - skip); \ 24 if (likely(len)) { \ 25 base = __p->iov_base + skip; \ 26 len -= (STEP); \ 27 off += len; \ 28 skip += len; \ 29 n -= len; \ 30 if (skip < __p->iov_len) \ 31 break; \ 32 } \ 33 __p++; \ 34 skip = 0; \ 35 } while (n); \ 36 n = off; \ 37 } 38 39 #define iterate_bvec(i, n, base, len, off, p, skip, STEP) { \ 40 size_t off = 0; \ 41 while (n) { \ 42 unsigned offset = p->bv_offset + skip; \ 43 unsigned left; \ 44 void *kaddr = kmap_local_page(p->bv_page + \ 45 offset / PAGE_SIZE); \ 46 base = kaddr + offset % PAGE_SIZE; \ 47 len = min(min(n, p->bv_len - skip), \ 48 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 49 left = (STEP); \ 50 kunmap_local(kaddr); \ 51 len -= left; \ 52 off += len; \ 53 skip += len; \ 54 if (skip == p->bv_len) { \ 55 skip = 0; \ 56 p++; \ 57 } \ 58 n -= len; \ 59 if (left) \ 60 break; \ 61 } \ 62 n = off; \ 63 } 64 65 #define iterate_xarray(i, n, base, len, __off, skip, STEP) { \ 66 __label__ __out; \ 67 size_t __off = 0; \ 68 struct page *head = NULL; \ 69 size_t offset; \ 70 loff_t start = i->xarray_start + skip; \ 71 pgoff_t index = start >> PAGE_SHIFT; \ 72 int j; \ 73 \ 74 XA_STATE(xas, i->xarray, index); \ 75 \ 76 rcu_read_lock(); \ 77 xas_for_each(&xas, head, ULONG_MAX) { \ 78 unsigned left; \ 79 if (xas_retry(&xas, head)) \ 80 continue; \ 81 if (WARN_ON(xa_is_value(head))) \ 82 break; \ 83 if (WARN_ON(PageHuge(head))) \ 84 break; \ 85 for (j = (head->index < index) ? index - head->index : 0; \ 86 j < thp_nr_pages(head); j++) { \ 87 void *kaddr = kmap_local_page(head + j); \ 88 offset = (start + __off) % PAGE_SIZE; \ 89 base = kaddr + offset; \ 90 len = PAGE_SIZE - offset; \ 91 len = min(n, len); \ 92 left = (STEP); \ 93 kunmap_local(kaddr); \ 94 len -= left; \ 95 __off += len; \ 96 n -= len; \ 97 if (left || n == 0) \ 98 goto __out; \ 99 } \ 100 } \ 101 __out: \ 102 rcu_read_unlock(); \ 103 skip += __off; \ 104 n = __off; \ 105 } 106 107 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 108 if (unlikely(i->count < n)) \ 109 n = i->count; \ 110 if (likely(n)) { \ 111 size_t skip = i->iov_offset; \ 112 if (likely(iter_is_iovec(i))) { \ 113 const struct iovec *iov = i->iov; \ 114 void __user *base; \ 115 size_t len; \ 116 iterate_iovec(i, n, base, len, off, \ 117 iov, skip, (I)) \ 118 i->nr_segs -= iov - i->iov; \ 119 i->iov = iov; \ 120 } else if (iov_iter_is_bvec(i)) { \ 121 const struct bio_vec *bvec = i->bvec; \ 122 void *base; \ 123 size_t len; \ 124 iterate_bvec(i, n, base, len, off, \ 125 bvec, skip, (K)) \ 126 i->nr_segs -= bvec - i->bvec; \ 127 i->bvec = bvec; \ 128 } else if (iov_iter_is_kvec(i)) { \ 129 const struct kvec *kvec = i->kvec; \ 130 void *base; \ 131 size_t len; \ 132 iterate_iovec(i, n, base, len, off, \ 133 kvec, skip, (K)) \ 134 i->nr_segs -= kvec - i->kvec; \ 135 i->kvec = kvec; \ 136 } else if (iov_iter_is_xarray(i)) { \ 137 void *base; \ 138 size_t len; \ 139 iterate_xarray(i, n, base, len, off, \ 140 skip, (K)) \ 141 } \ 142 i->count -= n; \ 143 i->iov_offset = skip; \ 144 } \ 145 } 146 #define iterate_and_advance(i, n, base, len, off, I, K) \ 147 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 148 149 static int copyout(void __user *to, const void *from, size_t n) 150 { 151 if (should_fail_usercopy()) 152 return n; 153 if (access_ok(to, n)) { 154 instrument_copy_to_user(to, from, n); 155 n = raw_copy_to_user(to, from, n); 156 } 157 return n; 158 } 159 160 static int copyin(void *to, const void __user *from, size_t n) 161 { 162 if (should_fail_usercopy()) 163 return n; 164 if (access_ok(from, n)) { 165 instrument_copy_from_user(to, from, n); 166 n = raw_copy_from_user(to, from, n); 167 } 168 return n; 169 } 170 171 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 172 struct iov_iter *i) 173 { 174 size_t skip, copy, left, wanted; 175 const struct iovec *iov; 176 char __user *buf; 177 void *kaddr, *from; 178 179 if (unlikely(bytes > i->count)) 180 bytes = i->count; 181 182 if (unlikely(!bytes)) 183 return 0; 184 185 might_fault(); 186 wanted = bytes; 187 iov = i->iov; 188 skip = i->iov_offset; 189 buf = iov->iov_base + skip; 190 copy = min(bytes, iov->iov_len - skip); 191 192 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { 193 kaddr = kmap_atomic(page); 194 from = kaddr + offset; 195 196 /* first chunk, usually the only one */ 197 left = copyout(buf, from, copy); 198 copy -= left; 199 skip += copy; 200 from += copy; 201 bytes -= copy; 202 203 while (unlikely(!left && bytes)) { 204 iov++; 205 buf = iov->iov_base; 206 copy = min(bytes, iov->iov_len); 207 left = copyout(buf, from, copy); 208 copy -= left; 209 skip = copy; 210 from += copy; 211 bytes -= copy; 212 } 213 if (likely(!bytes)) { 214 kunmap_atomic(kaddr); 215 goto done; 216 } 217 offset = from - kaddr; 218 buf += copy; 219 kunmap_atomic(kaddr); 220 copy = min(bytes, iov->iov_len - skip); 221 } 222 /* Too bad - revert to non-atomic kmap */ 223 224 kaddr = kmap(page); 225 from = kaddr + offset; 226 left = copyout(buf, from, copy); 227 copy -= left; 228 skip += copy; 229 from += copy; 230 bytes -= copy; 231 while (unlikely(!left && bytes)) { 232 iov++; 233 buf = iov->iov_base; 234 copy = min(bytes, iov->iov_len); 235 left = copyout(buf, from, copy); 236 copy -= left; 237 skip = copy; 238 from += copy; 239 bytes -= copy; 240 } 241 kunmap(page); 242 243 done: 244 if (skip == iov->iov_len) { 245 iov++; 246 skip = 0; 247 } 248 i->count -= wanted - bytes; 249 i->nr_segs -= iov - i->iov; 250 i->iov = iov; 251 i->iov_offset = skip; 252 return wanted - bytes; 253 } 254 255 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, 256 struct iov_iter *i) 257 { 258 size_t skip, copy, left, wanted; 259 const struct iovec *iov; 260 char __user *buf; 261 void *kaddr, *to; 262 263 if (unlikely(bytes > i->count)) 264 bytes = i->count; 265 266 if (unlikely(!bytes)) 267 return 0; 268 269 might_fault(); 270 wanted = bytes; 271 iov = i->iov; 272 skip = i->iov_offset; 273 buf = iov->iov_base + skip; 274 copy = min(bytes, iov->iov_len - skip); 275 276 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { 277 kaddr = kmap_atomic(page); 278 to = kaddr + offset; 279 280 /* first chunk, usually the only one */ 281 left = copyin(to, buf, copy); 282 copy -= left; 283 skip += copy; 284 to += copy; 285 bytes -= copy; 286 287 while (unlikely(!left && bytes)) { 288 iov++; 289 buf = iov->iov_base; 290 copy = min(bytes, iov->iov_len); 291 left = copyin(to, buf, copy); 292 copy -= left; 293 skip = copy; 294 to += copy; 295 bytes -= copy; 296 } 297 if (likely(!bytes)) { 298 kunmap_atomic(kaddr); 299 goto done; 300 } 301 offset = to - kaddr; 302 buf += copy; 303 kunmap_atomic(kaddr); 304 copy = min(bytes, iov->iov_len - skip); 305 } 306 /* Too bad - revert to non-atomic kmap */ 307 308 kaddr = kmap(page); 309 to = kaddr + offset; 310 left = copyin(to, buf, copy); 311 copy -= left; 312 skip += copy; 313 to += copy; 314 bytes -= copy; 315 while (unlikely(!left && bytes)) { 316 iov++; 317 buf = iov->iov_base; 318 copy = min(bytes, iov->iov_len); 319 left = copyin(to, buf, copy); 320 copy -= left; 321 skip = copy; 322 to += copy; 323 bytes -= copy; 324 } 325 kunmap(page); 326 327 done: 328 if (skip == iov->iov_len) { 329 iov++; 330 skip = 0; 331 } 332 i->count -= wanted - bytes; 333 i->nr_segs -= iov - i->iov; 334 i->iov = iov; 335 i->iov_offset = skip; 336 return wanted - bytes; 337 } 338 339 #ifdef PIPE_PARANOIA 340 static bool sanity(const struct iov_iter *i) 341 { 342 struct pipe_inode_info *pipe = i->pipe; 343 unsigned int p_head = pipe->head; 344 unsigned int p_tail = pipe->tail; 345 unsigned int p_mask = pipe->ring_size - 1; 346 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 347 unsigned int i_head = i->head; 348 unsigned int idx; 349 350 if (i->iov_offset) { 351 struct pipe_buffer *p; 352 if (unlikely(p_occupancy == 0)) 353 goto Bad; // pipe must be non-empty 354 if (unlikely(i_head != p_head - 1)) 355 goto Bad; // must be at the last buffer... 356 357 p = &pipe->bufs[i_head & p_mask]; 358 if (unlikely(p->offset + p->len != i->iov_offset)) 359 goto Bad; // ... at the end of segment 360 } else { 361 if (i_head != p_head) 362 goto Bad; // must be right after the last buffer 363 } 364 return true; 365 Bad: 366 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 367 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 368 p_head, p_tail, pipe->ring_size); 369 for (idx = 0; idx < pipe->ring_size; idx++) 370 printk(KERN_ERR "[%p %p %d %d]\n", 371 pipe->bufs[idx].ops, 372 pipe->bufs[idx].page, 373 pipe->bufs[idx].offset, 374 pipe->bufs[idx].len); 375 WARN_ON(1); 376 return false; 377 } 378 #else 379 #define sanity(i) true 380 #endif 381 382 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 383 struct iov_iter *i) 384 { 385 struct pipe_inode_info *pipe = i->pipe; 386 struct pipe_buffer *buf; 387 unsigned int p_tail = pipe->tail; 388 unsigned int p_mask = pipe->ring_size - 1; 389 unsigned int i_head = i->head; 390 size_t off; 391 392 if (unlikely(bytes > i->count)) 393 bytes = i->count; 394 395 if (unlikely(!bytes)) 396 return 0; 397 398 if (!sanity(i)) 399 return 0; 400 401 off = i->iov_offset; 402 buf = &pipe->bufs[i_head & p_mask]; 403 if (off) { 404 if (offset == off && buf->page == page) { 405 /* merge with the last one */ 406 buf->len += bytes; 407 i->iov_offset += bytes; 408 goto out; 409 } 410 i_head++; 411 buf = &pipe->bufs[i_head & p_mask]; 412 } 413 if (pipe_full(i_head, p_tail, pipe->max_usage)) 414 return 0; 415 416 buf->ops = &page_cache_pipe_buf_ops; 417 get_page(page); 418 buf->page = page; 419 buf->offset = offset; 420 buf->len = bytes; 421 422 pipe->head = i_head + 1; 423 i->iov_offset = offset + bytes; 424 i->head = i_head; 425 out: 426 i->count -= bytes; 427 return bytes; 428 } 429 430 /* 431 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 432 * bytes. For each iovec, fault in each page that constitutes the iovec. 433 * 434 * Return 0 on success, or non-zero if the memory could not be accessed (i.e. 435 * because it is an invalid address). 436 */ 437 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) 438 { 439 if (iter_is_iovec(i)) { 440 const struct iovec *p; 441 size_t skip; 442 443 if (bytes > i->count) 444 bytes = i->count; 445 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { 446 size_t len = min(bytes, p->iov_len - skip); 447 int err; 448 449 if (unlikely(!len)) 450 continue; 451 err = fault_in_pages_readable(p->iov_base + skip, len); 452 if (unlikely(err)) 453 return err; 454 bytes -= len; 455 } 456 } 457 return 0; 458 } 459 EXPORT_SYMBOL(iov_iter_fault_in_readable); 460 461 void iov_iter_init(struct iov_iter *i, unsigned int direction, 462 const struct iovec *iov, unsigned long nr_segs, 463 size_t count) 464 { 465 WARN_ON(direction & ~(READ | WRITE)); 466 WARN_ON_ONCE(uaccess_kernel()); 467 *i = (struct iov_iter) { 468 .iter_type = ITER_IOVEC, 469 .data_source = direction, 470 .iov = iov, 471 .nr_segs = nr_segs, 472 .iov_offset = 0, 473 .count = count 474 }; 475 } 476 EXPORT_SYMBOL(iov_iter_init); 477 478 static inline bool allocated(struct pipe_buffer *buf) 479 { 480 return buf->ops == &default_pipe_buf_ops; 481 } 482 483 static inline void data_start(const struct iov_iter *i, 484 unsigned int *iter_headp, size_t *offp) 485 { 486 unsigned int p_mask = i->pipe->ring_size - 1; 487 unsigned int iter_head = i->head; 488 size_t off = i->iov_offset; 489 490 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || 491 off == PAGE_SIZE)) { 492 iter_head++; 493 off = 0; 494 } 495 *iter_headp = iter_head; 496 *offp = off; 497 } 498 499 static size_t push_pipe(struct iov_iter *i, size_t size, 500 int *iter_headp, size_t *offp) 501 { 502 struct pipe_inode_info *pipe = i->pipe; 503 unsigned int p_tail = pipe->tail; 504 unsigned int p_mask = pipe->ring_size - 1; 505 unsigned int iter_head; 506 size_t off; 507 ssize_t left; 508 509 if (unlikely(size > i->count)) 510 size = i->count; 511 if (unlikely(!size)) 512 return 0; 513 514 left = size; 515 data_start(i, &iter_head, &off); 516 *iter_headp = iter_head; 517 *offp = off; 518 if (off) { 519 left -= PAGE_SIZE - off; 520 if (left <= 0) { 521 pipe->bufs[iter_head & p_mask].len += size; 522 return size; 523 } 524 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; 525 iter_head++; 526 } 527 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { 528 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; 529 struct page *page = alloc_page(GFP_USER); 530 if (!page) 531 break; 532 533 buf->ops = &default_pipe_buf_ops; 534 buf->page = page; 535 buf->offset = 0; 536 buf->len = min_t(ssize_t, left, PAGE_SIZE); 537 left -= buf->len; 538 iter_head++; 539 pipe->head = iter_head; 540 541 if (left == 0) 542 return size; 543 } 544 return size - left; 545 } 546 547 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 548 struct iov_iter *i) 549 { 550 struct pipe_inode_info *pipe = i->pipe; 551 unsigned int p_mask = pipe->ring_size - 1; 552 unsigned int i_head; 553 size_t n, off; 554 555 if (!sanity(i)) 556 return 0; 557 558 bytes = n = push_pipe(i, bytes, &i_head, &off); 559 if (unlikely(!n)) 560 return 0; 561 do { 562 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 563 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 564 i->head = i_head; 565 i->iov_offset = off + chunk; 566 n -= chunk; 567 addr += chunk; 568 off = 0; 569 i_head++; 570 } while (n); 571 i->count -= bytes; 572 return bytes; 573 } 574 575 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 576 __wsum sum, size_t off) 577 { 578 __wsum next = csum_partial_copy_nocheck(from, to, len); 579 return csum_block_add(sum, next, off); 580 } 581 582 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 583 struct csum_state *csstate, 584 struct iov_iter *i) 585 { 586 struct pipe_inode_info *pipe = i->pipe; 587 unsigned int p_mask = pipe->ring_size - 1; 588 __wsum sum = csstate->csum; 589 size_t off = csstate->off; 590 unsigned int i_head; 591 size_t n, r; 592 593 if (!sanity(i)) 594 return 0; 595 596 bytes = n = push_pipe(i, bytes, &i_head, &r); 597 if (unlikely(!n)) 598 return 0; 599 do { 600 size_t chunk = min_t(size_t, n, PAGE_SIZE - r); 601 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page); 602 sum = csum_and_memcpy(p + r, addr, chunk, sum, off); 603 kunmap_atomic(p); 604 i->head = i_head; 605 i->iov_offset = r + chunk; 606 n -= chunk; 607 off += chunk; 608 addr += chunk; 609 r = 0; 610 i_head++; 611 } while (n); 612 i->count -= bytes; 613 csstate->csum = sum; 614 csstate->off = off; 615 return bytes; 616 } 617 618 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 619 { 620 if (unlikely(iov_iter_is_pipe(i))) 621 return copy_pipe_to_iter(addr, bytes, i); 622 if (iter_is_iovec(i)) 623 might_fault(); 624 iterate_and_advance(i, bytes, base, len, off, 625 copyout(base, addr + off, len), 626 memcpy(base, addr + off, len) 627 ) 628 629 return bytes; 630 } 631 EXPORT_SYMBOL(_copy_to_iter); 632 633 #ifdef CONFIG_ARCH_HAS_COPY_MC 634 static int copyout_mc(void __user *to, const void *from, size_t n) 635 { 636 if (access_ok(to, n)) { 637 instrument_copy_to_user(to, from, n); 638 n = copy_mc_to_user((__force void *) to, from, n); 639 } 640 return n; 641 } 642 643 static unsigned long copy_mc_to_page(struct page *page, size_t offset, 644 const char *from, size_t len) 645 { 646 unsigned long ret; 647 char *to; 648 649 to = kmap_atomic(page); 650 ret = copy_mc_to_kernel(to + offset, from, len); 651 kunmap_atomic(to); 652 653 return ret; 654 } 655 656 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 657 struct iov_iter *i) 658 { 659 struct pipe_inode_info *pipe = i->pipe; 660 unsigned int p_mask = pipe->ring_size - 1; 661 unsigned int i_head; 662 size_t n, off, xfer = 0; 663 664 if (!sanity(i)) 665 return 0; 666 667 bytes = n = push_pipe(i, bytes, &i_head, &off); 668 if (unlikely(!n)) 669 return 0; 670 do { 671 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 672 unsigned long rem; 673 674 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page, 675 off, addr, chunk); 676 i->head = i_head; 677 i->iov_offset = off + chunk - rem; 678 xfer += chunk - rem; 679 if (rem) 680 break; 681 n -= chunk; 682 addr += chunk; 683 off = 0; 684 i_head++; 685 } while (n); 686 i->count -= xfer; 687 return xfer; 688 } 689 690 /** 691 * _copy_mc_to_iter - copy to iter with source memory error exception handling 692 * @addr: source kernel address 693 * @bytes: total transfer length 694 * @iter: destination iterator 695 * 696 * The pmem driver deploys this for the dax operation 697 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 698 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 699 * successfully copied. 700 * 701 * The main differences between this and typical _copy_to_iter(). 702 * 703 * * Typical tail/residue handling after a fault retries the copy 704 * byte-by-byte until the fault happens again. Re-triggering machine 705 * checks is potentially fatal so the implementation uses source 706 * alignment and poison alignment assumptions to avoid re-triggering 707 * hardware exceptions. 708 * 709 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 710 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 711 * a short copy. 712 */ 713 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 714 { 715 if (unlikely(iov_iter_is_pipe(i))) 716 return copy_mc_pipe_to_iter(addr, bytes, i); 717 if (iter_is_iovec(i)) 718 might_fault(); 719 __iterate_and_advance(i, bytes, base, len, off, 720 copyout_mc(base, addr + off, len), 721 copy_mc_to_kernel(base, addr + off, len) 722 ) 723 724 return bytes; 725 } 726 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 727 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 728 729 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 730 { 731 if (unlikely(iov_iter_is_pipe(i))) { 732 WARN_ON(1); 733 return 0; 734 } 735 if (iter_is_iovec(i)) 736 might_fault(); 737 iterate_and_advance(i, bytes, base, len, off, 738 copyin(addr + off, base, len), 739 memcpy(addr + off, base, len) 740 ) 741 742 return bytes; 743 } 744 EXPORT_SYMBOL(_copy_from_iter); 745 746 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 747 { 748 if (unlikely(iov_iter_is_pipe(i))) { 749 WARN_ON(1); 750 return 0; 751 } 752 iterate_and_advance(i, bytes, base, len, off, 753 __copy_from_user_inatomic_nocache(addr + off, base, len), 754 memcpy(addr + off, base, len) 755 ) 756 757 return bytes; 758 } 759 EXPORT_SYMBOL(_copy_from_iter_nocache); 760 761 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 762 /** 763 * _copy_from_iter_flushcache - write destination through cpu cache 764 * @addr: destination kernel address 765 * @bytes: total transfer length 766 * @iter: source iterator 767 * 768 * The pmem driver arranges for filesystem-dax to use this facility via 769 * dax_copy_from_iter() for ensuring that writes to persistent memory 770 * are flushed through the CPU cache. It is differentiated from 771 * _copy_from_iter_nocache() in that guarantees all data is flushed for 772 * all iterator types. The _copy_from_iter_nocache() only attempts to 773 * bypass the cache for the ITER_IOVEC case, and on some archs may use 774 * instructions that strand dirty-data in the cache. 775 */ 776 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 777 { 778 if (unlikely(iov_iter_is_pipe(i))) { 779 WARN_ON(1); 780 return 0; 781 } 782 iterate_and_advance(i, bytes, base, len, off, 783 __copy_from_user_flushcache(addr + off, base, len), 784 memcpy_flushcache(addr + off, base, len) 785 ) 786 787 return bytes; 788 } 789 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 790 #endif 791 792 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 793 { 794 struct page *head; 795 size_t v = n + offset; 796 797 /* 798 * The general case needs to access the page order in order 799 * to compute the page size. 800 * However, we mostly deal with order-0 pages and thus can 801 * avoid a possible cache line miss for requests that fit all 802 * page orders. 803 */ 804 if (n <= v && v <= PAGE_SIZE) 805 return true; 806 807 head = compound_head(page); 808 v += (page - head) << PAGE_SHIFT; 809 810 if (likely(n <= v && v <= (page_size(head)))) 811 return true; 812 WARN_ON(1); 813 return false; 814 } 815 816 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 817 struct iov_iter *i) 818 { 819 if (likely(iter_is_iovec(i))) 820 return copy_page_to_iter_iovec(page, offset, bytes, i); 821 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 822 void *kaddr = kmap_atomic(page); 823 size_t wanted = copy_to_iter(kaddr + offset, bytes, i); 824 kunmap_atomic(kaddr); 825 return wanted; 826 } 827 if (iov_iter_is_pipe(i)) 828 return copy_page_to_iter_pipe(page, offset, bytes, i); 829 if (unlikely(iov_iter_is_discard(i))) { 830 if (unlikely(i->count < bytes)) 831 bytes = i->count; 832 i->count -= bytes; 833 return bytes; 834 } 835 WARN_ON(1); 836 return 0; 837 } 838 839 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 840 struct iov_iter *i) 841 { 842 size_t res = 0; 843 if (unlikely(!page_copy_sane(page, offset, bytes))) 844 return 0; 845 page += offset / PAGE_SIZE; // first subpage 846 offset %= PAGE_SIZE; 847 while (1) { 848 size_t n = __copy_page_to_iter(page, offset, 849 min(bytes, (size_t)PAGE_SIZE - offset), i); 850 res += n; 851 bytes -= n; 852 if (!bytes || !n) 853 break; 854 offset += n; 855 if (offset == PAGE_SIZE) { 856 page++; 857 offset = 0; 858 } 859 } 860 return res; 861 } 862 EXPORT_SYMBOL(copy_page_to_iter); 863 864 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 865 struct iov_iter *i) 866 { 867 if (unlikely(!page_copy_sane(page, offset, bytes))) 868 return 0; 869 if (likely(iter_is_iovec(i))) 870 return copy_page_from_iter_iovec(page, offset, bytes, i); 871 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 872 void *kaddr = kmap_atomic(page); 873 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 874 kunmap_atomic(kaddr); 875 return wanted; 876 } 877 WARN_ON(1); 878 return 0; 879 } 880 EXPORT_SYMBOL(copy_page_from_iter); 881 882 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 883 { 884 struct pipe_inode_info *pipe = i->pipe; 885 unsigned int p_mask = pipe->ring_size - 1; 886 unsigned int i_head; 887 size_t n, off; 888 889 if (!sanity(i)) 890 return 0; 891 892 bytes = n = push_pipe(i, bytes, &i_head, &off); 893 if (unlikely(!n)) 894 return 0; 895 896 do { 897 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 898 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk); 899 i->head = i_head; 900 i->iov_offset = off + chunk; 901 n -= chunk; 902 off = 0; 903 i_head++; 904 } while (n); 905 i->count -= bytes; 906 return bytes; 907 } 908 909 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 910 { 911 if (unlikely(iov_iter_is_pipe(i))) 912 return pipe_zero(bytes, i); 913 iterate_and_advance(i, bytes, base, len, count, 914 clear_user(base, len), 915 memset(base, 0, len) 916 ) 917 918 return bytes; 919 } 920 EXPORT_SYMBOL(iov_iter_zero); 921 922 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 923 struct iov_iter *i) 924 { 925 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 926 if (unlikely(!page_copy_sane(page, offset, bytes))) { 927 kunmap_atomic(kaddr); 928 return 0; 929 } 930 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 931 kunmap_atomic(kaddr); 932 WARN_ON(1); 933 return 0; 934 } 935 iterate_and_advance(i, bytes, base, len, off, 936 copyin(p + off, base, len), 937 memcpy(p + off, base, len) 938 ) 939 kunmap_atomic(kaddr); 940 return bytes; 941 } 942 EXPORT_SYMBOL(copy_page_from_iter_atomic); 943 944 static inline void pipe_truncate(struct iov_iter *i) 945 { 946 struct pipe_inode_info *pipe = i->pipe; 947 unsigned int p_tail = pipe->tail; 948 unsigned int p_head = pipe->head; 949 unsigned int p_mask = pipe->ring_size - 1; 950 951 if (!pipe_empty(p_head, p_tail)) { 952 struct pipe_buffer *buf; 953 unsigned int i_head = i->head; 954 size_t off = i->iov_offset; 955 956 if (off) { 957 buf = &pipe->bufs[i_head & p_mask]; 958 buf->len = off - buf->offset; 959 i_head++; 960 } 961 while (p_head != i_head) { 962 p_head--; 963 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 964 } 965 966 pipe->head = p_head; 967 } 968 } 969 970 static void pipe_advance(struct iov_iter *i, size_t size) 971 { 972 struct pipe_inode_info *pipe = i->pipe; 973 if (size) { 974 struct pipe_buffer *buf; 975 unsigned int p_mask = pipe->ring_size - 1; 976 unsigned int i_head = i->head; 977 size_t off = i->iov_offset, left = size; 978 979 if (off) /* make it relative to the beginning of buffer */ 980 left += off - pipe->bufs[i_head & p_mask].offset; 981 while (1) { 982 buf = &pipe->bufs[i_head & p_mask]; 983 if (left <= buf->len) 984 break; 985 left -= buf->len; 986 i_head++; 987 } 988 i->head = i_head; 989 i->iov_offset = buf->offset + left; 990 } 991 i->count -= size; 992 /* ... and discard everything past that point */ 993 pipe_truncate(i); 994 } 995 996 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 997 { 998 struct bvec_iter bi; 999 1000 bi.bi_size = i->count; 1001 bi.bi_bvec_done = i->iov_offset; 1002 bi.bi_idx = 0; 1003 bvec_iter_advance(i->bvec, &bi, size); 1004 1005 i->bvec += bi.bi_idx; 1006 i->nr_segs -= bi.bi_idx; 1007 i->count = bi.bi_size; 1008 i->iov_offset = bi.bi_bvec_done; 1009 } 1010 1011 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 1012 { 1013 const struct iovec *iov, *end; 1014 1015 if (!i->count) 1016 return; 1017 i->count -= size; 1018 1019 size += i->iov_offset; // from beginning of current segment 1020 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 1021 if (likely(size < iov->iov_len)) 1022 break; 1023 size -= iov->iov_len; 1024 } 1025 i->iov_offset = size; 1026 i->nr_segs -= iov - i->iov; 1027 i->iov = iov; 1028 } 1029 1030 void iov_iter_advance(struct iov_iter *i, size_t size) 1031 { 1032 if (unlikely(i->count < size)) 1033 size = i->count; 1034 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 1035 /* iovec and kvec have identical layouts */ 1036 iov_iter_iovec_advance(i, size); 1037 } else if (iov_iter_is_bvec(i)) { 1038 iov_iter_bvec_advance(i, size); 1039 } else if (iov_iter_is_pipe(i)) { 1040 pipe_advance(i, size); 1041 } else if (unlikely(iov_iter_is_xarray(i))) { 1042 i->iov_offset += size; 1043 i->count -= size; 1044 } else if (iov_iter_is_discard(i)) { 1045 i->count -= size; 1046 } 1047 } 1048 EXPORT_SYMBOL(iov_iter_advance); 1049 1050 void iov_iter_revert(struct iov_iter *i, size_t unroll) 1051 { 1052 if (!unroll) 1053 return; 1054 if (WARN_ON(unroll > MAX_RW_COUNT)) 1055 return; 1056 i->count += unroll; 1057 if (unlikely(iov_iter_is_pipe(i))) { 1058 struct pipe_inode_info *pipe = i->pipe; 1059 unsigned int p_mask = pipe->ring_size - 1; 1060 unsigned int i_head = i->head; 1061 size_t off = i->iov_offset; 1062 while (1) { 1063 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 1064 size_t n = off - b->offset; 1065 if (unroll < n) { 1066 off -= unroll; 1067 break; 1068 } 1069 unroll -= n; 1070 if (!unroll && i_head == i->start_head) { 1071 off = 0; 1072 break; 1073 } 1074 i_head--; 1075 b = &pipe->bufs[i_head & p_mask]; 1076 off = b->offset + b->len; 1077 } 1078 i->iov_offset = off; 1079 i->head = i_head; 1080 pipe_truncate(i); 1081 return; 1082 } 1083 if (unlikely(iov_iter_is_discard(i))) 1084 return; 1085 if (unroll <= i->iov_offset) { 1086 i->iov_offset -= unroll; 1087 return; 1088 } 1089 unroll -= i->iov_offset; 1090 if (iov_iter_is_xarray(i)) { 1091 BUG(); /* We should never go beyond the start of the specified 1092 * range since we might then be straying into pages that 1093 * aren't pinned. 1094 */ 1095 } else if (iov_iter_is_bvec(i)) { 1096 const struct bio_vec *bvec = i->bvec; 1097 while (1) { 1098 size_t n = (--bvec)->bv_len; 1099 i->nr_segs++; 1100 if (unroll <= n) { 1101 i->bvec = bvec; 1102 i->iov_offset = n - unroll; 1103 return; 1104 } 1105 unroll -= n; 1106 } 1107 } else { /* same logics for iovec and kvec */ 1108 const struct iovec *iov = i->iov; 1109 while (1) { 1110 size_t n = (--iov)->iov_len; 1111 i->nr_segs++; 1112 if (unroll <= n) { 1113 i->iov = iov; 1114 i->iov_offset = n - unroll; 1115 return; 1116 } 1117 unroll -= n; 1118 } 1119 } 1120 } 1121 EXPORT_SYMBOL(iov_iter_revert); 1122 1123 /* 1124 * Return the count of just the current iov_iter segment. 1125 */ 1126 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1127 { 1128 if (i->nr_segs > 1) { 1129 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1130 return min(i->count, i->iov->iov_len - i->iov_offset); 1131 if (iov_iter_is_bvec(i)) 1132 return min(i->count, i->bvec->bv_len - i->iov_offset); 1133 } 1134 return i->count; 1135 } 1136 EXPORT_SYMBOL(iov_iter_single_seg_count); 1137 1138 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1139 const struct kvec *kvec, unsigned long nr_segs, 1140 size_t count) 1141 { 1142 WARN_ON(direction & ~(READ | WRITE)); 1143 *i = (struct iov_iter){ 1144 .iter_type = ITER_KVEC, 1145 .data_source = direction, 1146 .kvec = kvec, 1147 .nr_segs = nr_segs, 1148 .iov_offset = 0, 1149 .count = count 1150 }; 1151 } 1152 EXPORT_SYMBOL(iov_iter_kvec); 1153 1154 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1155 const struct bio_vec *bvec, unsigned long nr_segs, 1156 size_t count) 1157 { 1158 WARN_ON(direction & ~(READ | WRITE)); 1159 *i = (struct iov_iter){ 1160 .iter_type = ITER_BVEC, 1161 .data_source = direction, 1162 .bvec = bvec, 1163 .nr_segs = nr_segs, 1164 .iov_offset = 0, 1165 .count = count 1166 }; 1167 } 1168 EXPORT_SYMBOL(iov_iter_bvec); 1169 1170 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1171 struct pipe_inode_info *pipe, 1172 size_t count) 1173 { 1174 BUG_ON(direction != READ); 1175 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1176 *i = (struct iov_iter){ 1177 .iter_type = ITER_PIPE, 1178 .data_source = false, 1179 .pipe = pipe, 1180 .head = pipe->head, 1181 .start_head = pipe->head, 1182 .iov_offset = 0, 1183 .count = count 1184 }; 1185 } 1186 EXPORT_SYMBOL(iov_iter_pipe); 1187 1188 /** 1189 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1190 * @i: The iterator to initialise. 1191 * @direction: The direction of the transfer. 1192 * @xarray: The xarray to access. 1193 * @start: The start file position. 1194 * @count: The size of the I/O buffer in bytes. 1195 * 1196 * Set up an I/O iterator to either draw data out of the pages attached to an 1197 * inode or to inject data into those pages. The pages *must* be prevented 1198 * from evaporation, either by taking a ref on them or locking them by the 1199 * caller. 1200 */ 1201 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1202 struct xarray *xarray, loff_t start, size_t count) 1203 { 1204 BUG_ON(direction & ~1); 1205 *i = (struct iov_iter) { 1206 .iter_type = ITER_XARRAY, 1207 .data_source = direction, 1208 .xarray = xarray, 1209 .xarray_start = start, 1210 .count = count, 1211 .iov_offset = 0 1212 }; 1213 } 1214 EXPORT_SYMBOL(iov_iter_xarray); 1215 1216 /** 1217 * iov_iter_discard - Initialise an I/O iterator that discards data 1218 * @i: The iterator to initialise. 1219 * @direction: The direction of the transfer. 1220 * @count: The size of the I/O buffer in bytes. 1221 * 1222 * Set up an I/O iterator that just discards everything that's written to it. 1223 * It's only available as a READ iterator. 1224 */ 1225 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1226 { 1227 BUG_ON(direction != READ); 1228 *i = (struct iov_iter){ 1229 .iter_type = ITER_DISCARD, 1230 .data_source = false, 1231 .count = count, 1232 .iov_offset = 0 1233 }; 1234 } 1235 EXPORT_SYMBOL(iov_iter_discard); 1236 1237 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1238 { 1239 unsigned long res = 0; 1240 size_t size = i->count; 1241 size_t skip = i->iov_offset; 1242 unsigned k; 1243 1244 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1245 size_t len = i->iov[k].iov_len - skip; 1246 if (len) { 1247 res |= (unsigned long)i->iov[k].iov_base + skip; 1248 if (len > size) 1249 len = size; 1250 res |= len; 1251 size -= len; 1252 if (!size) 1253 break; 1254 } 1255 } 1256 return res; 1257 } 1258 1259 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1260 { 1261 unsigned res = 0; 1262 size_t size = i->count; 1263 unsigned skip = i->iov_offset; 1264 unsigned k; 1265 1266 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1267 size_t len = i->bvec[k].bv_len - skip; 1268 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1269 if (len > size) 1270 len = size; 1271 res |= len; 1272 size -= len; 1273 if (!size) 1274 break; 1275 } 1276 return res; 1277 } 1278 1279 unsigned long iov_iter_alignment(const struct iov_iter *i) 1280 { 1281 /* iovec and kvec have identical layouts */ 1282 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1283 return iov_iter_alignment_iovec(i); 1284 1285 if (iov_iter_is_bvec(i)) 1286 return iov_iter_alignment_bvec(i); 1287 1288 if (iov_iter_is_pipe(i)) { 1289 unsigned int p_mask = i->pipe->ring_size - 1; 1290 size_t size = i->count; 1291 1292 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) 1293 return size | i->iov_offset; 1294 return size; 1295 } 1296 1297 if (iov_iter_is_xarray(i)) 1298 return (i->xarray_start + i->iov_offset) | i->count; 1299 1300 return 0; 1301 } 1302 EXPORT_SYMBOL(iov_iter_alignment); 1303 1304 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1305 { 1306 unsigned long res = 0; 1307 unsigned long v = 0; 1308 size_t size = i->count; 1309 unsigned k; 1310 1311 if (WARN_ON(!iter_is_iovec(i))) 1312 return ~0U; 1313 1314 for (k = 0; k < i->nr_segs; k++) { 1315 if (i->iov[k].iov_len) { 1316 unsigned long base = (unsigned long)i->iov[k].iov_base; 1317 if (v) // if not the first one 1318 res |= base | v; // this start | previous end 1319 v = base + i->iov[k].iov_len; 1320 if (size <= i->iov[k].iov_len) 1321 break; 1322 size -= i->iov[k].iov_len; 1323 } 1324 } 1325 return res; 1326 } 1327 EXPORT_SYMBOL(iov_iter_gap_alignment); 1328 1329 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1330 size_t maxsize, 1331 struct page **pages, 1332 int iter_head, 1333 size_t *start) 1334 { 1335 struct pipe_inode_info *pipe = i->pipe; 1336 unsigned int p_mask = pipe->ring_size - 1; 1337 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1338 if (!n) 1339 return -EFAULT; 1340 1341 maxsize = n; 1342 n += *start; 1343 while (n > 0) { 1344 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1345 iter_head++; 1346 n -= PAGE_SIZE; 1347 } 1348 1349 return maxsize; 1350 } 1351 1352 static ssize_t pipe_get_pages(struct iov_iter *i, 1353 struct page **pages, size_t maxsize, unsigned maxpages, 1354 size_t *start) 1355 { 1356 unsigned int iter_head, npages; 1357 size_t capacity; 1358 1359 if (!sanity(i)) 1360 return -EFAULT; 1361 1362 data_start(i, &iter_head, start); 1363 /* Amount of free space: some of this one + all after this one */ 1364 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1365 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1366 1367 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1368 } 1369 1370 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1371 pgoff_t index, unsigned int nr_pages) 1372 { 1373 XA_STATE(xas, xa, index); 1374 struct page *page; 1375 unsigned int ret = 0; 1376 1377 rcu_read_lock(); 1378 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1379 if (xas_retry(&xas, page)) 1380 continue; 1381 1382 /* Has the page moved or been split? */ 1383 if (unlikely(page != xas_reload(&xas))) { 1384 xas_reset(&xas); 1385 continue; 1386 } 1387 1388 pages[ret] = find_subpage(page, xas.xa_index); 1389 get_page(pages[ret]); 1390 if (++ret == nr_pages) 1391 break; 1392 } 1393 rcu_read_unlock(); 1394 return ret; 1395 } 1396 1397 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1398 struct page **pages, size_t maxsize, 1399 unsigned maxpages, size_t *_start_offset) 1400 { 1401 unsigned nr, offset; 1402 pgoff_t index, count; 1403 size_t size = maxsize, actual; 1404 loff_t pos; 1405 1406 if (!size || !maxpages) 1407 return 0; 1408 1409 pos = i->xarray_start + i->iov_offset; 1410 index = pos >> PAGE_SHIFT; 1411 offset = pos & ~PAGE_MASK; 1412 *_start_offset = offset; 1413 1414 count = 1; 1415 if (size > PAGE_SIZE - offset) { 1416 size -= PAGE_SIZE - offset; 1417 count += size >> PAGE_SHIFT; 1418 size &= ~PAGE_MASK; 1419 if (size) 1420 count++; 1421 } 1422 1423 if (count > maxpages) 1424 count = maxpages; 1425 1426 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1427 if (nr == 0) 1428 return 0; 1429 1430 actual = PAGE_SIZE * nr; 1431 actual -= offset; 1432 if (nr == count && size > 0) { 1433 unsigned last_offset = (nr > 1) ? 0 : offset; 1434 actual -= PAGE_SIZE - (last_offset + size); 1435 } 1436 return actual; 1437 } 1438 1439 /* must be done on non-empty ITER_IOVEC one */ 1440 static unsigned long first_iovec_segment(const struct iov_iter *i, 1441 size_t *size, size_t *start, 1442 size_t maxsize, unsigned maxpages) 1443 { 1444 size_t skip; 1445 long k; 1446 1447 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1448 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; 1449 size_t len = i->iov[k].iov_len - skip; 1450 1451 if (unlikely(!len)) 1452 continue; 1453 if (len > maxsize) 1454 len = maxsize; 1455 len += (*start = addr % PAGE_SIZE); 1456 if (len > maxpages * PAGE_SIZE) 1457 len = maxpages * PAGE_SIZE; 1458 *size = len; 1459 return addr & PAGE_MASK; 1460 } 1461 BUG(); // if it had been empty, we wouldn't get called 1462 } 1463 1464 /* must be done on non-empty ITER_BVEC one */ 1465 static struct page *first_bvec_segment(const struct iov_iter *i, 1466 size_t *size, size_t *start, 1467 size_t maxsize, unsigned maxpages) 1468 { 1469 struct page *page; 1470 size_t skip = i->iov_offset, len; 1471 1472 len = i->bvec->bv_len - skip; 1473 if (len > maxsize) 1474 len = maxsize; 1475 skip += i->bvec->bv_offset; 1476 page = i->bvec->bv_page + skip / PAGE_SIZE; 1477 len += (*start = skip % PAGE_SIZE); 1478 if (len > maxpages * PAGE_SIZE) 1479 len = maxpages * PAGE_SIZE; 1480 *size = len; 1481 return page; 1482 } 1483 1484 ssize_t iov_iter_get_pages(struct iov_iter *i, 1485 struct page **pages, size_t maxsize, unsigned maxpages, 1486 size_t *start) 1487 { 1488 size_t len; 1489 int n, res; 1490 1491 if (maxsize > i->count) 1492 maxsize = i->count; 1493 if (!maxsize) 1494 return 0; 1495 1496 if (likely(iter_is_iovec(i))) { 1497 unsigned long addr; 1498 1499 addr = first_iovec_segment(i, &len, start, maxsize, maxpages); 1500 n = DIV_ROUND_UP(len, PAGE_SIZE); 1501 res = get_user_pages_fast(addr, n, 1502 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, 1503 pages); 1504 if (unlikely(res < 0)) 1505 return res; 1506 return (res == n ? len : res * PAGE_SIZE) - *start; 1507 } 1508 if (iov_iter_is_bvec(i)) { 1509 struct page *page; 1510 1511 page = first_bvec_segment(i, &len, start, maxsize, maxpages); 1512 n = DIV_ROUND_UP(len, PAGE_SIZE); 1513 while (n--) 1514 get_page(*pages++ = page++); 1515 return len - *start; 1516 } 1517 if (iov_iter_is_pipe(i)) 1518 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1519 if (iov_iter_is_xarray(i)) 1520 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1521 return -EFAULT; 1522 } 1523 EXPORT_SYMBOL(iov_iter_get_pages); 1524 1525 static struct page **get_pages_array(size_t n) 1526 { 1527 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1528 } 1529 1530 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1531 struct page ***pages, size_t maxsize, 1532 size_t *start) 1533 { 1534 struct page **p; 1535 unsigned int iter_head, npages; 1536 ssize_t n; 1537 1538 if (!sanity(i)) 1539 return -EFAULT; 1540 1541 data_start(i, &iter_head, start); 1542 /* Amount of free space: some of this one + all after this one */ 1543 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1544 n = npages * PAGE_SIZE - *start; 1545 if (maxsize > n) 1546 maxsize = n; 1547 else 1548 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1549 p = get_pages_array(npages); 1550 if (!p) 1551 return -ENOMEM; 1552 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1553 if (n > 0) 1554 *pages = p; 1555 else 1556 kvfree(p); 1557 return n; 1558 } 1559 1560 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1561 struct page ***pages, size_t maxsize, 1562 size_t *_start_offset) 1563 { 1564 struct page **p; 1565 unsigned nr, offset; 1566 pgoff_t index, count; 1567 size_t size = maxsize, actual; 1568 loff_t pos; 1569 1570 if (!size) 1571 return 0; 1572 1573 pos = i->xarray_start + i->iov_offset; 1574 index = pos >> PAGE_SHIFT; 1575 offset = pos & ~PAGE_MASK; 1576 *_start_offset = offset; 1577 1578 count = 1; 1579 if (size > PAGE_SIZE - offset) { 1580 size -= PAGE_SIZE - offset; 1581 count += size >> PAGE_SHIFT; 1582 size &= ~PAGE_MASK; 1583 if (size) 1584 count++; 1585 } 1586 1587 p = get_pages_array(count); 1588 if (!p) 1589 return -ENOMEM; 1590 *pages = p; 1591 1592 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1593 if (nr == 0) 1594 return 0; 1595 1596 actual = PAGE_SIZE * nr; 1597 actual -= offset; 1598 if (nr == count && size > 0) { 1599 unsigned last_offset = (nr > 1) ? 0 : offset; 1600 actual -= PAGE_SIZE - (last_offset + size); 1601 } 1602 return actual; 1603 } 1604 1605 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1606 struct page ***pages, size_t maxsize, 1607 size_t *start) 1608 { 1609 struct page **p; 1610 size_t len; 1611 int n, res; 1612 1613 if (maxsize > i->count) 1614 maxsize = i->count; 1615 if (!maxsize) 1616 return 0; 1617 1618 if (likely(iter_is_iovec(i))) { 1619 unsigned long addr; 1620 1621 addr = first_iovec_segment(i, &len, start, maxsize, ~0U); 1622 n = DIV_ROUND_UP(len, PAGE_SIZE); 1623 p = get_pages_array(n); 1624 if (!p) 1625 return -ENOMEM; 1626 res = get_user_pages_fast(addr, n, 1627 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); 1628 if (unlikely(res < 0)) { 1629 kvfree(p); 1630 return res; 1631 } 1632 *pages = p; 1633 return (res == n ? len : res * PAGE_SIZE) - *start; 1634 } 1635 if (iov_iter_is_bvec(i)) { 1636 struct page *page; 1637 1638 page = first_bvec_segment(i, &len, start, maxsize, ~0U); 1639 n = DIV_ROUND_UP(len, PAGE_SIZE); 1640 *pages = p = get_pages_array(n); 1641 if (!p) 1642 return -ENOMEM; 1643 while (n--) 1644 get_page(*p++ = page++); 1645 return len - *start; 1646 } 1647 if (iov_iter_is_pipe(i)) 1648 return pipe_get_pages_alloc(i, pages, maxsize, start); 1649 if (iov_iter_is_xarray(i)) 1650 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1651 return -EFAULT; 1652 } 1653 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1654 1655 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1656 struct iov_iter *i) 1657 { 1658 __wsum sum, next; 1659 sum = *csum; 1660 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1661 WARN_ON(1); 1662 return 0; 1663 } 1664 iterate_and_advance(i, bytes, base, len, off, ({ 1665 next = csum_and_copy_from_user(base, addr + off, len); 1666 if (next) 1667 sum = csum_block_add(sum, next, off); 1668 next ? 0 : len; 1669 }), ({ 1670 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1671 }) 1672 ) 1673 *csum = sum; 1674 return bytes; 1675 } 1676 EXPORT_SYMBOL(csum_and_copy_from_iter); 1677 1678 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1679 struct iov_iter *i) 1680 { 1681 struct csum_state *csstate = _csstate; 1682 __wsum sum, next; 1683 1684 if (unlikely(iov_iter_is_pipe(i))) 1685 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i); 1686 1687 sum = csum_shift(csstate->csum, csstate->off); 1688 if (unlikely(iov_iter_is_discard(i))) { 1689 WARN_ON(1); /* for now */ 1690 return 0; 1691 } 1692 iterate_and_advance(i, bytes, base, len, off, ({ 1693 next = csum_and_copy_to_user(addr + off, base, len); 1694 if (next) 1695 sum = csum_block_add(sum, next, off); 1696 next ? 0 : len; 1697 }), ({ 1698 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1699 }) 1700 ) 1701 csstate->csum = csum_shift(sum, csstate->off); 1702 csstate->off += bytes; 1703 return bytes; 1704 } 1705 EXPORT_SYMBOL(csum_and_copy_to_iter); 1706 1707 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1708 struct iov_iter *i) 1709 { 1710 #ifdef CONFIG_CRYPTO_HASH 1711 struct ahash_request *hash = hashp; 1712 struct scatterlist sg; 1713 size_t copied; 1714 1715 copied = copy_to_iter(addr, bytes, i); 1716 sg_init_one(&sg, addr, copied); 1717 ahash_request_set_crypt(hash, &sg, NULL, copied); 1718 crypto_ahash_update(hash); 1719 return copied; 1720 #else 1721 return 0; 1722 #endif 1723 } 1724 EXPORT_SYMBOL(hash_and_copy_to_iter); 1725 1726 static int iov_npages(const struct iov_iter *i, int maxpages) 1727 { 1728 size_t skip = i->iov_offset, size = i->count; 1729 const struct iovec *p; 1730 int npages = 0; 1731 1732 for (p = i->iov; size; skip = 0, p++) { 1733 unsigned offs = offset_in_page(p->iov_base + skip); 1734 size_t len = min(p->iov_len - skip, size); 1735 1736 if (len) { 1737 size -= len; 1738 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1739 if (unlikely(npages > maxpages)) 1740 return maxpages; 1741 } 1742 } 1743 return npages; 1744 } 1745 1746 static int bvec_npages(const struct iov_iter *i, int maxpages) 1747 { 1748 size_t skip = i->iov_offset, size = i->count; 1749 const struct bio_vec *p; 1750 int npages = 0; 1751 1752 for (p = i->bvec; size; skip = 0, p++) { 1753 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1754 size_t len = min(p->bv_len - skip, size); 1755 1756 size -= len; 1757 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1758 if (unlikely(npages > maxpages)) 1759 return maxpages; 1760 } 1761 return npages; 1762 } 1763 1764 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1765 { 1766 if (unlikely(!i->count)) 1767 return 0; 1768 /* iovec and kvec have identical layouts */ 1769 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1770 return iov_npages(i, maxpages); 1771 if (iov_iter_is_bvec(i)) 1772 return bvec_npages(i, maxpages); 1773 if (iov_iter_is_pipe(i)) { 1774 unsigned int iter_head; 1775 int npages; 1776 size_t off; 1777 1778 if (!sanity(i)) 1779 return 0; 1780 1781 data_start(i, &iter_head, &off); 1782 /* some of this one + all after this one */ 1783 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1784 return min(npages, maxpages); 1785 } 1786 if (iov_iter_is_xarray(i)) { 1787 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1788 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1789 return min(npages, maxpages); 1790 } 1791 return 0; 1792 } 1793 EXPORT_SYMBOL(iov_iter_npages); 1794 1795 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1796 { 1797 *new = *old; 1798 if (unlikely(iov_iter_is_pipe(new))) { 1799 WARN_ON(1); 1800 return NULL; 1801 } 1802 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) 1803 return NULL; 1804 if (iov_iter_is_bvec(new)) 1805 return new->bvec = kmemdup(new->bvec, 1806 new->nr_segs * sizeof(struct bio_vec), 1807 flags); 1808 else 1809 /* iovec and kvec have identical layout */ 1810 return new->iov = kmemdup(new->iov, 1811 new->nr_segs * sizeof(struct iovec), 1812 flags); 1813 } 1814 EXPORT_SYMBOL(dup_iter); 1815 1816 static int copy_compat_iovec_from_user(struct iovec *iov, 1817 const struct iovec __user *uvec, unsigned long nr_segs) 1818 { 1819 const struct compat_iovec __user *uiov = 1820 (const struct compat_iovec __user *)uvec; 1821 int ret = -EFAULT, i; 1822 1823 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1824 return -EFAULT; 1825 1826 for (i = 0; i < nr_segs; i++) { 1827 compat_uptr_t buf; 1828 compat_ssize_t len; 1829 1830 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1831 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1832 1833 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1834 if (len < 0) { 1835 ret = -EINVAL; 1836 goto uaccess_end; 1837 } 1838 iov[i].iov_base = compat_ptr(buf); 1839 iov[i].iov_len = len; 1840 } 1841 1842 ret = 0; 1843 uaccess_end: 1844 user_access_end(); 1845 return ret; 1846 } 1847 1848 static int copy_iovec_from_user(struct iovec *iov, 1849 const struct iovec __user *uvec, unsigned long nr_segs) 1850 { 1851 unsigned long seg; 1852 1853 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1854 return -EFAULT; 1855 for (seg = 0; seg < nr_segs; seg++) { 1856 if ((ssize_t)iov[seg].iov_len < 0) 1857 return -EINVAL; 1858 } 1859 1860 return 0; 1861 } 1862 1863 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1864 unsigned long nr_segs, unsigned long fast_segs, 1865 struct iovec *fast_iov, bool compat) 1866 { 1867 struct iovec *iov = fast_iov; 1868 int ret; 1869 1870 /* 1871 * SuS says "The readv() function *may* fail if the iovcnt argument was 1872 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1873 * traditionally returned zero for zero segments, so... 1874 */ 1875 if (nr_segs == 0) 1876 return iov; 1877 if (nr_segs > UIO_MAXIOV) 1878 return ERR_PTR(-EINVAL); 1879 if (nr_segs > fast_segs) { 1880 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1881 if (!iov) 1882 return ERR_PTR(-ENOMEM); 1883 } 1884 1885 if (compat) 1886 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1887 else 1888 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1889 if (ret) { 1890 if (iov != fast_iov) 1891 kfree(iov); 1892 return ERR_PTR(ret); 1893 } 1894 1895 return iov; 1896 } 1897 1898 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1899 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1900 struct iov_iter *i, bool compat) 1901 { 1902 ssize_t total_len = 0; 1903 unsigned long seg; 1904 struct iovec *iov; 1905 1906 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1907 if (IS_ERR(iov)) { 1908 *iovp = NULL; 1909 return PTR_ERR(iov); 1910 } 1911 1912 /* 1913 * According to the Single Unix Specification we should return EINVAL if 1914 * an element length is < 0 when cast to ssize_t or if the total length 1915 * would overflow the ssize_t return value of the system call. 1916 * 1917 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1918 * overflow case. 1919 */ 1920 for (seg = 0; seg < nr_segs; seg++) { 1921 ssize_t len = (ssize_t)iov[seg].iov_len; 1922 1923 if (!access_ok(iov[seg].iov_base, len)) { 1924 if (iov != *iovp) 1925 kfree(iov); 1926 *iovp = NULL; 1927 return -EFAULT; 1928 } 1929 1930 if (len > MAX_RW_COUNT - total_len) { 1931 len = MAX_RW_COUNT - total_len; 1932 iov[seg].iov_len = len; 1933 } 1934 total_len += len; 1935 } 1936 1937 iov_iter_init(i, type, iov, nr_segs, total_len); 1938 if (iov == *iovp) 1939 *iovp = NULL; 1940 else 1941 *iovp = iov; 1942 return total_len; 1943 } 1944 1945 /** 1946 * import_iovec() - Copy an array of &struct iovec from userspace 1947 * into the kernel, check that it is valid, and initialize a new 1948 * &struct iov_iter iterator to access it. 1949 * 1950 * @type: One of %READ or %WRITE. 1951 * @uvec: Pointer to the userspace array. 1952 * @nr_segs: Number of elements in userspace array. 1953 * @fast_segs: Number of elements in @iov. 1954 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1955 * on-stack) kernel array. 1956 * @i: Pointer to iterator that will be initialized on success. 1957 * 1958 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1959 * then this function places %NULL in *@iov on return. Otherwise, a new 1960 * array will be allocated and the result placed in *@iov. This means that 1961 * the caller may call kfree() on *@iov regardless of whether the small 1962 * on-stack array was used or not (and regardless of whether this function 1963 * returns an error or not). 1964 * 1965 * Return: Negative error code on error, bytes imported on success 1966 */ 1967 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1968 unsigned nr_segs, unsigned fast_segs, 1969 struct iovec **iovp, struct iov_iter *i) 1970 { 1971 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1972 in_compat_syscall()); 1973 } 1974 EXPORT_SYMBOL(import_iovec); 1975 1976 int import_single_range(int rw, void __user *buf, size_t len, 1977 struct iovec *iov, struct iov_iter *i) 1978 { 1979 if (len > MAX_RW_COUNT) 1980 len = MAX_RW_COUNT; 1981 if (unlikely(!access_ok(buf, len))) 1982 return -EFAULT; 1983 1984 iov->iov_base = buf; 1985 iov->iov_len = len; 1986 iov_iter_init(i, rw, iov, 1, len); 1987 return 0; 1988 } 1989 EXPORT_SYMBOL(import_single_range); 1990