1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers iovec and kvec alike */ 20 #define iterate_iovec(i, n, __v, __p, skip, STEP) { \ 21 size_t left; \ 22 size_t wanted = n; \ 23 do { \ 24 __v.iov_len = min(n, __p->iov_len - skip); \ 25 if (likely(__v.iov_len)) { \ 26 __v.iov_base = __p->iov_base + skip; \ 27 left = (STEP); \ 28 __v.iov_len -= left; \ 29 skip += __v.iov_len; \ 30 n -= __v.iov_len; \ 31 if (skip < __p->iov_len) \ 32 break; \ 33 } \ 34 __p++; \ 35 skip = 0; \ 36 } while (n); \ 37 n = wanted - n; \ 38 } 39 40 #define iterate_bvec(i, n, __v, p, skip, STEP) { \ 41 size_t wanted = n; \ 42 while (n) { \ 43 unsigned offset = p->bv_offset + skip; \ 44 unsigned left; \ 45 void *kaddr = kmap_local_page(p->bv_page + \ 46 offset / PAGE_SIZE); \ 47 __v.iov_base = kaddr + offset % PAGE_SIZE; \ 48 __v.iov_len = min(min(n, p->bv_len - skip), \ 49 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 50 left = (STEP); \ 51 kunmap_local(kaddr); \ 52 __v.iov_len -= left; \ 53 skip += __v.iov_len; \ 54 if (skip == p->bv_len) { \ 55 skip = 0; \ 56 p++; \ 57 } \ 58 n -= __v.iov_len; \ 59 if (left) \ 60 break; \ 61 } \ 62 n = wanted - n; \ 63 } 64 65 #define iterate_xarray(i, n, __v, skip, STEP) { \ 66 __label__ __out; \ 67 struct page *head = NULL; \ 68 size_t wanted = n, seg, offset; \ 69 loff_t start = i->xarray_start + skip; \ 70 pgoff_t index = start >> PAGE_SHIFT; \ 71 int j; \ 72 \ 73 XA_STATE(xas, i->xarray, index); \ 74 \ 75 rcu_read_lock(); \ 76 xas_for_each(&xas, head, ULONG_MAX) { \ 77 unsigned left; \ 78 if (xas_retry(&xas, head)) \ 79 continue; \ 80 if (WARN_ON(xa_is_value(head))) \ 81 break; \ 82 if (WARN_ON(PageHuge(head))) \ 83 break; \ 84 for (j = (head->index < index) ? index - head->index : 0; \ 85 j < thp_nr_pages(head); j++) { \ 86 void *kaddr = kmap_local_page(head + j); \ 87 offset = (i->xarray_start + skip) % PAGE_SIZE; \ 88 __v.iov_base = kaddr + offset; \ 89 seg = PAGE_SIZE - offset; \ 90 __v.iov_len = min(n, seg); \ 91 left = (STEP); \ 92 kunmap_local(kaddr); \ 93 __v.iov_len -= left; \ 94 n -= __v.iov_len; \ 95 skip += __v.iov_len; \ 96 if (left || n == 0) \ 97 goto __out; \ 98 } \ 99 } \ 100 __out: \ 101 rcu_read_unlock(); \ 102 n = wanted - n; \ 103 } 104 105 #define __iterate_and_advance(i, n, v, I, K) { \ 106 if (unlikely(i->count < n)) \ 107 n = i->count; \ 108 if (likely(n)) { \ 109 size_t skip = i->iov_offset; \ 110 if (likely(iter_is_iovec(i))) { \ 111 const struct iovec *iov = i->iov; \ 112 struct iovec v; \ 113 iterate_iovec(i, n, v, iov, skip, (I)) \ 114 i->nr_segs -= iov - i->iov; \ 115 i->iov = iov; \ 116 } else if (iov_iter_is_bvec(i)) { \ 117 const struct bio_vec *bvec = i->bvec; \ 118 struct kvec v; \ 119 iterate_bvec(i, n, v, bvec, skip, (K)) \ 120 i->nr_segs -= bvec - i->bvec; \ 121 i->bvec = bvec; \ 122 } else if (iov_iter_is_kvec(i)) { \ 123 const struct kvec *kvec = i->kvec; \ 124 struct kvec v; \ 125 iterate_iovec(i, n, v, kvec, skip, (K)) \ 126 i->nr_segs -= kvec - i->kvec; \ 127 i->kvec = kvec; \ 128 } else if (iov_iter_is_xarray(i)) { \ 129 struct kvec v; \ 130 iterate_xarray(i, n, v, skip, (K)) \ 131 } \ 132 i->count -= n; \ 133 i->iov_offset = skip; \ 134 } \ 135 } 136 #define iterate_and_advance(i, n, v, I, K) \ 137 __iterate_and_advance(i, n, v, I, ((void)(K),0)) 138 139 static int copyout(void __user *to, const void *from, size_t n) 140 { 141 if (should_fail_usercopy()) 142 return n; 143 if (access_ok(to, n)) { 144 instrument_copy_to_user(to, from, n); 145 n = raw_copy_to_user(to, from, n); 146 } 147 return n; 148 } 149 150 static int copyin(void *to, const void __user *from, size_t n) 151 { 152 if (should_fail_usercopy()) 153 return n; 154 if (access_ok(from, n)) { 155 instrument_copy_from_user(to, from, n); 156 n = raw_copy_from_user(to, from, n); 157 } 158 return n; 159 } 160 161 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 162 struct iov_iter *i) 163 { 164 size_t skip, copy, left, wanted; 165 const struct iovec *iov; 166 char __user *buf; 167 void *kaddr, *from; 168 169 if (unlikely(bytes > i->count)) 170 bytes = i->count; 171 172 if (unlikely(!bytes)) 173 return 0; 174 175 might_fault(); 176 wanted = bytes; 177 iov = i->iov; 178 skip = i->iov_offset; 179 buf = iov->iov_base + skip; 180 copy = min(bytes, iov->iov_len - skip); 181 182 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { 183 kaddr = kmap_atomic(page); 184 from = kaddr + offset; 185 186 /* first chunk, usually the only one */ 187 left = copyout(buf, from, copy); 188 copy -= left; 189 skip += copy; 190 from += copy; 191 bytes -= copy; 192 193 while (unlikely(!left && bytes)) { 194 iov++; 195 buf = iov->iov_base; 196 copy = min(bytes, iov->iov_len); 197 left = copyout(buf, from, copy); 198 copy -= left; 199 skip = copy; 200 from += copy; 201 bytes -= copy; 202 } 203 if (likely(!bytes)) { 204 kunmap_atomic(kaddr); 205 goto done; 206 } 207 offset = from - kaddr; 208 buf += copy; 209 kunmap_atomic(kaddr); 210 copy = min(bytes, iov->iov_len - skip); 211 } 212 /* Too bad - revert to non-atomic kmap */ 213 214 kaddr = kmap(page); 215 from = kaddr + offset; 216 left = copyout(buf, from, copy); 217 copy -= left; 218 skip += copy; 219 from += copy; 220 bytes -= copy; 221 while (unlikely(!left && bytes)) { 222 iov++; 223 buf = iov->iov_base; 224 copy = min(bytes, iov->iov_len); 225 left = copyout(buf, from, copy); 226 copy -= left; 227 skip = copy; 228 from += copy; 229 bytes -= copy; 230 } 231 kunmap(page); 232 233 done: 234 if (skip == iov->iov_len) { 235 iov++; 236 skip = 0; 237 } 238 i->count -= wanted - bytes; 239 i->nr_segs -= iov - i->iov; 240 i->iov = iov; 241 i->iov_offset = skip; 242 return wanted - bytes; 243 } 244 245 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, 246 struct iov_iter *i) 247 { 248 size_t skip, copy, left, wanted; 249 const struct iovec *iov; 250 char __user *buf; 251 void *kaddr, *to; 252 253 if (unlikely(bytes > i->count)) 254 bytes = i->count; 255 256 if (unlikely(!bytes)) 257 return 0; 258 259 might_fault(); 260 wanted = bytes; 261 iov = i->iov; 262 skip = i->iov_offset; 263 buf = iov->iov_base + skip; 264 copy = min(bytes, iov->iov_len - skip); 265 266 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { 267 kaddr = kmap_atomic(page); 268 to = kaddr + offset; 269 270 /* first chunk, usually the only one */ 271 left = copyin(to, buf, copy); 272 copy -= left; 273 skip += copy; 274 to += copy; 275 bytes -= copy; 276 277 while (unlikely(!left && bytes)) { 278 iov++; 279 buf = iov->iov_base; 280 copy = min(bytes, iov->iov_len); 281 left = copyin(to, buf, copy); 282 copy -= left; 283 skip = copy; 284 to += copy; 285 bytes -= copy; 286 } 287 if (likely(!bytes)) { 288 kunmap_atomic(kaddr); 289 goto done; 290 } 291 offset = to - kaddr; 292 buf += copy; 293 kunmap_atomic(kaddr); 294 copy = min(bytes, iov->iov_len - skip); 295 } 296 /* Too bad - revert to non-atomic kmap */ 297 298 kaddr = kmap(page); 299 to = kaddr + offset; 300 left = copyin(to, buf, copy); 301 copy -= left; 302 skip += copy; 303 to += copy; 304 bytes -= copy; 305 while (unlikely(!left && bytes)) { 306 iov++; 307 buf = iov->iov_base; 308 copy = min(bytes, iov->iov_len); 309 left = copyin(to, buf, copy); 310 copy -= left; 311 skip = copy; 312 to += copy; 313 bytes -= copy; 314 } 315 kunmap(page); 316 317 done: 318 if (skip == iov->iov_len) { 319 iov++; 320 skip = 0; 321 } 322 i->count -= wanted - bytes; 323 i->nr_segs -= iov - i->iov; 324 i->iov = iov; 325 i->iov_offset = skip; 326 return wanted - bytes; 327 } 328 329 #ifdef PIPE_PARANOIA 330 static bool sanity(const struct iov_iter *i) 331 { 332 struct pipe_inode_info *pipe = i->pipe; 333 unsigned int p_head = pipe->head; 334 unsigned int p_tail = pipe->tail; 335 unsigned int p_mask = pipe->ring_size - 1; 336 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 337 unsigned int i_head = i->head; 338 unsigned int idx; 339 340 if (i->iov_offset) { 341 struct pipe_buffer *p; 342 if (unlikely(p_occupancy == 0)) 343 goto Bad; // pipe must be non-empty 344 if (unlikely(i_head != p_head - 1)) 345 goto Bad; // must be at the last buffer... 346 347 p = &pipe->bufs[i_head & p_mask]; 348 if (unlikely(p->offset + p->len != i->iov_offset)) 349 goto Bad; // ... at the end of segment 350 } else { 351 if (i_head != p_head) 352 goto Bad; // must be right after the last buffer 353 } 354 return true; 355 Bad: 356 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 357 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 358 p_head, p_tail, pipe->ring_size); 359 for (idx = 0; idx < pipe->ring_size; idx++) 360 printk(KERN_ERR "[%p %p %d %d]\n", 361 pipe->bufs[idx].ops, 362 pipe->bufs[idx].page, 363 pipe->bufs[idx].offset, 364 pipe->bufs[idx].len); 365 WARN_ON(1); 366 return false; 367 } 368 #else 369 #define sanity(i) true 370 #endif 371 372 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 373 struct iov_iter *i) 374 { 375 struct pipe_inode_info *pipe = i->pipe; 376 struct pipe_buffer *buf; 377 unsigned int p_tail = pipe->tail; 378 unsigned int p_mask = pipe->ring_size - 1; 379 unsigned int i_head = i->head; 380 size_t off; 381 382 if (unlikely(bytes > i->count)) 383 bytes = i->count; 384 385 if (unlikely(!bytes)) 386 return 0; 387 388 if (!sanity(i)) 389 return 0; 390 391 off = i->iov_offset; 392 buf = &pipe->bufs[i_head & p_mask]; 393 if (off) { 394 if (offset == off && buf->page == page) { 395 /* merge with the last one */ 396 buf->len += bytes; 397 i->iov_offset += bytes; 398 goto out; 399 } 400 i_head++; 401 buf = &pipe->bufs[i_head & p_mask]; 402 } 403 if (pipe_full(i_head, p_tail, pipe->max_usage)) 404 return 0; 405 406 buf->ops = &page_cache_pipe_buf_ops; 407 get_page(page); 408 buf->page = page; 409 buf->offset = offset; 410 buf->len = bytes; 411 412 pipe->head = i_head + 1; 413 i->iov_offset = offset + bytes; 414 i->head = i_head; 415 out: 416 i->count -= bytes; 417 return bytes; 418 } 419 420 /* 421 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 422 * bytes. For each iovec, fault in each page that constitutes the iovec. 423 * 424 * Return 0 on success, or non-zero if the memory could not be accessed (i.e. 425 * because it is an invalid address). 426 */ 427 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) 428 { 429 if (iter_is_iovec(i)) { 430 const struct iovec *p; 431 size_t skip; 432 433 if (bytes > i->count) 434 bytes = i->count; 435 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { 436 size_t len = min(bytes, p->iov_len - skip); 437 int err; 438 439 if (unlikely(!len)) 440 continue; 441 err = fault_in_pages_readable(p->iov_base + skip, len); 442 if (unlikely(err)) 443 return err; 444 bytes -= len; 445 } 446 } 447 return 0; 448 } 449 EXPORT_SYMBOL(iov_iter_fault_in_readable); 450 451 void iov_iter_init(struct iov_iter *i, unsigned int direction, 452 const struct iovec *iov, unsigned long nr_segs, 453 size_t count) 454 { 455 WARN_ON(direction & ~(READ | WRITE)); 456 WARN_ON_ONCE(uaccess_kernel()); 457 *i = (struct iov_iter) { 458 .iter_type = ITER_IOVEC, 459 .data_source = direction, 460 .iov = iov, 461 .nr_segs = nr_segs, 462 .iov_offset = 0, 463 .count = count 464 }; 465 } 466 EXPORT_SYMBOL(iov_iter_init); 467 468 static inline bool allocated(struct pipe_buffer *buf) 469 { 470 return buf->ops == &default_pipe_buf_ops; 471 } 472 473 static inline void data_start(const struct iov_iter *i, 474 unsigned int *iter_headp, size_t *offp) 475 { 476 unsigned int p_mask = i->pipe->ring_size - 1; 477 unsigned int iter_head = i->head; 478 size_t off = i->iov_offset; 479 480 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || 481 off == PAGE_SIZE)) { 482 iter_head++; 483 off = 0; 484 } 485 *iter_headp = iter_head; 486 *offp = off; 487 } 488 489 static size_t push_pipe(struct iov_iter *i, size_t size, 490 int *iter_headp, size_t *offp) 491 { 492 struct pipe_inode_info *pipe = i->pipe; 493 unsigned int p_tail = pipe->tail; 494 unsigned int p_mask = pipe->ring_size - 1; 495 unsigned int iter_head; 496 size_t off; 497 ssize_t left; 498 499 if (unlikely(size > i->count)) 500 size = i->count; 501 if (unlikely(!size)) 502 return 0; 503 504 left = size; 505 data_start(i, &iter_head, &off); 506 *iter_headp = iter_head; 507 *offp = off; 508 if (off) { 509 left -= PAGE_SIZE - off; 510 if (left <= 0) { 511 pipe->bufs[iter_head & p_mask].len += size; 512 return size; 513 } 514 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; 515 iter_head++; 516 } 517 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { 518 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; 519 struct page *page = alloc_page(GFP_USER); 520 if (!page) 521 break; 522 523 buf->ops = &default_pipe_buf_ops; 524 buf->page = page; 525 buf->offset = 0; 526 buf->len = min_t(ssize_t, left, PAGE_SIZE); 527 left -= buf->len; 528 iter_head++; 529 pipe->head = iter_head; 530 531 if (left == 0) 532 return size; 533 } 534 return size - left; 535 } 536 537 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 538 struct iov_iter *i) 539 { 540 struct pipe_inode_info *pipe = i->pipe; 541 unsigned int p_mask = pipe->ring_size - 1; 542 unsigned int i_head; 543 size_t n, off; 544 545 if (!sanity(i)) 546 return 0; 547 548 bytes = n = push_pipe(i, bytes, &i_head, &off); 549 if (unlikely(!n)) 550 return 0; 551 do { 552 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 553 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 554 i->head = i_head; 555 i->iov_offset = off + chunk; 556 n -= chunk; 557 addr += chunk; 558 off = 0; 559 i_head++; 560 } while (n); 561 i->count -= bytes; 562 return bytes; 563 } 564 565 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 566 __wsum sum, size_t off) 567 { 568 __wsum next = csum_partial_copy_nocheck(from, to, len); 569 return csum_block_add(sum, next, off); 570 } 571 572 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 573 struct csum_state *csstate, 574 struct iov_iter *i) 575 { 576 struct pipe_inode_info *pipe = i->pipe; 577 unsigned int p_mask = pipe->ring_size - 1; 578 __wsum sum = csstate->csum; 579 size_t off = csstate->off; 580 unsigned int i_head; 581 size_t n, r; 582 583 if (!sanity(i)) 584 return 0; 585 586 bytes = n = push_pipe(i, bytes, &i_head, &r); 587 if (unlikely(!n)) 588 return 0; 589 do { 590 size_t chunk = min_t(size_t, n, PAGE_SIZE - r); 591 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page); 592 sum = csum_and_memcpy(p + r, addr, chunk, sum, off); 593 kunmap_atomic(p); 594 i->head = i_head; 595 i->iov_offset = r + chunk; 596 n -= chunk; 597 off += chunk; 598 addr += chunk; 599 r = 0; 600 i_head++; 601 } while (n); 602 i->count -= bytes; 603 csstate->csum = sum; 604 csstate->off = off; 605 return bytes; 606 } 607 608 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 609 { 610 const char *from = addr; 611 if (unlikely(iov_iter_is_pipe(i))) 612 return copy_pipe_to_iter(addr, bytes, i); 613 if (iter_is_iovec(i)) 614 might_fault(); 615 iterate_and_advance(i, bytes, v, 616 copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len), 617 memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) 618 ) 619 620 return bytes; 621 } 622 EXPORT_SYMBOL(_copy_to_iter); 623 624 #ifdef CONFIG_ARCH_HAS_COPY_MC 625 static int copyout_mc(void __user *to, const void *from, size_t n) 626 { 627 if (access_ok(to, n)) { 628 instrument_copy_to_user(to, from, n); 629 n = copy_mc_to_user((__force void *) to, from, n); 630 } 631 return n; 632 } 633 634 static unsigned long copy_mc_to_page(struct page *page, size_t offset, 635 const char *from, size_t len) 636 { 637 unsigned long ret; 638 char *to; 639 640 to = kmap_atomic(page); 641 ret = copy_mc_to_kernel(to + offset, from, len); 642 kunmap_atomic(to); 643 644 return ret; 645 } 646 647 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 648 struct iov_iter *i) 649 { 650 struct pipe_inode_info *pipe = i->pipe; 651 unsigned int p_mask = pipe->ring_size - 1; 652 unsigned int i_head; 653 size_t n, off, xfer = 0; 654 655 if (!sanity(i)) 656 return 0; 657 658 bytes = n = push_pipe(i, bytes, &i_head, &off); 659 if (unlikely(!n)) 660 return 0; 661 do { 662 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 663 unsigned long rem; 664 665 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page, 666 off, addr, chunk); 667 i->head = i_head; 668 i->iov_offset = off + chunk - rem; 669 xfer += chunk - rem; 670 if (rem) 671 break; 672 n -= chunk; 673 addr += chunk; 674 off = 0; 675 i_head++; 676 } while (n); 677 i->count -= xfer; 678 return xfer; 679 } 680 681 /** 682 * _copy_mc_to_iter - copy to iter with source memory error exception handling 683 * @addr: source kernel address 684 * @bytes: total transfer length 685 * @iter: destination iterator 686 * 687 * The pmem driver deploys this for the dax operation 688 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 689 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 690 * successfully copied. 691 * 692 * The main differences between this and typical _copy_to_iter(). 693 * 694 * * Typical tail/residue handling after a fault retries the copy 695 * byte-by-byte until the fault happens again. Re-triggering machine 696 * checks is potentially fatal so the implementation uses source 697 * alignment and poison alignment assumptions to avoid re-triggering 698 * hardware exceptions. 699 * 700 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 701 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 702 * a short copy. 703 */ 704 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 705 { 706 const char *from = addr; 707 708 if (unlikely(iov_iter_is_pipe(i))) 709 return copy_mc_pipe_to_iter(addr, bytes, i); 710 if (iter_is_iovec(i)) 711 might_fault(); 712 __iterate_and_advance(i, bytes, v, 713 copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len, 714 v.iov_len), 715 copy_mc_to_kernel(v.iov_base, (from += v.iov_len) 716 - v.iov_len, v.iov_len) 717 ) 718 719 return bytes; 720 } 721 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 722 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 723 724 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 725 { 726 char *to = addr; 727 if (unlikely(iov_iter_is_pipe(i))) { 728 WARN_ON(1); 729 return 0; 730 } 731 if (iter_is_iovec(i)) 732 might_fault(); 733 iterate_and_advance(i, bytes, v, 734 copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), 735 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) 736 ) 737 738 return bytes; 739 } 740 EXPORT_SYMBOL(_copy_from_iter); 741 742 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 743 { 744 char *to = addr; 745 if (unlikely(iov_iter_is_pipe(i))) { 746 WARN_ON(1); 747 return 0; 748 } 749 iterate_and_advance(i, bytes, v, 750 __copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len, 751 v.iov_base, v.iov_len), 752 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) 753 ) 754 755 return bytes; 756 } 757 EXPORT_SYMBOL(_copy_from_iter_nocache); 758 759 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 760 /** 761 * _copy_from_iter_flushcache - write destination through cpu cache 762 * @addr: destination kernel address 763 * @bytes: total transfer length 764 * @iter: source iterator 765 * 766 * The pmem driver arranges for filesystem-dax to use this facility via 767 * dax_copy_from_iter() for ensuring that writes to persistent memory 768 * are flushed through the CPU cache. It is differentiated from 769 * _copy_from_iter_nocache() in that guarantees all data is flushed for 770 * all iterator types. The _copy_from_iter_nocache() only attempts to 771 * bypass the cache for the ITER_IOVEC case, and on some archs may use 772 * instructions that strand dirty-data in the cache. 773 */ 774 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 775 { 776 char *to = addr; 777 if (unlikely(iov_iter_is_pipe(i))) { 778 WARN_ON(1); 779 return 0; 780 } 781 iterate_and_advance(i, bytes, v, 782 __copy_from_user_flushcache((to += v.iov_len) - v.iov_len, 783 v.iov_base, v.iov_len), 784 memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base, 785 v.iov_len) 786 ) 787 788 return bytes; 789 } 790 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 791 #endif 792 793 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 794 { 795 struct page *head; 796 size_t v = n + offset; 797 798 /* 799 * The general case needs to access the page order in order 800 * to compute the page size. 801 * However, we mostly deal with order-0 pages and thus can 802 * avoid a possible cache line miss for requests that fit all 803 * page orders. 804 */ 805 if (n <= v && v <= PAGE_SIZE) 806 return true; 807 808 head = compound_head(page); 809 v += (page - head) << PAGE_SHIFT; 810 811 if (likely(n <= v && v <= (page_size(head)))) 812 return true; 813 WARN_ON(1); 814 return false; 815 } 816 817 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 818 struct iov_iter *i) 819 { 820 if (likely(iter_is_iovec(i))) 821 return copy_page_to_iter_iovec(page, offset, bytes, i); 822 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 823 void *kaddr = kmap_atomic(page); 824 size_t wanted = copy_to_iter(kaddr + offset, bytes, i); 825 kunmap_atomic(kaddr); 826 return wanted; 827 } 828 if (iov_iter_is_pipe(i)) 829 return copy_page_to_iter_pipe(page, offset, bytes, i); 830 if (unlikely(iov_iter_is_discard(i))) { 831 if (unlikely(i->count < bytes)) 832 bytes = i->count; 833 i->count -= bytes; 834 return bytes; 835 } 836 WARN_ON(1); 837 return 0; 838 } 839 840 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 841 struct iov_iter *i) 842 { 843 size_t res = 0; 844 if (unlikely(!page_copy_sane(page, offset, bytes))) 845 return 0; 846 page += offset / PAGE_SIZE; // first subpage 847 offset %= PAGE_SIZE; 848 while (1) { 849 size_t n = __copy_page_to_iter(page, offset, 850 min(bytes, (size_t)PAGE_SIZE - offset), i); 851 res += n; 852 bytes -= n; 853 if (!bytes || !n) 854 break; 855 offset += n; 856 if (offset == PAGE_SIZE) { 857 page++; 858 offset = 0; 859 } 860 } 861 return res; 862 } 863 EXPORT_SYMBOL(copy_page_to_iter); 864 865 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 866 struct iov_iter *i) 867 { 868 if (unlikely(!page_copy_sane(page, offset, bytes))) 869 return 0; 870 if (likely(iter_is_iovec(i))) 871 return copy_page_from_iter_iovec(page, offset, bytes, i); 872 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 873 void *kaddr = kmap_atomic(page); 874 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 875 kunmap_atomic(kaddr); 876 return wanted; 877 } 878 WARN_ON(1); 879 return 0; 880 } 881 EXPORT_SYMBOL(copy_page_from_iter); 882 883 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 884 { 885 struct pipe_inode_info *pipe = i->pipe; 886 unsigned int p_mask = pipe->ring_size - 1; 887 unsigned int i_head; 888 size_t n, off; 889 890 if (!sanity(i)) 891 return 0; 892 893 bytes = n = push_pipe(i, bytes, &i_head, &off); 894 if (unlikely(!n)) 895 return 0; 896 897 do { 898 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 899 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk); 900 i->head = i_head; 901 i->iov_offset = off + chunk; 902 n -= chunk; 903 off = 0; 904 i_head++; 905 } while (n); 906 i->count -= bytes; 907 return bytes; 908 } 909 910 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 911 { 912 if (unlikely(iov_iter_is_pipe(i))) 913 return pipe_zero(bytes, i); 914 iterate_and_advance(i, bytes, v, 915 clear_user(v.iov_base, v.iov_len), 916 memset(v.iov_base, 0, v.iov_len) 917 ) 918 919 return bytes; 920 } 921 EXPORT_SYMBOL(iov_iter_zero); 922 923 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 924 struct iov_iter *i) 925 { 926 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 927 if (unlikely(!page_copy_sane(page, offset, bytes))) { 928 kunmap_atomic(kaddr); 929 return 0; 930 } 931 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 932 kunmap_atomic(kaddr); 933 WARN_ON(1); 934 return 0; 935 } 936 iterate_and_advance(i, bytes, v, 937 copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), 938 memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) 939 ) 940 kunmap_atomic(kaddr); 941 return bytes; 942 } 943 EXPORT_SYMBOL(copy_page_from_iter_atomic); 944 945 static inline void pipe_truncate(struct iov_iter *i) 946 { 947 struct pipe_inode_info *pipe = i->pipe; 948 unsigned int p_tail = pipe->tail; 949 unsigned int p_head = pipe->head; 950 unsigned int p_mask = pipe->ring_size - 1; 951 952 if (!pipe_empty(p_head, p_tail)) { 953 struct pipe_buffer *buf; 954 unsigned int i_head = i->head; 955 size_t off = i->iov_offset; 956 957 if (off) { 958 buf = &pipe->bufs[i_head & p_mask]; 959 buf->len = off - buf->offset; 960 i_head++; 961 } 962 while (p_head != i_head) { 963 p_head--; 964 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 965 } 966 967 pipe->head = p_head; 968 } 969 } 970 971 static void pipe_advance(struct iov_iter *i, size_t size) 972 { 973 struct pipe_inode_info *pipe = i->pipe; 974 if (size) { 975 struct pipe_buffer *buf; 976 unsigned int p_mask = pipe->ring_size - 1; 977 unsigned int i_head = i->head; 978 size_t off = i->iov_offset, left = size; 979 980 if (off) /* make it relative to the beginning of buffer */ 981 left += off - pipe->bufs[i_head & p_mask].offset; 982 while (1) { 983 buf = &pipe->bufs[i_head & p_mask]; 984 if (left <= buf->len) 985 break; 986 left -= buf->len; 987 i_head++; 988 } 989 i->head = i_head; 990 i->iov_offset = buf->offset + left; 991 } 992 i->count -= size; 993 /* ... and discard everything past that point */ 994 pipe_truncate(i); 995 } 996 997 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 998 { 999 struct bvec_iter bi; 1000 1001 bi.bi_size = i->count; 1002 bi.bi_bvec_done = i->iov_offset; 1003 bi.bi_idx = 0; 1004 bvec_iter_advance(i->bvec, &bi, size); 1005 1006 i->bvec += bi.bi_idx; 1007 i->nr_segs -= bi.bi_idx; 1008 i->count = bi.bi_size; 1009 i->iov_offset = bi.bi_bvec_done; 1010 } 1011 1012 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 1013 { 1014 const struct iovec *iov, *end; 1015 1016 if (!i->count) 1017 return; 1018 i->count -= size; 1019 1020 size += i->iov_offset; // from beginning of current segment 1021 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 1022 if (likely(size < iov->iov_len)) 1023 break; 1024 size -= iov->iov_len; 1025 } 1026 i->iov_offset = size; 1027 i->nr_segs -= iov - i->iov; 1028 i->iov = iov; 1029 } 1030 1031 void iov_iter_advance(struct iov_iter *i, size_t size) 1032 { 1033 if (unlikely(i->count < size)) 1034 size = i->count; 1035 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 1036 /* iovec and kvec have identical layouts */ 1037 iov_iter_iovec_advance(i, size); 1038 } else if (iov_iter_is_bvec(i)) { 1039 iov_iter_bvec_advance(i, size); 1040 } else if (iov_iter_is_pipe(i)) { 1041 pipe_advance(i, size); 1042 } else if (unlikely(iov_iter_is_xarray(i))) { 1043 i->iov_offset += size; 1044 i->count -= size; 1045 } else if (iov_iter_is_discard(i)) { 1046 i->count -= size; 1047 } 1048 } 1049 EXPORT_SYMBOL(iov_iter_advance); 1050 1051 void iov_iter_revert(struct iov_iter *i, size_t unroll) 1052 { 1053 if (!unroll) 1054 return; 1055 if (WARN_ON(unroll > MAX_RW_COUNT)) 1056 return; 1057 i->count += unroll; 1058 if (unlikely(iov_iter_is_pipe(i))) { 1059 struct pipe_inode_info *pipe = i->pipe; 1060 unsigned int p_mask = pipe->ring_size - 1; 1061 unsigned int i_head = i->head; 1062 size_t off = i->iov_offset; 1063 while (1) { 1064 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 1065 size_t n = off - b->offset; 1066 if (unroll < n) { 1067 off -= unroll; 1068 break; 1069 } 1070 unroll -= n; 1071 if (!unroll && i_head == i->start_head) { 1072 off = 0; 1073 break; 1074 } 1075 i_head--; 1076 b = &pipe->bufs[i_head & p_mask]; 1077 off = b->offset + b->len; 1078 } 1079 i->iov_offset = off; 1080 i->head = i_head; 1081 pipe_truncate(i); 1082 return; 1083 } 1084 if (unlikely(iov_iter_is_discard(i))) 1085 return; 1086 if (unroll <= i->iov_offset) { 1087 i->iov_offset -= unroll; 1088 return; 1089 } 1090 unroll -= i->iov_offset; 1091 if (iov_iter_is_xarray(i)) { 1092 BUG(); /* We should never go beyond the start of the specified 1093 * range since we might then be straying into pages that 1094 * aren't pinned. 1095 */ 1096 } else if (iov_iter_is_bvec(i)) { 1097 const struct bio_vec *bvec = i->bvec; 1098 while (1) { 1099 size_t n = (--bvec)->bv_len; 1100 i->nr_segs++; 1101 if (unroll <= n) { 1102 i->bvec = bvec; 1103 i->iov_offset = n - unroll; 1104 return; 1105 } 1106 unroll -= n; 1107 } 1108 } else { /* same logics for iovec and kvec */ 1109 const struct iovec *iov = i->iov; 1110 while (1) { 1111 size_t n = (--iov)->iov_len; 1112 i->nr_segs++; 1113 if (unroll <= n) { 1114 i->iov = iov; 1115 i->iov_offset = n - unroll; 1116 return; 1117 } 1118 unroll -= n; 1119 } 1120 } 1121 } 1122 EXPORT_SYMBOL(iov_iter_revert); 1123 1124 /* 1125 * Return the count of just the current iov_iter segment. 1126 */ 1127 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1128 { 1129 if (i->nr_segs > 1) { 1130 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1131 return min(i->count, i->iov->iov_len - i->iov_offset); 1132 if (iov_iter_is_bvec(i)) 1133 return min(i->count, i->bvec->bv_len - i->iov_offset); 1134 } 1135 return i->count; 1136 } 1137 EXPORT_SYMBOL(iov_iter_single_seg_count); 1138 1139 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1140 const struct kvec *kvec, unsigned long nr_segs, 1141 size_t count) 1142 { 1143 WARN_ON(direction & ~(READ | WRITE)); 1144 *i = (struct iov_iter){ 1145 .iter_type = ITER_KVEC, 1146 .data_source = direction, 1147 .kvec = kvec, 1148 .nr_segs = nr_segs, 1149 .iov_offset = 0, 1150 .count = count 1151 }; 1152 } 1153 EXPORT_SYMBOL(iov_iter_kvec); 1154 1155 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1156 const struct bio_vec *bvec, unsigned long nr_segs, 1157 size_t count) 1158 { 1159 WARN_ON(direction & ~(READ | WRITE)); 1160 *i = (struct iov_iter){ 1161 .iter_type = ITER_BVEC, 1162 .data_source = direction, 1163 .bvec = bvec, 1164 .nr_segs = nr_segs, 1165 .iov_offset = 0, 1166 .count = count 1167 }; 1168 } 1169 EXPORT_SYMBOL(iov_iter_bvec); 1170 1171 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1172 struct pipe_inode_info *pipe, 1173 size_t count) 1174 { 1175 BUG_ON(direction != READ); 1176 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1177 *i = (struct iov_iter){ 1178 .iter_type = ITER_PIPE, 1179 .data_source = false, 1180 .pipe = pipe, 1181 .head = pipe->head, 1182 .start_head = pipe->head, 1183 .iov_offset = 0, 1184 .count = count 1185 }; 1186 } 1187 EXPORT_SYMBOL(iov_iter_pipe); 1188 1189 /** 1190 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1191 * @i: The iterator to initialise. 1192 * @direction: The direction of the transfer. 1193 * @xarray: The xarray to access. 1194 * @start: The start file position. 1195 * @count: The size of the I/O buffer in bytes. 1196 * 1197 * Set up an I/O iterator to either draw data out of the pages attached to an 1198 * inode or to inject data into those pages. The pages *must* be prevented 1199 * from evaporation, either by taking a ref on them or locking them by the 1200 * caller. 1201 */ 1202 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1203 struct xarray *xarray, loff_t start, size_t count) 1204 { 1205 BUG_ON(direction & ~1); 1206 *i = (struct iov_iter) { 1207 .iter_type = ITER_XARRAY, 1208 .data_source = direction, 1209 .xarray = xarray, 1210 .xarray_start = start, 1211 .count = count, 1212 .iov_offset = 0 1213 }; 1214 } 1215 EXPORT_SYMBOL(iov_iter_xarray); 1216 1217 /** 1218 * iov_iter_discard - Initialise an I/O iterator that discards data 1219 * @i: The iterator to initialise. 1220 * @direction: The direction of the transfer. 1221 * @count: The size of the I/O buffer in bytes. 1222 * 1223 * Set up an I/O iterator that just discards everything that's written to it. 1224 * It's only available as a READ iterator. 1225 */ 1226 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1227 { 1228 BUG_ON(direction != READ); 1229 *i = (struct iov_iter){ 1230 .iter_type = ITER_DISCARD, 1231 .data_source = false, 1232 .count = count, 1233 .iov_offset = 0 1234 }; 1235 } 1236 EXPORT_SYMBOL(iov_iter_discard); 1237 1238 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1239 { 1240 unsigned long res = 0; 1241 size_t size = i->count; 1242 size_t skip = i->iov_offset; 1243 unsigned k; 1244 1245 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1246 size_t len = i->iov[k].iov_len - skip; 1247 if (len) { 1248 res |= (unsigned long)i->iov[k].iov_base + skip; 1249 if (len > size) 1250 len = size; 1251 res |= len; 1252 size -= len; 1253 if (!size) 1254 break; 1255 } 1256 } 1257 return res; 1258 } 1259 1260 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1261 { 1262 unsigned res = 0; 1263 size_t size = i->count; 1264 unsigned skip = i->iov_offset; 1265 unsigned k; 1266 1267 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1268 size_t len = i->bvec[k].bv_len - skip; 1269 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1270 if (len > size) 1271 len = size; 1272 res |= len; 1273 size -= len; 1274 if (!size) 1275 break; 1276 } 1277 return res; 1278 } 1279 1280 unsigned long iov_iter_alignment(const struct iov_iter *i) 1281 { 1282 /* iovec and kvec have identical layouts */ 1283 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1284 return iov_iter_alignment_iovec(i); 1285 1286 if (iov_iter_is_bvec(i)) 1287 return iov_iter_alignment_bvec(i); 1288 1289 if (iov_iter_is_pipe(i)) { 1290 unsigned int p_mask = i->pipe->ring_size - 1; 1291 size_t size = i->count; 1292 1293 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) 1294 return size | i->iov_offset; 1295 return size; 1296 } 1297 1298 if (iov_iter_is_xarray(i)) 1299 return (i->xarray_start + i->iov_offset) | i->count; 1300 1301 return 0; 1302 } 1303 EXPORT_SYMBOL(iov_iter_alignment); 1304 1305 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1306 { 1307 unsigned long res = 0; 1308 unsigned long v = 0; 1309 size_t size = i->count; 1310 unsigned k; 1311 1312 if (WARN_ON(!iter_is_iovec(i))) 1313 return ~0U; 1314 1315 for (k = 0; k < i->nr_segs; k++) { 1316 if (i->iov[k].iov_len) { 1317 unsigned long base = (unsigned long)i->iov[k].iov_base; 1318 if (v) // if not the first one 1319 res |= base | v; // this start | previous end 1320 v = base + i->iov[k].iov_len; 1321 if (size <= i->iov[k].iov_len) 1322 break; 1323 size -= i->iov[k].iov_len; 1324 } 1325 } 1326 return res; 1327 } 1328 EXPORT_SYMBOL(iov_iter_gap_alignment); 1329 1330 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1331 size_t maxsize, 1332 struct page **pages, 1333 int iter_head, 1334 size_t *start) 1335 { 1336 struct pipe_inode_info *pipe = i->pipe; 1337 unsigned int p_mask = pipe->ring_size - 1; 1338 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1339 if (!n) 1340 return -EFAULT; 1341 1342 maxsize = n; 1343 n += *start; 1344 while (n > 0) { 1345 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1346 iter_head++; 1347 n -= PAGE_SIZE; 1348 } 1349 1350 return maxsize; 1351 } 1352 1353 static ssize_t pipe_get_pages(struct iov_iter *i, 1354 struct page **pages, size_t maxsize, unsigned maxpages, 1355 size_t *start) 1356 { 1357 unsigned int iter_head, npages; 1358 size_t capacity; 1359 1360 if (!sanity(i)) 1361 return -EFAULT; 1362 1363 data_start(i, &iter_head, start); 1364 /* Amount of free space: some of this one + all after this one */ 1365 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1366 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1367 1368 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1369 } 1370 1371 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1372 pgoff_t index, unsigned int nr_pages) 1373 { 1374 XA_STATE(xas, xa, index); 1375 struct page *page; 1376 unsigned int ret = 0; 1377 1378 rcu_read_lock(); 1379 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1380 if (xas_retry(&xas, page)) 1381 continue; 1382 1383 /* Has the page moved or been split? */ 1384 if (unlikely(page != xas_reload(&xas))) { 1385 xas_reset(&xas); 1386 continue; 1387 } 1388 1389 pages[ret] = find_subpage(page, xas.xa_index); 1390 get_page(pages[ret]); 1391 if (++ret == nr_pages) 1392 break; 1393 } 1394 rcu_read_unlock(); 1395 return ret; 1396 } 1397 1398 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1399 struct page **pages, size_t maxsize, 1400 unsigned maxpages, size_t *_start_offset) 1401 { 1402 unsigned nr, offset; 1403 pgoff_t index, count; 1404 size_t size = maxsize, actual; 1405 loff_t pos; 1406 1407 if (!size || !maxpages) 1408 return 0; 1409 1410 pos = i->xarray_start + i->iov_offset; 1411 index = pos >> PAGE_SHIFT; 1412 offset = pos & ~PAGE_MASK; 1413 *_start_offset = offset; 1414 1415 count = 1; 1416 if (size > PAGE_SIZE - offset) { 1417 size -= PAGE_SIZE - offset; 1418 count += size >> PAGE_SHIFT; 1419 size &= ~PAGE_MASK; 1420 if (size) 1421 count++; 1422 } 1423 1424 if (count > maxpages) 1425 count = maxpages; 1426 1427 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1428 if (nr == 0) 1429 return 0; 1430 1431 actual = PAGE_SIZE * nr; 1432 actual -= offset; 1433 if (nr == count && size > 0) { 1434 unsigned last_offset = (nr > 1) ? 0 : offset; 1435 actual -= PAGE_SIZE - (last_offset + size); 1436 } 1437 return actual; 1438 } 1439 1440 /* must be done on non-empty ITER_IOVEC one */ 1441 static unsigned long first_iovec_segment(const struct iov_iter *i, 1442 size_t *size, size_t *start, 1443 size_t maxsize, unsigned maxpages) 1444 { 1445 size_t skip; 1446 long k; 1447 1448 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1449 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; 1450 size_t len = i->iov[k].iov_len - skip; 1451 1452 if (unlikely(!len)) 1453 continue; 1454 if (len > maxsize) 1455 len = maxsize; 1456 len += (*start = addr % PAGE_SIZE); 1457 if (len > maxpages * PAGE_SIZE) 1458 len = maxpages * PAGE_SIZE; 1459 *size = len; 1460 return addr & PAGE_MASK; 1461 } 1462 BUG(); // if it had been empty, we wouldn't get called 1463 } 1464 1465 /* must be done on non-empty ITER_BVEC one */ 1466 static struct page *first_bvec_segment(const struct iov_iter *i, 1467 size_t *size, size_t *start, 1468 size_t maxsize, unsigned maxpages) 1469 { 1470 struct page *page; 1471 size_t skip = i->iov_offset, len; 1472 1473 len = i->bvec->bv_len - skip; 1474 if (len > maxsize) 1475 len = maxsize; 1476 skip += i->bvec->bv_offset; 1477 page = i->bvec->bv_page + skip / PAGE_SIZE; 1478 len += (*start = skip % PAGE_SIZE); 1479 if (len > maxpages * PAGE_SIZE) 1480 len = maxpages * PAGE_SIZE; 1481 *size = len; 1482 return page; 1483 } 1484 1485 ssize_t iov_iter_get_pages(struct iov_iter *i, 1486 struct page **pages, size_t maxsize, unsigned maxpages, 1487 size_t *start) 1488 { 1489 size_t len; 1490 int n, res; 1491 1492 if (maxsize > i->count) 1493 maxsize = i->count; 1494 if (!maxsize) 1495 return 0; 1496 1497 if (likely(iter_is_iovec(i))) { 1498 unsigned long addr; 1499 1500 addr = first_iovec_segment(i, &len, start, maxsize, maxpages); 1501 n = DIV_ROUND_UP(len, PAGE_SIZE); 1502 res = get_user_pages_fast(addr, n, 1503 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, 1504 pages); 1505 if (unlikely(res < 0)) 1506 return res; 1507 return (res == n ? len : res * PAGE_SIZE) - *start; 1508 } 1509 if (iov_iter_is_bvec(i)) { 1510 struct page *page; 1511 1512 page = first_bvec_segment(i, &len, start, maxsize, maxpages); 1513 n = DIV_ROUND_UP(len, PAGE_SIZE); 1514 while (n--) 1515 get_page(*pages++ = page++); 1516 return len - *start; 1517 } 1518 if (iov_iter_is_pipe(i)) 1519 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1520 if (iov_iter_is_xarray(i)) 1521 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1522 return -EFAULT; 1523 } 1524 EXPORT_SYMBOL(iov_iter_get_pages); 1525 1526 static struct page **get_pages_array(size_t n) 1527 { 1528 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1529 } 1530 1531 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1532 struct page ***pages, size_t maxsize, 1533 size_t *start) 1534 { 1535 struct page **p; 1536 unsigned int iter_head, npages; 1537 ssize_t n; 1538 1539 if (!sanity(i)) 1540 return -EFAULT; 1541 1542 data_start(i, &iter_head, start); 1543 /* Amount of free space: some of this one + all after this one */ 1544 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1545 n = npages * PAGE_SIZE - *start; 1546 if (maxsize > n) 1547 maxsize = n; 1548 else 1549 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1550 p = get_pages_array(npages); 1551 if (!p) 1552 return -ENOMEM; 1553 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1554 if (n > 0) 1555 *pages = p; 1556 else 1557 kvfree(p); 1558 return n; 1559 } 1560 1561 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1562 struct page ***pages, size_t maxsize, 1563 size_t *_start_offset) 1564 { 1565 struct page **p; 1566 unsigned nr, offset; 1567 pgoff_t index, count; 1568 size_t size = maxsize, actual; 1569 loff_t pos; 1570 1571 if (!size) 1572 return 0; 1573 1574 pos = i->xarray_start + i->iov_offset; 1575 index = pos >> PAGE_SHIFT; 1576 offset = pos & ~PAGE_MASK; 1577 *_start_offset = offset; 1578 1579 count = 1; 1580 if (size > PAGE_SIZE - offset) { 1581 size -= PAGE_SIZE - offset; 1582 count += size >> PAGE_SHIFT; 1583 size &= ~PAGE_MASK; 1584 if (size) 1585 count++; 1586 } 1587 1588 p = get_pages_array(count); 1589 if (!p) 1590 return -ENOMEM; 1591 *pages = p; 1592 1593 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1594 if (nr == 0) 1595 return 0; 1596 1597 actual = PAGE_SIZE * nr; 1598 actual -= offset; 1599 if (nr == count && size > 0) { 1600 unsigned last_offset = (nr > 1) ? 0 : offset; 1601 actual -= PAGE_SIZE - (last_offset + size); 1602 } 1603 return actual; 1604 } 1605 1606 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1607 struct page ***pages, size_t maxsize, 1608 size_t *start) 1609 { 1610 struct page **p; 1611 size_t len; 1612 int n, res; 1613 1614 if (maxsize > i->count) 1615 maxsize = i->count; 1616 if (!maxsize) 1617 return 0; 1618 1619 if (likely(iter_is_iovec(i))) { 1620 unsigned long addr; 1621 1622 addr = first_iovec_segment(i, &len, start, maxsize, ~0U); 1623 n = DIV_ROUND_UP(len, PAGE_SIZE); 1624 p = get_pages_array(n); 1625 if (!p) 1626 return -ENOMEM; 1627 res = get_user_pages_fast(addr, n, 1628 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); 1629 if (unlikely(res < 0)) { 1630 kvfree(p); 1631 return res; 1632 } 1633 *pages = p; 1634 return (res == n ? len : res * PAGE_SIZE) - *start; 1635 } 1636 if (iov_iter_is_bvec(i)) { 1637 struct page *page; 1638 1639 page = first_bvec_segment(i, &len, start, maxsize, ~0U); 1640 n = DIV_ROUND_UP(len, PAGE_SIZE); 1641 *pages = p = get_pages_array(n); 1642 if (!p) 1643 return -ENOMEM; 1644 while (n--) 1645 get_page(*p++ = page++); 1646 return len - *start; 1647 } 1648 if (iov_iter_is_pipe(i)) 1649 return pipe_get_pages_alloc(i, pages, maxsize, start); 1650 if (iov_iter_is_xarray(i)) 1651 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1652 return -EFAULT; 1653 } 1654 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1655 1656 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1657 struct iov_iter *i) 1658 { 1659 char *to = addr; 1660 __wsum sum, next; 1661 size_t off = 0; 1662 sum = *csum; 1663 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1664 WARN_ON(1); 1665 return 0; 1666 } 1667 iterate_and_advance(i, bytes, v, ({ 1668 next = csum_and_copy_from_user(v.iov_base, 1669 (to += v.iov_len) - v.iov_len, 1670 v.iov_len); 1671 if (next) { 1672 sum = csum_block_add(sum, next, off); 1673 off += v.iov_len; 1674 } 1675 next ? 0 : v.iov_len; 1676 }), ({ 1677 sum = csum_and_memcpy((to += v.iov_len) - v.iov_len, 1678 v.iov_base, v.iov_len, 1679 sum, off); 1680 off += v.iov_len; 1681 }) 1682 ) 1683 *csum = sum; 1684 return bytes; 1685 } 1686 EXPORT_SYMBOL(csum_and_copy_from_iter); 1687 1688 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1689 struct iov_iter *i) 1690 { 1691 struct csum_state *csstate = _csstate; 1692 const char *from = addr; 1693 __wsum sum, next; 1694 size_t off; 1695 1696 if (unlikely(iov_iter_is_pipe(i))) 1697 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i); 1698 1699 sum = csum_shift(csstate->csum, csstate->off); 1700 off = 0; 1701 if (unlikely(iov_iter_is_discard(i))) { 1702 WARN_ON(1); /* for now */ 1703 return 0; 1704 } 1705 iterate_and_advance(i, bytes, v, ({ 1706 next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, 1707 v.iov_base, 1708 v.iov_len); 1709 if (next) { 1710 sum = csum_block_add(sum, next, off); 1711 off += v.iov_len; 1712 } 1713 next ? 0 : v.iov_len; 1714 }), ({ 1715 sum = csum_and_memcpy(v.iov_base, 1716 (from += v.iov_len) - v.iov_len, 1717 v.iov_len, sum, off); 1718 off += v.iov_len; 1719 }) 1720 ) 1721 csstate->csum = csum_shift(sum, csstate->off); 1722 csstate->off += bytes; 1723 return bytes; 1724 } 1725 EXPORT_SYMBOL(csum_and_copy_to_iter); 1726 1727 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1728 struct iov_iter *i) 1729 { 1730 #ifdef CONFIG_CRYPTO_HASH 1731 struct ahash_request *hash = hashp; 1732 struct scatterlist sg; 1733 size_t copied; 1734 1735 copied = copy_to_iter(addr, bytes, i); 1736 sg_init_one(&sg, addr, copied); 1737 ahash_request_set_crypt(hash, &sg, NULL, copied); 1738 crypto_ahash_update(hash); 1739 return copied; 1740 #else 1741 return 0; 1742 #endif 1743 } 1744 EXPORT_SYMBOL(hash_and_copy_to_iter); 1745 1746 static int iov_npages(const struct iov_iter *i, int maxpages) 1747 { 1748 size_t skip = i->iov_offset, size = i->count; 1749 const struct iovec *p; 1750 int npages = 0; 1751 1752 for (p = i->iov; size; skip = 0, p++) { 1753 unsigned offs = offset_in_page(p->iov_base + skip); 1754 size_t len = min(p->iov_len - skip, size); 1755 1756 if (len) { 1757 size -= len; 1758 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1759 if (unlikely(npages > maxpages)) 1760 return maxpages; 1761 } 1762 } 1763 return npages; 1764 } 1765 1766 static int bvec_npages(const struct iov_iter *i, int maxpages) 1767 { 1768 size_t skip = i->iov_offset, size = i->count; 1769 const struct bio_vec *p; 1770 int npages = 0; 1771 1772 for (p = i->bvec; size; skip = 0, p++) { 1773 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1774 size_t len = min(p->bv_len - skip, size); 1775 1776 size -= len; 1777 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1778 if (unlikely(npages > maxpages)) 1779 return maxpages; 1780 } 1781 return npages; 1782 } 1783 1784 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1785 { 1786 if (unlikely(!i->count)) 1787 return 0; 1788 /* iovec and kvec have identical layouts */ 1789 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1790 return iov_npages(i, maxpages); 1791 if (iov_iter_is_bvec(i)) 1792 return bvec_npages(i, maxpages); 1793 if (iov_iter_is_pipe(i)) { 1794 unsigned int iter_head; 1795 int npages; 1796 size_t off; 1797 1798 if (!sanity(i)) 1799 return 0; 1800 1801 data_start(i, &iter_head, &off); 1802 /* some of this one + all after this one */ 1803 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1804 return min(npages, maxpages); 1805 } 1806 if (iov_iter_is_xarray(i)) { 1807 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1808 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1809 return min(npages, maxpages); 1810 } 1811 return 0; 1812 } 1813 EXPORT_SYMBOL(iov_iter_npages); 1814 1815 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1816 { 1817 *new = *old; 1818 if (unlikely(iov_iter_is_pipe(new))) { 1819 WARN_ON(1); 1820 return NULL; 1821 } 1822 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) 1823 return NULL; 1824 if (iov_iter_is_bvec(new)) 1825 return new->bvec = kmemdup(new->bvec, 1826 new->nr_segs * sizeof(struct bio_vec), 1827 flags); 1828 else 1829 /* iovec and kvec have identical layout */ 1830 return new->iov = kmemdup(new->iov, 1831 new->nr_segs * sizeof(struct iovec), 1832 flags); 1833 } 1834 EXPORT_SYMBOL(dup_iter); 1835 1836 static int copy_compat_iovec_from_user(struct iovec *iov, 1837 const struct iovec __user *uvec, unsigned long nr_segs) 1838 { 1839 const struct compat_iovec __user *uiov = 1840 (const struct compat_iovec __user *)uvec; 1841 int ret = -EFAULT, i; 1842 1843 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1844 return -EFAULT; 1845 1846 for (i = 0; i < nr_segs; i++) { 1847 compat_uptr_t buf; 1848 compat_ssize_t len; 1849 1850 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1851 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1852 1853 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1854 if (len < 0) { 1855 ret = -EINVAL; 1856 goto uaccess_end; 1857 } 1858 iov[i].iov_base = compat_ptr(buf); 1859 iov[i].iov_len = len; 1860 } 1861 1862 ret = 0; 1863 uaccess_end: 1864 user_access_end(); 1865 return ret; 1866 } 1867 1868 static int copy_iovec_from_user(struct iovec *iov, 1869 const struct iovec __user *uvec, unsigned long nr_segs) 1870 { 1871 unsigned long seg; 1872 1873 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1874 return -EFAULT; 1875 for (seg = 0; seg < nr_segs; seg++) { 1876 if ((ssize_t)iov[seg].iov_len < 0) 1877 return -EINVAL; 1878 } 1879 1880 return 0; 1881 } 1882 1883 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1884 unsigned long nr_segs, unsigned long fast_segs, 1885 struct iovec *fast_iov, bool compat) 1886 { 1887 struct iovec *iov = fast_iov; 1888 int ret; 1889 1890 /* 1891 * SuS says "The readv() function *may* fail if the iovcnt argument was 1892 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1893 * traditionally returned zero for zero segments, so... 1894 */ 1895 if (nr_segs == 0) 1896 return iov; 1897 if (nr_segs > UIO_MAXIOV) 1898 return ERR_PTR(-EINVAL); 1899 if (nr_segs > fast_segs) { 1900 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1901 if (!iov) 1902 return ERR_PTR(-ENOMEM); 1903 } 1904 1905 if (compat) 1906 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1907 else 1908 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1909 if (ret) { 1910 if (iov != fast_iov) 1911 kfree(iov); 1912 return ERR_PTR(ret); 1913 } 1914 1915 return iov; 1916 } 1917 1918 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1919 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1920 struct iov_iter *i, bool compat) 1921 { 1922 ssize_t total_len = 0; 1923 unsigned long seg; 1924 struct iovec *iov; 1925 1926 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1927 if (IS_ERR(iov)) { 1928 *iovp = NULL; 1929 return PTR_ERR(iov); 1930 } 1931 1932 /* 1933 * According to the Single Unix Specification we should return EINVAL if 1934 * an element length is < 0 when cast to ssize_t or if the total length 1935 * would overflow the ssize_t return value of the system call. 1936 * 1937 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1938 * overflow case. 1939 */ 1940 for (seg = 0; seg < nr_segs; seg++) { 1941 ssize_t len = (ssize_t)iov[seg].iov_len; 1942 1943 if (!access_ok(iov[seg].iov_base, len)) { 1944 if (iov != *iovp) 1945 kfree(iov); 1946 *iovp = NULL; 1947 return -EFAULT; 1948 } 1949 1950 if (len > MAX_RW_COUNT - total_len) { 1951 len = MAX_RW_COUNT - total_len; 1952 iov[seg].iov_len = len; 1953 } 1954 total_len += len; 1955 } 1956 1957 iov_iter_init(i, type, iov, nr_segs, total_len); 1958 if (iov == *iovp) 1959 *iovp = NULL; 1960 else 1961 *iovp = iov; 1962 return total_len; 1963 } 1964 1965 /** 1966 * import_iovec() - Copy an array of &struct iovec from userspace 1967 * into the kernel, check that it is valid, and initialize a new 1968 * &struct iov_iter iterator to access it. 1969 * 1970 * @type: One of %READ or %WRITE. 1971 * @uvec: Pointer to the userspace array. 1972 * @nr_segs: Number of elements in userspace array. 1973 * @fast_segs: Number of elements in @iov. 1974 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1975 * on-stack) kernel array. 1976 * @i: Pointer to iterator that will be initialized on success. 1977 * 1978 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1979 * then this function places %NULL in *@iov on return. Otherwise, a new 1980 * array will be allocated and the result placed in *@iov. This means that 1981 * the caller may call kfree() on *@iov regardless of whether the small 1982 * on-stack array was used or not (and regardless of whether this function 1983 * returns an error or not). 1984 * 1985 * Return: Negative error code on error, bytes imported on success 1986 */ 1987 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1988 unsigned nr_segs, unsigned fast_segs, 1989 struct iovec **iovp, struct iov_iter *i) 1990 { 1991 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1992 in_compat_syscall()); 1993 } 1994 EXPORT_SYMBOL(import_iovec); 1995 1996 int import_single_range(int rw, void __user *buf, size_t len, 1997 struct iovec *iov, struct iov_iter *i) 1998 { 1999 if (len > MAX_RW_COUNT) 2000 len = MAX_RW_COUNT; 2001 if (unlikely(!access_ok(buf, len))) 2002 return -EFAULT; 2003 2004 iov->iov_base = buf; 2005 iov->iov_len = len; 2006 iov_iter_init(i, rw, iov, 1, len); 2007 return 0; 2008 } 2009 EXPORT_SYMBOL(import_single_range); 2010