1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers iovec and kvec alike */ 20 #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ 21 size_t off = 0; \ 22 size_t skip = i->iov_offset; \ 23 do { \ 24 len = min(n, __p->iov_len - skip); \ 25 if (likely(len)) { \ 26 base = __p->iov_base + skip; \ 27 len -= (STEP); \ 28 off += len; \ 29 skip += len; \ 30 n -= len; \ 31 if (skip < __p->iov_len) \ 32 break; \ 33 } \ 34 __p++; \ 35 skip = 0; \ 36 } while (n); \ 37 i->iov_offset = skip; \ 38 n = off; \ 39 } 40 41 #define iterate_bvec(i, n, base, len, off, p, STEP) { \ 42 size_t off = 0; \ 43 unsigned skip = i->iov_offset; \ 44 while (n) { \ 45 unsigned offset = p->bv_offset + skip; \ 46 unsigned left; \ 47 void *kaddr = kmap_local_page(p->bv_page + \ 48 offset / PAGE_SIZE); \ 49 base = kaddr + offset % PAGE_SIZE; \ 50 len = min(min(n, (size_t)(p->bv_len - skip)), \ 51 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 52 left = (STEP); \ 53 kunmap_local(kaddr); \ 54 len -= left; \ 55 off += len; \ 56 skip += len; \ 57 if (skip == p->bv_len) { \ 58 skip = 0; \ 59 p++; \ 60 } \ 61 n -= len; \ 62 if (left) \ 63 break; \ 64 } \ 65 i->iov_offset = skip; \ 66 n = off; \ 67 } 68 69 #define iterate_xarray(i, n, base, len, __off, STEP) { \ 70 __label__ __out; \ 71 size_t __off = 0; \ 72 struct page *head = NULL; \ 73 loff_t start = i->xarray_start + i->iov_offset; \ 74 unsigned offset = start % PAGE_SIZE; \ 75 pgoff_t index = start / PAGE_SIZE; \ 76 int j; \ 77 \ 78 XA_STATE(xas, i->xarray, index); \ 79 \ 80 rcu_read_lock(); \ 81 xas_for_each(&xas, head, ULONG_MAX) { \ 82 unsigned left; \ 83 if (xas_retry(&xas, head)) \ 84 continue; \ 85 if (WARN_ON(xa_is_value(head))) \ 86 break; \ 87 if (WARN_ON(PageHuge(head))) \ 88 break; \ 89 for (j = (head->index < index) ? index - head->index : 0; \ 90 j < thp_nr_pages(head); j++) { \ 91 void *kaddr = kmap_local_page(head + j); \ 92 base = kaddr + offset; \ 93 len = PAGE_SIZE - offset; \ 94 len = min(n, len); \ 95 left = (STEP); \ 96 kunmap_local(kaddr); \ 97 len -= left; \ 98 __off += len; \ 99 n -= len; \ 100 if (left || n == 0) \ 101 goto __out; \ 102 offset = 0; \ 103 } \ 104 } \ 105 __out: \ 106 rcu_read_unlock(); \ 107 i->iov_offset += __off; \ 108 n = __off; \ 109 } 110 111 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 112 if (unlikely(i->count < n)) \ 113 n = i->count; \ 114 if (likely(n)) { \ 115 if (likely(iter_is_iovec(i))) { \ 116 const struct iovec *iov = i->iov; \ 117 void __user *base; \ 118 size_t len; \ 119 iterate_iovec(i, n, base, len, off, \ 120 iov, (I)) \ 121 i->nr_segs -= iov - i->iov; \ 122 i->iov = iov; \ 123 } else if (iov_iter_is_bvec(i)) { \ 124 const struct bio_vec *bvec = i->bvec; \ 125 void *base; \ 126 size_t len; \ 127 iterate_bvec(i, n, base, len, off, \ 128 bvec, (K)) \ 129 i->nr_segs -= bvec - i->bvec; \ 130 i->bvec = bvec; \ 131 } else if (iov_iter_is_kvec(i)) { \ 132 const struct kvec *kvec = i->kvec; \ 133 void *base; \ 134 size_t len; \ 135 iterate_iovec(i, n, base, len, off, \ 136 kvec, (K)) \ 137 i->nr_segs -= kvec - i->kvec; \ 138 i->kvec = kvec; \ 139 } else if (iov_iter_is_xarray(i)) { \ 140 void *base; \ 141 size_t len; \ 142 iterate_xarray(i, n, base, len, off, \ 143 (K)) \ 144 } \ 145 i->count -= n; \ 146 } \ 147 } 148 #define iterate_and_advance(i, n, base, len, off, I, K) \ 149 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 150 151 static int copyout(void __user *to, const void *from, size_t n) 152 { 153 if (should_fail_usercopy()) 154 return n; 155 if (access_ok(to, n)) { 156 instrument_copy_to_user(to, from, n); 157 n = raw_copy_to_user(to, from, n); 158 } 159 return n; 160 } 161 162 static int copyin(void *to, const void __user *from, size_t n) 163 { 164 if (should_fail_usercopy()) 165 return n; 166 if (access_ok(from, n)) { 167 instrument_copy_from_user(to, from, n); 168 n = raw_copy_from_user(to, from, n); 169 } 170 return n; 171 } 172 173 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 174 struct iov_iter *i) 175 { 176 size_t skip, copy, left, wanted; 177 const struct iovec *iov; 178 char __user *buf; 179 void *kaddr, *from; 180 181 if (unlikely(bytes > i->count)) 182 bytes = i->count; 183 184 if (unlikely(!bytes)) 185 return 0; 186 187 might_fault(); 188 wanted = bytes; 189 iov = i->iov; 190 skip = i->iov_offset; 191 buf = iov->iov_base + skip; 192 copy = min(bytes, iov->iov_len - skip); 193 194 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) { 195 kaddr = kmap_atomic(page); 196 from = kaddr + offset; 197 198 /* first chunk, usually the only one */ 199 left = copyout(buf, from, copy); 200 copy -= left; 201 skip += copy; 202 from += copy; 203 bytes -= copy; 204 205 while (unlikely(!left && bytes)) { 206 iov++; 207 buf = iov->iov_base; 208 copy = min(bytes, iov->iov_len); 209 left = copyout(buf, from, copy); 210 copy -= left; 211 skip = copy; 212 from += copy; 213 bytes -= copy; 214 } 215 if (likely(!bytes)) { 216 kunmap_atomic(kaddr); 217 goto done; 218 } 219 offset = from - kaddr; 220 buf += copy; 221 kunmap_atomic(kaddr); 222 copy = min(bytes, iov->iov_len - skip); 223 } 224 /* Too bad - revert to non-atomic kmap */ 225 226 kaddr = kmap(page); 227 from = kaddr + offset; 228 left = copyout(buf, from, copy); 229 copy -= left; 230 skip += copy; 231 from += copy; 232 bytes -= copy; 233 while (unlikely(!left && bytes)) { 234 iov++; 235 buf = iov->iov_base; 236 copy = min(bytes, iov->iov_len); 237 left = copyout(buf, from, copy); 238 copy -= left; 239 skip = copy; 240 from += copy; 241 bytes -= copy; 242 } 243 kunmap(page); 244 245 done: 246 if (skip == iov->iov_len) { 247 iov++; 248 skip = 0; 249 } 250 i->count -= wanted - bytes; 251 i->nr_segs -= iov - i->iov; 252 i->iov = iov; 253 i->iov_offset = skip; 254 return wanted - bytes; 255 } 256 257 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, 258 struct iov_iter *i) 259 { 260 size_t skip, copy, left, wanted; 261 const struct iovec *iov; 262 char __user *buf; 263 void *kaddr, *to; 264 265 if (unlikely(bytes > i->count)) 266 bytes = i->count; 267 268 if (unlikely(!bytes)) 269 return 0; 270 271 might_fault(); 272 wanted = bytes; 273 iov = i->iov; 274 skip = i->iov_offset; 275 buf = iov->iov_base + skip; 276 copy = min(bytes, iov->iov_len - skip); 277 278 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) { 279 kaddr = kmap_atomic(page); 280 to = kaddr + offset; 281 282 /* first chunk, usually the only one */ 283 left = copyin(to, buf, copy); 284 copy -= left; 285 skip += copy; 286 to += copy; 287 bytes -= copy; 288 289 while (unlikely(!left && bytes)) { 290 iov++; 291 buf = iov->iov_base; 292 copy = min(bytes, iov->iov_len); 293 left = copyin(to, buf, copy); 294 copy -= left; 295 skip = copy; 296 to += copy; 297 bytes -= copy; 298 } 299 if (likely(!bytes)) { 300 kunmap_atomic(kaddr); 301 goto done; 302 } 303 offset = to - kaddr; 304 buf += copy; 305 kunmap_atomic(kaddr); 306 copy = min(bytes, iov->iov_len - skip); 307 } 308 /* Too bad - revert to non-atomic kmap */ 309 310 kaddr = kmap(page); 311 to = kaddr + offset; 312 left = copyin(to, buf, copy); 313 copy -= left; 314 skip += copy; 315 to += copy; 316 bytes -= copy; 317 while (unlikely(!left && bytes)) { 318 iov++; 319 buf = iov->iov_base; 320 copy = min(bytes, iov->iov_len); 321 left = copyin(to, buf, copy); 322 copy -= left; 323 skip = copy; 324 to += copy; 325 bytes -= copy; 326 } 327 kunmap(page); 328 329 done: 330 if (skip == iov->iov_len) { 331 iov++; 332 skip = 0; 333 } 334 i->count -= wanted - bytes; 335 i->nr_segs -= iov - i->iov; 336 i->iov = iov; 337 i->iov_offset = skip; 338 return wanted - bytes; 339 } 340 341 #ifdef PIPE_PARANOIA 342 static bool sanity(const struct iov_iter *i) 343 { 344 struct pipe_inode_info *pipe = i->pipe; 345 unsigned int p_head = pipe->head; 346 unsigned int p_tail = pipe->tail; 347 unsigned int p_mask = pipe->ring_size - 1; 348 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 349 unsigned int i_head = i->head; 350 unsigned int idx; 351 352 if (i->iov_offset) { 353 struct pipe_buffer *p; 354 if (unlikely(p_occupancy == 0)) 355 goto Bad; // pipe must be non-empty 356 if (unlikely(i_head != p_head - 1)) 357 goto Bad; // must be at the last buffer... 358 359 p = &pipe->bufs[i_head & p_mask]; 360 if (unlikely(p->offset + p->len != i->iov_offset)) 361 goto Bad; // ... at the end of segment 362 } else { 363 if (i_head != p_head) 364 goto Bad; // must be right after the last buffer 365 } 366 return true; 367 Bad: 368 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 369 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 370 p_head, p_tail, pipe->ring_size); 371 for (idx = 0; idx < pipe->ring_size; idx++) 372 printk(KERN_ERR "[%p %p %d %d]\n", 373 pipe->bufs[idx].ops, 374 pipe->bufs[idx].page, 375 pipe->bufs[idx].offset, 376 pipe->bufs[idx].len); 377 WARN_ON(1); 378 return false; 379 } 380 #else 381 #define sanity(i) true 382 #endif 383 384 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 385 struct iov_iter *i) 386 { 387 struct pipe_inode_info *pipe = i->pipe; 388 struct pipe_buffer *buf; 389 unsigned int p_tail = pipe->tail; 390 unsigned int p_mask = pipe->ring_size - 1; 391 unsigned int i_head = i->head; 392 size_t off; 393 394 if (unlikely(bytes > i->count)) 395 bytes = i->count; 396 397 if (unlikely(!bytes)) 398 return 0; 399 400 if (!sanity(i)) 401 return 0; 402 403 off = i->iov_offset; 404 buf = &pipe->bufs[i_head & p_mask]; 405 if (off) { 406 if (offset == off && buf->page == page) { 407 /* merge with the last one */ 408 buf->len += bytes; 409 i->iov_offset += bytes; 410 goto out; 411 } 412 i_head++; 413 buf = &pipe->bufs[i_head & p_mask]; 414 } 415 if (pipe_full(i_head, p_tail, pipe->max_usage)) 416 return 0; 417 418 buf->ops = &page_cache_pipe_buf_ops; 419 buf->flags = 0; 420 get_page(page); 421 buf->page = page; 422 buf->offset = offset; 423 buf->len = bytes; 424 425 pipe->head = i_head + 1; 426 i->iov_offset = offset + bytes; 427 i->head = i_head; 428 out: 429 i->count -= bytes; 430 return bytes; 431 } 432 433 /* 434 * fault_in_iov_iter_readable - fault in iov iterator for reading 435 * @i: iterator 436 * @size: maximum length 437 * 438 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 439 * @size. For each iovec, fault in each page that constitutes the iovec. 440 * 441 * Returns the number of bytes not faulted in (like copy_to_user() and 442 * copy_from_user()). 443 * 444 * Always returns 0 for non-userspace iterators. 445 */ 446 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) 447 { 448 if (iter_is_iovec(i)) { 449 size_t count = min(size, iov_iter_count(i)); 450 const struct iovec *p; 451 size_t skip; 452 453 size -= count; 454 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 455 size_t len = min(count, p->iov_len - skip); 456 size_t ret; 457 458 if (unlikely(!len)) 459 continue; 460 ret = fault_in_readable(p->iov_base + skip, len); 461 count -= len - ret; 462 if (ret) 463 break; 464 } 465 return count + size; 466 } 467 return 0; 468 } 469 EXPORT_SYMBOL(fault_in_iov_iter_readable); 470 471 void iov_iter_init(struct iov_iter *i, unsigned int direction, 472 const struct iovec *iov, unsigned long nr_segs, 473 size_t count) 474 { 475 WARN_ON(direction & ~(READ | WRITE)); 476 *i = (struct iov_iter) { 477 .iter_type = ITER_IOVEC, 478 .data_source = direction, 479 .iov = iov, 480 .nr_segs = nr_segs, 481 .iov_offset = 0, 482 .count = count 483 }; 484 } 485 EXPORT_SYMBOL(iov_iter_init); 486 487 static inline bool allocated(struct pipe_buffer *buf) 488 { 489 return buf->ops == &default_pipe_buf_ops; 490 } 491 492 static inline void data_start(const struct iov_iter *i, 493 unsigned int *iter_headp, size_t *offp) 494 { 495 unsigned int p_mask = i->pipe->ring_size - 1; 496 unsigned int iter_head = i->head; 497 size_t off = i->iov_offset; 498 499 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || 500 off == PAGE_SIZE)) { 501 iter_head++; 502 off = 0; 503 } 504 *iter_headp = iter_head; 505 *offp = off; 506 } 507 508 static size_t push_pipe(struct iov_iter *i, size_t size, 509 int *iter_headp, size_t *offp) 510 { 511 struct pipe_inode_info *pipe = i->pipe; 512 unsigned int p_tail = pipe->tail; 513 unsigned int p_mask = pipe->ring_size - 1; 514 unsigned int iter_head; 515 size_t off; 516 ssize_t left; 517 518 if (unlikely(size > i->count)) 519 size = i->count; 520 if (unlikely(!size)) 521 return 0; 522 523 left = size; 524 data_start(i, &iter_head, &off); 525 *iter_headp = iter_head; 526 *offp = off; 527 if (off) { 528 left -= PAGE_SIZE - off; 529 if (left <= 0) { 530 pipe->bufs[iter_head & p_mask].len += size; 531 return size; 532 } 533 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; 534 iter_head++; 535 } 536 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { 537 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; 538 struct page *page = alloc_page(GFP_USER); 539 if (!page) 540 break; 541 542 buf->ops = &default_pipe_buf_ops; 543 buf->flags = 0; 544 buf->page = page; 545 buf->offset = 0; 546 buf->len = min_t(ssize_t, left, PAGE_SIZE); 547 left -= buf->len; 548 iter_head++; 549 pipe->head = iter_head; 550 551 if (left == 0) 552 return size; 553 } 554 return size - left; 555 } 556 557 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 558 struct iov_iter *i) 559 { 560 struct pipe_inode_info *pipe = i->pipe; 561 unsigned int p_mask = pipe->ring_size - 1; 562 unsigned int i_head; 563 size_t n, off; 564 565 if (!sanity(i)) 566 return 0; 567 568 bytes = n = push_pipe(i, bytes, &i_head, &off); 569 if (unlikely(!n)) 570 return 0; 571 do { 572 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 573 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 574 i->head = i_head; 575 i->iov_offset = off + chunk; 576 n -= chunk; 577 addr += chunk; 578 off = 0; 579 i_head++; 580 } while (n); 581 i->count -= bytes; 582 return bytes; 583 } 584 585 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 586 __wsum sum, size_t off) 587 { 588 __wsum next = csum_partial_copy_nocheck(from, to, len); 589 return csum_block_add(sum, next, off); 590 } 591 592 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 593 struct iov_iter *i, __wsum *sump) 594 { 595 struct pipe_inode_info *pipe = i->pipe; 596 unsigned int p_mask = pipe->ring_size - 1; 597 __wsum sum = *sump; 598 size_t off = 0; 599 unsigned int i_head; 600 size_t r; 601 602 if (!sanity(i)) 603 return 0; 604 605 bytes = push_pipe(i, bytes, &i_head, &r); 606 while (bytes) { 607 size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r); 608 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 609 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); 610 kunmap_local(p); 611 i->head = i_head; 612 i->iov_offset = r + chunk; 613 bytes -= chunk; 614 off += chunk; 615 r = 0; 616 i_head++; 617 } 618 *sump = sum; 619 i->count -= off; 620 return off; 621 } 622 623 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 624 { 625 if (unlikely(iov_iter_is_pipe(i))) 626 return copy_pipe_to_iter(addr, bytes, i); 627 if (iter_is_iovec(i)) 628 might_fault(); 629 iterate_and_advance(i, bytes, base, len, off, 630 copyout(base, addr + off, len), 631 memcpy(base, addr + off, len) 632 ) 633 634 return bytes; 635 } 636 EXPORT_SYMBOL(_copy_to_iter); 637 638 #ifdef CONFIG_ARCH_HAS_COPY_MC 639 static int copyout_mc(void __user *to, const void *from, size_t n) 640 { 641 if (access_ok(to, n)) { 642 instrument_copy_to_user(to, from, n); 643 n = copy_mc_to_user((__force void *) to, from, n); 644 } 645 return n; 646 } 647 648 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 649 struct iov_iter *i) 650 { 651 struct pipe_inode_info *pipe = i->pipe; 652 unsigned int p_mask = pipe->ring_size - 1; 653 unsigned int i_head; 654 size_t n, off, xfer = 0; 655 656 if (!sanity(i)) 657 return 0; 658 659 n = push_pipe(i, bytes, &i_head, &off); 660 while (n) { 661 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 662 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 663 unsigned long rem; 664 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); 665 chunk -= rem; 666 kunmap_local(p); 667 i->head = i_head; 668 i->iov_offset = off + chunk; 669 xfer += chunk; 670 if (rem) 671 break; 672 n -= chunk; 673 off = 0; 674 i_head++; 675 } 676 i->count -= xfer; 677 return xfer; 678 } 679 680 /** 681 * _copy_mc_to_iter - copy to iter with source memory error exception handling 682 * @addr: source kernel address 683 * @bytes: total transfer length 684 * @i: destination iterator 685 * 686 * The pmem driver deploys this for the dax operation 687 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 688 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 689 * successfully copied. 690 * 691 * The main differences between this and typical _copy_to_iter(). 692 * 693 * * Typical tail/residue handling after a fault retries the copy 694 * byte-by-byte until the fault happens again. Re-triggering machine 695 * checks is potentially fatal so the implementation uses source 696 * alignment and poison alignment assumptions to avoid re-triggering 697 * hardware exceptions. 698 * 699 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 700 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 701 * a short copy. 702 * 703 * Return: number of bytes copied (may be %0) 704 */ 705 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 706 { 707 if (unlikely(iov_iter_is_pipe(i))) 708 return copy_mc_pipe_to_iter(addr, bytes, i); 709 if (iter_is_iovec(i)) 710 might_fault(); 711 __iterate_and_advance(i, bytes, base, len, off, 712 copyout_mc(base, addr + off, len), 713 copy_mc_to_kernel(base, addr + off, len) 714 ) 715 716 return bytes; 717 } 718 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 719 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 720 721 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 722 { 723 if (unlikely(iov_iter_is_pipe(i))) { 724 WARN_ON(1); 725 return 0; 726 } 727 if (iter_is_iovec(i)) 728 might_fault(); 729 iterate_and_advance(i, bytes, base, len, off, 730 copyin(addr + off, base, len), 731 memcpy(addr + off, base, len) 732 ) 733 734 return bytes; 735 } 736 EXPORT_SYMBOL(_copy_from_iter); 737 738 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 739 { 740 if (unlikely(iov_iter_is_pipe(i))) { 741 WARN_ON(1); 742 return 0; 743 } 744 iterate_and_advance(i, bytes, base, len, off, 745 __copy_from_user_inatomic_nocache(addr + off, base, len), 746 memcpy(addr + off, base, len) 747 ) 748 749 return bytes; 750 } 751 EXPORT_SYMBOL(_copy_from_iter_nocache); 752 753 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 754 /** 755 * _copy_from_iter_flushcache - write destination through cpu cache 756 * @addr: destination kernel address 757 * @bytes: total transfer length 758 * @i: source iterator 759 * 760 * The pmem driver arranges for filesystem-dax to use this facility via 761 * dax_copy_from_iter() for ensuring that writes to persistent memory 762 * are flushed through the CPU cache. It is differentiated from 763 * _copy_from_iter_nocache() in that guarantees all data is flushed for 764 * all iterator types. The _copy_from_iter_nocache() only attempts to 765 * bypass the cache for the ITER_IOVEC case, and on some archs may use 766 * instructions that strand dirty-data in the cache. 767 * 768 * Return: number of bytes copied (may be %0) 769 */ 770 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 771 { 772 if (unlikely(iov_iter_is_pipe(i))) { 773 WARN_ON(1); 774 return 0; 775 } 776 iterate_and_advance(i, bytes, base, len, off, 777 __copy_from_user_flushcache(addr + off, base, len), 778 memcpy_flushcache(addr + off, base, len) 779 ) 780 781 return bytes; 782 } 783 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 784 #endif 785 786 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 787 { 788 struct page *head; 789 size_t v = n + offset; 790 791 /* 792 * The general case needs to access the page order in order 793 * to compute the page size. 794 * However, we mostly deal with order-0 pages and thus can 795 * avoid a possible cache line miss for requests that fit all 796 * page orders. 797 */ 798 if (n <= v && v <= PAGE_SIZE) 799 return true; 800 801 head = compound_head(page); 802 v += (page - head) << PAGE_SHIFT; 803 804 if (likely(n <= v && v <= (page_size(head)))) 805 return true; 806 WARN_ON(1); 807 return false; 808 } 809 810 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 811 struct iov_iter *i) 812 { 813 if (likely(iter_is_iovec(i))) 814 return copy_page_to_iter_iovec(page, offset, bytes, i); 815 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 816 void *kaddr = kmap_local_page(page); 817 size_t wanted = _copy_to_iter(kaddr + offset, bytes, i); 818 kunmap_local(kaddr); 819 return wanted; 820 } 821 if (iov_iter_is_pipe(i)) 822 return copy_page_to_iter_pipe(page, offset, bytes, i); 823 if (unlikely(iov_iter_is_discard(i))) { 824 if (unlikely(i->count < bytes)) 825 bytes = i->count; 826 i->count -= bytes; 827 return bytes; 828 } 829 WARN_ON(1); 830 return 0; 831 } 832 833 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 834 struct iov_iter *i) 835 { 836 size_t res = 0; 837 if (unlikely(!page_copy_sane(page, offset, bytes))) 838 return 0; 839 page += offset / PAGE_SIZE; // first subpage 840 offset %= PAGE_SIZE; 841 while (1) { 842 size_t n = __copy_page_to_iter(page, offset, 843 min(bytes, (size_t)PAGE_SIZE - offset), i); 844 res += n; 845 bytes -= n; 846 if (!bytes || !n) 847 break; 848 offset += n; 849 if (offset == PAGE_SIZE) { 850 page++; 851 offset = 0; 852 } 853 } 854 return res; 855 } 856 EXPORT_SYMBOL(copy_page_to_iter); 857 858 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 859 struct iov_iter *i) 860 { 861 if (unlikely(!page_copy_sane(page, offset, bytes))) 862 return 0; 863 if (likely(iter_is_iovec(i))) 864 return copy_page_from_iter_iovec(page, offset, bytes, i); 865 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 866 void *kaddr = kmap_local_page(page); 867 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 868 kunmap_local(kaddr); 869 return wanted; 870 } 871 WARN_ON(1); 872 return 0; 873 } 874 EXPORT_SYMBOL(copy_page_from_iter); 875 876 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 877 { 878 struct pipe_inode_info *pipe = i->pipe; 879 unsigned int p_mask = pipe->ring_size - 1; 880 unsigned int i_head; 881 size_t n, off; 882 883 if (!sanity(i)) 884 return 0; 885 886 bytes = n = push_pipe(i, bytes, &i_head, &off); 887 if (unlikely(!n)) 888 return 0; 889 890 do { 891 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 892 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 893 memset(p + off, 0, chunk); 894 kunmap_local(p); 895 i->head = i_head; 896 i->iov_offset = off + chunk; 897 n -= chunk; 898 off = 0; 899 i_head++; 900 } while (n); 901 i->count -= bytes; 902 return bytes; 903 } 904 905 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 906 { 907 if (unlikely(iov_iter_is_pipe(i))) 908 return pipe_zero(bytes, i); 909 iterate_and_advance(i, bytes, base, len, count, 910 clear_user(base, len), 911 memset(base, 0, len) 912 ) 913 914 return bytes; 915 } 916 EXPORT_SYMBOL(iov_iter_zero); 917 918 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 919 struct iov_iter *i) 920 { 921 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 922 if (unlikely(!page_copy_sane(page, offset, bytes))) { 923 kunmap_atomic(kaddr); 924 return 0; 925 } 926 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 927 kunmap_atomic(kaddr); 928 WARN_ON(1); 929 return 0; 930 } 931 iterate_and_advance(i, bytes, base, len, off, 932 copyin(p + off, base, len), 933 memcpy(p + off, base, len) 934 ) 935 kunmap_atomic(kaddr); 936 return bytes; 937 } 938 EXPORT_SYMBOL(copy_page_from_iter_atomic); 939 940 static inline void pipe_truncate(struct iov_iter *i) 941 { 942 struct pipe_inode_info *pipe = i->pipe; 943 unsigned int p_tail = pipe->tail; 944 unsigned int p_head = pipe->head; 945 unsigned int p_mask = pipe->ring_size - 1; 946 947 if (!pipe_empty(p_head, p_tail)) { 948 struct pipe_buffer *buf; 949 unsigned int i_head = i->head; 950 size_t off = i->iov_offset; 951 952 if (off) { 953 buf = &pipe->bufs[i_head & p_mask]; 954 buf->len = off - buf->offset; 955 i_head++; 956 } 957 while (p_head != i_head) { 958 p_head--; 959 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 960 } 961 962 pipe->head = p_head; 963 } 964 } 965 966 static void pipe_advance(struct iov_iter *i, size_t size) 967 { 968 struct pipe_inode_info *pipe = i->pipe; 969 if (size) { 970 struct pipe_buffer *buf; 971 unsigned int p_mask = pipe->ring_size - 1; 972 unsigned int i_head = i->head; 973 size_t off = i->iov_offset, left = size; 974 975 if (off) /* make it relative to the beginning of buffer */ 976 left += off - pipe->bufs[i_head & p_mask].offset; 977 while (1) { 978 buf = &pipe->bufs[i_head & p_mask]; 979 if (left <= buf->len) 980 break; 981 left -= buf->len; 982 i_head++; 983 } 984 i->head = i_head; 985 i->iov_offset = buf->offset + left; 986 } 987 i->count -= size; 988 /* ... and discard everything past that point */ 989 pipe_truncate(i); 990 } 991 992 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 993 { 994 struct bvec_iter bi; 995 996 bi.bi_size = i->count; 997 bi.bi_bvec_done = i->iov_offset; 998 bi.bi_idx = 0; 999 bvec_iter_advance(i->bvec, &bi, size); 1000 1001 i->bvec += bi.bi_idx; 1002 i->nr_segs -= bi.bi_idx; 1003 i->count = bi.bi_size; 1004 i->iov_offset = bi.bi_bvec_done; 1005 } 1006 1007 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 1008 { 1009 const struct iovec *iov, *end; 1010 1011 if (!i->count) 1012 return; 1013 i->count -= size; 1014 1015 size += i->iov_offset; // from beginning of current segment 1016 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 1017 if (likely(size < iov->iov_len)) 1018 break; 1019 size -= iov->iov_len; 1020 } 1021 i->iov_offset = size; 1022 i->nr_segs -= iov - i->iov; 1023 i->iov = iov; 1024 } 1025 1026 void iov_iter_advance(struct iov_iter *i, size_t size) 1027 { 1028 if (unlikely(i->count < size)) 1029 size = i->count; 1030 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 1031 /* iovec and kvec have identical layouts */ 1032 iov_iter_iovec_advance(i, size); 1033 } else if (iov_iter_is_bvec(i)) { 1034 iov_iter_bvec_advance(i, size); 1035 } else if (iov_iter_is_pipe(i)) { 1036 pipe_advance(i, size); 1037 } else if (unlikely(iov_iter_is_xarray(i))) { 1038 i->iov_offset += size; 1039 i->count -= size; 1040 } else if (iov_iter_is_discard(i)) { 1041 i->count -= size; 1042 } 1043 } 1044 EXPORT_SYMBOL(iov_iter_advance); 1045 1046 void iov_iter_revert(struct iov_iter *i, size_t unroll) 1047 { 1048 if (!unroll) 1049 return; 1050 if (WARN_ON(unroll > MAX_RW_COUNT)) 1051 return; 1052 i->count += unroll; 1053 if (unlikely(iov_iter_is_pipe(i))) { 1054 struct pipe_inode_info *pipe = i->pipe; 1055 unsigned int p_mask = pipe->ring_size - 1; 1056 unsigned int i_head = i->head; 1057 size_t off = i->iov_offset; 1058 while (1) { 1059 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 1060 size_t n = off - b->offset; 1061 if (unroll < n) { 1062 off -= unroll; 1063 break; 1064 } 1065 unroll -= n; 1066 if (!unroll && i_head == i->start_head) { 1067 off = 0; 1068 break; 1069 } 1070 i_head--; 1071 b = &pipe->bufs[i_head & p_mask]; 1072 off = b->offset + b->len; 1073 } 1074 i->iov_offset = off; 1075 i->head = i_head; 1076 pipe_truncate(i); 1077 return; 1078 } 1079 if (unlikely(iov_iter_is_discard(i))) 1080 return; 1081 if (unroll <= i->iov_offset) { 1082 i->iov_offset -= unroll; 1083 return; 1084 } 1085 unroll -= i->iov_offset; 1086 if (iov_iter_is_xarray(i)) { 1087 BUG(); /* We should never go beyond the start of the specified 1088 * range since we might then be straying into pages that 1089 * aren't pinned. 1090 */ 1091 } else if (iov_iter_is_bvec(i)) { 1092 const struct bio_vec *bvec = i->bvec; 1093 while (1) { 1094 size_t n = (--bvec)->bv_len; 1095 i->nr_segs++; 1096 if (unroll <= n) { 1097 i->bvec = bvec; 1098 i->iov_offset = n - unroll; 1099 return; 1100 } 1101 unroll -= n; 1102 } 1103 } else { /* same logics for iovec and kvec */ 1104 const struct iovec *iov = i->iov; 1105 while (1) { 1106 size_t n = (--iov)->iov_len; 1107 i->nr_segs++; 1108 if (unroll <= n) { 1109 i->iov = iov; 1110 i->iov_offset = n - unroll; 1111 return; 1112 } 1113 unroll -= n; 1114 } 1115 } 1116 } 1117 EXPORT_SYMBOL(iov_iter_revert); 1118 1119 /* 1120 * Return the count of just the current iov_iter segment. 1121 */ 1122 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1123 { 1124 if (i->nr_segs > 1) { 1125 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1126 return min(i->count, i->iov->iov_len - i->iov_offset); 1127 if (iov_iter_is_bvec(i)) 1128 return min(i->count, i->bvec->bv_len - i->iov_offset); 1129 } 1130 return i->count; 1131 } 1132 EXPORT_SYMBOL(iov_iter_single_seg_count); 1133 1134 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1135 const struct kvec *kvec, unsigned long nr_segs, 1136 size_t count) 1137 { 1138 WARN_ON(direction & ~(READ | WRITE)); 1139 *i = (struct iov_iter){ 1140 .iter_type = ITER_KVEC, 1141 .data_source = direction, 1142 .kvec = kvec, 1143 .nr_segs = nr_segs, 1144 .iov_offset = 0, 1145 .count = count 1146 }; 1147 } 1148 EXPORT_SYMBOL(iov_iter_kvec); 1149 1150 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1151 const struct bio_vec *bvec, unsigned long nr_segs, 1152 size_t count) 1153 { 1154 WARN_ON(direction & ~(READ | WRITE)); 1155 *i = (struct iov_iter){ 1156 .iter_type = ITER_BVEC, 1157 .data_source = direction, 1158 .bvec = bvec, 1159 .nr_segs = nr_segs, 1160 .iov_offset = 0, 1161 .count = count 1162 }; 1163 } 1164 EXPORT_SYMBOL(iov_iter_bvec); 1165 1166 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1167 struct pipe_inode_info *pipe, 1168 size_t count) 1169 { 1170 BUG_ON(direction != READ); 1171 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1172 *i = (struct iov_iter){ 1173 .iter_type = ITER_PIPE, 1174 .data_source = false, 1175 .pipe = pipe, 1176 .head = pipe->head, 1177 .start_head = pipe->head, 1178 .iov_offset = 0, 1179 .count = count 1180 }; 1181 } 1182 EXPORT_SYMBOL(iov_iter_pipe); 1183 1184 /** 1185 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1186 * @i: The iterator to initialise. 1187 * @direction: The direction of the transfer. 1188 * @xarray: The xarray to access. 1189 * @start: The start file position. 1190 * @count: The size of the I/O buffer in bytes. 1191 * 1192 * Set up an I/O iterator to either draw data out of the pages attached to an 1193 * inode or to inject data into those pages. The pages *must* be prevented 1194 * from evaporation, either by taking a ref on them or locking them by the 1195 * caller. 1196 */ 1197 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1198 struct xarray *xarray, loff_t start, size_t count) 1199 { 1200 BUG_ON(direction & ~1); 1201 *i = (struct iov_iter) { 1202 .iter_type = ITER_XARRAY, 1203 .data_source = direction, 1204 .xarray = xarray, 1205 .xarray_start = start, 1206 .count = count, 1207 .iov_offset = 0 1208 }; 1209 } 1210 EXPORT_SYMBOL(iov_iter_xarray); 1211 1212 /** 1213 * iov_iter_discard - Initialise an I/O iterator that discards data 1214 * @i: The iterator to initialise. 1215 * @direction: The direction of the transfer. 1216 * @count: The size of the I/O buffer in bytes. 1217 * 1218 * Set up an I/O iterator that just discards everything that's written to it. 1219 * It's only available as a READ iterator. 1220 */ 1221 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1222 { 1223 BUG_ON(direction != READ); 1224 *i = (struct iov_iter){ 1225 .iter_type = ITER_DISCARD, 1226 .data_source = false, 1227 .count = count, 1228 .iov_offset = 0 1229 }; 1230 } 1231 EXPORT_SYMBOL(iov_iter_discard); 1232 1233 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1234 { 1235 unsigned long res = 0; 1236 size_t size = i->count; 1237 size_t skip = i->iov_offset; 1238 unsigned k; 1239 1240 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1241 size_t len = i->iov[k].iov_len - skip; 1242 if (len) { 1243 res |= (unsigned long)i->iov[k].iov_base + skip; 1244 if (len > size) 1245 len = size; 1246 res |= len; 1247 size -= len; 1248 if (!size) 1249 break; 1250 } 1251 } 1252 return res; 1253 } 1254 1255 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1256 { 1257 unsigned res = 0; 1258 size_t size = i->count; 1259 unsigned skip = i->iov_offset; 1260 unsigned k; 1261 1262 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1263 size_t len = i->bvec[k].bv_len - skip; 1264 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1265 if (len > size) 1266 len = size; 1267 res |= len; 1268 size -= len; 1269 if (!size) 1270 break; 1271 } 1272 return res; 1273 } 1274 1275 unsigned long iov_iter_alignment(const struct iov_iter *i) 1276 { 1277 /* iovec and kvec have identical layouts */ 1278 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1279 return iov_iter_alignment_iovec(i); 1280 1281 if (iov_iter_is_bvec(i)) 1282 return iov_iter_alignment_bvec(i); 1283 1284 if (iov_iter_is_pipe(i)) { 1285 unsigned int p_mask = i->pipe->ring_size - 1; 1286 size_t size = i->count; 1287 1288 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) 1289 return size | i->iov_offset; 1290 return size; 1291 } 1292 1293 if (iov_iter_is_xarray(i)) 1294 return (i->xarray_start + i->iov_offset) | i->count; 1295 1296 return 0; 1297 } 1298 EXPORT_SYMBOL(iov_iter_alignment); 1299 1300 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1301 { 1302 unsigned long res = 0; 1303 unsigned long v = 0; 1304 size_t size = i->count; 1305 unsigned k; 1306 1307 if (WARN_ON(!iter_is_iovec(i))) 1308 return ~0U; 1309 1310 for (k = 0; k < i->nr_segs; k++) { 1311 if (i->iov[k].iov_len) { 1312 unsigned long base = (unsigned long)i->iov[k].iov_base; 1313 if (v) // if not the first one 1314 res |= base | v; // this start | previous end 1315 v = base + i->iov[k].iov_len; 1316 if (size <= i->iov[k].iov_len) 1317 break; 1318 size -= i->iov[k].iov_len; 1319 } 1320 } 1321 return res; 1322 } 1323 EXPORT_SYMBOL(iov_iter_gap_alignment); 1324 1325 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1326 size_t maxsize, 1327 struct page **pages, 1328 int iter_head, 1329 size_t *start) 1330 { 1331 struct pipe_inode_info *pipe = i->pipe; 1332 unsigned int p_mask = pipe->ring_size - 1; 1333 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1334 if (!n) 1335 return -EFAULT; 1336 1337 maxsize = n; 1338 n += *start; 1339 while (n > 0) { 1340 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1341 iter_head++; 1342 n -= PAGE_SIZE; 1343 } 1344 1345 return maxsize; 1346 } 1347 1348 static ssize_t pipe_get_pages(struct iov_iter *i, 1349 struct page **pages, size_t maxsize, unsigned maxpages, 1350 size_t *start) 1351 { 1352 unsigned int iter_head, npages; 1353 size_t capacity; 1354 1355 if (!sanity(i)) 1356 return -EFAULT; 1357 1358 data_start(i, &iter_head, start); 1359 /* Amount of free space: some of this one + all after this one */ 1360 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1361 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1362 1363 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1364 } 1365 1366 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1367 pgoff_t index, unsigned int nr_pages) 1368 { 1369 XA_STATE(xas, xa, index); 1370 struct page *page; 1371 unsigned int ret = 0; 1372 1373 rcu_read_lock(); 1374 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1375 if (xas_retry(&xas, page)) 1376 continue; 1377 1378 /* Has the page moved or been split? */ 1379 if (unlikely(page != xas_reload(&xas))) { 1380 xas_reset(&xas); 1381 continue; 1382 } 1383 1384 pages[ret] = find_subpage(page, xas.xa_index); 1385 get_page(pages[ret]); 1386 if (++ret == nr_pages) 1387 break; 1388 } 1389 rcu_read_unlock(); 1390 return ret; 1391 } 1392 1393 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1394 struct page **pages, size_t maxsize, 1395 unsigned maxpages, size_t *_start_offset) 1396 { 1397 unsigned nr, offset; 1398 pgoff_t index, count; 1399 size_t size = maxsize, actual; 1400 loff_t pos; 1401 1402 if (!size || !maxpages) 1403 return 0; 1404 1405 pos = i->xarray_start + i->iov_offset; 1406 index = pos >> PAGE_SHIFT; 1407 offset = pos & ~PAGE_MASK; 1408 *_start_offset = offset; 1409 1410 count = 1; 1411 if (size > PAGE_SIZE - offset) { 1412 size -= PAGE_SIZE - offset; 1413 count += size >> PAGE_SHIFT; 1414 size &= ~PAGE_MASK; 1415 if (size) 1416 count++; 1417 } 1418 1419 if (count > maxpages) 1420 count = maxpages; 1421 1422 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1423 if (nr == 0) 1424 return 0; 1425 1426 actual = PAGE_SIZE * nr; 1427 actual -= offset; 1428 if (nr == count && size > 0) { 1429 unsigned last_offset = (nr > 1) ? 0 : offset; 1430 actual -= PAGE_SIZE - (last_offset + size); 1431 } 1432 return actual; 1433 } 1434 1435 /* must be done on non-empty ITER_IOVEC one */ 1436 static unsigned long first_iovec_segment(const struct iov_iter *i, 1437 size_t *size, size_t *start, 1438 size_t maxsize, unsigned maxpages) 1439 { 1440 size_t skip; 1441 long k; 1442 1443 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1444 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; 1445 size_t len = i->iov[k].iov_len - skip; 1446 1447 if (unlikely(!len)) 1448 continue; 1449 if (len > maxsize) 1450 len = maxsize; 1451 len += (*start = addr % PAGE_SIZE); 1452 if (len > maxpages * PAGE_SIZE) 1453 len = maxpages * PAGE_SIZE; 1454 *size = len; 1455 return addr & PAGE_MASK; 1456 } 1457 BUG(); // if it had been empty, we wouldn't get called 1458 } 1459 1460 /* must be done on non-empty ITER_BVEC one */ 1461 static struct page *first_bvec_segment(const struct iov_iter *i, 1462 size_t *size, size_t *start, 1463 size_t maxsize, unsigned maxpages) 1464 { 1465 struct page *page; 1466 size_t skip = i->iov_offset, len; 1467 1468 len = i->bvec->bv_len - skip; 1469 if (len > maxsize) 1470 len = maxsize; 1471 skip += i->bvec->bv_offset; 1472 page = i->bvec->bv_page + skip / PAGE_SIZE; 1473 len += (*start = skip % PAGE_SIZE); 1474 if (len > maxpages * PAGE_SIZE) 1475 len = maxpages * PAGE_SIZE; 1476 *size = len; 1477 return page; 1478 } 1479 1480 ssize_t iov_iter_get_pages(struct iov_iter *i, 1481 struct page **pages, size_t maxsize, unsigned maxpages, 1482 size_t *start) 1483 { 1484 size_t len; 1485 int n, res; 1486 1487 if (maxsize > i->count) 1488 maxsize = i->count; 1489 if (!maxsize) 1490 return 0; 1491 1492 if (likely(iter_is_iovec(i))) { 1493 unsigned long addr; 1494 1495 addr = first_iovec_segment(i, &len, start, maxsize, maxpages); 1496 n = DIV_ROUND_UP(len, PAGE_SIZE); 1497 res = get_user_pages_fast(addr, n, 1498 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, 1499 pages); 1500 if (unlikely(res <= 0)) 1501 return res; 1502 return (res == n ? len : res * PAGE_SIZE) - *start; 1503 } 1504 if (iov_iter_is_bvec(i)) { 1505 struct page *page; 1506 1507 page = first_bvec_segment(i, &len, start, maxsize, maxpages); 1508 n = DIV_ROUND_UP(len, PAGE_SIZE); 1509 while (n--) 1510 get_page(*pages++ = page++); 1511 return len - *start; 1512 } 1513 if (iov_iter_is_pipe(i)) 1514 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1515 if (iov_iter_is_xarray(i)) 1516 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1517 return -EFAULT; 1518 } 1519 EXPORT_SYMBOL(iov_iter_get_pages); 1520 1521 static struct page **get_pages_array(size_t n) 1522 { 1523 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1524 } 1525 1526 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1527 struct page ***pages, size_t maxsize, 1528 size_t *start) 1529 { 1530 struct page **p; 1531 unsigned int iter_head, npages; 1532 ssize_t n; 1533 1534 if (!sanity(i)) 1535 return -EFAULT; 1536 1537 data_start(i, &iter_head, start); 1538 /* Amount of free space: some of this one + all after this one */ 1539 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1540 n = npages * PAGE_SIZE - *start; 1541 if (maxsize > n) 1542 maxsize = n; 1543 else 1544 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1545 p = get_pages_array(npages); 1546 if (!p) 1547 return -ENOMEM; 1548 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1549 if (n > 0) 1550 *pages = p; 1551 else 1552 kvfree(p); 1553 return n; 1554 } 1555 1556 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1557 struct page ***pages, size_t maxsize, 1558 size_t *_start_offset) 1559 { 1560 struct page **p; 1561 unsigned nr, offset; 1562 pgoff_t index, count; 1563 size_t size = maxsize, actual; 1564 loff_t pos; 1565 1566 if (!size) 1567 return 0; 1568 1569 pos = i->xarray_start + i->iov_offset; 1570 index = pos >> PAGE_SHIFT; 1571 offset = pos & ~PAGE_MASK; 1572 *_start_offset = offset; 1573 1574 count = 1; 1575 if (size > PAGE_SIZE - offset) { 1576 size -= PAGE_SIZE - offset; 1577 count += size >> PAGE_SHIFT; 1578 size &= ~PAGE_MASK; 1579 if (size) 1580 count++; 1581 } 1582 1583 p = get_pages_array(count); 1584 if (!p) 1585 return -ENOMEM; 1586 *pages = p; 1587 1588 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1589 if (nr == 0) 1590 return 0; 1591 1592 actual = PAGE_SIZE * nr; 1593 actual -= offset; 1594 if (nr == count && size > 0) { 1595 unsigned last_offset = (nr > 1) ? 0 : offset; 1596 actual -= PAGE_SIZE - (last_offset + size); 1597 } 1598 return actual; 1599 } 1600 1601 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1602 struct page ***pages, size_t maxsize, 1603 size_t *start) 1604 { 1605 struct page **p; 1606 size_t len; 1607 int n, res; 1608 1609 if (maxsize > i->count) 1610 maxsize = i->count; 1611 if (!maxsize) 1612 return 0; 1613 1614 if (likely(iter_is_iovec(i))) { 1615 unsigned long addr; 1616 1617 addr = first_iovec_segment(i, &len, start, maxsize, ~0U); 1618 n = DIV_ROUND_UP(len, PAGE_SIZE); 1619 p = get_pages_array(n); 1620 if (!p) 1621 return -ENOMEM; 1622 res = get_user_pages_fast(addr, n, 1623 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); 1624 if (unlikely(res <= 0)) { 1625 kvfree(p); 1626 *pages = NULL; 1627 return res; 1628 } 1629 *pages = p; 1630 return (res == n ? len : res * PAGE_SIZE) - *start; 1631 } 1632 if (iov_iter_is_bvec(i)) { 1633 struct page *page; 1634 1635 page = first_bvec_segment(i, &len, start, maxsize, ~0U); 1636 n = DIV_ROUND_UP(len, PAGE_SIZE); 1637 *pages = p = get_pages_array(n); 1638 if (!p) 1639 return -ENOMEM; 1640 while (n--) 1641 get_page(*p++ = page++); 1642 return len - *start; 1643 } 1644 if (iov_iter_is_pipe(i)) 1645 return pipe_get_pages_alloc(i, pages, maxsize, start); 1646 if (iov_iter_is_xarray(i)) 1647 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1648 return -EFAULT; 1649 } 1650 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1651 1652 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1653 struct iov_iter *i) 1654 { 1655 __wsum sum, next; 1656 sum = *csum; 1657 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1658 WARN_ON(1); 1659 return 0; 1660 } 1661 iterate_and_advance(i, bytes, base, len, off, ({ 1662 next = csum_and_copy_from_user(base, addr + off, len); 1663 sum = csum_block_add(sum, next, off); 1664 next ? 0 : len; 1665 }), ({ 1666 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1667 }) 1668 ) 1669 *csum = sum; 1670 return bytes; 1671 } 1672 EXPORT_SYMBOL(csum_and_copy_from_iter); 1673 1674 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1675 struct iov_iter *i) 1676 { 1677 struct csum_state *csstate = _csstate; 1678 __wsum sum, next; 1679 1680 if (unlikely(iov_iter_is_discard(i))) { 1681 WARN_ON(1); /* for now */ 1682 return 0; 1683 } 1684 1685 sum = csum_shift(csstate->csum, csstate->off); 1686 if (unlikely(iov_iter_is_pipe(i))) 1687 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); 1688 else iterate_and_advance(i, bytes, base, len, off, ({ 1689 next = csum_and_copy_to_user(addr + off, base, len); 1690 sum = csum_block_add(sum, next, off); 1691 next ? 0 : len; 1692 }), ({ 1693 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1694 }) 1695 ) 1696 csstate->csum = csum_shift(sum, csstate->off); 1697 csstate->off += bytes; 1698 return bytes; 1699 } 1700 EXPORT_SYMBOL(csum_and_copy_to_iter); 1701 1702 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1703 struct iov_iter *i) 1704 { 1705 #ifdef CONFIG_CRYPTO_HASH 1706 struct ahash_request *hash = hashp; 1707 struct scatterlist sg; 1708 size_t copied; 1709 1710 copied = copy_to_iter(addr, bytes, i); 1711 sg_init_one(&sg, addr, copied); 1712 ahash_request_set_crypt(hash, &sg, NULL, copied); 1713 crypto_ahash_update(hash); 1714 return copied; 1715 #else 1716 return 0; 1717 #endif 1718 } 1719 EXPORT_SYMBOL(hash_and_copy_to_iter); 1720 1721 static int iov_npages(const struct iov_iter *i, int maxpages) 1722 { 1723 size_t skip = i->iov_offset, size = i->count; 1724 const struct iovec *p; 1725 int npages = 0; 1726 1727 for (p = i->iov; size; skip = 0, p++) { 1728 unsigned offs = offset_in_page(p->iov_base + skip); 1729 size_t len = min(p->iov_len - skip, size); 1730 1731 if (len) { 1732 size -= len; 1733 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1734 if (unlikely(npages > maxpages)) 1735 return maxpages; 1736 } 1737 } 1738 return npages; 1739 } 1740 1741 static int bvec_npages(const struct iov_iter *i, int maxpages) 1742 { 1743 size_t skip = i->iov_offset, size = i->count; 1744 const struct bio_vec *p; 1745 int npages = 0; 1746 1747 for (p = i->bvec; size; skip = 0, p++) { 1748 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1749 size_t len = min(p->bv_len - skip, size); 1750 1751 size -= len; 1752 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1753 if (unlikely(npages > maxpages)) 1754 return maxpages; 1755 } 1756 return npages; 1757 } 1758 1759 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1760 { 1761 if (unlikely(!i->count)) 1762 return 0; 1763 /* iovec and kvec have identical layouts */ 1764 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1765 return iov_npages(i, maxpages); 1766 if (iov_iter_is_bvec(i)) 1767 return bvec_npages(i, maxpages); 1768 if (iov_iter_is_pipe(i)) { 1769 unsigned int iter_head; 1770 int npages; 1771 size_t off; 1772 1773 if (!sanity(i)) 1774 return 0; 1775 1776 data_start(i, &iter_head, &off); 1777 /* some of this one + all after this one */ 1778 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1779 return min(npages, maxpages); 1780 } 1781 if (iov_iter_is_xarray(i)) { 1782 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1783 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1784 return min(npages, maxpages); 1785 } 1786 return 0; 1787 } 1788 EXPORT_SYMBOL(iov_iter_npages); 1789 1790 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1791 { 1792 *new = *old; 1793 if (unlikely(iov_iter_is_pipe(new))) { 1794 WARN_ON(1); 1795 return NULL; 1796 } 1797 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) 1798 return NULL; 1799 if (iov_iter_is_bvec(new)) 1800 return new->bvec = kmemdup(new->bvec, 1801 new->nr_segs * sizeof(struct bio_vec), 1802 flags); 1803 else 1804 /* iovec and kvec have identical layout */ 1805 return new->iov = kmemdup(new->iov, 1806 new->nr_segs * sizeof(struct iovec), 1807 flags); 1808 } 1809 EXPORT_SYMBOL(dup_iter); 1810 1811 static int copy_compat_iovec_from_user(struct iovec *iov, 1812 const struct iovec __user *uvec, unsigned long nr_segs) 1813 { 1814 const struct compat_iovec __user *uiov = 1815 (const struct compat_iovec __user *)uvec; 1816 int ret = -EFAULT, i; 1817 1818 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1819 return -EFAULT; 1820 1821 for (i = 0; i < nr_segs; i++) { 1822 compat_uptr_t buf; 1823 compat_ssize_t len; 1824 1825 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1826 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1827 1828 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1829 if (len < 0) { 1830 ret = -EINVAL; 1831 goto uaccess_end; 1832 } 1833 iov[i].iov_base = compat_ptr(buf); 1834 iov[i].iov_len = len; 1835 } 1836 1837 ret = 0; 1838 uaccess_end: 1839 user_access_end(); 1840 return ret; 1841 } 1842 1843 static int copy_iovec_from_user(struct iovec *iov, 1844 const struct iovec __user *uvec, unsigned long nr_segs) 1845 { 1846 unsigned long seg; 1847 1848 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1849 return -EFAULT; 1850 for (seg = 0; seg < nr_segs; seg++) { 1851 if ((ssize_t)iov[seg].iov_len < 0) 1852 return -EINVAL; 1853 } 1854 1855 return 0; 1856 } 1857 1858 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1859 unsigned long nr_segs, unsigned long fast_segs, 1860 struct iovec *fast_iov, bool compat) 1861 { 1862 struct iovec *iov = fast_iov; 1863 int ret; 1864 1865 /* 1866 * SuS says "The readv() function *may* fail if the iovcnt argument was 1867 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1868 * traditionally returned zero for zero segments, so... 1869 */ 1870 if (nr_segs == 0) 1871 return iov; 1872 if (nr_segs > UIO_MAXIOV) 1873 return ERR_PTR(-EINVAL); 1874 if (nr_segs > fast_segs) { 1875 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1876 if (!iov) 1877 return ERR_PTR(-ENOMEM); 1878 } 1879 1880 if (compat) 1881 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1882 else 1883 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1884 if (ret) { 1885 if (iov != fast_iov) 1886 kfree(iov); 1887 return ERR_PTR(ret); 1888 } 1889 1890 return iov; 1891 } 1892 1893 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1894 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1895 struct iov_iter *i, bool compat) 1896 { 1897 ssize_t total_len = 0; 1898 unsigned long seg; 1899 struct iovec *iov; 1900 1901 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1902 if (IS_ERR(iov)) { 1903 *iovp = NULL; 1904 return PTR_ERR(iov); 1905 } 1906 1907 /* 1908 * According to the Single Unix Specification we should return EINVAL if 1909 * an element length is < 0 when cast to ssize_t or if the total length 1910 * would overflow the ssize_t return value of the system call. 1911 * 1912 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1913 * overflow case. 1914 */ 1915 for (seg = 0; seg < nr_segs; seg++) { 1916 ssize_t len = (ssize_t)iov[seg].iov_len; 1917 1918 if (!access_ok(iov[seg].iov_base, len)) { 1919 if (iov != *iovp) 1920 kfree(iov); 1921 *iovp = NULL; 1922 return -EFAULT; 1923 } 1924 1925 if (len > MAX_RW_COUNT - total_len) { 1926 len = MAX_RW_COUNT - total_len; 1927 iov[seg].iov_len = len; 1928 } 1929 total_len += len; 1930 } 1931 1932 iov_iter_init(i, type, iov, nr_segs, total_len); 1933 if (iov == *iovp) 1934 *iovp = NULL; 1935 else 1936 *iovp = iov; 1937 return total_len; 1938 } 1939 1940 /** 1941 * import_iovec() - Copy an array of &struct iovec from userspace 1942 * into the kernel, check that it is valid, and initialize a new 1943 * &struct iov_iter iterator to access it. 1944 * 1945 * @type: One of %READ or %WRITE. 1946 * @uvec: Pointer to the userspace array. 1947 * @nr_segs: Number of elements in userspace array. 1948 * @fast_segs: Number of elements in @iov. 1949 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1950 * on-stack) kernel array. 1951 * @i: Pointer to iterator that will be initialized on success. 1952 * 1953 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1954 * then this function places %NULL in *@iov on return. Otherwise, a new 1955 * array will be allocated and the result placed in *@iov. This means that 1956 * the caller may call kfree() on *@iov regardless of whether the small 1957 * on-stack array was used or not (and regardless of whether this function 1958 * returns an error or not). 1959 * 1960 * Return: Negative error code on error, bytes imported on success 1961 */ 1962 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1963 unsigned nr_segs, unsigned fast_segs, 1964 struct iovec **iovp, struct iov_iter *i) 1965 { 1966 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1967 in_compat_syscall()); 1968 } 1969 EXPORT_SYMBOL(import_iovec); 1970 1971 int import_single_range(int rw, void __user *buf, size_t len, 1972 struct iovec *iov, struct iov_iter *i) 1973 { 1974 if (len > MAX_RW_COUNT) 1975 len = MAX_RW_COUNT; 1976 if (unlikely(!access_ok(buf, len))) 1977 return -EFAULT; 1978 1979 iov->iov_base = buf; 1980 iov->iov_len = len; 1981 iov_iter_init(i, rw, iov, 1, len); 1982 return 0; 1983 } 1984 EXPORT_SYMBOL(import_single_range); 1985 1986 /** 1987 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when 1988 * iov_iter_save_state() was called. 1989 * 1990 * @i: &struct iov_iter to restore 1991 * @state: state to restore from 1992 * 1993 * Used after iov_iter_save_state() to bring restore @i, if operations may 1994 * have advanced it. 1995 * 1996 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC 1997 */ 1998 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) 1999 { 2000 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && 2001 !iov_iter_is_kvec(i)) 2002 return; 2003 i->iov_offset = state->iov_offset; 2004 i->count = state->count; 2005 /* 2006 * For the *vec iters, nr_segs + iov is constant - if we increment 2007 * the vec, then we also decrement the nr_segs count. Hence we don't 2008 * need to track both of these, just one is enough and we can deduct 2009 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct 2010 * size, so we can just increment the iov pointer as they are unionzed. 2011 * ITER_BVEC _may_ be the same size on some archs, but on others it is 2012 * not. Be safe and handle it separately. 2013 */ 2014 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 2015 if (iov_iter_is_bvec(i)) 2016 i->bvec -= state->nr_segs - i->nr_segs; 2017 else 2018 i->iov -= state->nr_segs - i->nr_segs; 2019 i->nr_segs = state->nr_segs; 2020 } 2021