1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers ubuf and kbuf alike */ 20 #define iterate_buf(i, n, base, len, off, __p, STEP) { \ 21 size_t __maybe_unused off = 0; \ 22 len = n; \ 23 base = __p + i->iov_offset; \ 24 len -= (STEP); \ 25 i->iov_offset += len; \ 26 n = len; \ 27 } 28 29 /* covers iovec and kvec alike */ 30 #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ 31 size_t off = 0; \ 32 size_t skip = i->iov_offset; \ 33 do { \ 34 len = min(n, __p->iov_len - skip); \ 35 if (likely(len)) { \ 36 base = __p->iov_base + skip; \ 37 len -= (STEP); \ 38 off += len; \ 39 skip += len; \ 40 n -= len; \ 41 if (skip < __p->iov_len) \ 42 break; \ 43 } \ 44 __p++; \ 45 skip = 0; \ 46 } while (n); \ 47 i->iov_offset = skip; \ 48 n = off; \ 49 } 50 51 #define iterate_bvec(i, n, base, len, off, p, STEP) { \ 52 size_t off = 0; \ 53 unsigned skip = i->iov_offset; \ 54 while (n) { \ 55 unsigned offset = p->bv_offset + skip; \ 56 unsigned left; \ 57 void *kaddr = kmap_local_page(p->bv_page + \ 58 offset / PAGE_SIZE); \ 59 base = kaddr + offset % PAGE_SIZE; \ 60 len = min(min(n, (size_t)(p->bv_len - skip)), \ 61 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 62 left = (STEP); \ 63 kunmap_local(kaddr); \ 64 len -= left; \ 65 off += len; \ 66 skip += len; \ 67 if (skip == p->bv_len) { \ 68 skip = 0; \ 69 p++; \ 70 } \ 71 n -= len; \ 72 if (left) \ 73 break; \ 74 } \ 75 i->iov_offset = skip; \ 76 n = off; \ 77 } 78 79 #define iterate_xarray(i, n, base, len, __off, STEP) { \ 80 __label__ __out; \ 81 size_t __off = 0; \ 82 struct folio *folio; \ 83 loff_t start = i->xarray_start + i->iov_offset; \ 84 pgoff_t index = start / PAGE_SIZE; \ 85 XA_STATE(xas, i->xarray, index); \ 86 \ 87 len = PAGE_SIZE - offset_in_page(start); \ 88 rcu_read_lock(); \ 89 xas_for_each(&xas, folio, ULONG_MAX) { \ 90 unsigned left; \ 91 size_t offset; \ 92 if (xas_retry(&xas, folio)) \ 93 continue; \ 94 if (WARN_ON(xa_is_value(folio))) \ 95 break; \ 96 if (WARN_ON(folio_test_hugetlb(folio))) \ 97 break; \ 98 offset = offset_in_folio(folio, start + __off); \ 99 while (offset < folio_size(folio)) { \ 100 base = kmap_local_folio(folio, offset); \ 101 len = min(n, len); \ 102 left = (STEP); \ 103 kunmap_local(base); \ 104 len -= left; \ 105 __off += len; \ 106 n -= len; \ 107 if (left || n == 0) \ 108 goto __out; \ 109 offset += len; \ 110 len = PAGE_SIZE; \ 111 } \ 112 } \ 113 __out: \ 114 rcu_read_unlock(); \ 115 i->iov_offset += __off; \ 116 n = __off; \ 117 } 118 119 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 120 if (unlikely(i->count < n)) \ 121 n = i->count; \ 122 if (likely(n)) { \ 123 if (likely(iter_is_ubuf(i))) { \ 124 void __user *base; \ 125 size_t len; \ 126 iterate_buf(i, n, base, len, off, \ 127 i->ubuf, (I)) \ 128 } else if (likely(iter_is_iovec(i))) { \ 129 const struct iovec *iov = i->iov; \ 130 void __user *base; \ 131 size_t len; \ 132 iterate_iovec(i, n, base, len, off, \ 133 iov, (I)) \ 134 i->nr_segs -= iov - i->iov; \ 135 i->iov = iov; \ 136 } else if (iov_iter_is_bvec(i)) { \ 137 const struct bio_vec *bvec = i->bvec; \ 138 void *base; \ 139 size_t len; \ 140 iterate_bvec(i, n, base, len, off, \ 141 bvec, (K)) \ 142 i->nr_segs -= bvec - i->bvec; \ 143 i->bvec = bvec; \ 144 } else if (iov_iter_is_kvec(i)) { \ 145 const struct kvec *kvec = i->kvec; \ 146 void *base; \ 147 size_t len; \ 148 iterate_iovec(i, n, base, len, off, \ 149 kvec, (K)) \ 150 i->nr_segs -= kvec - i->kvec; \ 151 i->kvec = kvec; \ 152 } else if (iov_iter_is_xarray(i)) { \ 153 void *base; \ 154 size_t len; \ 155 iterate_xarray(i, n, base, len, off, \ 156 (K)) \ 157 } \ 158 i->count -= n; \ 159 } \ 160 } 161 #define iterate_and_advance(i, n, base, len, off, I, K) \ 162 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 163 164 static int copyout(void __user *to, const void *from, size_t n) 165 { 166 if (should_fail_usercopy()) 167 return n; 168 if (access_ok(to, n)) { 169 instrument_copy_to_user(to, from, n); 170 n = raw_copy_to_user(to, from, n); 171 } 172 return n; 173 } 174 175 static int copyin(void *to, const void __user *from, size_t n) 176 { 177 size_t res = n; 178 179 if (should_fail_usercopy()) 180 return n; 181 if (access_ok(from, n)) { 182 instrument_copy_from_user_before(to, from, n); 183 res = raw_copy_from_user(to, from, n); 184 instrument_copy_from_user_after(to, from, n, res); 185 } 186 return res; 187 } 188 189 static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, 190 unsigned int slot) 191 { 192 return &pipe->bufs[slot & (pipe->ring_size - 1)]; 193 } 194 195 #ifdef PIPE_PARANOIA 196 static bool sanity(const struct iov_iter *i) 197 { 198 struct pipe_inode_info *pipe = i->pipe; 199 unsigned int p_head = pipe->head; 200 unsigned int p_tail = pipe->tail; 201 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 202 unsigned int i_head = i->head; 203 unsigned int idx; 204 205 if (i->last_offset) { 206 struct pipe_buffer *p; 207 if (unlikely(p_occupancy == 0)) 208 goto Bad; // pipe must be non-empty 209 if (unlikely(i_head != p_head - 1)) 210 goto Bad; // must be at the last buffer... 211 212 p = pipe_buf(pipe, i_head); 213 if (unlikely(p->offset + p->len != abs(i->last_offset))) 214 goto Bad; // ... at the end of segment 215 } else { 216 if (i_head != p_head) 217 goto Bad; // must be right after the last buffer 218 } 219 return true; 220 Bad: 221 printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset); 222 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 223 p_head, p_tail, pipe->ring_size); 224 for (idx = 0; idx < pipe->ring_size; idx++) 225 printk(KERN_ERR "[%p %p %d %d]\n", 226 pipe->bufs[idx].ops, 227 pipe->bufs[idx].page, 228 pipe->bufs[idx].offset, 229 pipe->bufs[idx].len); 230 WARN_ON(1); 231 return false; 232 } 233 #else 234 #define sanity(i) true 235 #endif 236 237 static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) 238 { 239 struct page *page = alloc_page(GFP_USER); 240 if (page) { 241 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 242 *buf = (struct pipe_buffer) { 243 .ops = &default_pipe_buf_ops, 244 .page = page, 245 .offset = 0, 246 .len = size 247 }; 248 } 249 return page; 250 } 251 252 static void push_page(struct pipe_inode_info *pipe, struct page *page, 253 unsigned int offset, unsigned int size) 254 { 255 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 256 *buf = (struct pipe_buffer) { 257 .ops = &page_cache_pipe_buf_ops, 258 .page = page, 259 .offset = offset, 260 .len = size 261 }; 262 get_page(page); 263 } 264 265 static inline int last_offset(const struct pipe_buffer *buf) 266 { 267 if (buf->ops == &default_pipe_buf_ops) 268 return buf->len; // buf->offset is 0 for those 269 else 270 return -(buf->offset + buf->len); 271 } 272 273 static struct page *append_pipe(struct iov_iter *i, size_t size, 274 unsigned int *off) 275 { 276 struct pipe_inode_info *pipe = i->pipe; 277 int offset = i->last_offset; 278 struct pipe_buffer *buf; 279 struct page *page; 280 281 if (offset > 0 && offset < PAGE_SIZE) { 282 // some space in the last buffer; add to it 283 buf = pipe_buf(pipe, pipe->head - 1); 284 size = min_t(size_t, size, PAGE_SIZE - offset); 285 buf->len += size; 286 i->last_offset += size; 287 i->count -= size; 288 *off = offset; 289 return buf->page; 290 } 291 // OK, we need a new buffer 292 *off = 0; 293 size = min_t(size_t, size, PAGE_SIZE); 294 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 295 return NULL; 296 page = push_anon(pipe, size); 297 if (!page) 298 return NULL; 299 i->head = pipe->head - 1; 300 i->last_offset = size; 301 i->count -= size; 302 return page; 303 } 304 305 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 306 struct iov_iter *i) 307 { 308 struct pipe_inode_info *pipe = i->pipe; 309 unsigned int head = pipe->head; 310 311 if (unlikely(bytes > i->count)) 312 bytes = i->count; 313 314 if (unlikely(!bytes)) 315 return 0; 316 317 if (!sanity(i)) 318 return 0; 319 320 if (offset && i->last_offset == -offset) { // could we merge it? 321 struct pipe_buffer *buf = pipe_buf(pipe, head - 1); 322 if (buf->page == page) { 323 buf->len += bytes; 324 i->last_offset -= bytes; 325 i->count -= bytes; 326 return bytes; 327 } 328 } 329 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 330 return 0; 331 332 push_page(pipe, page, offset, bytes); 333 i->last_offset = -(offset + bytes); 334 i->head = head; 335 i->count -= bytes; 336 return bytes; 337 } 338 339 /* 340 * fault_in_iov_iter_readable - fault in iov iterator for reading 341 * @i: iterator 342 * @size: maximum length 343 * 344 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 345 * @size. For each iovec, fault in each page that constitutes the iovec. 346 * 347 * Returns the number of bytes not faulted in (like copy_to_user() and 348 * copy_from_user()). 349 * 350 * Always returns 0 for non-userspace iterators. 351 */ 352 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) 353 { 354 if (iter_is_ubuf(i)) { 355 size_t n = min(size, iov_iter_count(i)); 356 n -= fault_in_readable(i->ubuf + i->iov_offset, n); 357 return size - n; 358 } else if (iter_is_iovec(i)) { 359 size_t count = min(size, iov_iter_count(i)); 360 const struct iovec *p; 361 size_t skip; 362 363 size -= count; 364 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 365 size_t len = min(count, p->iov_len - skip); 366 size_t ret; 367 368 if (unlikely(!len)) 369 continue; 370 ret = fault_in_readable(p->iov_base + skip, len); 371 count -= len - ret; 372 if (ret) 373 break; 374 } 375 return count + size; 376 } 377 return 0; 378 } 379 EXPORT_SYMBOL(fault_in_iov_iter_readable); 380 381 /* 382 * fault_in_iov_iter_writeable - fault in iov iterator for writing 383 * @i: iterator 384 * @size: maximum length 385 * 386 * Faults in the iterator using get_user_pages(), i.e., without triggering 387 * hardware page faults. This is primarily useful when we already know that 388 * some or all of the pages in @i aren't in memory. 389 * 390 * Returns the number of bytes not faulted in, like copy_to_user() and 391 * copy_from_user(). 392 * 393 * Always returns 0 for non-user-space iterators. 394 */ 395 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) 396 { 397 if (iter_is_ubuf(i)) { 398 size_t n = min(size, iov_iter_count(i)); 399 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); 400 return size - n; 401 } else if (iter_is_iovec(i)) { 402 size_t count = min(size, iov_iter_count(i)); 403 const struct iovec *p; 404 size_t skip; 405 406 size -= count; 407 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 408 size_t len = min(count, p->iov_len - skip); 409 size_t ret; 410 411 if (unlikely(!len)) 412 continue; 413 ret = fault_in_safe_writeable(p->iov_base + skip, len); 414 count -= len - ret; 415 if (ret) 416 break; 417 } 418 return count + size; 419 } 420 return 0; 421 } 422 EXPORT_SYMBOL(fault_in_iov_iter_writeable); 423 424 void iov_iter_init(struct iov_iter *i, unsigned int direction, 425 const struct iovec *iov, unsigned long nr_segs, 426 size_t count) 427 { 428 WARN_ON(direction & ~(READ | WRITE)); 429 *i = (struct iov_iter) { 430 .iter_type = ITER_IOVEC, 431 .nofault = false, 432 .user_backed = true, 433 .data_source = direction, 434 .iov = iov, 435 .nr_segs = nr_segs, 436 .iov_offset = 0, 437 .count = count 438 }; 439 } 440 EXPORT_SYMBOL(iov_iter_init); 441 442 // returns the offset in partial buffer (if any) 443 static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages) 444 { 445 struct pipe_inode_info *pipe = i->pipe; 446 int used = pipe->head - pipe->tail; 447 int off = i->last_offset; 448 449 *npages = max((int)pipe->max_usage - used, 0); 450 451 if (off > 0 && off < PAGE_SIZE) { // anon and not full 452 (*npages)++; 453 return off; 454 } 455 return 0; 456 } 457 458 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 459 struct iov_iter *i) 460 { 461 unsigned int off, chunk; 462 463 if (unlikely(bytes > i->count)) 464 bytes = i->count; 465 if (unlikely(!bytes)) 466 return 0; 467 468 if (!sanity(i)) 469 return 0; 470 471 for (size_t n = bytes; n; n -= chunk) { 472 struct page *page = append_pipe(i, n, &off); 473 chunk = min_t(size_t, n, PAGE_SIZE - off); 474 if (!page) 475 return bytes - n; 476 memcpy_to_page(page, off, addr, chunk); 477 addr += chunk; 478 } 479 return bytes; 480 } 481 482 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 483 __wsum sum, size_t off) 484 { 485 __wsum next = csum_partial_copy_nocheck(from, to, len); 486 return csum_block_add(sum, next, off); 487 } 488 489 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 490 struct iov_iter *i, __wsum *sump) 491 { 492 __wsum sum = *sump; 493 size_t off = 0; 494 unsigned int chunk, r; 495 496 if (unlikely(bytes > i->count)) 497 bytes = i->count; 498 if (unlikely(!bytes)) 499 return 0; 500 501 if (!sanity(i)) 502 return 0; 503 504 while (bytes) { 505 struct page *page = append_pipe(i, bytes, &r); 506 char *p; 507 508 if (!page) 509 break; 510 chunk = min_t(size_t, bytes, PAGE_SIZE - r); 511 p = kmap_local_page(page); 512 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); 513 kunmap_local(p); 514 off += chunk; 515 bytes -= chunk; 516 } 517 *sump = sum; 518 return off; 519 } 520 521 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 522 { 523 if (unlikely(iov_iter_is_pipe(i))) 524 return copy_pipe_to_iter(addr, bytes, i); 525 if (user_backed_iter(i)) 526 might_fault(); 527 iterate_and_advance(i, bytes, base, len, off, 528 copyout(base, addr + off, len), 529 memcpy(base, addr + off, len) 530 ) 531 532 return bytes; 533 } 534 EXPORT_SYMBOL(_copy_to_iter); 535 536 #ifdef CONFIG_ARCH_HAS_COPY_MC 537 static int copyout_mc(void __user *to, const void *from, size_t n) 538 { 539 if (access_ok(to, n)) { 540 instrument_copy_to_user(to, from, n); 541 n = copy_mc_to_user((__force void *) to, from, n); 542 } 543 return n; 544 } 545 546 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 547 struct iov_iter *i) 548 { 549 size_t xfer = 0; 550 unsigned int off, chunk; 551 552 if (unlikely(bytes > i->count)) 553 bytes = i->count; 554 if (unlikely(!bytes)) 555 return 0; 556 557 if (!sanity(i)) 558 return 0; 559 560 while (bytes) { 561 struct page *page = append_pipe(i, bytes, &off); 562 unsigned long rem; 563 char *p; 564 565 if (!page) 566 break; 567 chunk = min_t(size_t, bytes, PAGE_SIZE - off); 568 p = kmap_local_page(page); 569 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); 570 chunk -= rem; 571 kunmap_local(p); 572 xfer += chunk; 573 bytes -= chunk; 574 if (rem) { 575 iov_iter_revert(i, rem); 576 break; 577 } 578 } 579 return xfer; 580 } 581 582 /** 583 * _copy_mc_to_iter - copy to iter with source memory error exception handling 584 * @addr: source kernel address 585 * @bytes: total transfer length 586 * @i: destination iterator 587 * 588 * The pmem driver deploys this for the dax operation 589 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 590 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 591 * successfully copied. 592 * 593 * The main differences between this and typical _copy_to_iter(). 594 * 595 * * Typical tail/residue handling after a fault retries the copy 596 * byte-by-byte until the fault happens again. Re-triggering machine 597 * checks is potentially fatal so the implementation uses source 598 * alignment and poison alignment assumptions to avoid re-triggering 599 * hardware exceptions. 600 * 601 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 602 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 603 * a short copy. 604 * 605 * Return: number of bytes copied (may be %0) 606 */ 607 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 608 { 609 if (unlikely(iov_iter_is_pipe(i))) 610 return copy_mc_pipe_to_iter(addr, bytes, i); 611 if (user_backed_iter(i)) 612 might_fault(); 613 __iterate_and_advance(i, bytes, base, len, off, 614 copyout_mc(base, addr + off, len), 615 copy_mc_to_kernel(base, addr + off, len) 616 ) 617 618 return bytes; 619 } 620 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 621 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 622 623 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 624 { 625 if (unlikely(iov_iter_is_pipe(i))) { 626 WARN_ON(1); 627 return 0; 628 } 629 if (user_backed_iter(i)) 630 might_fault(); 631 iterate_and_advance(i, bytes, base, len, off, 632 copyin(addr + off, base, len), 633 memcpy(addr + off, base, len) 634 ) 635 636 return bytes; 637 } 638 EXPORT_SYMBOL(_copy_from_iter); 639 640 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 641 { 642 if (unlikely(iov_iter_is_pipe(i))) { 643 WARN_ON(1); 644 return 0; 645 } 646 iterate_and_advance(i, bytes, base, len, off, 647 __copy_from_user_inatomic_nocache(addr + off, base, len), 648 memcpy(addr + off, base, len) 649 ) 650 651 return bytes; 652 } 653 EXPORT_SYMBOL(_copy_from_iter_nocache); 654 655 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 656 /** 657 * _copy_from_iter_flushcache - write destination through cpu cache 658 * @addr: destination kernel address 659 * @bytes: total transfer length 660 * @i: source iterator 661 * 662 * The pmem driver arranges for filesystem-dax to use this facility via 663 * dax_copy_from_iter() for ensuring that writes to persistent memory 664 * are flushed through the CPU cache. It is differentiated from 665 * _copy_from_iter_nocache() in that guarantees all data is flushed for 666 * all iterator types. The _copy_from_iter_nocache() only attempts to 667 * bypass the cache for the ITER_IOVEC case, and on some archs may use 668 * instructions that strand dirty-data in the cache. 669 * 670 * Return: number of bytes copied (may be %0) 671 */ 672 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 673 { 674 if (unlikely(iov_iter_is_pipe(i))) { 675 WARN_ON(1); 676 return 0; 677 } 678 iterate_and_advance(i, bytes, base, len, off, 679 __copy_from_user_flushcache(addr + off, base, len), 680 memcpy_flushcache(addr + off, base, len) 681 ) 682 683 return bytes; 684 } 685 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 686 #endif 687 688 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 689 { 690 struct page *head; 691 size_t v = n + offset; 692 693 /* 694 * The general case needs to access the page order in order 695 * to compute the page size. 696 * However, we mostly deal with order-0 pages and thus can 697 * avoid a possible cache line miss for requests that fit all 698 * page orders. 699 */ 700 if (n <= v && v <= PAGE_SIZE) 701 return true; 702 703 head = compound_head(page); 704 v += (page - head) << PAGE_SHIFT; 705 706 if (WARN_ON(n > v || v > page_size(head))) 707 return false; 708 return true; 709 } 710 711 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 712 struct iov_iter *i) 713 { 714 size_t res = 0; 715 if (!page_copy_sane(page, offset, bytes)) 716 return 0; 717 if (unlikely(iov_iter_is_pipe(i))) 718 return copy_page_to_iter_pipe(page, offset, bytes, i); 719 page += offset / PAGE_SIZE; // first subpage 720 offset %= PAGE_SIZE; 721 while (1) { 722 void *kaddr = kmap_local_page(page); 723 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 724 n = _copy_to_iter(kaddr + offset, n, i); 725 kunmap_local(kaddr); 726 res += n; 727 bytes -= n; 728 if (!bytes || !n) 729 break; 730 offset += n; 731 if (offset == PAGE_SIZE) { 732 page++; 733 offset = 0; 734 } 735 } 736 return res; 737 } 738 EXPORT_SYMBOL(copy_page_to_iter); 739 740 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 741 struct iov_iter *i) 742 { 743 size_t res = 0; 744 if (!page_copy_sane(page, offset, bytes)) 745 return 0; 746 page += offset / PAGE_SIZE; // first subpage 747 offset %= PAGE_SIZE; 748 while (1) { 749 void *kaddr = kmap_local_page(page); 750 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 751 n = _copy_from_iter(kaddr + offset, n, i); 752 kunmap_local(kaddr); 753 res += n; 754 bytes -= n; 755 if (!bytes || !n) 756 break; 757 offset += n; 758 if (offset == PAGE_SIZE) { 759 page++; 760 offset = 0; 761 } 762 } 763 return res; 764 } 765 EXPORT_SYMBOL(copy_page_from_iter); 766 767 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 768 { 769 unsigned int chunk, off; 770 771 if (unlikely(bytes > i->count)) 772 bytes = i->count; 773 if (unlikely(!bytes)) 774 return 0; 775 776 if (!sanity(i)) 777 return 0; 778 779 for (size_t n = bytes; n; n -= chunk) { 780 struct page *page = append_pipe(i, n, &off); 781 char *p; 782 783 if (!page) 784 return bytes - n; 785 chunk = min_t(size_t, n, PAGE_SIZE - off); 786 p = kmap_local_page(page); 787 memset(p + off, 0, chunk); 788 kunmap_local(p); 789 } 790 return bytes; 791 } 792 793 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 794 { 795 if (unlikely(iov_iter_is_pipe(i))) 796 return pipe_zero(bytes, i); 797 iterate_and_advance(i, bytes, base, len, count, 798 clear_user(base, len), 799 memset(base, 0, len) 800 ) 801 802 return bytes; 803 } 804 EXPORT_SYMBOL(iov_iter_zero); 805 806 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 807 struct iov_iter *i) 808 { 809 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 810 if (!page_copy_sane(page, offset, bytes)) { 811 kunmap_atomic(kaddr); 812 return 0; 813 } 814 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 815 kunmap_atomic(kaddr); 816 WARN_ON(1); 817 return 0; 818 } 819 iterate_and_advance(i, bytes, base, len, off, 820 copyin(p + off, base, len), 821 memcpy(p + off, base, len) 822 ) 823 kunmap_atomic(kaddr); 824 return bytes; 825 } 826 EXPORT_SYMBOL(copy_page_from_iter_atomic); 827 828 static void pipe_advance(struct iov_iter *i, size_t size) 829 { 830 struct pipe_inode_info *pipe = i->pipe; 831 int off = i->last_offset; 832 833 if (!off && !size) { 834 pipe_discard_from(pipe, i->start_head); // discard everything 835 return; 836 } 837 i->count -= size; 838 while (1) { 839 struct pipe_buffer *buf = pipe_buf(pipe, i->head); 840 if (off) /* make it relative to the beginning of buffer */ 841 size += abs(off) - buf->offset; 842 if (size <= buf->len) { 843 buf->len = size; 844 i->last_offset = last_offset(buf); 845 break; 846 } 847 size -= buf->len; 848 i->head++; 849 off = 0; 850 } 851 pipe_discard_from(pipe, i->head + 1); // discard everything past this one 852 } 853 854 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 855 { 856 const struct bio_vec *bvec, *end; 857 858 if (!i->count) 859 return; 860 i->count -= size; 861 862 size += i->iov_offset; 863 864 for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { 865 if (likely(size < bvec->bv_len)) 866 break; 867 size -= bvec->bv_len; 868 } 869 i->iov_offset = size; 870 i->nr_segs -= bvec - i->bvec; 871 i->bvec = bvec; 872 } 873 874 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 875 { 876 const struct iovec *iov, *end; 877 878 if (!i->count) 879 return; 880 i->count -= size; 881 882 size += i->iov_offset; // from beginning of current segment 883 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 884 if (likely(size < iov->iov_len)) 885 break; 886 size -= iov->iov_len; 887 } 888 i->iov_offset = size; 889 i->nr_segs -= iov - i->iov; 890 i->iov = iov; 891 } 892 893 void iov_iter_advance(struct iov_iter *i, size_t size) 894 { 895 if (unlikely(i->count < size)) 896 size = i->count; 897 if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { 898 i->iov_offset += size; 899 i->count -= size; 900 } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 901 /* iovec and kvec have identical layouts */ 902 iov_iter_iovec_advance(i, size); 903 } else if (iov_iter_is_bvec(i)) { 904 iov_iter_bvec_advance(i, size); 905 } else if (iov_iter_is_pipe(i)) { 906 pipe_advance(i, size); 907 } else if (iov_iter_is_discard(i)) { 908 i->count -= size; 909 } 910 } 911 EXPORT_SYMBOL(iov_iter_advance); 912 913 void iov_iter_revert(struct iov_iter *i, size_t unroll) 914 { 915 if (!unroll) 916 return; 917 if (WARN_ON(unroll > MAX_RW_COUNT)) 918 return; 919 i->count += unroll; 920 if (unlikely(iov_iter_is_pipe(i))) { 921 struct pipe_inode_info *pipe = i->pipe; 922 unsigned int head = pipe->head; 923 924 while (head > i->start_head) { 925 struct pipe_buffer *b = pipe_buf(pipe, --head); 926 if (unroll < b->len) { 927 b->len -= unroll; 928 i->last_offset = last_offset(b); 929 i->head = head; 930 return; 931 } 932 unroll -= b->len; 933 pipe_buf_release(pipe, b); 934 pipe->head--; 935 } 936 i->last_offset = 0; 937 i->head = head; 938 return; 939 } 940 if (unlikely(iov_iter_is_discard(i))) 941 return; 942 if (unroll <= i->iov_offset) { 943 i->iov_offset -= unroll; 944 return; 945 } 946 unroll -= i->iov_offset; 947 if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { 948 BUG(); /* We should never go beyond the start of the specified 949 * range since we might then be straying into pages that 950 * aren't pinned. 951 */ 952 } else if (iov_iter_is_bvec(i)) { 953 const struct bio_vec *bvec = i->bvec; 954 while (1) { 955 size_t n = (--bvec)->bv_len; 956 i->nr_segs++; 957 if (unroll <= n) { 958 i->bvec = bvec; 959 i->iov_offset = n - unroll; 960 return; 961 } 962 unroll -= n; 963 } 964 } else { /* same logics for iovec and kvec */ 965 const struct iovec *iov = i->iov; 966 while (1) { 967 size_t n = (--iov)->iov_len; 968 i->nr_segs++; 969 if (unroll <= n) { 970 i->iov = iov; 971 i->iov_offset = n - unroll; 972 return; 973 } 974 unroll -= n; 975 } 976 } 977 } 978 EXPORT_SYMBOL(iov_iter_revert); 979 980 /* 981 * Return the count of just the current iov_iter segment. 982 */ 983 size_t iov_iter_single_seg_count(const struct iov_iter *i) 984 { 985 if (i->nr_segs > 1) { 986 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 987 return min(i->count, i->iov->iov_len - i->iov_offset); 988 if (iov_iter_is_bvec(i)) 989 return min(i->count, i->bvec->bv_len - i->iov_offset); 990 } 991 return i->count; 992 } 993 EXPORT_SYMBOL(iov_iter_single_seg_count); 994 995 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 996 const struct kvec *kvec, unsigned long nr_segs, 997 size_t count) 998 { 999 WARN_ON(direction & ~(READ | WRITE)); 1000 *i = (struct iov_iter){ 1001 .iter_type = ITER_KVEC, 1002 .data_source = direction, 1003 .kvec = kvec, 1004 .nr_segs = nr_segs, 1005 .iov_offset = 0, 1006 .count = count 1007 }; 1008 } 1009 EXPORT_SYMBOL(iov_iter_kvec); 1010 1011 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1012 const struct bio_vec *bvec, unsigned long nr_segs, 1013 size_t count) 1014 { 1015 WARN_ON(direction & ~(READ | WRITE)); 1016 *i = (struct iov_iter){ 1017 .iter_type = ITER_BVEC, 1018 .data_source = direction, 1019 .bvec = bvec, 1020 .nr_segs = nr_segs, 1021 .iov_offset = 0, 1022 .count = count 1023 }; 1024 } 1025 EXPORT_SYMBOL(iov_iter_bvec); 1026 1027 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1028 struct pipe_inode_info *pipe, 1029 size_t count) 1030 { 1031 BUG_ON(direction != READ); 1032 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1033 *i = (struct iov_iter){ 1034 .iter_type = ITER_PIPE, 1035 .data_source = false, 1036 .pipe = pipe, 1037 .head = pipe->head, 1038 .start_head = pipe->head, 1039 .last_offset = 0, 1040 .count = count 1041 }; 1042 } 1043 EXPORT_SYMBOL(iov_iter_pipe); 1044 1045 /** 1046 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1047 * @i: The iterator to initialise. 1048 * @direction: The direction of the transfer. 1049 * @xarray: The xarray to access. 1050 * @start: The start file position. 1051 * @count: The size of the I/O buffer in bytes. 1052 * 1053 * Set up an I/O iterator to either draw data out of the pages attached to an 1054 * inode or to inject data into those pages. The pages *must* be prevented 1055 * from evaporation, either by taking a ref on them or locking them by the 1056 * caller. 1057 */ 1058 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1059 struct xarray *xarray, loff_t start, size_t count) 1060 { 1061 BUG_ON(direction & ~1); 1062 *i = (struct iov_iter) { 1063 .iter_type = ITER_XARRAY, 1064 .data_source = direction, 1065 .xarray = xarray, 1066 .xarray_start = start, 1067 .count = count, 1068 .iov_offset = 0 1069 }; 1070 } 1071 EXPORT_SYMBOL(iov_iter_xarray); 1072 1073 /** 1074 * iov_iter_discard - Initialise an I/O iterator that discards data 1075 * @i: The iterator to initialise. 1076 * @direction: The direction of the transfer. 1077 * @count: The size of the I/O buffer in bytes. 1078 * 1079 * Set up an I/O iterator that just discards everything that's written to it. 1080 * It's only available as a READ iterator. 1081 */ 1082 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1083 { 1084 BUG_ON(direction != READ); 1085 *i = (struct iov_iter){ 1086 .iter_type = ITER_DISCARD, 1087 .data_source = false, 1088 .count = count, 1089 .iov_offset = 0 1090 }; 1091 } 1092 EXPORT_SYMBOL(iov_iter_discard); 1093 1094 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, 1095 unsigned len_mask) 1096 { 1097 size_t size = i->count; 1098 size_t skip = i->iov_offset; 1099 unsigned k; 1100 1101 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1102 size_t len = i->iov[k].iov_len - skip; 1103 1104 if (len > size) 1105 len = size; 1106 if (len & len_mask) 1107 return false; 1108 if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) 1109 return false; 1110 1111 size -= len; 1112 if (!size) 1113 break; 1114 } 1115 return true; 1116 } 1117 1118 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, 1119 unsigned len_mask) 1120 { 1121 size_t size = i->count; 1122 unsigned skip = i->iov_offset; 1123 unsigned k; 1124 1125 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1126 size_t len = i->bvec[k].bv_len - skip; 1127 1128 if (len > size) 1129 len = size; 1130 if (len & len_mask) 1131 return false; 1132 if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) 1133 return false; 1134 1135 size -= len; 1136 if (!size) 1137 break; 1138 } 1139 return true; 1140 } 1141 1142 /** 1143 * iov_iter_is_aligned() - Check if the addresses and lengths of each segments 1144 * are aligned to the parameters. 1145 * 1146 * @i: &struct iov_iter to restore 1147 * @addr_mask: bit mask to check against the iov element's addresses 1148 * @len_mask: bit mask to check against the iov element's lengths 1149 * 1150 * Return: false if any addresses or lengths intersect with the provided masks 1151 */ 1152 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, 1153 unsigned len_mask) 1154 { 1155 if (likely(iter_is_ubuf(i))) { 1156 if (i->count & len_mask) 1157 return false; 1158 if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) 1159 return false; 1160 return true; 1161 } 1162 1163 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1164 return iov_iter_aligned_iovec(i, addr_mask, len_mask); 1165 1166 if (iov_iter_is_bvec(i)) 1167 return iov_iter_aligned_bvec(i, addr_mask, len_mask); 1168 1169 if (iov_iter_is_pipe(i)) { 1170 size_t size = i->count; 1171 1172 if (size & len_mask) 1173 return false; 1174 if (size && i->last_offset > 0) { 1175 if (i->last_offset & addr_mask) 1176 return false; 1177 } 1178 1179 return true; 1180 } 1181 1182 if (iov_iter_is_xarray(i)) { 1183 if (i->count & len_mask) 1184 return false; 1185 if ((i->xarray_start + i->iov_offset) & addr_mask) 1186 return false; 1187 } 1188 1189 return true; 1190 } 1191 EXPORT_SYMBOL_GPL(iov_iter_is_aligned); 1192 1193 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1194 { 1195 unsigned long res = 0; 1196 size_t size = i->count; 1197 size_t skip = i->iov_offset; 1198 unsigned k; 1199 1200 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1201 size_t len = i->iov[k].iov_len - skip; 1202 if (len) { 1203 res |= (unsigned long)i->iov[k].iov_base + skip; 1204 if (len > size) 1205 len = size; 1206 res |= len; 1207 size -= len; 1208 if (!size) 1209 break; 1210 } 1211 } 1212 return res; 1213 } 1214 1215 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1216 { 1217 unsigned res = 0; 1218 size_t size = i->count; 1219 unsigned skip = i->iov_offset; 1220 unsigned k; 1221 1222 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1223 size_t len = i->bvec[k].bv_len - skip; 1224 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1225 if (len > size) 1226 len = size; 1227 res |= len; 1228 size -= len; 1229 if (!size) 1230 break; 1231 } 1232 return res; 1233 } 1234 1235 unsigned long iov_iter_alignment(const struct iov_iter *i) 1236 { 1237 if (likely(iter_is_ubuf(i))) { 1238 size_t size = i->count; 1239 if (size) 1240 return ((unsigned long)i->ubuf + i->iov_offset) | size; 1241 return 0; 1242 } 1243 1244 /* iovec and kvec have identical layouts */ 1245 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1246 return iov_iter_alignment_iovec(i); 1247 1248 if (iov_iter_is_bvec(i)) 1249 return iov_iter_alignment_bvec(i); 1250 1251 if (iov_iter_is_pipe(i)) { 1252 size_t size = i->count; 1253 1254 if (size && i->last_offset > 0) 1255 return size | i->last_offset; 1256 return size; 1257 } 1258 1259 if (iov_iter_is_xarray(i)) 1260 return (i->xarray_start + i->iov_offset) | i->count; 1261 1262 return 0; 1263 } 1264 EXPORT_SYMBOL(iov_iter_alignment); 1265 1266 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1267 { 1268 unsigned long res = 0; 1269 unsigned long v = 0; 1270 size_t size = i->count; 1271 unsigned k; 1272 1273 if (iter_is_ubuf(i)) 1274 return 0; 1275 1276 if (WARN_ON(!iter_is_iovec(i))) 1277 return ~0U; 1278 1279 for (k = 0; k < i->nr_segs; k++) { 1280 if (i->iov[k].iov_len) { 1281 unsigned long base = (unsigned long)i->iov[k].iov_base; 1282 if (v) // if not the first one 1283 res |= base | v; // this start | previous end 1284 v = base + i->iov[k].iov_len; 1285 if (size <= i->iov[k].iov_len) 1286 break; 1287 size -= i->iov[k].iov_len; 1288 } 1289 } 1290 return res; 1291 } 1292 EXPORT_SYMBOL(iov_iter_gap_alignment); 1293 1294 static int want_pages_array(struct page ***res, size_t size, 1295 size_t start, unsigned int maxpages) 1296 { 1297 unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); 1298 1299 if (count > maxpages) 1300 count = maxpages; 1301 WARN_ON(!count); // caller should've prevented that 1302 if (!*res) { 1303 *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); 1304 if (!*res) 1305 return 0; 1306 } 1307 return count; 1308 } 1309 1310 static ssize_t pipe_get_pages(struct iov_iter *i, 1311 struct page ***pages, size_t maxsize, unsigned maxpages, 1312 size_t *start) 1313 { 1314 unsigned int npages, count, off, chunk; 1315 struct page **p; 1316 size_t left; 1317 1318 if (!sanity(i)) 1319 return -EFAULT; 1320 1321 *start = off = pipe_npages(i, &npages); 1322 if (!npages) 1323 return -EFAULT; 1324 count = want_pages_array(pages, maxsize, off, min(npages, maxpages)); 1325 if (!count) 1326 return -ENOMEM; 1327 p = *pages; 1328 for (npages = 0, left = maxsize ; npages < count; npages++, left -= chunk) { 1329 struct page *page = append_pipe(i, left, &off); 1330 if (!page) 1331 break; 1332 chunk = min_t(size_t, left, PAGE_SIZE - off); 1333 get_page(*p++ = page); 1334 } 1335 if (!npages) 1336 return -EFAULT; 1337 return maxsize - left; 1338 } 1339 1340 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1341 pgoff_t index, unsigned int nr_pages) 1342 { 1343 XA_STATE(xas, xa, index); 1344 struct page *page; 1345 unsigned int ret = 0; 1346 1347 rcu_read_lock(); 1348 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1349 if (xas_retry(&xas, page)) 1350 continue; 1351 1352 /* Has the page moved or been split? */ 1353 if (unlikely(page != xas_reload(&xas))) { 1354 xas_reset(&xas); 1355 continue; 1356 } 1357 1358 pages[ret] = find_subpage(page, xas.xa_index); 1359 get_page(pages[ret]); 1360 if (++ret == nr_pages) 1361 break; 1362 } 1363 rcu_read_unlock(); 1364 return ret; 1365 } 1366 1367 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1368 struct page ***pages, size_t maxsize, 1369 unsigned maxpages, size_t *_start_offset) 1370 { 1371 unsigned nr, offset, count; 1372 pgoff_t index; 1373 loff_t pos; 1374 1375 pos = i->xarray_start + i->iov_offset; 1376 index = pos >> PAGE_SHIFT; 1377 offset = pos & ~PAGE_MASK; 1378 *_start_offset = offset; 1379 1380 count = want_pages_array(pages, maxsize, offset, maxpages); 1381 if (!count) 1382 return -ENOMEM; 1383 nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); 1384 if (nr == 0) 1385 return 0; 1386 1387 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1388 i->iov_offset += maxsize; 1389 i->count -= maxsize; 1390 return maxsize; 1391 } 1392 1393 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ 1394 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) 1395 { 1396 size_t skip; 1397 long k; 1398 1399 if (iter_is_ubuf(i)) 1400 return (unsigned long)i->ubuf + i->iov_offset; 1401 1402 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1403 size_t len = i->iov[k].iov_len - skip; 1404 1405 if (unlikely(!len)) 1406 continue; 1407 if (*size > len) 1408 *size = len; 1409 return (unsigned long)i->iov[k].iov_base + skip; 1410 } 1411 BUG(); // if it had been empty, we wouldn't get called 1412 } 1413 1414 /* must be done on non-empty ITER_BVEC one */ 1415 static struct page *first_bvec_segment(const struct iov_iter *i, 1416 size_t *size, size_t *start) 1417 { 1418 struct page *page; 1419 size_t skip = i->iov_offset, len; 1420 1421 len = i->bvec->bv_len - skip; 1422 if (*size > len) 1423 *size = len; 1424 skip += i->bvec->bv_offset; 1425 page = i->bvec->bv_page + skip / PAGE_SIZE; 1426 *start = skip % PAGE_SIZE; 1427 return page; 1428 } 1429 1430 static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, 1431 struct page ***pages, size_t maxsize, 1432 unsigned int maxpages, size_t *start) 1433 { 1434 unsigned int n; 1435 1436 if (maxsize > i->count) 1437 maxsize = i->count; 1438 if (!maxsize) 1439 return 0; 1440 if (maxsize > MAX_RW_COUNT) 1441 maxsize = MAX_RW_COUNT; 1442 1443 if (likely(user_backed_iter(i))) { 1444 unsigned int gup_flags = 0; 1445 unsigned long addr; 1446 int res; 1447 1448 if (iov_iter_rw(i) != WRITE) 1449 gup_flags |= FOLL_WRITE; 1450 if (i->nofault) 1451 gup_flags |= FOLL_NOFAULT; 1452 1453 addr = first_iovec_segment(i, &maxsize); 1454 *start = addr % PAGE_SIZE; 1455 addr &= PAGE_MASK; 1456 n = want_pages_array(pages, maxsize, *start, maxpages); 1457 if (!n) 1458 return -ENOMEM; 1459 res = get_user_pages_fast(addr, n, gup_flags, *pages); 1460 if (unlikely(res <= 0)) 1461 return res; 1462 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); 1463 iov_iter_advance(i, maxsize); 1464 return maxsize; 1465 } 1466 if (iov_iter_is_bvec(i)) { 1467 struct page **p; 1468 struct page *page; 1469 1470 page = first_bvec_segment(i, &maxsize, start); 1471 n = want_pages_array(pages, maxsize, *start, maxpages); 1472 if (!n) 1473 return -ENOMEM; 1474 p = *pages; 1475 for (int k = 0; k < n; k++) 1476 get_page(p[k] = page + k); 1477 maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); 1478 i->count -= maxsize; 1479 i->iov_offset += maxsize; 1480 if (i->iov_offset == i->bvec->bv_len) { 1481 i->iov_offset = 0; 1482 i->bvec++; 1483 i->nr_segs--; 1484 } 1485 return maxsize; 1486 } 1487 if (iov_iter_is_pipe(i)) 1488 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1489 if (iov_iter_is_xarray(i)) 1490 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1491 return -EFAULT; 1492 } 1493 1494 ssize_t iov_iter_get_pages2(struct iov_iter *i, 1495 struct page **pages, size_t maxsize, unsigned maxpages, 1496 size_t *start) 1497 { 1498 if (!maxpages) 1499 return 0; 1500 BUG_ON(!pages); 1501 1502 return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); 1503 } 1504 EXPORT_SYMBOL(iov_iter_get_pages2); 1505 1506 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, 1507 struct page ***pages, size_t maxsize, 1508 size_t *start) 1509 { 1510 ssize_t len; 1511 1512 *pages = NULL; 1513 1514 len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); 1515 if (len <= 0) { 1516 kvfree(*pages); 1517 *pages = NULL; 1518 } 1519 return len; 1520 } 1521 EXPORT_SYMBOL(iov_iter_get_pages_alloc2); 1522 1523 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1524 struct iov_iter *i) 1525 { 1526 __wsum sum, next; 1527 sum = *csum; 1528 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1529 WARN_ON(1); 1530 return 0; 1531 } 1532 iterate_and_advance(i, bytes, base, len, off, ({ 1533 next = csum_and_copy_from_user(base, addr + off, len); 1534 sum = csum_block_add(sum, next, off); 1535 next ? 0 : len; 1536 }), ({ 1537 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1538 }) 1539 ) 1540 *csum = sum; 1541 return bytes; 1542 } 1543 EXPORT_SYMBOL(csum_and_copy_from_iter); 1544 1545 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1546 struct iov_iter *i) 1547 { 1548 struct csum_state *csstate = _csstate; 1549 __wsum sum, next; 1550 1551 if (unlikely(iov_iter_is_discard(i))) { 1552 // can't use csum_memcpy() for that one - data is not copied 1553 csstate->csum = csum_block_add(csstate->csum, 1554 csum_partial(addr, bytes, 0), 1555 csstate->off); 1556 csstate->off += bytes; 1557 return bytes; 1558 } 1559 1560 sum = csum_shift(csstate->csum, csstate->off); 1561 if (unlikely(iov_iter_is_pipe(i))) 1562 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); 1563 else iterate_and_advance(i, bytes, base, len, off, ({ 1564 next = csum_and_copy_to_user(addr + off, base, len); 1565 sum = csum_block_add(sum, next, off); 1566 next ? 0 : len; 1567 }), ({ 1568 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1569 }) 1570 ) 1571 csstate->csum = csum_shift(sum, csstate->off); 1572 csstate->off += bytes; 1573 return bytes; 1574 } 1575 EXPORT_SYMBOL(csum_and_copy_to_iter); 1576 1577 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1578 struct iov_iter *i) 1579 { 1580 #ifdef CONFIG_CRYPTO_HASH 1581 struct ahash_request *hash = hashp; 1582 struct scatterlist sg; 1583 size_t copied; 1584 1585 copied = copy_to_iter(addr, bytes, i); 1586 sg_init_one(&sg, addr, copied); 1587 ahash_request_set_crypt(hash, &sg, NULL, copied); 1588 crypto_ahash_update(hash); 1589 return copied; 1590 #else 1591 return 0; 1592 #endif 1593 } 1594 EXPORT_SYMBOL(hash_and_copy_to_iter); 1595 1596 static int iov_npages(const struct iov_iter *i, int maxpages) 1597 { 1598 size_t skip = i->iov_offset, size = i->count; 1599 const struct iovec *p; 1600 int npages = 0; 1601 1602 for (p = i->iov; size; skip = 0, p++) { 1603 unsigned offs = offset_in_page(p->iov_base + skip); 1604 size_t len = min(p->iov_len - skip, size); 1605 1606 if (len) { 1607 size -= len; 1608 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1609 if (unlikely(npages > maxpages)) 1610 return maxpages; 1611 } 1612 } 1613 return npages; 1614 } 1615 1616 static int bvec_npages(const struct iov_iter *i, int maxpages) 1617 { 1618 size_t skip = i->iov_offset, size = i->count; 1619 const struct bio_vec *p; 1620 int npages = 0; 1621 1622 for (p = i->bvec; size; skip = 0, p++) { 1623 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1624 size_t len = min(p->bv_len - skip, size); 1625 1626 size -= len; 1627 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1628 if (unlikely(npages > maxpages)) 1629 return maxpages; 1630 } 1631 return npages; 1632 } 1633 1634 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1635 { 1636 if (unlikely(!i->count)) 1637 return 0; 1638 if (likely(iter_is_ubuf(i))) { 1639 unsigned offs = offset_in_page(i->ubuf + i->iov_offset); 1640 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); 1641 return min(npages, maxpages); 1642 } 1643 /* iovec and kvec have identical layouts */ 1644 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1645 return iov_npages(i, maxpages); 1646 if (iov_iter_is_bvec(i)) 1647 return bvec_npages(i, maxpages); 1648 if (iov_iter_is_pipe(i)) { 1649 int npages; 1650 1651 if (!sanity(i)) 1652 return 0; 1653 1654 pipe_npages(i, &npages); 1655 return min(npages, maxpages); 1656 } 1657 if (iov_iter_is_xarray(i)) { 1658 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1659 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1660 return min(npages, maxpages); 1661 } 1662 return 0; 1663 } 1664 EXPORT_SYMBOL(iov_iter_npages); 1665 1666 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1667 { 1668 *new = *old; 1669 if (unlikely(iov_iter_is_pipe(new))) { 1670 WARN_ON(1); 1671 return NULL; 1672 } 1673 if (iov_iter_is_bvec(new)) 1674 return new->bvec = kmemdup(new->bvec, 1675 new->nr_segs * sizeof(struct bio_vec), 1676 flags); 1677 else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) 1678 /* iovec and kvec have identical layout */ 1679 return new->iov = kmemdup(new->iov, 1680 new->nr_segs * sizeof(struct iovec), 1681 flags); 1682 return NULL; 1683 } 1684 EXPORT_SYMBOL(dup_iter); 1685 1686 static int copy_compat_iovec_from_user(struct iovec *iov, 1687 const struct iovec __user *uvec, unsigned long nr_segs) 1688 { 1689 const struct compat_iovec __user *uiov = 1690 (const struct compat_iovec __user *)uvec; 1691 int ret = -EFAULT, i; 1692 1693 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1694 return -EFAULT; 1695 1696 for (i = 0; i < nr_segs; i++) { 1697 compat_uptr_t buf; 1698 compat_ssize_t len; 1699 1700 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1701 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1702 1703 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1704 if (len < 0) { 1705 ret = -EINVAL; 1706 goto uaccess_end; 1707 } 1708 iov[i].iov_base = compat_ptr(buf); 1709 iov[i].iov_len = len; 1710 } 1711 1712 ret = 0; 1713 uaccess_end: 1714 user_access_end(); 1715 return ret; 1716 } 1717 1718 static int copy_iovec_from_user(struct iovec *iov, 1719 const struct iovec __user *uvec, unsigned long nr_segs) 1720 { 1721 unsigned long seg; 1722 1723 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1724 return -EFAULT; 1725 for (seg = 0; seg < nr_segs; seg++) { 1726 if ((ssize_t)iov[seg].iov_len < 0) 1727 return -EINVAL; 1728 } 1729 1730 return 0; 1731 } 1732 1733 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1734 unsigned long nr_segs, unsigned long fast_segs, 1735 struct iovec *fast_iov, bool compat) 1736 { 1737 struct iovec *iov = fast_iov; 1738 int ret; 1739 1740 /* 1741 * SuS says "The readv() function *may* fail if the iovcnt argument was 1742 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1743 * traditionally returned zero for zero segments, so... 1744 */ 1745 if (nr_segs == 0) 1746 return iov; 1747 if (nr_segs > UIO_MAXIOV) 1748 return ERR_PTR(-EINVAL); 1749 if (nr_segs > fast_segs) { 1750 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1751 if (!iov) 1752 return ERR_PTR(-ENOMEM); 1753 } 1754 1755 if (compat) 1756 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1757 else 1758 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1759 if (ret) { 1760 if (iov != fast_iov) 1761 kfree(iov); 1762 return ERR_PTR(ret); 1763 } 1764 1765 return iov; 1766 } 1767 1768 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1769 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1770 struct iov_iter *i, bool compat) 1771 { 1772 ssize_t total_len = 0; 1773 unsigned long seg; 1774 struct iovec *iov; 1775 1776 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1777 if (IS_ERR(iov)) { 1778 *iovp = NULL; 1779 return PTR_ERR(iov); 1780 } 1781 1782 /* 1783 * According to the Single Unix Specification we should return EINVAL if 1784 * an element length is < 0 when cast to ssize_t or if the total length 1785 * would overflow the ssize_t return value of the system call. 1786 * 1787 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1788 * overflow case. 1789 */ 1790 for (seg = 0; seg < nr_segs; seg++) { 1791 ssize_t len = (ssize_t)iov[seg].iov_len; 1792 1793 if (!access_ok(iov[seg].iov_base, len)) { 1794 if (iov != *iovp) 1795 kfree(iov); 1796 *iovp = NULL; 1797 return -EFAULT; 1798 } 1799 1800 if (len > MAX_RW_COUNT - total_len) { 1801 len = MAX_RW_COUNT - total_len; 1802 iov[seg].iov_len = len; 1803 } 1804 total_len += len; 1805 } 1806 1807 iov_iter_init(i, type, iov, nr_segs, total_len); 1808 if (iov == *iovp) 1809 *iovp = NULL; 1810 else 1811 *iovp = iov; 1812 return total_len; 1813 } 1814 1815 /** 1816 * import_iovec() - Copy an array of &struct iovec from userspace 1817 * into the kernel, check that it is valid, and initialize a new 1818 * &struct iov_iter iterator to access it. 1819 * 1820 * @type: One of %READ or %WRITE. 1821 * @uvec: Pointer to the userspace array. 1822 * @nr_segs: Number of elements in userspace array. 1823 * @fast_segs: Number of elements in @iov. 1824 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1825 * on-stack) kernel array. 1826 * @i: Pointer to iterator that will be initialized on success. 1827 * 1828 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1829 * then this function places %NULL in *@iov on return. Otherwise, a new 1830 * array will be allocated and the result placed in *@iov. This means that 1831 * the caller may call kfree() on *@iov regardless of whether the small 1832 * on-stack array was used or not (and regardless of whether this function 1833 * returns an error or not). 1834 * 1835 * Return: Negative error code on error, bytes imported on success 1836 */ 1837 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1838 unsigned nr_segs, unsigned fast_segs, 1839 struct iovec **iovp, struct iov_iter *i) 1840 { 1841 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1842 in_compat_syscall()); 1843 } 1844 EXPORT_SYMBOL(import_iovec); 1845 1846 int import_single_range(int rw, void __user *buf, size_t len, 1847 struct iovec *iov, struct iov_iter *i) 1848 { 1849 if (len > MAX_RW_COUNT) 1850 len = MAX_RW_COUNT; 1851 if (unlikely(!access_ok(buf, len))) 1852 return -EFAULT; 1853 1854 iov->iov_base = buf; 1855 iov->iov_len = len; 1856 iov_iter_init(i, rw, iov, 1, len); 1857 return 0; 1858 } 1859 EXPORT_SYMBOL(import_single_range); 1860 1861 /** 1862 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when 1863 * iov_iter_save_state() was called. 1864 * 1865 * @i: &struct iov_iter to restore 1866 * @state: state to restore from 1867 * 1868 * Used after iov_iter_save_state() to bring restore @i, if operations may 1869 * have advanced it. 1870 * 1871 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC 1872 */ 1873 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) 1874 { 1875 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && 1876 !iov_iter_is_kvec(i) && !iter_is_ubuf(i)) 1877 return; 1878 i->iov_offset = state->iov_offset; 1879 i->count = state->count; 1880 if (iter_is_ubuf(i)) 1881 return; 1882 /* 1883 * For the *vec iters, nr_segs + iov is constant - if we increment 1884 * the vec, then we also decrement the nr_segs count. Hence we don't 1885 * need to track both of these, just one is enough and we can deduct 1886 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct 1887 * size, so we can just increment the iov pointer as they are unionzed. 1888 * ITER_BVEC _may_ be the same size on some archs, but on others it is 1889 * not. Be safe and handle it separately. 1890 */ 1891 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 1892 if (iov_iter_is_bvec(i)) 1893 i->bvec -= state->nr_segs - i->nr_segs; 1894 else 1895 i->iov -= state->nr_segs - i->nr_segs; 1896 i->nr_segs = state->nr_segs; 1897 } 1898