1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers ubuf and kbuf alike */ 20 #define iterate_buf(i, n, base, len, off, __p, STEP) { \ 21 size_t __maybe_unused off = 0; \ 22 len = n; \ 23 base = __p + i->iov_offset; \ 24 len -= (STEP); \ 25 i->iov_offset += len; \ 26 n = len; \ 27 } 28 29 /* covers iovec and kvec alike */ 30 #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ 31 size_t off = 0; \ 32 size_t skip = i->iov_offset; \ 33 do { \ 34 len = min(n, __p->iov_len - skip); \ 35 if (likely(len)) { \ 36 base = __p->iov_base + skip; \ 37 len -= (STEP); \ 38 off += len; \ 39 skip += len; \ 40 n -= len; \ 41 if (skip < __p->iov_len) \ 42 break; \ 43 } \ 44 __p++; \ 45 skip = 0; \ 46 } while (n); \ 47 i->iov_offset = skip; \ 48 n = off; \ 49 } 50 51 #define iterate_bvec(i, n, base, len, off, p, STEP) { \ 52 size_t off = 0; \ 53 unsigned skip = i->iov_offset; \ 54 while (n) { \ 55 unsigned offset = p->bv_offset + skip; \ 56 unsigned left; \ 57 void *kaddr = kmap_local_page(p->bv_page + \ 58 offset / PAGE_SIZE); \ 59 base = kaddr + offset % PAGE_SIZE; \ 60 len = min(min(n, (size_t)(p->bv_len - skip)), \ 61 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 62 left = (STEP); \ 63 kunmap_local(kaddr); \ 64 len -= left; \ 65 off += len; \ 66 skip += len; \ 67 if (skip == p->bv_len) { \ 68 skip = 0; \ 69 p++; \ 70 } \ 71 n -= len; \ 72 if (left) \ 73 break; \ 74 } \ 75 i->iov_offset = skip; \ 76 n = off; \ 77 } 78 79 #define iterate_xarray(i, n, base, len, __off, STEP) { \ 80 __label__ __out; \ 81 size_t __off = 0; \ 82 struct folio *folio; \ 83 loff_t start = i->xarray_start + i->iov_offset; \ 84 pgoff_t index = start / PAGE_SIZE; \ 85 XA_STATE(xas, i->xarray, index); \ 86 \ 87 len = PAGE_SIZE - offset_in_page(start); \ 88 rcu_read_lock(); \ 89 xas_for_each(&xas, folio, ULONG_MAX) { \ 90 unsigned left; \ 91 size_t offset; \ 92 if (xas_retry(&xas, folio)) \ 93 continue; \ 94 if (WARN_ON(xa_is_value(folio))) \ 95 break; \ 96 if (WARN_ON(folio_test_hugetlb(folio))) \ 97 break; \ 98 offset = offset_in_folio(folio, start + __off); \ 99 while (offset < folio_size(folio)) { \ 100 base = kmap_local_folio(folio, offset); \ 101 len = min(n, len); \ 102 left = (STEP); \ 103 kunmap_local(base); \ 104 len -= left; \ 105 __off += len; \ 106 n -= len; \ 107 if (left || n == 0) \ 108 goto __out; \ 109 offset += len; \ 110 len = PAGE_SIZE; \ 111 } \ 112 } \ 113 __out: \ 114 rcu_read_unlock(); \ 115 i->iov_offset += __off; \ 116 n = __off; \ 117 } 118 119 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 120 if (unlikely(i->count < n)) \ 121 n = i->count; \ 122 if (likely(n)) { \ 123 if (likely(iter_is_ubuf(i))) { \ 124 void __user *base; \ 125 size_t len; \ 126 iterate_buf(i, n, base, len, off, \ 127 i->ubuf, (I)) \ 128 } else if (likely(iter_is_iovec(i))) { \ 129 const struct iovec *iov = i->iov; \ 130 void __user *base; \ 131 size_t len; \ 132 iterate_iovec(i, n, base, len, off, \ 133 iov, (I)) \ 134 i->nr_segs -= iov - i->iov; \ 135 i->iov = iov; \ 136 } else if (iov_iter_is_bvec(i)) { \ 137 const struct bio_vec *bvec = i->bvec; \ 138 void *base; \ 139 size_t len; \ 140 iterate_bvec(i, n, base, len, off, \ 141 bvec, (K)) \ 142 i->nr_segs -= bvec - i->bvec; \ 143 i->bvec = bvec; \ 144 } else if (iov_iter_is_kvec(i)) { \ 145 const struct kvec *kvec = i->kvec; \ 146 void *base; \ 147 size_t len; \ 148 iterate_iovec(i, n, base, len, off, \ 149 kvec, (K)) \ 150 i->nr_segs -= kvec - i->kvec; \ 151 i->kvec = kvec; \ 152 } else if (iov_iter_is_xarray(i)) { \ 153 void *base; \ 154 size_t len; \ 155 iterate_xarray(i, n, base, len, off, \ 156 (K)) \ 157 } \ 158 i->count -= n; \ 159 } \ 160 } 161 #define iterate_and_advance(i, n, base, len, off, I, K) \ 162 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 163 164 static int copyout(void __user *to, const void *from, size_t n) 165 { 166 if (should_fail_usercopy()) 167 return n; 168 if (access_ok(to, n)) { 169 instrument_copy_to_user(to, from, n); 170 n = raw_copy_to_user(to, from, n); 171 } 172 return n; 173 } 174 175 static int copyin(void *to, const void __user *from, size_t n) 176 { 177 if (should_fail_usercopy()) 178 return n; 179 if (access_ok(from, n)) { 180 instrument_copy_from_user(to, from, n); 181 n = raw_copy_from_user(to, from, n); 182 } 183 return n; 184 } 185 186 static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, 187 unsigned int slot) 188 { 189 return &pipe->bufs[slot & (pipe->ring_size - 1)]; 190 } 191 192 #ifdef PIPE_PARANOIA 193 static bool sanity(const struct iov_iter *i) 194 { 195 struct pipe_inode_info *pipe = i->pipe; 196 unsigned int p_head = pipe->head; 197 unsigned int p_tail = pipe->tail; 198 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 199 unsigned int i_head = i->head; 200 unsigned int idx; 201 202 if (i->iov_offset) { 203 struct pipe_buffer *p; 204 if (unlikely(p_occupancy == 0)) 205 goto Bad; // pipe must be non-empty 206 if (unlikely(i_head != p_head - 1)) 207 goto Bad; // must be at the last buffer... 208 209 p = pipe_buf(pipe, i_head); 210 if (unlikely(p->offset + p->len != i->iov_offset)) 211 goto Bad; // ... at the end of segment 212 } else { 213 if (i_head != p_head) 214 goto Bad; // must be right after the last buffer 215 } 216 return true; 217 Bad: 218 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 219 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 220 p_head, p_tail, pipe->ring_size); 221 for (idx = 0; idx < pipe->ring_size; idx++) 222 printk(KERN_ERR "[%p %p %d %d]\n", 223 pipe->bufs[idx].ops, 224 pipe->bufs[idx].page, 225 pipe->bufs[idx].offset, 226 pipe->bufs[idx].len); 227 WARN_ON(1); 228 return false; 229 } 230 #else 231 #define sanity(i) true 232 #endif 233 234 static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) 235 { 236 struct page *page = alloc_page(GFP_USER); 237 if (page) { 238 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 239 *buf = (struct pipe_buffer) { 240 .ops = &default_pipe_buf_ops, 241 .page = page, 242 .offset = 0, 243 .len = size 244 }; 245 } 246 return page; 247 } 248 249 static void push_page(struct pipe_inode_info *pipe, struct page *page, 250 unsigned int offset, unsigned int size) 251 { 252 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 253 *buf = (struct pipe_buffer) { 254 .ops = &page_cache_pipe_buf_ops, 255 .page = page, 256 .offset = offset, 257 .len = size 258 }; 259 get_page(page); 260 } 261 262 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 263 struct iov_iter *i) 264 { 265 struct pipe_inode_info *pipe = i->pipe; 266 unsigned int head = pipe->head; 267 268 if (unlikely(bytes > i->count)) 269 bytes = i->count; 270 271 if (unlikely(!bytes)) 272 return 0; 273 274 if (!sanity(i)) 275 return 0; 276 277 if (offset && i->iov_offset == offset) { // could we merge it? 278 struct pipe_buffer *buf = pipe_buf(pipe, head - 1); 279 if (buf->page == page) { 280 buf->len += bytes; 281 i->iov_offset += bytes; 282 i->count -= bytes; 283 return bytes; 284 } 285 } 286 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 287 return 0; 288 289 push_page(pipe, page, offset, bytes); 290 i->iov_offset = offset + bytes; 291 i->head = head; 292 i->count -= bytes; 293 return bytes; 294 } 295 296 /* 297 * fault_in_iov_iter_readable - fault in iov iterator for reading 298 * @i: iterator 299 * @size: maximum length 300 * 301 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 302 * @size. For each iovec, fault in each page that constitutes the iovec. 303 * 304 * Returns the number of bytes not faulted in (like copy_to_user() and 305 * copy_from_user()). 306 * 307 * Always returns 0 for non-userspace iterators. 308 */ 309 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) 310 { 311 if (iter_is_ubuf(i)) { 312 size_t n = min(size, iov_iter_count(i)); 313 n -= fault_in_readable(i->ubuf + i->iov_offset, n); 314 return size - n; 315 } else if (iter_is_iovec(i)) { 316 size_t count = min(size, iov_iter_count(i)); 317 const struct iovec *p; 318 size_t skip; 319 320 size -= count; 321 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 322 size_t len = min(count, p->iov_len - skip); 323 size_t ret; 324 325 if (unlikely(!len)) 326 continue; 327 ret = fault_in_readable(p->iov_base + skip, len); 328 count -= len - ret; 329 if (ret) 330 break; 331 } 332 return count + size; 333 } 334 return 0; 335 } 336 EXPORT_SYMBOL(fault_in_iov_iter_readable); 337 338 /* 339 * fault_in_iov_iter_writeable - fault in iov iterator for writing 340 * @i: iterator 341 * @size: maximum length 342 * 343 * Faults in the iterator using get_user_pages(), i.e., without triggering 344 * hardware page faults. This is primarily useful when we already know that 345 * some or all of the pages in @i aren't in memory. 346 * 347 * Returns the number of bytes not faulted in, like copy_to_user() and 348 * copy_from_user(). 349 * 350 * Always returns 0 for non-user-space iterators. 351 */ 352 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) 353 { 354 if (iter_is_ubuf(i)) { 355 size_t n = min(size, iov_iter_count(i)); 356 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); 357 return size - n; 358 } else if (iter_is_iovec(i)) { 359 size_t count = min(size, iov_iter_count(i)); 360 const struct iovec *p; 361 size_t skip; 362 363 size -= count; 364 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 365 size_t len = min(count, p->iov_len - skip); 366 size_t ret; 367 368 if (unlikely(!len)) 369 continue; 370 ret = fault_in_safe_writeable(p->iov_base + skip, len); 371 count -= len - ret; 372 if (ret) 373 break; 374 } 375 return count + size; 376 } 377 return 0; 378 } 379 EXPORT_SYMBOL(fault_in_iov_iter_writeable); 380 381 void iov_iter_init(struct iov_iter *i, unsigned int direction, 382 const struct iovec *iov, unsigned long nr_segs, 383 size_t count) 384 { 385 WARN_ON(direction & ~(READ | WRITE)); 386 *i = (struct iov_iter) { 387 .iter_type = ITER_IOVEC, 388 .nofault = false, 389 .user_backed = true, 390 .data_source = direction, 391 .iov = iov, 392 .nr_segs = nr_segs, 393 .iov_offset = 0, 394 .count = count 395 }; 396 } 397 EXPORT_SYMBOL(iov_iter_init); 398 399 static inline bool allocated(struct pipe_buffer *buf) 400 { 401 return buf->ops == &default_pipe_buf_ops; 402 } 403 404 static inline void data_start(const struct iov_iter *i, 405 unsigned int *iter_headp, size_t *offp) 406 { 407 unsigned int iter_head = i->head; 408 size_t off = i->iov_offset; 409 410 if (off && (!allocated(pipe_buf(i->pipe, iter_head)) || 411 off == PAGE_SIZE)) { 412 iter_head++; 413 off = 0; 414 } 415 *iter_headp = iter_head; 416 *offp = off; 417 } 418 419 static size_t push_pipe(struct iov_iter *i, size_t size, 420 int *iter_headp, size_t *offp) 421 { 422 struct pipe_inode_info *pipe = i->pipe; 423 unsigned int iter_head; 424 size_t off; 425 ssize_t left; 426 427 if (unlikely(size > i->count)) 428 size = i->count; 429 if (unlikely(!size)) 430 return 0; 431 432 left = size; 433 data_start(i, &iter_head, &off); 434 *iter_headp = iter_head; 435 *offp = off; 436 if (off) { 437 struct pipe_buffer *buf = pipe_buf(pipe, iter_head); 438 439 left -= PAGE_SIZE - off; 440 if (left <= 0) { 441 buf->len += size; 442 return size; 443 } 444 buf->len = PAGE_SIZE; 445 } 446 while (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 447 struct page *page = push_anon(pipe, 448 min_t(ssize_t, left, PAGE_SIZE)); 449 if (!page) 450 break; 451 452 left -= PAGE_SIZE; 453 if (left <= 0) 454 return size; 455 } 456 return size - left; 457 } 458 459 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 460 struct iov_iter *i) 461 { 462 struct pipe_inode_info *pipe = i->pipe; 463 unsigned int p_mask = pipe->ring_size - 1; 464 unsigned int i_head; 465 size_t n, off; 466 467 if (!sanity(i)) 468 return 0; 469 470 bytes = n = push_pipe(i, bytes, &i_head, &off); 471 if (unlikely(!n)) 472 return 0; 473 do { 474 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 475 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 476 i->head = i_head; 477 i->iov_offset = off + chunk; 478 n -= chunk; 479 addr += chunk; 480 off = 0; 481 i_head++; 482 } while (n); 483 i->count -= bytes; 484 return bytes; 485 } 486 487 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 488 __wsum sum, size_t off) 489 { 490 __wsum next = csum_partial_copy_nocheck(from, to, len); 491 return csum_block_add(sum, next, off); 492 } 493 494 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 495 struct iov_iter *i, __wsum *sump) 496 { 497 struct pipe_inode_info *pipe = i->pipe; 498 unsigned int p_mask = pipe->ring_size - 1; 499 __wsum sum = *sump; 500 size_t off = 0; 501 unsigned int i_head; 502 size_t r; 503 504 if (!sanity(i)) 505 return 0; 506 507 bytes = push_pipe(i, bytes, &i_head, &r); 508 while (bytes) { 509 size_t chunk = min_t(size_t, bytes, PAGE_SIZE - r); 510 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 511 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); 512 kunmap_local(p); 513 i->head = i_head; 514 i->iov_offset = r + chunk; 515 bytes -= chunk; 516 off += chunk; 517 r = 0; 518 i_head++; 519 } 520 *sump = sum; 521 i->count -= off; 522 return off; 523 } 524 525 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 526 { 527 if (unlikely(iov_iter_is_pipe(i))) 528 return copy_pipe_to_iter(addr, bytes, i); 529 if (user_backed_iter(i)) 530 might_fault(); 531 iterate_and_advance(i, bytes, base, len, off, 532 copyout(base, addr + off, len), 533 memcpy(base, addr + off, len) 534 ) 535 536 return bytes; 537 } 538 EXPORT_SYMBOL(_copy_to_iter); 539 540 #ifdef CONFIG_ARCH_HAS_COPY_MC 541 static int copyout_mc(void __user *to, const void *from, size_t n) 542 { 543 if (access_ok(to, n)) { 544 instrument_copy_to_user(to, from, n); 545 n = copy_mc_to_user((__force void *) to, from, n); 546 } 547 return n; 548 } 549 550 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 551 struct iov_iter *i) 552 { 553 struct pipe_inode_info *pipe = i->pipe; 554 unsigned int p_mask = pipe->ring_size - 1; 555 unsigned int i_head; 556 unsigned int valid = pipe->head; 557 size_t n, off, xfer = 0; 558 559 if (!sanity(i)) 560 return 0; 561 562 n = push_pipe(i, bytes, &i_head, &off); 563 while (n) { 564 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 565 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 566 unsigned long rem; 567 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); 568 chunk -= rem; 569 kunmap_local(p); 570 if (chunk) { 571 i->head = i_head; 572 i->iov_offset = off + chunk; 573 xfer += chunk; 574 valid = i_head + 1; 575 } 576 if (rem) { 577 pipe->bufs[i_head & p_mask].len -= rem; 578 pipe_discard_from(pipe, valid); 579 break; 580 } 581 n -= chunk; 582 off = 0; 583 i_head++; 584 } 585 i->count -= xfer; 586 return xfer; 587 } 588 589 /** 590 * _copy_mc_to_iter - copy to iter with source memory error exception handling 591 * @addr: source kernel address 592 * @bytes: total transfer length 593 * @i: destination iterator 594 * 595 * The pmem driver deploys this for the dax operation 596 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 597 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 598 * successfully copied. 599 * 600 * The main differences between this and typical _copy_to_iter(). 601 * 602 * * Typical tail/residue handling after a fault retries the copy 603 * byte-by-byte until the fault happens again. Re-triggering machine 604 * checks is potentially fatal so the implementation uses source 605 * alignment and poison alignment assumptions to avoid re-triggering 606 * hardware exceptions. 607 * 608 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 609 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 610 * a short copy. 611 * 612 * Return: number of bytes copied (may be %0) 613 */ 614 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 615 { 616 if (unlikely(iov_iter_is_pipe(i))) 617 return copy_mc_pipe_to_iter(addr, bytes, i); 618 if (user_backed_iter(i)) 619 might_fault(); 620 __iterate_and_advance(i, bytes, base, len, off, 621 copyout_mc(base, addr + off, len), 622 copy_mc_to_kernel(base, addr + off, len) 623 ) 624 625 return bytes; 626 } 627 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 628 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 629 630 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 631 { 632 if (unlikely(iov_iter_is_pipe(i))) { 633 WARN_ON(1); 634 return 0; 635 } 636 if (user_backed_iter(i)) 637 might_fault(); 638 iterate_and_advance(i, bytes, base, len, off, 639 copyin(addr + off, base, len), 640 memcpy(addr + off, base, len) 641 ) 642 643 return bytes; 644 } 645 EXPORT_SYMBOL(_copy_from_iter); 646 647 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 648 { 649 if (unlikely(iov_iter_is_pipe(i))) { 650 WARN_ON(1); 651 return 0; 652 } 653 iterate_and_advance(i, bytes, base, len, off, 654 __copy_from_user_inatomic_nocache(addr + off, base, len), 655 memcpy(addr + off, base, len) 656 ) 657 658 return bytes; 659 } 660 EXPORT_SYMBOL(_copy_from_iter_nocache); 661 662 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 663 /** 664 * _copy_from_iter_flushcache - write destination through cpu cache 665 * @addr: destination kernel address 666 * @bytes: total transfer length 667 * @i: source iterator 668 * 669 * The pmem driver arranges for filesystem-dax to use this facility via 670 * dax_copy_from_iter() for ensuring that writes to persistent memory 671 * are flushed through the CPU cache. It is differentiated from 672 * _copy_from_iter_nocache() in that guarantees all data is flushed for 673 * all iterator types. The _copy_from_iter_nocache() only attempts to 674 * bypass the cache for the ITER_IOVEC case, and on some archs may use 675 * instructions that strand dirty-data in the cache. 676 * 677 * Return: number of bytes copied (may be %0) 678 */ 679 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 680 { 681 if (unlikely(iov_iter_is_pipe(i))) { 682 WARN_ON(1); 683 return 0; 684 } 685 iterate_and_advance(i, bytes, base, len, off, 686 __copy_from_user_flushcache(addr + off, base, len), 687 memcpy_flushcache(addr + off, base, len) 688 ) 689 690 return bytes; 691 } 692 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 693 #endif 694 695 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 696 { 697 struct page *head; 698 size_t v = n + offset; 699 700 /* 701 * The general case needs to access the page order in order 702 * to compute the page size. 703 * However, we mostly deal with order-0 pages and thus can 704 * avoid a possible cache line miss for requests that fit all 705 * page orders. 706 */ 707 if (n <= v && v <= PAGE_SIZE) 708 return true; 709 710 head = compound_head(page); 711 v += (page - head) << PAGE_SHIFT; 712 713 if (likely(n <= v && v <= (page_size(head)))) 714 return true; 715 WARN_ON(1); 716 return false; 717 } 718 719 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 720 struct iov_iter *i) 721 { 722 if (unlikely(iov_iter_is_pipe(i))) { 723 return copy_page_to_iter_pipe(page, offset, bytes, i); 724 } else { 725 void *kaddr = kmap_local_page(page); 726 size_t wanted = _copy_to_iter(kaddr + offset, bytes, i); 727 kunmap_local(kaddr); 728 return wanted; 729 } 730 } 731 732 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 733 struct iov_iter *i) 734 { 735 size_t res = 0; 736 if (unlikely(!page_copy_sane(page, offset, bytes))) 737 return 0; 738 page += offset / PAGE_SIZE; // first subpage 739 offset %= PAGE_SIZE; 740 while (1) { 741 size_t n = __copy_page_to_iter(page, offset, 742 min(bytes, (size_t)PAGE_SIZE - offset), i); 743 res += n; 744 bytes -= n; 745 if (!bytes || !n) 746 break; 747 offset += n; 748 if (offset == PAGE_SIZE) { 749 page++; 750 offset = 0; 751 } 752 } 753 return res; 754 } 755 EXPORT_SYMBOL(copy_page_to_iter); 756 757 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 758 struct iov_iter *i) 759 { 760 if (page_copy_sane(page, offset, bytes)) { 761 void *kaddr = kmap_local_page(page); 762 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 763 kunmap_local(kaddr); 764 return wanted; 765 } 766 return 0; 767 } 768 EXPORT_SYMBOL(copy_page_from_iter); 769 770 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 771 { 772 struct pipe_inode_info *pipe = i->pipe; 773 unsigned int p_mask = pipe->ring_size - 1; 774 unsigned int i_head; 775 size_t n, off; 776 777 if (!sanity(i)) 778 return 0; 779 780 bytes = n = push_pipe(i, bytes, &i_head, &off); 781 if (unlikely(!n)) 782 return 0; 783 784 do { 785 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 786 char *p = kmap_local_page(pipe->bufs[i_head & p_mask].page); 787 memset(p + off, 0, chunk); 788 kunmap_local(p); 789 i->head = i_head; 790 i->iov_offset = off + chunk; 791 n -= chunk; 792 off = 0; 793 i_head++; 794 } while (n); 795 i->count -= bytes; 796 return bytes; 797 } 798 799 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 800 { 801 if (unlikely(iov_iter_is_pipe(i))) 802 return pipe_zero(bytes, i); 803 iterate_and_advance(i, bytes, base, len, count, 804 clear_user(base, len), 805 memset(base, 0, len) 806 ) 807 808 return bytes; 809 } 810 EXPORT_SYMBOL(iov_iter_zero); 811 812 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 813 struct iov_iter *i) 814 { 815 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 816 if (unlikely(!page_copy_sane(page, offset, bytes))) { 817 kunmap_atomic(kaddr); 818 return 0; 819 } 820 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 821 kunmap_atomic(kaddr); 822 WARN_ON(1); 823 return 0; 824 } 825 iterate_and_advance(i, bytes, base, len, off, 826 copyin(p + off, base, len), 827 memcpy(p + off, base, len) 828 ) 829 kunmap_atomic(kaddr); 830 return bytes; 831 } 832 EXPORT_SYMBOL(copy_page_from_iter_atomic); 833 834 static inline void pipe_truncate(struct iov_iter *i) 835 { 836 struct pipe_inode_info *pipe = i->pipe; 837 unsigned int p_tail = pipe->tail; 838 unsigned int p_head = pipe->head; 839 unsigned int p_mask = pipe->ring_size - 1; 840 841 if (!pipe_empty(p_head, p_tail)) { 842 struct pipe_buffer *buf; 843 unsigned int i_head = i->head; 844 size_t off = i->iov_offset; 845 846 if (off) { 847 buf = &pipe->bufs[i_head & p_mask]; 848 buf->len = off - buf->offset; 849 i_head++; 850 } 851 while (p_head != i_head) { 852 p_head--; 853 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 854 } 855 856 pipe->head = p_head; 857 } 858 } 859 860 static void pipe_advance(struct iov_iter *i, size_t size) 861 { 862 struct pipe_inode_info *pipe = i->pipe; 863 if (size) { 864 struct pipe_buffer *buf; 865 unsigned int p_mask = pipe->ring_size - 1; 866 unsigned int i_head = i->head; 867 size_t off = i->iov_offset, left = size; 868 869 if (off) /* make it relative to the beginning of buffer */ 870 left += off - pipe->bufs[i_head & p_mask].offset; 871 while (1) { 872 buf = &pipe->bufs[i_head & p_mask]; 873 if (left <= buf->len) 874 break; 875 left -= buf->len; 876 i_head++; 877 } 878 i->head = i_head; 879 i->iov_offset = buf->offset + left; 880 } 881 i->count -= size; 882 /* ... and discard everything past that point */ 883 pipe_truncate(i); 884 } 885 886 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 887 { 888 const struct bio_vec *bvec, *end; 889 890 if (!i->count) 891 return; 892 i->count -= size; 893 894 size += i->iov_offset; 895 896 for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { 897 if (likely(size < bvec->bv_len)) 898 break; 899 size -= bvec->bv_len; 900 } 901 i->iov_offset = size; 902 i->nr_segs -= bvec - i->bvec; 903 i->bvec = bvec; 904 } 905 906 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 907 { 908 const struct iovec *iov, *end; 909 910 if (!i->count) 911 return; 912 i->count -= size; 913 914 size += i->iov_offset; // from beginning of current segment 915 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 916 if (likely(size < iov->iov_len)) 917 break; 918 size -= iov->iov_len; 919 } 920 i->iov_offset = size; 921 i->nr_segs -= iov - i->iov; 922 i->iov = iov; 923 } 924 925 void iov_iter_advance(struct iov_iter *i, size_t size) 926 { 927 if (unlikely(i->count < size)) 928 size = i->count; 929 if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { 930 i->iov_offset += size; 931 i->count -= size; 932 } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 933 /* iovec and kvec have identical layouts */ 934 iov_iter_iovec_advance(i, size); 935 } else if (iov_iter_is_bvec(i)) { 936 iov_iter_bvec_advance(i, size); 937 } else if (iov_iter_is_pipe(i)) { 938 pipe_advance(i, size); 939 } else if (iov_iter_is_discard(i)) { 940 i->count -= size; 941 } 942 } 943 EXPORT_SYMBOL(iov_iter_advance); 944 945 void iov_iter_revert(struct iov_iter *i, size_t unroll) 946 { 947 if (!unroll) 948 return; 949 if (WARN_ON(unroll > MAX_RW_COUNT)) 950 return; 951 i->count += unroll; 952 if (unlikely(iov_iter_is_pipe(i))) { 953 struct pipe_inode_info *pipe = i->pipe; 954 unsigned int p_mask = pipe->ring_size - 1; 955 unsigned int i_head = i->head; 956 size_t off = i->iov_offset; 957 while (1) { 958 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 959 size_t n = off - b->offset; 960 if (unroll < n) { 961 off -= unroll; 962 break; 963 } 964 unroll -= n; 965 if (!unroll && i_head == i->start_head) { 966 off = 0; 967 break; 968 } 969 i_head--; 970 b = &pipe->bufs[i_head & p_mask]; 971 off = b->offset + b->len; 972 } 973 i->iov_offset = off; 974 i->head = i_head; 975 pipe_truncate(i); 976 return; 977 } 978 if (unlikely(iov_iter_is_discard(i))) 979 return; 980 if (unroll <= i->iov_offset) { 981 i->iov_offset -= unroll; 982 return; 983 } 984 unroll -= i->iov_offset; 985 if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { 986 BUG(); /* We should never go beyond the start of the specified 987 * range since we might then be straying into pages that 988 * aren't pinned. 989 */ 990 } else if (iov_iter_is_bvec(i)) { 991 const struct bio_vec *bvec = i->bvec; 992 while (1) { 993 size_t n = (--bvec)->bv_len; 994 i->nr_segs++; 995 if (unroll <= n) { 996 i->bvec = bvec; 997 i->iov_offset = n - unroll; 998 return; 999 } 1000 unroll -= n; 1001 } 1002 } else { /* same logics for iovec and kvec */ 1003 const struct iovec *iov = i->iov; 1004 while (1) { 1005 size_t n = (--iov)->iov_len; 1006 i->nr_segs++; 1007 if (unroll <= n) { 1008 i->iov = iov; 1009 i->iov_offset = n - unroll; 1010 return; 1011 } 1012 unroll -= n; 1013 } 1014 } 1015 } 1016 EXPORT_SYMBOL(iov_iter_revert); 1017 1018 /* 1019 * Return the count of just the current iov_iter segment. 1020 */ 1021 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1022 { 1023 if (i->nr_segs > 1) { 1024 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1025 return min(i->count, i->iov->iov_len - i->iov_offset); 1026 if (iov_iter_is_bvec(i)) 1027 return min(i->count, i->bvec->bv_len - i->iov_offset); 1028 } 1029 return i->count; 1030 } 1031 EXPORT_SYMBOL(iov_iter_single_seg_count); 1032 1033 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1034 const struct kvec *kvec, unsigned long nr_segs, 1035 size_t count) 1036 { 1037 WARN_ON(direction & ~(READ | WRITE)); 1038 *i = (struct iov_iter){ 1039 .iter_type = ITER_KVEC, 1040 .data_source = direction, 1041 .kvec = kvec, 1042 .nr_segs = nr_segs, 1043 .iov_offset = 0, 1044 .count = count 1045 }; 1046 } 1047 EXPORT_SYMBOL(iov_iter_kvec); 1048 1049 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1050 const struct bio_vec *bvec, unsigned long nr_segs, 1051 size_t count) 1052 { 1053 WARN_ON(direction & ~(READ | WRITE)); 1054 *i = (struct iov_iter){ 1055 .iter_type = ITER_BVEC, 1056 .data_source = direction, 1057 .bvec = bvec, 1058 .nr_segs = nr_segs, 1059 .iov_offset = 0, 1060 .count = count 1061 }; 1062 } 1063 EXPORT_SYMBOL(iov_iter_bvec); 1064 1065 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1066 struct pipe_inode_info *pipe, 1067 size_t count) 1068 { 1069 BUG_ON(direction != READ); 1070 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1071 *i = (struct iov_iter){ 1072 .iter_type = ITER_PIPE, 1073 .data_source = false, 1074 .pipe = pipe, 1075 .head = pipe->head, 1076 .start_head = pipe->head, 1077 .iov_offset = 0, 1078 .count = count 1079 }; 1080 } 1081 EXPORT_SYMBOL(iov_iter_pipe); 1082 1083 /** 1084 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1085 * @i: The iterator to initialise. 1086 * @direction: The direction of the transfer. 1087 * @xarray: The xarray to access. 1088 * @start: The start file position. 1089 * @count: The size of the I/O buffer in bytes. 1090 * 1091 * Set up an I/O iterator to either draw data out of the pages attached to an 1092 * inode or to inject data into those pages. The pages *must* be prevented 1093 * from evaporation, either by taking a ref on them or locking them by the 1094 * caller. 1095 */ 1096 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1097 struct xarray *xarray, loff_t start, size_t count) 1098 { 1099 BUG_ON(direction & ~1); 1100 *i = (struct iov_iter) { 1101 .iter_type = ITER_XARRAY, 1102 .data_source = direction, 1103 .xarray = xarray, 1104 .xarray_start = start, 1105 .count = count, 1106 .iov_offset = 0 1107 }; 1108 } 1109 EXPORT_SYMBOL(iov_iter_xarray); 1110 1111 /** 1112 * iov_iter_discard - Initialise an I/O iterator that discards data 1113 * @i: The iterator to initialise. 1114 * @direction: The direction of the transfer. 1115 * @count: The size of the I/O buffer in bytes. 1116 * 1117 * Set up an I/O iterator that just discards everything that's written to it. 1118 * It's only available as a READ iterator. 1119 */ 1120 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1121 { 1122 BUG_ON(direction != READ); 1123 *i = (struct iov_iter){ 1124 .iter_type = ITER_DISCARD, 1125 .data_source = false, 1126 .count = count, 1127 .iov_offset = 0 1128 }; 1129 } 1130 EXPORT_SYMBOL(iov_iter_discard); 1131 1132 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, 1133 unsigned len_mask) 1134 { 1135 size_t size = i->count; 1136 size_t skip = i->iov_offset; 1137 unsigned k; 1138 1139 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1140 size_t len = i->iov[k].iov_len - skip; 1141 1142 if (len > size) 1143 len = size; 1144 if (len & len_mask) 1145 return false; 1146 if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) 1147 return false; 1148 1149 size -= len; 1150 if (!size) 1151 break; 1152 } 1153 return true; 1154 } 1155 1156 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, 1157 unsigned len_mask) 1158 { 1159 size_t size = i->count; 1160 unsigned skip = i->iov_offset; 1161 unsigned k; 1162 1163 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1164 size_t len = i->bvec[k].bv_len - skip; 1165 1166 if (len > size) 1167 len = size; 1168 if (len & len_mask) 1169 return false; 1170 if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) 1171 return false; 1172 1173 size -= len; 1174 if (!size) 1175 break; 1176 } 1177 return true; 1178 } 1179 1180 /** 1181 * iov_iter_is_aligned() - Check if the addresses and lengths of each segments 1182 * are aligned to the parameters. 1183 * 1184 * @i: &struct iov_iter to restore 1185 * @addr_mask: bit mask to check against the iov element's addresses 1186 * @len_mask: bit mask to check against the iov element's lengths 1187 * 1188 * Return: false if any addresses or lengths intersect with the provided masks 1189 */ 1190 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, 1191 unsigned len_mask) 1192 { 1193 if (likely(iter_is_ubuf(i))) { 1194 if (i->count & len_mask) 1195 return false; 1196 if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) 1197 return false; 1198 return true; 1199 } 1200 1201 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1202 return iov_iter_aligned_iovec(i, addr_mask, len_mask); 1203 1204 if (iov_iter_is_bvec(i)) 1205 return iov_iter_aligned_bvec(i, addr_mask, len_mask); 1206 1207 if (iov_iter_is_pipe(i)) { 1208 unsigned int p_mask = i->pipe->ring_size - 1; 1209 size_t size = i->count; 1210 1211 if (size & len_mask) 1212 return false; 1213 if (size && allocated(&i->pipe->bufs[i->head & p_mask])) { 1214 if (i->iov_offset & addr_mask) 1215 return false; 1216 } 1217 1218 return true; 1219 } 1220 1221 if (iov_iter_is_xarray(i)) { 1222 if (i->count & len_mask) 1223 return false; 1224 if ((i->xarray_start + i->iov_offset) & addr_mask) 1225 return false; 1226 } 1227 1228 return true; 1229 } 1230 EXPORT_SYMBOL_GPL(iov_iter_is_aligned); 1231 1232 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1233 { 1234 unsigned long res = 0; 1235 size_t size = i->count; 1236 size_t skip = i->iov_offset; 1237 unsigned k; 1238 1239 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1240 size_t len = i->iov[k].iov_len - skip; 1241 if (len) { 1242 res |= (unsigned long)i->iov[k].iov_base + skip; 1243 if (len > size) 1244 len = size; 1245 res |= len; 1246 size -= len; 1247 if (!size) 1248 break; 1249 } 1250 } 1251 return res; 1252 } 1253 1254 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1255 { 1256 unsigned res = 0; 1257 size_t size = i->count; 1258 unsigned skip = i->iov_offset; 1259 unsigned k; 1260 1261 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1262 size_t len = i->bvec[k].bv_len - skip; 1263 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1264 if (len > size) 1265 len = size; 1266 res |= len; 1267 size -= len; 1268 if (!size) 1269 break; 1270 } 1271 return res; 1272 } 1273 1274 unsigned long iov_iter_alignment(const struct iov_iter *i) 1275 { 1276 if (likely(iter_is_ubuf(i))) { 1277 size_t size = i->count; 1278 if (size) 1279 return ((unsigned long)i->ubuf + i->iov_offset) | size; 1280 return 0; 1281 } 1282 1283 /* iovec and kvec have identical layouts */ 1284 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1285 return iov_iter_alignment_iovec(i); 1286 1287 if (iov_iter_is_bvec(i)) 1288 return iov_iter_alignment_bvec(i); 1289 1290 if (iov_iter_is_pipe(i)) { 1291 size_t size = i->count; 1292 1293 if (size && i->iov_offset && allocated(pipe_buf(i->pipe, i->head))) 1294 return size | i->iov_offset; 1295 return size; 1296 } 1297 1298 if (iov_iter_is_xarray(i)) 1299 return (i->xarray_start + i->iov_offset) | i->count; 1300 1301 return 0; 1302 } 1303 EXPORT_SYMBOL(iov_iter_alignment); 1304 1305 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1306 { 1307 unsigned long res = 0; 1308 unsigned long v = 0; 1309 size_t size = i->count; 1310 unsigned k; 1311 1312 if (iter_is_ubuf(i)) 1313 return 0; 1314 1315 if (WARN_ON(!iter_is_iovec(i))) 1316 return ~0U; 1317 1318 for (k = 0; k < i->nr_segs; k++) { 1319 if (i->iov[k].iov_len) { 1320 unsigned long base = (unsigned long)i->iov[k].iov_base; 1321 if (v) // if not the first one 1322 res |= base | v; // this start | previous end 1323 v = base + i->iov[k].iov_len; 1324 if (size <= i->iov[k].iov_len) 1325 break; 1326 size -= i->iov[k].iov_len; 1327 } 1328 } 1329 return res; 1330 } 1331 EXPORT_SYMBOL(iov_iter_gap_alignment); 1332 1333 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1334 size_t maxsize, 1335 struct page **pages, 1336 int iter_head, 1337 size_t *start) 1338 { 1339 struct pipe_inode_info *pipe = i->pipe; 1340 unsigned int p_mask = pipe->ring_size - 1; 1341 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1342 if (!n) 1343 return -EFAULT; 1344 1345 maxsize = n; 1346 n += *start; 1347 while (n > 0) { 1348 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1349 iter_head++; 1350 n -= PAGE_SIZE; 1351 } 1352 1353 return maxsize; 1354 } 1355 1356 static ssize_t pipe_get_pages(struct iov_iter *i, 1357 struct page **pages, size_t maxsize, unsigned maxpages, 1358 size_t *start) 1359 { 1360 unsigned int iter_head, npages; 1361 size_t capacity; 1362 1363 if (!sanity(i)) 1364 return -EFAULT; 1365 1366 data_start(i, &iter_head, start); 1367 /* Amount of free space: some of this one + all after this one */ 1368 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1369 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1370 1371 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1372 } 1373 1374 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1375 pgoff_t index, unsigned int nr_pages) 1376 { 1377 XA_STATE(xas, xa, index); 1378 struct page *page; 1379 unsigned int ret = 0; 1380 1381 rcu_read_lock(); 1382 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1383 if (xas_retry(&xas, page)) 1384 continue; 1385 1386 /* Has the page moved or been split? */ 1387 if (unlikely(page != xas_reload(&xas))) { 1388 xas_reset(&xas); 1389 continue; 1390 } 1391 1392 pages[ret] = find_subpage(page, xas.xa_index); 1393 get_page(pages[ret]); 1394 if (++ret == nr_pages) 1395 break; 1396 } 1397 rcu_read_unlock(); 1398 return ret; 1399 } 1400 1401 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1402 struct page **pages, size_t maxsize, 1403 unsigned maxpages, size_t *_start_offset) 1404 { 1405 unsigned nr, offset; 1406 pgoff_t index, count; 1407 size_t size = maxsize; 1408 loff_t pos; 1409 1410 if (!size || !maxpages) 1411 return 0; 1412 1413 pos = i->xarray_start + i->iov_offset; 1414 index = pos >> PAGE_SHIFT; 1415 offset = pos & ~PAGE_MASK; 1416 *_start_offset = offset; 1417 1418 count = 1; 1419 if (size > PAGE_SIZE - offset) { 1420 size -= PAGE_SIZE - offset; 1421 count += size >> PAGE_SHIFT; 1422 size &= ~PAGE_MASK; 1423 if (size) 1424 count++; 1425 } 1426 1427 if (count > maxpages) 1428 count = maxpages; 1429 1430 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1431 if (nr == 0) 1432 return 0; 1433 1434 return min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1435 } 1436 1437 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ 1438 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) 1439 { 1440 size_t skip; 1441 long k; 1442 1443 if (iter_is_ubuf(i)) 1444 return (unsigned long)i->ubuf + i->iov_offset; 1445 1446 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1447 size_t len = i->iov[k].iov_len - skip; 1448 1449 if (unlikely(!len)) 1450 continue; 1451 if (*size > len) 1452 *size = len; 1453 return (unsigned long)i->iov[k].iov_base + skip; 1454 } 1455 BUG(); // if it had been empty, we wouldn't get called 1456 } 1457 1458 /* must be done on non-empty ITER_BVEC one */ 1459 static struct page *first_bvec_segment(const struct iov_iter *i, 1460 size_t *size, size_t *start) 1461 { 1462 struct page *page; 1463 size_t skip = i->iov_offset, len; 1464 1465 len = i->bvec->bv_len - skip; 1466 if (*size > len) 1467 *size = len; 1468 skip += i->bvec->bv_offset; 1469 page = i->bvec->bv_page + skip / PAGE_SIZE; 1470 *start = skip % PAGE_SIZE; 1471 return page; 1472 } 1473 1474 ssize_t iov_iter_get_pages(struct iov_iter *i, 1475 struct page **pages, size_t maxsize, unsigned maxpages, 1476 size_t *start) 1477 { 1478 int n, res; 1479 1480 if (maxsize > i->count) 1481 maxsize = i->count; 1482 if (!maxsize) 1483 return 0; 1484 if (maxsize > MAX_RW_COUNT) 1485 maxsize = MAX_RW_COUNT; 1486 1487 if (likely(user_backed_iter(i))) { 1488 unsigned int gup_flags = 0; 1489 unsigned long addr; 1490 1491 if (iov_iter_rw(i) != WRITE) 1492 gup_flags |= FOLL_WRITE; 1493 if (i->nofault) 1494 gup_flags |= FOLL_NOFAULT; 1495 1496 addr = first_iovec_segment(i, &maxsize); 1497 *start = addr % PAGE_SIZE; 1498 addr &= PAGE_MASK; 1499 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1500 if (n > maxpages) 1501 n = maxpages; 1502 res = get_user_pages_fast(addr, n, gup_flags, pages); 1503 if (unlikely(res <= 0)) 1504 return res; 1505 return min_t(size_t, maxsize, res * PAGE_SIZE - *start); 1506 } 1507 if (iov_iter_is_bvec(i)) { 1508 struct page *page; 1509 1510 page = first_bvec_segment(i, &maxsize, start); 1511 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1512 if (n > maxpages) 1513 n = maxpages; 1514 for (int k = 0; k < n; k++) 1515 get_page(*pages++ = page++); 1516 return min_t(size_t, maxsize, n * PAGE_SIZE - *start); 1517 } 1518 if (iov_iter_is_pipe(i)) 1519 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1520 if (iov_iter_is_xarray(i)) 1521 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1522 return -EFAULT; 1523 } 1524 EXPORT_SYMBOL(iov_iter_get_pages); 1525 1526 static struct page **get_pages_array(size_t n) 1527 { 1528 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1529 } 1530 1531 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1532 struct page ***pages, size_t maxsize, 1533 size_t *start) 1534 { 1535 struct page **p; 1536 unsigned int iter_head, npages; 1537 ssize_t n; 1538 1539 if (!sanity(i)) 1540 return -EFAULT; 1541 1542 data_start(i, &iter_head, start); 1543 /* Amount of free space: some of this one + all after this one */ 1544 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1545 n = npages * PAGE_SIZE - *start; 1546 if (maxsize > n) 1547 maxsize = n; 1548 else 1549 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1550 p = get_pages_array(npages); 1551 if (!p) 1552 return -ENOMEM; 1553 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1554 if (n > 0) 1555 *pages = p; 1556 else 1557 kvfree(p); 1558 return n; 1559 } 1560 1561 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1562 struct page ***pages, size_t maxsize, 1563 size_t *_start_offset) 1564 { 1565 struct page **p; 1566 unsigned nr, offset; 1567 pgoff_t index, count; 1568 size_t size = maxsize; 1569 loff_t pos; 1570 1571 if (!size) 1572 return 0; 1573 1574 pos = i->xarray_start + i->iov_offset; 1575 index = pos >> PAGE_SHIFT; 1576 offset = pos & ~PAGE_MASK; 1577 *_start_offset = offset; 1578 1579 count = 1; 1580 if (size > PAGE_SIZE - offset) { 1581 size -= PAGE_SIZE - offset; 1582 count += size >> PAGE_SHIFT; 1583 size &= ~PAGE_MASK; 1584 if (size) 1585 count++; 1586 } 1587 1588 p = get_pages_array(count); 1589 if (!p) 1590 return -ENOMEM; 1591 *pages = p; 1592 1593 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1594 if (nr == 0) 1595 return 0; 1596 1597 return min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1598 } 1599 1600 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1601 struct page ***pages, size_t maxsize, 1602 size_t *start) 1603 { 1604 struct page **p; 1605 int n, res; 1606 1607 if (maxsize > i->count) 1608 maxsize = i->count; 1609 if (!maxsize) 1610 return 0; 1611 if (maxsize > MAX_RW_COUNT) 1612 maxsize = MAX_RW_COUNT; 1613 1614 if (likely(user_backed_iter(i))) { 1615 unsigned int gup_flags = 0; 1616 unsigned long addr; 1617 1618 if (iov_iter_rw(i) != WRITE) 1619 gup_flags |= FOLL_WRITE; 1620 if (i->nofault) 1621 gup_flags |= FOLL_NOFAULT; 1622 1623 addr = first_iovec_segment(i, &maxsize); 1624 *start = addr % PAGE_SIZE; 1625 addr &= PAGE_MASK; 1626 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1627 p = get_pages_array(n); 1628 if (!p) 1629 return -ENOMEM; 1630 res = get_user_pages_fast(addr, n, gup_flags, p); 1631 if (unlikely(res <= 0)) { 1632 kvfree(p); 1633 *pages = NULL; 1634 return res; 1635 } 1636 *pages = p; 1637 return min_t(size_t, maxsize, res * PAGE_SIZE - *start); 1638 } 1639 if (iov_iter_is_bvec(i)) { 1640 struct page *page; 1641 1642 page = first_bvec_segment(i, &maxsize, start); 1643 n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1644 *pages = p = get_pages_array(n); 1645 if (!p) 1646 return -ENOMEM; 1647 for (int k = 0; k < n; k++) 1648 get_page(*p++ = page++); 1649 return min_t(size_t, maxsize, n * PAGE_SIZE - *start); 1650 } 1651 if (iov_iter_is_pipe(i)) 1652 return pipe_get_pages_alloc(i, pages, maxsize, start); 1653 if (iov_iter_is_xarray(i)) 1654 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1655 return -EFAULT; 1656 } 1657 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1658 1659 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1660 struct iov_iter *i) 1661 { 1662 __wsum sum, next; 1663 sum = *csum; 1664 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1665 WARN_ON(1); 1666 return 0; 1667 } 1668 iterate_and_advance(i, bytes, base, len, off, ({ 1669 next = csum_and_copy_from_user(base, addr + off, len); 1670 sum = csum_block_add(sum, next, off); 1671 next ? 0 : len; 1672 }), ({ 1673 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1674 }) 1675 ) 1676 *csum = sum; 1677 return bytes; 1678 } 1679 EXPORT_SYMBOL(csum_and_copy_from_iter); 1680 1681 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1682 struct iov_iter *i) 1683 { 1684 struct csum_state *csstate = _csstate; 1685 __wsum sum, next; 1686 1687 if (unlikely(iov_iter_is_discard(i))) { 1688 WARN_ON(1); /* for now */ 1689 return 0; 1690 } 1691 1692 sum = csum_shift(csstate->csum, csstate->off); 1693 if (unlikely(iov_iter_is_pipe(i))) 1694 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); 1695 else iterate_and_advance(i, bytes, base, len, off, ({ 1696 next = csum_and_copy_to_user(addr + off, base, len); 1697 sum = csum_block_add(sum, next, off); 1698 next ? 0 : len; 1699 }), ({ 1700 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1701 }) 1702 ) 1703 csstate->csum = csum_shift(sum, csstate->off); 1704 csstate->off += bytes; 1705 return bytes; 1706 } 1707 EXPORT_SYMBOL(csum_and_copy_to_iter); 1708 1709 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1710 struct iov_iter *i) 1711 { 1712 #ifdef CONFIG_CRYPTO_HASH 1713 struct ahash_request *hash = hashp; 1714 struct scatterlist sg; 1715 size_t copied; 1716 1717 copied = copy_to_iter(addr, bytes, i); 1718 sg_init_one(&sg, addr, copied); 1719 ahash_request_set_crypt(hash, &sg, NULL, copied); 1720 crypto_ahash_update(hash); 1721 return copied; 1722 #else 1723 return 0; 1724 #endif 1725 } 1726 EXPORT_SYMBOL(hash_and_copy_to_iter); 1727 1728 static int iov_npages(const struct iov_iter *i, int maxpages) 1729 { 1730 size_t skip = i->iov_offset, size = i->count; 1731 const struct iovec *p; 1732 int npages = 0; 1733 1734 for (p = i->iov; size; skip = 0, p++) { 1735 unsigned offs = offset_in_page(p->iov_base + skip); 1736 size_t len = min(p->iov_len - skip, size); 1737 1738 if (len) { 1739 size -= len; 1740 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1741 if (unlikely(npages > maxpages)) 1742 return maxpages; 1743 } 1744 } 1745 return npages; 1746 } 1747 1748 static int bvec_npages(const struct iov_iter *i, int maxpages) 1749 { 1750 size_t skip = i->iov_offset, size = i->count; 1751 const struct bio_vec *p; 1752 int npages = 0; 1753 1754 for (p = i->bvec; size; skip = 0, p++) { 1755 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1756 size_t len = min(p->bv_len - skip, size); 1757 1758 size -= len; 1759 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1760 if (unlikely(npages > maxpages)) 1761 return maxpages; 1762 } 1763 return npages; 1764 } 1765 1766 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1767 { 1768 if (unlikely(!i->count)) 1769 return 0; 1770 if (likely(iter_is_ubuf(i))) { 1771 unsigned offs = offset_in_page(i->ubuf + i->iov_offset); 1772 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); 1773 return min(npages, maxpages); 1774 } 1775 /* iovec and kvec have identical layouts */ 1776 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1777 return iov_npages(i, maxpages); 1778 if (iov_iter_is_bvec(i)) 1779 return bvec_npages(i, maxpages); 1780 if (iov_iter_is_pipe(i)) { 1781 unsigned int iter_head; 1782 int npages; 1783 size_t off; 1784 1785 if (!sanity(i)) 1786 return 0; 1787 1788 data_start(i, &iter_head, &off); 1789 /* some of this one + all after this one */ 1790 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1791 return min(npages, maxpages); 1792 } 1793 if (iov_iter_is_xarray(i)) { 1794 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1795 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1796 return min(npages, maxpages); 1797 } 1798 return 0; 1799 } 1800 EXPORT_SYMBOL(iov_iter_npages); 1801 1802 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1803 { 1804 *new = *old; 1805 if (unlikely(iov_iter_is_pipe(new))) { 1806 WARN_ON(1); 1807 return NULL; 1808 } 1809 if (iov_iter_is_bvec(new)) 1810 return new->bvec = kmemdup(new->bvec, 1811 new->nr_segs * sizeof(struct bio_vec), 1812 flags); 1813 else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) 1814 /* iovec and kvec have identical layout */ 1815 return new->iov = kmemdup(new->iov, 1816 new->nr_segs * sizeof(struct iovec), 1817 flags); 1818 return NULL; 1819 } 1820 EXPORT_SYMBOL(dup_iter); 1821 1822 static int copy_compat_iovec_from_user(struct iovec *iov, 1823 const struct iovec __user *uvec, unsigned long nr_segs) 1824 { 1825 const struct compat_iovec __user *uiov = 1826 (const struct compat_iovec __user *)uvec; 1827 int ret = -EFAULT, i; 1828 1829 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1830 return -EFAULT; 1831 1832 for (i = 0; i < nr_segs; i++) { 1833 compat_uptr_t buf; 1834 compat_ssize_t len; 1835 1836 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1837 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1838 1839 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1840 if (len < 0) { 1841 ret = -EINVAL; 1842 goto uaccess_end; 1843 } 1844 iov[i].iov_base = compat_ptr(buf); 1845 iov[i].iov_len = len; 1846 } 1847 1848 ret = 0; 1849 uaccess_end: 1850 user_access_end(); 1851 return ret; 1852 } 1853 1854 static int copy_iovec_from_user(struct iovec *iov, 1855 const struct iovec __user *uvec, unsigned long nr_segs) 1856 { 1857 unsigned long seg; 1858 1859 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1860 return -EFAULT; 1861 for (seg = 0; seg < nr_segs; seg++) { 1862 if ((ssize_t)iov[seg].iov_len < 0) 1863 return -EINVAL; 1864 } 1865 1866 return 0; 1867 } 1868 1869 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1870 unsigned long nr_segs, unsigned long fast_segs, 1871 struct iovec *fast_iov, bool compat) 1872 { 1873 struct iovec *iov = fast_iov; 1874 int ret; 1875 1876 /* 1877 * SuS says "The readv() function *may* fail if the iovcnt argument was 1878 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1879 * traditionally returned zero for zero segments, so... 1880 */ 1881 if (nr_segs == 0) 1882 return iov; 1883 if (nr_segs > UIO_MAXIOV) 1884 return ERR_PTR(-EINVAL); 1885 if (nr_segs > fast_segs) { 1886 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1887 if (!iov) 1888 return ERR_PTR(-ENOMEM); 1889 } 1890 1891 if (compat) 1892 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1893 else 1894 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1895 if (ret) { 1896 if (iov != fast_iov) 1897 kfree(iov); 1898 return ERR_PTR(ret); 1899 } 1900 1901 return iov; 1902 } 1903 1904 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1905 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1906 struct iov_iter *i, bool compat) 1907 { 1908 ssize_t total_len = 0; 1909 unsigned long seg; 1910 struct iovec *iov; 1911 1912 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1913 if (IS_ERR(iov)) { 1914 *iovp = NULL; 1915 return PTR_ERR(iov); 1916 } 1917 1918 /* 1919 * According to the Single Unix Specification we should return EINVAL if 1920 * an element length is < 0 when cast to ssize_t or if the total length 1921 * would overflow the ssize_t return value of the system call. 1922 * 1923 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1924 * overflow case. 1925 */ 1926 for (seg = 0; seg < nr_segs; seg++) { 1927 ssize_t len = (ssize_t)iov[seg].iov_len; 1928 1929 if (!access_ok(iov[seg].iov_base, len)) { 1930 if (iov != *iovp) 1931 kfree(iov); 1932 *iovp = NULL; 1933 return -EFAULT; 1934 } 1935 1936 if (len > MAX_RW_COUNT - total_len) { 1937 len = MAX_RW_COUNT - total_len; 1938 iov[seg].iov_len = len; 1939 } 1940 total_len += len; 1941 } 1942 1943 iov_iter_init(i, type, iov, nr_segs, total_len); 1944 if (iov == *iovp) 1945 *iovp = NULL; 1946 else 1947 *iovp = iov; 1948 return total_len; 1949 } 1950 1951 /** 1952 * import_iovec() - Copy an array of &struct iovec from userspace 1953 * into the kernel, check that it is valid, and initialize a new 1954 * &struct iov_iter iterator to access it. 1955 * 1956 * @type: One of %READ or %WRITE. 1957 * @uvec: Pointer to the userspace array. 1958 * @nr_segs: Number of elements in userspace array. 1959 * @fast_segs: Number of elements in @iov. 1960 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1961 * on-stack) kernel array. 1962 * @i: Pointer to iterator that will be initialized on success. 1963 * 1964 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1965 * then this function places %NULL in *@iov on return. Otherwise, a new 1966 * array will be allocated and the result placed in *@iov. This means that 1967 * the caller may call kfree() on *@iov regardless of whether the small 1968 * on-stack array was used or not (and regardless of whether this function 1969 * returns an error or not). 1970 * 1971 * Return: Negative error code on error, bytes imported on success 1972 */ 1973 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1974 unsigned nr_segs, unsigned fast_segs, 1975 struct iovec **iovp, struct iov_iter *i) 1976 { 1977 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1978 in_compat_syscall()); 1979 } 1980 EXPORT_SYMBOL(import_iovec); 1981 1982 int import_single_range(int rw, void __user *buf, size_t len, 1983 struct iovec *iov, struct iov_iter *i) 1984 { 1985 if (len > MAX_RW_COUNT) 1986 len = MAX_RW_COUNT; 1987 if (unlikely(!access_ok(buf, len))) 1988 return -EFAULT; 1989 1990 iov->iov_base = buf; 1991 iov->iov_len = len; 1992 iov_iter_init(i, rw, iov, 1, len); 1993 return 0; 1994 } 1995 EXPORT_SYMBOL(import_single_range); 1996 1997 /** 1998 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when 1999 * iov_iter_save_state() was called. 2000 * 2001 * @i: &struct iov_iter to restore 2002 * @state: state to restore from 2003 * 2004 * Used after iov_iter_save_state() to bring restore @i, if operations may 2005 * have advanced it. 2006 * 2007 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC 2008 */ 2009 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) 2010 { 2011 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && 2012 !iov_iter_is_kvec(i) && !iter_is_ubuf(i)) 2013 return; 2014 i->iov_offset = state->iov_offset; 2015 i->count = state->count; 2016 if (iter_is_ubuf(i)) 2017 return; 2018 /* 2019 * For the *vec iters, nr_segs + iov is constant - if we increment 2020 * the vec, then we also decrement the nr_segs count. Hence we don't 2021 * need to track both of these, just one is enough and we can deduct 2022 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct 2023 * size, so we can just increment the iov pointer as they are unionzed. 2024 * ITER_BVEC _may_ be the same size on some archs, but on others it is 2025 * not. Be safe and handle it separately. 2026 */ 2027 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 2028 if (iov_iter_is_bvec(i)) 2029 i->bvec -= state->nr_segs - i->nr_segs; 2030 else 2031 i->iov -= state->nr_segs - i->nr_segs; 2032 i->nr_segs = state->nr_segs; 2033 } 2034