1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers ubuf and kbuf alike */ 20 #define iterate_buf(i, n, base, len, off, __p, STEP) { \ 21 size_t __maybe_unused off = 0; \ 22 len = n; \ 23 base = __p + i->iov_offset; \ 24 len -= (STEP); \ 25 i->iov_offset += len; \ 26 n = len; \ 27 } 28 29 /* covers iovec and kvec alike */ 30 #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ 31 size_t off = 0; \ 32 size_t skip = i->iov_offset; \ 33 do { \ 34 len = min(n, __p->iov_len - skip); \ 35 if (likely(len)) { \ 36 base = __p->iov_base + skip; \ 37 len -= (STEP); \ 38 off += len; \ 39 skip += len; \ 40 n -= len; \ 41 if (skip < __p->iov_len) \ 42 break; \ 43 } \ 44 __p++; \ 45 skip = 0; \ 46 } while (n); \ 47 i->iov_offset = skip; \ 48 n = off; \ 49 } 50 51 #define iterate_bvec(i, n, base, len, off, p, STEP) { \ 52 size_t off = 0; \ 53 unsigned skip = i->iov_offset; \ 54 while (n) { \ 55 unsigned offset = p->bv_offset + skip; \ 56 unsigned left; \ 57 void *kaddr = kmap_local_page(p->bv_page + \ 58 offset / PAGE_SIZE); \ 59 base = kaddr + offset % PAGE_SIZE; \ 60 len = min(min(n, (size_t)(p->bv_len - skip)), \ 61 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 62 left = (STEP); \ 63 kunmap_local(kaddr); \ 64 len -= left; \ 65 off += len; \ 66 skip += len; \ 67 if (skip == p->bv_len) { \ 68 skip = 0; \ 69 p++; \ 70 } \ 71 n -= len; \ 72 if (left) \ 73 break; \ 74 } \ 75 i->iov_offset = skip; \ 76 n = off; \ 77 } 78 79 #define iterate_xarray(i, n, base, len, __off, STEP) { \ 80 __label__ __out; \ 81 size_t __off = 0; \ 82 struct folio *folio; \ 83 loff_t start = i->xarray_start + i->iov_offset; \ 84 pgoff_t index = start / PAGE_SIZE; \ 85 XA_STATE(xas, i->xarray, index); \ 86 \ 87 len = PAGE_SIZE - offset_in_page(start); \ 88 rcu_read_lock(); \ 89 xas_for_each(&xas, folio, ULONG_MAX) { \ 90 unsigned left; \ 91 size_t offset; \ 92 if (xas_retry(&xas, folio)) \ 93 continue; \ 94 if (WARN_ON(xa_is_value(folio))) \ 95 break; \ 96 if (WARN_ON(folio_test_hugetlb(folio))) \ 97 break; \ 98 offset = offset_in_folio(folio, start + __off); \ 99 while (offset < folio_size(folio)) { \ 100 base = kmap_local_folio(folio, offset); \ 101 len = min(n, len); \ 102 left = (STEP); \ 103 kunmap_local(base); \ 104 len -= left; \ 105 __off += len; \ 106 n -= len; \ 107 if (left || n == 0) \ 108 goto __out; \ 109 offset += len; \ 110 len = PAGE_SIZE; \ 111 } \ 112 } \ 113 __out: \ 114 rcu_read_unlock(); \ 115 i->iov_offset += __off; \ 116 n = __off; \ 117 } 118 119 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 120 if (unlikely(i->count < n)) \ 121 n = i->count; \ 122 if (likely(n)) { \ 123 if (likely(iter_is_ubuf(i))) { \ 124 void __user *base; \ 125 size_t len; \ 126 iterate_buf(i, n, base, len, off, \ 127 i->ubuf, (I)) \ 128 } else if (likely(iter_is_iovec(i))) { \ 129 const struct iovec *iov = i->iov; \ 130 void __user *base; \ 131 size_t len; \ 132 iterate_iovec(i, n, base, len, off, \ 133 iov, (I)) \ 134 i->nr_segs -= iov - i->iov; \ 135 i->iov = iov; \ 136 } else if (iov_iter_is_bvec(i)) { \ 137 const struct bio_vec *bvec = i->bvec; \ 138 void *base; \ 139 size_t len; \ 140 iterate_bvec(i, n, base, len, off, \ 141 bvec, (K)) \ 142 i->nr_segs -= bvec - i->bvec; \ 143 i->bvec = bvec; \ 144 } else if (iov_iter_is_kvec(i)) { \ 145 const struct kvec *kvec = i->kvec; \ 146 void *base; \ 147 size_t len; \ 148 iterate_iovec(i, n, base, len, off, \ 149 kvec, (K)) \ 150 i->nr_segs -= kvec - i->kvec; \ 151 i->kvec = kvec; \ 152 } else if (iov_iter_is_xarray(i)) { \ 153 void *base; \ 154 size_t len; \ 155 iterate_xarray(i, n, base, len, off, \ 156 (K)) \ 157 } \ 158 i->count -= n; \ 159 } \ 160 } 161 #define iterate_and_advance(i, n, base, len, off, I, K) \ 162 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 163 164 static int copyout(void __user *to, const void *from, size_t n) 165 { 166 if (should_fail_usercopy()) 167 return n; 168 if (access_ok(to, n)) { 169 instrument_copy_to_user(to, from, n); 170 n = raw_copy_to_user(to, from, n); 171 } 172 return n; 173 } 174 175 static int copyout_nofault(void __user *to, const void *from, size_t n) 176 { 177 long res; 178 179 if (should_fail_usercopy()) 180 return n; 181 182 res = copy_to_user_nofault(to, from, n); 183 184 return res < 0 ? n : res; 185 } 186 187 static int copyin(void *to, const void __user *from, size_t n) 188 { 189 size_t res = n; 190 191 if (should_fail_usercopy()) 192 return n; 193 if (access_ok(from, n)) { 194 instrument_copy_from_user_before(to, from, n); 195 res = raw_copy_from_user(to, from, n); 196 instrument_copy_from_user_after(to, from, n, res); 197 } 198 return res; 199 } 200 201 #ifdef PIPE_PARANOIA 202 static bool sanity(const struct iov_iter *i) 203 { 204 struct pipe_inode_info *pipe = i->pipe; 205 unsigned int p_head = pipe->head; 206 unsigned int p_tail = pipe->tail; 207 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 208 unsigned int i_head = i->head; 209 unsigned int idx; 210 211 if (i->last_offset) { 212 struct pipe_buffer *p; 213 if (unlikely(p_occupancy == 0)) 214 goto Bad; // pipe must be non-empty 215 if (unlikely(i_head != p_head - 1)) 216 goto Bad; // must be at the last buffer... 217 218 p = pipe_buf(pipe, i_head); 219 if (unlikely(p->offset + p->len != abs(i->last_offset))) 220 goto Bad; // ... at the end of segment 221 } else { 222 if (i_head != p_head) 223 goto Bad; // must be right after the last buffer 224 } 225 return true; 226 Bad: 227 printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset); 228 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 229 p_head, p_tail, pipe->ring_size); 230 for (idx = 0; idx < pipe->ring_size; idx++) 231 printk(KERN_ERR "[%p %p %d %d]\n", 232 pipe->bufs[idx].ops, 233 pipe->bufs[idx].page, 234 pipe->bufs[idx].offset, 235 pipe->bufs[idx].len); 236 WARN_ON(1); 237 return false; 238 } 239 #else 240 #define sanity(i) true 241 #endif 242 243 static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) 244 { 245 struct page *page = alloc_page(GFP_USER); 246 if (page) { 247 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 248 *buf = (struct pipe_buffer) { 249 .ops = &default_pipe_buf_ops, 250 .page = page, 251 .offset = 0, 252 .len = size 253 }; 254 } 255 return page; 256 } 257 258 static void push_page(struct pipe_inode_info *pipe, struct page *page, 259 unsigned int offset, unsigned int size) 260 { 261 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 262 *buf = (struct pipe_buffer) { 263 .ops = &page_cache_pipe_buf_ops, 264 .page = page, 265 .offset = offset, 266 .len = size 267 }; 268 get_page(page); 269 } 270 271 static inline int last_offset(const struct pipe_buffer *buf) 272 { 273 if (buf->ops == &default_pipe_buf_ops) 274 return buf->len; // buf->offset is 0 for those 275 else 276 return -(buf->offset + buf->len); 277 } 278 279 static struct page *append_pipe(struct iov_iter *i, size_t size, 280 unsigned int *off) 281 { 282 struct pipe_inode_info *pipe = i->pipe; 283 int offset = i->last_offset; 284 struct pipe_buffer *buf; 285 struct page *page; 286 287 if (offset > 0 && offset < PAGE_SIZE) { 288 // some space in the last buffer; add to it 289 buf = pipe_buf(pipe, pipe->head - 1); 290 size = min_t(size_t, size, PAGE_SIZE - offset); 291 buf->len += size; 292 i->last_offset += size; 293 i->count -= size; 294 *off = offset; 295 return buf->page; 296 } 297 // OK, we need a new buffer 298 *off = 0; 299 size = min_t(size_t, size, PAGE_SIZE); 300 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 301 return NULL; 302 page = push_anon(pipe, size); 303 if (!page) 304 return NULL; 305 i->head = pipe->head - 1; 306 i->last_offset = size; 307 i->count -= size; 308 return page; 309 } 310 311 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 312 struct iov_iter *i) 313 { 314 struct pipe_inode_info *pipe = i->pipe; 315 unsigned int head = pipe->head; 316 317 if (unlikely(bytes > i->count)) 318 bytes = i->count; 319 320 if (unlikely(!bytes)) 321 return 0; 322 323 if (!sanity(i)) 324 return 0; 325 326 if (offset && i->last_offset == -offset) { // could we merge it? 327 struct pipe_buffer *buf = pipe_buf(pipe, head - 1); 328 if (buf->page == page) { 329 buf->len += bytes; 330 i->last_offset -= bytes; 331 i->count -= bytes; 332 return bytes; 333 } 334 } 335 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 336 return 0; 337 338 push_page(pipe, page, offset, bytes); 339 i->last_offset = -(offset + bytes); 340 i->head = head; 341 i->count -= bytes; 342 return bytes; 343 } 344 345 /* 346 * fault_in_iov_iter_readable - fault in iov iterator for reading 347 * @i: iterator 348 * @size: maximum length 349 * 350 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 351 * @size. For each iovec, fault in each page that constitutes the iovec. 352 * 353 * Returns the number of bytes not faulted in (like copy_to_user() and 354 * copy_from_user()). 355 * 356 * Always returns 0 for non-userspace iterators. 357 */ 358 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) 359 { 360 if (iter_is_ubuf(i)) { 361 size_t n = min(size, iov_iter_count(i)); 362 n -= fault_in_readable(i->ubuf + i->iov_offset, n); 363 return size - n; 364 } else if (iter_is_iovec(i)) { 365 size_t count = min(size, iov_iter_count(i)); 366 const struct iovec *p; 367 size_t skip; 368 369 size -= count; 370 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 371 size_t len = min(count, p->iov_len - skip); 372 size_t ret; 373 374 if (unlikely(!len)) 375 continue; 376 ret = fault_in_readable(p->iov_base + skip, len); 377 count -= len - ret; 378 if (ret) 379 break; 380 } 381 return count + size; 382 } 383 return 0; 384 } 385 EXPORT_SYMBOL(fault_in_iov_iter_readable); 386 387 /* 388 * fault_in_iov_iter_writeable - fault in iov iterator for writing 389 * @i: iterator 390 * @size: maximum length 391 * 392 * Faults in the iterator using get_user_pages(), i.e., without triggering 393 * hardware page faults. This is primarily useful when we already know that 394 * some or all of the pages in @i aren't in memory. 395 * 396 * Returns the number of bytes not faulted in, like copy_to_user() and 397 * copy_from_user(). 398 * 399 * Always returns 0 for non-user-space iterators. 400 */ 401 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) 402 { 403 if (iter_is_ubuf(i)) { 404 size_t n = min(size, iov_iter_count(i)); 405 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); 406 return size - n; 407 } else if (iter_is_iovec(i)) { 408 size_t count = min(size, iov_iter_count(i)); 409 const struct iovec *p; 410 size_t skip; 411 412 size -= count; 413 for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { 414 size_t len = min(count, p->iov_len - skip); 415 size_t ret; 416 417 if (unlikely(!len)) 418 continue; 419 ret = fault_in_safe_writeable(p->iov_base + skip, len); 420 count -= len - ret; 421 if (ret) 422 break; 423 } 424 return count + size; 425 } 426 return 0; 427 } 428 EXPORT_SYMBOL(fault_in_iov_iter_writeable); 429 430 void iov_iter_init(struct iov_iter *i, unsigned int direction, 431 const struct iovec *iov, unsigned long nr_segs, 432 size_t count) 433 { 434 WARN_ON(direction & ~(READ | WRITE)); 435 *i = (struct iov_iter) { 436 .iter_type = ITER_IOVEC, 437 .nofault = false, 438 .user_backed = true, 439 .data_source = direction, 440 .iov = iov, 441 .nr_segs = nr_segs, 442 .iov_offset = 0, 443 .count = count 444 }; 445 } 446 EXPORT_SYMBOL(iov_iter_init); 447 448 // returns the offset in partial buffer (if any) 449 static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages) 450 { 451 struct pipe_inode_info *pipe = i->pipe; 452 int used = pipe->head - pipe->tail; 453 int off = i->last_offset; 454 455 *npages = max((int)pipe->max_usage - used, 0); 456 457 if (off > 0 && off < PAGE_SIZE) { // anon and not full 458 (*npages)++; 459 return off; 460 } 461 return 0; 462 } 463 464 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 465 struct iov_iter *i) 466 { 467 unsigned int off, chunk; 468 469 if (unlikely(bytes > i->count)) 470 bytes = i->count; 471 if (unlikely(!bytes)) 472 return 0; 473 474 if (!sanity(i)) 475 return 0; 476 477 for (size_t n = bytes; n; n -= chunk) { 478 struct page *page = append_pipe(i, n, &off); 479 chunk = min_t(size_t, n, PAGE_SIZE - off); 480 if (!page) 481 return bytes - n; 482 memcpy_to_page(page, off, addr, chunk); 483 addr += chunk; 484 } 485 return bytes; 486 } 487 488 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 489 __wsum sum, size_t off) 490 { 491 __wsum next = csum_partial_copy_nocheck(from, to, len); 492 return csum_block_add(sum, next, off); 493 } 494 495 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 496 struct iov_iter *i, __wsum *sump) 497 { 498 __wsum sum = *sump; 499 size_t off = 0; 500 unsigned int chunk, r; 501 502 if (unlikely(bytes > i->count)) 503 bytes = i->count; 504 if (unlikely(!bytes)) 505 return 0; 506 507 if (!sanity(i)) 508 return 0; 509 510 while (bytes) { 511 struct page *page = append_pipe(i, bytes, &r); 512 char *p; 513 514 if (!page) 515 break; 516 chunk = min_t(size_t, bytes, PAGE_SIZE - r); 517 p = kmap_local_page(page); 518 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); 519 kunmap_local(p); 520 off += chunk; 521 bytes -= chunk; 522 } 523 *sump = sum; 524 return off; 525 } 526 527 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 528 { 529 if (WARN_ON_ONCE(i->data_source)) 530 return 0; 531 if (unlikely(iov_iter_is_pipe(i))) 532 return copy_pipe_to_iter(addr, bytes, i); 533 if (user_backed_iter(i)) 534 might_fault(); 535 iterate_and_advance(i, bytes, base, len, off, 536 copyout(base, addr + off, len), 537 memcpy(base, addr + off, len) 538 ) 539 540 return bytes; 541 } 542 EXPORT_SYMBOL(_copy_to_iter); 543 544 #ifdef CONFIG_ARCH_HAS_COPY_MC 545 static int copyout_mc(void __user *to, const void *from, size_t n) 546 { 547 if (access_ok(to, n)) { 548 instrument_copy_to_user(to, from, n); 549 n = copy_mc_to_user((__force void *) to, from, n); 550 } 551 return n; 552 } 553 554 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 555 struct iov_iter *i) 556 { 557 size_t xfer = 0; 558 unsigned int off, chunk; 559 560 if (unlikely(bytes > i->count)) 561 bytes = i->count; 562 if (unlikely(!bytes)) 563 return 0; 564 565 if (!sanity(i)) 566 return 0; 567 568 while (bytes) { 569 struct page *page = append_pipe(i, bytes, &off); 570 unsigned long rem; 571 char *p; 572 573 if (!page) 574 break; 575 chunk = min_t(size_t, bytes, PAGE_SIZE - off); 576 p = kmap_local_page(page); 577 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); 578 chunk -= rem; 579 kunmap_local(p); 580 xfer += chunk; 581 bytes -= chunk; 582 if (rem) { 583 iov_iter_revert(i, rem); 584 break; 585 } 586 } 587 return xfer; 588 } 589 590 /** 591 * _copy_mc_to_iter - copy to iter with source memory error exception handling 592 * @addr: source kernel address 593 * @bytes: total transfer length 594 * @i: destination iterator 595 * 596 * The pmem driver deploys this for the dax operation 597 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 598 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 599 * successfully copied. 600 * 601 * The main differences between this and typical _copy_to_iter(). 602 * 603 * * Typical tail/residue handling after a fault retries the copy 604 * byte-by-byte until the fault happens again. Re-triggering machine 605 * checks is potentially fatal so the implementation uses source 606 * alignment and poison alignment assumptions to avoid re-triggering 607 * hardware exceptions. 608 * 609 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 610 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 611 * a short copy. 612 * 613 * Return: number of bytes copied (may be %0) 614 */ 615 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 616 { 617 if (WARN_ON_ONCE(i->data_source)) 618 return 0; 619 if (unlikely(iov_iter_is_pipe(i))) 620 return copy_mc_pipe_to_iter(addr, bytes, i); 621 if (user_backed_iter(i)) 622 might_fault(); 623 __iterate_and_advance(i, bytes, base, len, off, 624 copyout_mc(base, addr + off, len), 625 copy_mc_to_kernel(base, addr + off, len) 626 ) 627 628 return bytes; 629 } 630 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 631 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 632 633 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 634 { 635 if (WARN_ON_ONCE(!i->data_source)) 636 return 0; 637 638 if (user_backed_iter(i)) 639 might_fault(); 640 iterate_and_advance(i, bytes, base, len, off, 641 copyin(addr + off, base, len), 642 memcpy(addr + off, base, len) 643 ) 644 645 return bytes; 646 } 647 EXPORT_SYMBOL(_copy_from_iter); 648 649 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 650 { 651 if (WARN_ON_ONCE(!i->data_source)) 652 return 0; 653 654 iterate_and_advance(i, bytes, base, len, off, 655 __copy_from_user_inatomic_nocache(addr + off, base, len), 656 memcpy(addr + off, base, len) 657 ) 658 659 return bytes; 660 } 661 EXPORT_SYMBOL(_copy_from_iter_nocache); 662 663 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 664 /** 665 * _copy_from_iter_flushcache - write destination through cpu cache 666 * @addr: destination kernel address 667 * @bytes: total transfer length 668 * @i: source iterator 669 * 670 * The pmem driver arranges for filesystem-dax to use this facility via 671 * dax_copy_from_iter() for ensuring that writes to persistent memory 672 * are flushed through the CPU cache. It is differentiated from 673 * _copy_from_iter_nocache() in that guarantees all data is flushed for 674 * all iterator types. The _copy_from_iter_nocache() only attempts to 675 * bypass the cache for the ITER_IOVEC case, and on some archs may use 676 * instructions that strand dirty-data in the cache. 677 * 678 * Return: number of bytes copied (may be %0) 679 */ 680 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 681 { 682 if (WARN_ON_ONCE(!i->data_source)) 683 return 0; 684 685 iterate_and_advance(i, bytes, base, len, off, 686 __copy_from_user_flushcache(addr + off, base, len), 687 memcpy_flushcache(addr + off, base, len) 688 ) 689 690 return bytes; 691 } 692 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 693 #endif 694 695 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 696 { 697 struct page *head; 698 size_t v = n + offset; 699 700 /* 701 * The general case needs to access the page order in order 702 * to compute the page size. 703 * However, we mostly deal with order-0 pages and thus can 704 * avoid a possible cache line miss for requests that fit all 705 * page orders. 706 */ 707 if (n <= v && v <= PAGE_SIZE) 708 return true; 709 710 head = compound_head(page); 711 v += (page - head) << PAGE_SHIFT; 712 713 if (WARN_ON(n > v || v > page_size(head))) 714 return false; 715 return true; 716 } 717 718 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 719 struct iov_iter *i) 720 { 721 size_t res = 0; 722 if (!page_copy_sane(page, offset, bytes)) 723 return 0; 724 if (WARN_ON_ONCE(i->data_source)) 725 return 0; 726 if (unlikely(iov_iter_is_pipe(i))) 727 return copy_page_to_iter_pipe(page, offset, bytes, i); 728 page += offset / PAGE_SIZE; // first subpage 729 offset %= PAGE_SIZE; 730 while (1) { 731 void *kaddr = kmap_local_page(page); 732 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 733 n = _copy_to_iter(kaddr + offset, n, i); 734 kunmap_local(kaddr); 735 res += n; 736 bytes -= n; 737 if (!bytes || !n) 738 break; 739 offset += n; 740 if (offset == PAGE_SIZE) { 741 page++; 742 offset = 0; 743 } 744 } 745 return res; 746 } 747 EXPORT_SYMBOL(copy_page_to_iter); 748 749 size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, 750 struct iov_iter *i) 751 { 752 size_t res = 0; 753 754 if (!page_copy_sane(page, offset, bytes)) 755 return 0; 756 if (WARN_ON_ONCE(i->data_source)) 757 return 0; 758 if (unlikely(iov_iter_is_pipe(i))) 759 return copy_page_to_iter_pipe(page, offset, bytes, i); 760 page += offset / PAGE_SIZE; // first subpage 761 offset %= PAGE_SIZE; 762 while (1) { 763 void *kaddr = kmap_local_page(page); 764 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 765 766 iterate_and_advance(i, n, base, len, off, 767 copyout_nofault(base, kaddr + offset + off, len), 768 memcpy(base, kaddr + offset + off, len) 769 ) 770 kunmap_local(kaddr); 771 res += n; 772 bytes -= n; 773 if (!bytes || !n) 774 break; 775 offset += n; 776 if (offset == PAGE_SIZE) { 777 page++; 778 offset = 0; 779 } 780 } 781 return res; 782 } 783 EXPORT_SYMBOL(copy_page_to_iter_nofault); 784 785 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 786 struct iov_iter *i) 787 { 788 size_t res = 0; 789 if (!page_copy_sane(page, offset, bytes)) 790 return 0; 791 page += offset / PAGE_SIZE; // first subpage 792 offset %= PAGE_SIZE; 793 while (1) { 794 void *kaddr = kmap_local_page(page); 795 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 796 n = _copy_from_iter(kaddr + offset, n, i); 797 kunmap_local(kaddr); 798 res += n; 799 bytes -= n; 800 if (!bytes || !n) 801 break; 802 offset += n; 803 if (offset == PAGE_SIZE) { 804 page++; 805 offset = 0; 806 } 807 } 808 return res; 809 } 810 EXPORT_SYMBOL(copy_page_from_iter); 811 812 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 813 { 814 unsigned int chunk, off; 815 816 if (unlikely(bytes > i->count)) 817 bytes = i->count; 818 if (unlikely(!bytes)) 819 return 0; 820 821 if (!sanity(i)) 822 return 0; 823 824 for (size_t n = bytes; n; n -= chunk) { 825 struct page *page = append_pipe(i, n, &off); 826 char *p; 827 828 if (!page) 829 return bytes - n; 830 chunk = min_t(size_t, n, PAGE_SIZE - off); 831 p = kmap_local_page(page); 832 memset(p + off, 0, chunk); 833 kunmap_local(p); 834 } 835 return bytes; 836 } 837 838 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 839 { 840 if (unlikely(iov_iter_is_pipe(i))) 841 return pipe_zero(bytes, i); 842 iterate_and_advance(i, bytes, base, len, count, 843 clear_user(base, len), 844 memset(base, 0, len) 845 ) 846 847 return bytes; 848 } 849 EXPORT_SYMBOL(iov_iter_zero); 850 851 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 852 struct iov_iter *i) 853 { 854 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 855 if (!page_copy_sane(page, offset, bytes)) { 856 kunmap_atomic(kaddr); 857 return 0; 858 } 859 if (WARN_ON_ONCE(!i->data_source)) { 860 kunmap_atomic(kaddr); 861 return 0; 862 } 863 iterate_and_advance(i, bytes, base, len, off, 864 copyin(p + off, base, len), 865 memcpy(p + off, base, len) 866 ) 867 kunmap_atomic(kaddr); 868 return bytes; 869 } 870 EXPORT_SYMBOL(copy_page_from_iter_atomic); 871 872 static void pipe_advance(struct iov_iter *i, size_t size) 873 { 874 struct pipe_inode_info *pipe = i->pipe; 875 int off = i->last_offset; 876 877 if (!off && !size) { 878 pipe_discard_from(pipe, i->start_head); // discard everything 879 return; 880 } 881 i->count -= size; 882 while (1) { 883 struct pipe_buffer *buf = pipe_buf(pipe, i->head); 884 if (off) /* make it relative to the beginning of buffer */ 885 size += abs(off) - buf->offset; 886 if (size <= buf->len) { 887 buf->len = size; 888 i->last_offset = last_offset(buf); 889 break; 890 } 891 size -= buf->len; 892 i->head++; 893 off = 0; 894 } 895 pipe_discard_from(pipe, i->head + 1); // discard everything past this one 896 } 897 898 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 899 { 900 const struct bio_vec *bvec, *end; 901 902 if (!i->count) 903 return; 904 i->count -= size; 905 906 size += i->iov_offset; 907 908 for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { 909 if (likely(size < bvec->bv_len)) 910 break; 911 size -= bvec->bv_len; 912 } 913 i->iov_offset = size; 914 i->nr_segs -= bvec - i->bvec; 915 i->bvec = bvec; 916 } 917 918 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 919 { 920 const struct iovec *iov, *end; 921 922 if (!i->count) 923 return; 924 i->count -= size; 925 926 size += i->iov_offset; // from beginning of current segment 927 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 928 if (likely(size < iov->iov_len)) 929 break; 930 size -= iov->iov_len; 931 } 932 i->iov_offset = size; 933 i->nr_segs -= iov - i->iov; 934 i->iov = iov; 935 } 936 937 void iov_iter_advance(struct iov_iter *i, size_t size) 938 { 939 if (unlikely(i->count < size)) 940 size = i->count; 941 if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { 942 i->iov_offset += size; 943 i->count -= size; 944 } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 945 /* iovec and kvec have identical layouts */ 946 iov_iter_iovec_advance(i, size); 947 } else if (iov_iter_is_bvec(i)) { 948 iov_iter_bvec_advance(i, size); 949 } else if (iov_iter_is_pipe(i)) { 950 pipe_advance(i, size); 951 } else if (iov_iter_is_discard(i)) { 952 i->count -= size; 953 } 954 } 955 EXPORT_SYMBOL(iov_iter_advance); 956 957 void iov_iter_revert(struct iov_iter *i, size_t unroll) 958 { 959 if (!unroll) 960 return; 961 if (WARN_ON(unroll > MAX_RW_COUNT)) 962 return; 963 i->count += unroll; 964 if (unlikely(iov_iter_is_pipe(i))) { 965 struct pipe_inode_info *pipe = i->pipe; 966 unsigned int head = pipe->head; 967 968 while (head > i->start_head) { 969 struct pipe_buffer *b = pipe_buf(pipe, --head); 970 if (unroll < b->len) { 971 b->len -= unroll; 972 i->last_offset = last_offset(b); 973 i->head = head; 974 return; 975 } 976 unroll -= b->len; 977 pipe_buf_release(pipe, b); 978 pipe->head--; 979 } 980 i->last_offset = 0; 981 i->head = head; 982 return; 983 } 984 if (unlikely(iov_iter_is_discard(i))) 985 return; 986 if (unroll <= i->iov_offset) { 987 i->iov_offset -= unroll; 988 return; 989 } 990 unroll -= i->iov_offset; 991 if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { 992 BUG(); /* We should never go beyond the start of the specified 993 * range since we might then be straying into pages that 994 * aren't pinned. 995 */ 996 } else if (iov_iter_is_bvec(i)) { 997 const struct bio_vec *bvec = i->bvec; 998 while (1) { 999 size_t n = (--bvec)->bv_len; 1000 i->nr_segs++; 1001 if (unroll <= n) { 1002 i->bvec = bvec; 1003 i->iov_offset = n - unroll; 1004 return; 1005 } 1006 unroll -= n; 1007 } 1008 } else { /* same logics for iovec and kvec */ 1009 const struct iovec *iov = i->iov; 1010 while (1) { 1011 size_t n = (--iov)->iov_len; 1012 i->nr_segs++; 1013 if (unroll <= n) { 1014 i->iov = iov; 1015 i->iov_offset = n - unroll; 1016 return; 1017 } 1018 unroll -= n; 1019 } 1020 } 1021 } 1022 EXPORT_SYMBOL(iov_iter_revert); 1023 1024 /* 1025 * Return the count of just the current iov_iter segment. 1026 */ 1027 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1028 { 1029 if (i->nr_segs > 1) { 1030 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1031 return min(i->count, i->iov->iov_len - i->iov_offset); 1032 if (iov_iter_is_bvec(i)) 1033 return min(i->count, i->bvec->bv_len - i->iov_offset); 1034 } 1035 return i->count; 1036 } 1037 EXPORT_SYMBOL(iov_iter_single_seg_count); 1038 1039 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1040 const struct kvec *kvec, unsigned long nr_segs, 1041 size_t count) 1042 { 1043 WARN_ON(direction & ~(READ | WRITE)); 1044 *i = (struct iov_iter){ 1045 .iter_type = ITER_KVEC, 1046 .data_source = direction, 1047 .kvec = kvec, 1048 .nr_segs = nr_segs, 1049 .iov_offset = 0, 1050 .count = count 1051 }; 1052 } 1053 EXPORT_SYMBOL(iov_iter_kvec); 1054 1055 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1056 const struct bio_vec *bvec, unsigned long nr_segs, 1057 size_t count) 1058 { 1059 WARN_ON(direction & ~(READ | WRITE)); 1060 *i = (struct iov_iter){ 1061 .iter_type = ITER_BVEC, 1062 .data_source = direction, 1063 .bvec = bvec, 1064 .nr_segs = nr_segs, 1065 .iov_offset = 0, 1066 .count = count 1067 }; 1068 } 1069 EXPORT_SYMBOL(iov_iter_bvec); 1070 1071 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1072 struct pipe_inode_info *pipe, 1073 size_t count) 1074 { 1075 BUG_ON(direction != READ); 1076 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1077 *i = (struct iov_iter){ 1078 .iter_type = ITER_PIPE, 1079 .data_source = false, 1080 .pipe = pipe, 1081 .head = pipe->head, 1082 .start_head = pipe->head, 1083 .last_offset = 0, 1084 .count = count 1085 }; 1086 } 1087 EXPORT_SYMBOL(iov_iter_pipe); 1088 1089 /** 1090 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1091 * @i: The iterator to initialise. 1092 * @direction: The direction of the transfer. 1093 * @xarray: The xarray to access. 1094 * @start: The start file position. 1095 * @count: The size of the I/O buffer in bytes. 1096 * 1097 * Set up an I/O iterator to either draw data out of the pages attached to an 1098 * inode or to inject data into those pages. The pages *must* be prevented 1099 * from evaporation, either by taking a ref on them or locking them by the 1100 * caller. 1101 */ 1102 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1103 struct xarray *xarray, loff_t start, size_t count) 1104 { 1105 BUG_ON(direction & ~1); 1106 *i = (struct iov_iter) { 1107 .iter_type = ITER_XARRAY, 1108 .data_source = direction, 1109 .xarray = xarray, 1110 .xarray_start = start, 1111 .count = count, 1112 .iov_offset = 0 1113 }; 1114 } 1115 EXPORT_SYMBOL(iov_iter_xarray); 1116 1117 /** 1118 * iov_iter_discard - Initialise an I/O iterator that discards data 1119 * @i: The iterator to initialise. 1120 * @direction: The direction of the transfer. 1121 * @count: The size of the I/O buffer in bytes. 1122 * 1123 * Set up an I/O iterator that just discards everything that's written to it. 1124 * It's only available as a READ iterator. 1125 */ 1126 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1127 { 1128 BUG_ON(direction != READ); 1129 *i = (struct iov_iter){ 1130 .iter_type = ITER_DISCARD, 1131 .data_source = false, 1132 .count = count, 1133 .iov_offset = 0 1134 }; 1135 } 1136 EXPORT_SYMBOL(iov_iter_discard); 1137 1138 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, 1139 unsigned len_mask) 1140 { 1141 size_t size = i->count; 1142 size_t skip = i->iov_offset; 1143 unsigned k; 1144 1145 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1146 size_t len = i->iov[k].iov_len - skip; 1147 1148 if (len > size) 1149 len = size; 1150 if (len & len_mask) 1151 return false; 1152 if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) 1153 return false; 1154 1155 size -= len; 1156 if (!size) 1157 break; 1158 } 1159 return true; 1160 } 1161 1162 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, 1163 unsigned len_mask) 1164 { 1165 size_t size = i->count; 1166 unsigned skip = i->iov_offset; 1167 unsigned k; 1168 1169 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1170 size_t len = i->bvec[k].bv_len - skip; 1171 1172 if (len > size) 1173 len = size; 1174 if (len & len_mask) 1175 return false; 1176 if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) 1177 return false; 1178 1179 size -= len; 1180 if (!size) 1181 break; 1182 } 1183 return true; 1184 } 1185 1186 /** 1187 * iov_iter_is_aligned() - Check if the addresses and lengths of each segments 1188 * are aligned to the parameters. 1189 * 1190 * @i: &struct iov_iter to restore 1191 * @addr_mask: bit mask to check against the iov element's addresses 1192 * @len_mask: bit mask to check against the iov element's lengths 1193 * 1194 * Return: false if any addresses or lengths intersect with the provided masks 1195 */ 1196 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, 1197 unsigned len_mask) 1198 { 1199 if (likely(iter_is_ubuf(i))) { 1200 if (i->count & len_mask) 1201 return false; 1202 if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) 1203 return false; 1204 return true; 1205 } 1206 1207 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1208 return iov_iter_aligned_iovec(i, addr_mask, len_mask); 1209 1210 if (iov_iter_is_bvec(i)) 1211 return iov_iter_aligned_bvec(i, addr_mask, len_mask); 1212 1213 if (iov_iter_is_pipe(i)) { 1214 size_t size = i->count; 1215 1216 if (size & len_mask) 1217 return false; 1218 if (size && i->last_offset > 0) { 1219 if (i->last_offset & addr_mask) 1220 return false; 1221 } 1222 1223 return true; 1224 } 1225 1226 if (iov_iter_is_xarray(i)) { 1227 if (i->count & len_mask) 1228 return false; 1229 if ((i->xarray_start + i->iov_offset) & addr_mask) 1230 return false; 1231 } 1232 1233 return true; 1234 } 1235 EXPORT_SYMBOL_GPL(iov_iter_is_aligned); 1236 1237 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1238 { 1239 unsigned long res = 0; 1240 size_t size = i->count; 1241 size_t skip = i->iov_offset; 1242 unsigned k; 1243 1244 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1245 size_t len = i->iov[k].iov_len - skip; 1246 if (len) { 1247 res |= (unsigned long)i->iov[k].iov_base + skip; 1248 if (len > size) 1249 len = size; 1250 res |= len; 1251 size -= len; 1252 if (!size) 1253 break; 1254 } 1255 } 1256 return res; 1257 } 1258 1259 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1260 { 1261 unsigned res = 0; 1262 size_t size = i->count; 1263 unsigned skip = i->iov_offset; 1264 unsigned k; 1265 1266 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1267 size_t len = i->bvec[k].bv_len - skip; 1268 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1269 if (len > size) 1270 len = size; 1271 res |= len; 1272 size -= len; 1273 if (!size) 1274 break; 1275 } 1276 return res; 1277 } 1278 1279 unsigned long iov_iter_alignment(const struct iov_iter *i) 1280 { 1281 if (likely(iter_is_ubuf(i))) { 1282 size_t size = i->count; 1283 if (size) 1284 return ((unsigned long)i->ubuf + i->iov_offset) | size; 1285 return 0; 1286 } 1287 1288 /* iovec and kvec have identical layouts */ 1289 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1290 return iov_iter_alignment_iovec(i); 1291 1292 if (iov_iter_is_bvec(i)) 1293 return iov_iter_alignment_bvec(i); 1294 1295 if (iov_iter_is_pipe(i)) { 1296 size_t size = i->count; 1297 1298 if (size && i->last_offset > 0) 1299 return size | i->last_offset; 1300 return size; 1301 } 1302 1303 if (iov_iter_is_xarray(i)) 1304 return (i->xarray_start + i->iov_offset) | i->count; 1305 1306 return 0; 1307 } 1308 EXPORT_SYMBOL(iov_iter_alignment); 1309 1310 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1311 { 1312 unsigned long res = 0; 1313 unsigned long v = 0; 1314 size_t size = i->count; 1315 unsigned k; 1316 1317 if (iter_is_ubuf(i)) 1318 return 0; 1319 1320 if (WARN_ON(!iter_is_iovec(i))) 1321 return ~0U; 1322 1323 for (k = 0; k < i->nr_segs; k++) { 1324 if (i->iov[k].iov_len) { 1325 unsigned long base = (unsigned long)i->iov[k].iov_base; 1326 if (v) // if not the first one 1327 res |= base | v; // this start | previous end 1328 v = base + i->iov[k].iov_len; 1329 if (size <= i->iov[k].iov_len) 1330 break; 1331 size -= i->iov[k].iov_len; 1332 } 1333 } 1334 return res; 1335 } 1336 EXPORT_SYMBOL(iov_iter_gap_alignment); 1337 1338 static int want_pages_array(struct page ***res, size_t size, 1339 size_t start, unsigned int maxpages) 1340 { 1341 unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); 1342 1343 if (count > maxpages) 1344 count = maxpages; 1345 WARN_ON(!count); // caller should've prevented that 1346 if (!*res) { 1347 *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); 1348 if (!*res) 1349 return 0; 1350 } 1351 return count; 1352 } 1353 1354 static ssize_t pipe_get_pages(struct iov_iter *i, 1355 struct page ***pages, size_t maxsize, unsigned maxpages, 1356 size_t *start) 1357 { 1358 unsigned int npages, count, off, chunk; 1359 struct page **p; 1360 size_t left; 1361 1362 if (!sanity(i)) 1363 return -EFAULT; 1364 1365 *start = off = pipe_npages(i, &npages); 1366 if (!npages) 1367 return -EFAULT; 1368 count = want_pages_array(pages, maxsize, off, min(npages, maxpages)); 1369 if (!count) 1370 return -ENOMEM; 1371 p = *pages; 1372 for (npages = 0, left = maxsize ; npages < count; npages++, left -= chunk) { 1373 struct page *page = append_pipe(i, left, &off); 1374 if (!page) 1375 break; 1376 chunk = min_t(size_t, left, PAGE_SIZE - off); 1377 get_page(*p++ = page); 1378 } 1379 if (!npages) 1380 return -EFAULT; 1381 return maxsize - left; 1382 } 1383 1384 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1385 pgoff_t index, unsigned int nr_pages) 1386 { 1387 XA_STATE(xas, xa, index); 1388 struct page *page; 1389 unsigned int ret = 0; 1390 1391 rcu_read_lock(); 1392 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1393 if (xas_retry(&xas, page)) 1394 continue; 1395 1396 /* Has the page moved or been split? */ 1397 if (unlikely(page != xas_reload(&xas))) { 1398 xas_reset(&xas); 1399 continue; 1400 } 1401 1402 pages[ret] = find_subpage(page, xas.xa_index); 1403 get_page(pages[ret]); 1404 if (++ret == nr_pages) 1405 break; 1406 } 1407 rcu_read_unlock(); 1408 return ret; 1409 } 1410 1411 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1412 struct page ***pages, size_t maxsize, 1413 unsigned maxpages, size_t *_start_offset) 1414 { 1415 unsigned nr, offset, count; 1416 pgoff_t index; 1417 loff_t pos; 1418 1419 pos = i->xarray_start + i->iov_offset; 1420 index = pos >> PAGE_SHIFT; 1421 offset = pos & ~PAGE_MASK; 1422 *_start_offset = offset; 1423 1424 count = want_pages_array(pages, maxsize, offset, maxpages); 1425 if (!count) 1426 return -ENOMEM; 1427 nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); 1428 if (nr == 0) 1429 return 0; 1430 1431 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1432 i->iov_offset += maxsize; 1433 i->count -= maxsize; 1434 return maxsize; 1435 } 1436 1437 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ 1438 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) 1439 { 1440 size_t skip; 1441 long k; 1442 1443 if (iter_is_ubuf(i)) 1444 return (unsigned long)i->ubuf + i->iov_offset; 1445 1446 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1447 size_t len = i->iov[k].iov_len - skip; 1448 1449 if (unlikely(!len)) 1450 continue; 1451 if (*size > len) 1452 *size = len; 1453 return (unsigned long)i->iov[k].iov_base + skip; 1454 } 1455 BUG(); // if it had been empty, we wouldn't get called 1456 } 1457 1458 /* must be done on non-empty ITER_BVEC one */ 1459 static struct page *first_bvec_segment(const struct iov_iter *i, 1460 size_t *size, size_t *start) 1461 { 1462 struct page *page; 1463 size_t skip = i->iov_offset, len; 1464 1465 len = i->bvec->bv_len - skip; 1466 if (*size > len) 1467 *size = len; 1468 skip += i->bvec->bv_offset; 1469 page = i->bvec->bv_page + skip / PAGE_SIZE; 1470 *start = skip % PAGE_SIZE; 1471 return page; 1472 } 1473 1474 static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, 1475 struct page ***pages, size_t maxsize, 1476 unsigned int maxpages, size_t *start, 1477 iov_iter_extraction_t extraction_flags) 1478 { 1479 unsigned int n, gup_flags = 0; 1480 1481 if (maxsize > i->count) 1482 maxsize = i->count; 1483 if (!maxsize) 1484 return 0; 1485 if (maxsize > MAX_RW_COUNT) 1486 maxsize = MAX_RW_COUNT; 1487 if (extraction_flags & ITER_ALLOW_P2PDMA) 1488 gup_flags |= FOLL_PCI_P2PDMA; 1489 1490 if (likely(user_backed_iter(i))) { 1491 unsigned long addr; 1492 int res; 1493 1494 if (iov_iter_rw(i) != WRITE) 1495 gup_flags |= FOLL_WRITE; 1496 if (i->nofault) 1497 gup_flags |= FOLL_NOFAULT; 1498 1499 addr = first_iovec_segment(i, &maxsize); 1500 *start = addr % PAGE_SIZE; 1501 addr &= PAGE_MASK; 1502 n = want_pages_array(pages, maxsize, *start, maxpages); 1503 if (!n) 1504 return -ENOMEM; 1505 res = get_user_pages_fast(addr, n, gup_flags, *pages); 1506 if (unlikely(res <= 0)) 1507 return res; 1508 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); 1509 iov_iter_advance(i, maxsize); 1510 return maxsize; 1511 } 1512 if (iov_iter_is_bvec(i)) { 1513 struct page **p; 1514 struct page *page; 1515 1516 page = first_bvec_segment(i, &maxsize, start); 1517 n = want_pages_array(pages, maxsize, *start, maxpages); 1518 if (!n) 1519 return -ENOMEM; 1520 p = *pages; 1521 for (int k = 0; k < n; k++) 1522 get_page(p[k] = page + k); 1523 maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); 1524 i->count -= maxsize; 1525 i->iov_offset += maxsize; 1526 if (i->iov_offset == i->bvec->bv_len) { 1527 i->iov_offset = 0; 1528 i->bvec++; 1529 i->nr_segs--; 1530 } 1531 return maxsize; 1532 } 1533 if (iov_iter_is_pipe(i)) 1534 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1535 if (iov_iter_is_xarray(i)) 1536 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1537 return -EFAULT; 1538 } 1539 1540 ssize_t iov_iter_get_pages(struct iov_iter *i, 1541 struct page **pages, size_t maxsize, unsigned maxpages, 1542 size_t *start, iov_iter_extraction_t extraction_flags) 1543 { 1544 if (!maxpages) 1545 return 0; 1546 BUG_ON(!pages); 1547 1548 return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, 1549 start, extraction_flags); 1550 } 1551 EXPORT_SYMBOL_GPL(iov_iter_get_pages); 1552 1553 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, 1554 size_t maxsize, unsigned maxpages, size_t *start) 1555 { 1556 return iov_iter_get_pages(i, pages, maxsize, maxpages, start, 0); 1557 } 1558 EXPORT_SYMBOL(iov_iter_get_pages2); 1559 1560 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1561 struct page ***pages, size_t maxsize, 1562 size_t *start, iov_iter_extraction_t extraction_flags) 1563 { 1564 ssize_t len; 1565 1566 *pages = NULL; 1567 1568 len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, 1569 extraction_flags); 1570 if (len <= 0) { 1571 kvfree(*pages); 1572 *pages = NULL; 1573 } 1574 return len; 1575 } 1576 EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc); 1577 1578 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, 1579 struct page ***pages, size_t maxsize, size_t *start) 1580 { 1581 return iov_iter_get_pages_alloc(i, pages, maxsize, start, 0); 1582 } 1583 EXPORT_SYMBOL(iov_iter_get_pages_alloc2); 1584 1585 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1586 struct iov_iter *i) 1587 { 1588 __wsum sum, next; 1589 sum = *csum; 1590 if (WARN_ON_ONCE(!i->data_source)) 1591 return 0; 1592 1593 iterate_and_advance(i, bytes, base, len, off, ({ 1594 next = csum_and_copy_from_user(base, addr + off, len); 1595 sum = csum_block_add(sum, next, off); 1596 next ? 0 : len; 1597 }), ({ 1598 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1599 }) 1600 ) 1601 *csum = sum; 1602 return bytes; 1603 } 1604 EXPORT_SYMBOL(csum_and_copy_from_iter); 1605 1606 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1607 struct iov_iter *i) 1608 { 1609 struct csum_state *csstate = _csstate; 1610 __wsum sum, next; 1611 1612 if (WARN_ON_ONCE(i->data_source)) 1613 return 0; 1614 if (unlikely(iov_iter_is_discard(i))) { 1615 // can't use csum_memcpy() for that one - data is not copied 1616 csstate->csum = csum_block_add(csstate->csum, 1617 csum_partial(addr, bytes, 0), 1618 csstate->off); 1619 csstate->off += bytes; 1620 return bytes; 1621 } 1622 1623 sum = csum_shift(csstate->csum, csstate->off); 1624 if (unlikely(iov_iter_is_pipe(i))) 1625 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); 1626 else iterate_and_advance(i, bytes, base, len, off, ({ 1627 next = csum_and_copy_to_user(addr + off, base, len); 1628 sum = csum_block_add(sum, next, off); 1629 next ? 0 : len; 1630 }), ({ 1631 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1632 }) 1633 ) 1634 csstate->csum = csum_shift(sum, csstate->off); 1635 csstate->off += bytes; 1636 return bytes; 1637 } 1638 EXPORT_SYMBOL(csum_and_copy_to_iter); 1639 1640 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1641 struct iov_iter *i) 1642 { 1643 #ifdef CONFIG_CRYPTO_HASH 1644 struct ahash_request *hash = hashp; 1645 struct scatterlist sg; 1646 size_t copied; 1647 1648 copied = copy_to_iter(addr, bytes, i); 1649 sg_init_one(&sg, addr, copied); 1650 ahash_request_set_crypt(hash, &sg, NULL, copied); 1651 crypto_ahash_update(hash); 1652 return copied; 1653 #else 1654 return 0; 1655 #endif 1656 } 1657 EXPORT_SYMBOL(hash_and_copy_to_iter); 1658 1659 static int iov_npages(const struct iov_iter *i, int maxpages) 1660 { 1661 size_t skip = i->iov_offset, size = i->count; 1662 const struct iovec *p; 1663 int npages = 0; 1664 1665 for (p = i->iov; size; skip = 0, p++) { 1666 unsigned offs = offset_in_page(p->iov_base + skip); 1667 size_t len = min(p->iov_len - skip, size); 1668 1669 if (len) { 1670 size -= len; 1671 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1672 if (unlikely(npages > maxpages)) 1673 return maxpages; 1674 } 1675 } 1676 return npages; 1677 } 1678 1679 static int bvec_npages(const struct iov_iter *i, int maxpages) 1680 { 1681 size_t skip = i->iov_offset, size = i->count; 1682 const struct bio_vec *p; 1683 int npages = 0; 1684 1685 for (p = i->bvec; size; skip = 0, p++) { 1686 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1687 size_t len = min(p->bv_len - skip, size); 1688 1689 size -= len; 1690 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1691 if (unlikely(npages > maxpages)) 1692 return maxpages; 1693 } 1694 return npages; 1695 } 1696 1697 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1698 { 1699 if (unlikely(!i->count)) 1700 return 0; 1701 if (likely(iter_is_ubuf(i))) { 1702 unsigned offs = offset_in_page(i->ubuf + i->iov_offset); 1703 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); 1704 return min(npages, maxpages); 1705 } 1706 /* iovec and kvec have identical layouts */ 1707 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1708 return iov_npages(i, maxpages); 1709 if (iov_iter_is_bvec(i)) 1710 return bvec_npages(i, maxpages); 1711 if (iov_iter_is_pipe(i)) { 1712 int npages; 1713 1714 if (!sanity(i)) 1715 return 0; 1716 1717 pipe_npages(i, &npages); 1718 return min(npages, maxpages); 1719 } 1720 if (iov_iter_is_xarray(i)) { 1721 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1722 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1723 return min(npages, maxpages); 1724 } 1725 return 0; 1726 } 1727 EXPORT_SYMBOL(iov_iter_npages); 1728 1729 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1730 { 1731 *new = *old; 1732 if (unlikely(iov_iter_is_pipe(new))) { 1733 WARN_ON(1); 1734 return NULL; 1735 } 1736 if (iov_iter_is_bvec(new)) 1737 return new->bvec = kmemdup(new->bvec, 1738 new->nr_segs * sizeof(struct bio_vec), 1739 flags); 1740 else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) 1741 /* iovec and kvec have identical layout */ 1742 return new->iov = kmemdup(new->iov, 1743 new->nr_segs * sizeof(struct iovec), 1744 flags); 1745 return NULL; 1746 } 1747 EXPORT_SYMBOL(dup_iter); 1748 1749 static int copy_compat_iovec_from_user(struct iovec *iov, 1750 const struct iovec __user *uvec, unsigned long nr_segs) 1751 { 1752 const struct compat_iovec __user *uiov = 1753 (const struct compat_iovec __user *)uvec; 1754 int ret = -EFAULT, i; 1755 1756 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1757 return -EFAULT; 1758 1759 for (i = 0; i < nr_segs; i++) { 1760 compat_uptr_t buf; 1761 compat_ssize_t len; 1762 1763 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1764 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1765 1766 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1767 if (len < 0) { 1768 ret = -EINVAL; 1769 goto uaccess_end; 1770 } 1771 iov[i].iov_base = compat_ptr(buf); 1772 iov[i].iov_len = len; 1773 } 1774 1775 ret = 0; 1776 uaccess_end: 1777 user_access_end(); 1778 return ret; 1779 } 1780 1781 static int copy_iovec_from_user(struct iovec *iov, 1782 const struct iovec __user *uvec, unsigned long nr_segs) 1783 { 1784 unsigned long seg; 1785 1786 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1787 return -EFAULT; 1788 for (seg = 0; seg < nr_segs; seg++) { 1789 if ((ssize_t)iov[seg].iov_len < 0) 1790 return -EINVAL; 1791 } 1792 1793 return 0; 1794 } 1795 1796 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1797 unsigned long nr_segs, unsigned long fast_segs, 1798 struct iovec *fast_iov, bool compat) 1799 { 1800 struct iovec *iov = fast_iov; 1801 int ret; 1802 1803 /* 1804 * SuS says "The readv() function *may* fail if the iovcnt argument was 1805 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1806 * traditionally returned zero for zero segments, so... 1807 */ 1808 if (nr_segs == 0) 1809 return iov; 1810 if (nr_segs > UIO_MAXIOV) 1811 return ERR_PTR(-EINVAL); 1812 if (nr_segs > fast_segs) { 1813 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1814 if (!iov) 1815 return ERR_PTR(-ENOMEM); 1816 } 1817 1818 if (compat) 1819 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1820 else 1821 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1822 if (ret) { 1823 if (iov != fast_iov) 1824 kfree(iov); 1825 return ERR_PTR(ret); 1826 } 1827 1828 return iov; 1829 } 1830 1831 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1832 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1833 struct iov_iter *i, bool compat) 1834 { 1835 ssize_t total_len = 0; 1836 unsigned long seg; 1837 struct iovec *iov; 1838 1839 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1840 if (IS_ERR(iov)) { 1841 *iovp = NULL; 1842 return PTR_ERR(iov); 1843 } 1844 1845 /* 1846 * According to the Single Unix Specification we should return EINVAL if 1847 * an element length is < 0 when cast to ssize_t or if the total length 1848 * would overflow the ssize_t return value of the system call. 1849 * 1850 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1851 * overflow case. 1852 */ 1853 for (seg = 0; seg < nr_segs; seg++) { 1854 ssize_t len = (ssize_t)iov[seg].iov_len; 1855 1856 if (!access_ok(iov[seg].iov_base, len)) { 1857 if (iov != *iovp) 1858 kfree(iov); 1859 *iovp = NULL; 1860 return -EFAULT; 1861 } 1862 1863 if (len > MAX_RW_COUNT - total_len) { 1864 len = MAX_RW_COUNT - total_len; 1865 iov[seg].iov_len = len; 1866 } 1867 total_len += len; 1868 } 1869 1870 iov_iter_init(i, type, iov, nr_segs, total_len); 1871 if (iov == *iovp) 1872 *iovp = NULL; 1873 else 1874 *iovp = iov; 1875 return total_len; 1876 } 1877 1878 /** 1879 * import_iovec() - Copy an array of &struct iovec from userspace 1880 * into the kernel, check that it is valid, and initialize a new 1881 * &struct iov_iter iterator to access it. 1882 * 1883 * @type: One of %READ or %WRITE. 1884 * @uvec: Pointer to the userspace array. 1885 * @nr_segs: Number of elements in userspace array. 1886 * @fast_segs: Number of elements in @iov. 1887 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1888 * on-stack) kernel array. 1889 * @i: Pointer to iterator that will be initialized on success. 1890 * 1891 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1892 * then this function places %NULL in *@iov on return. Otherwise, a new 1893 * array will be allocated and the result placed in *@iov. This means that 1894 * the caller may call kfree() on *@iov regardless of whether the small 1895 * on-stack array was used or not (and regardless of whether this function 1896 * returns an error or not). 1897 * 1898 * Return: Negative error code on error, bytes imported on success 1899 */ 1900 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1901 unsigned nr_segs, unsigned fast_segs, 1902 struct iovec **iovp, struct iov_iter *i) 1903 { 1904 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1905 in_compat_syscall()); 1906 } 1907 EXPORT_SYMBOL(import_iovec); 1908 1909 int import_single_range(int rw, void __user *buf, size_t len, 1910 struct iovec *iov, struct iov_iter *i) 1911 { 1912 if (len > MAX_RW_COUNT) 1913 len = MAX_RW_COUNT; 1914 if (unlikely(!access_ok(buf, len))) 1915 return -EFAULT; 1916 1917 iov->iov_base = buf; 1918 iov->iov_len = len; 1919 iov_iter_init(i, rw, iov, 1, len); 1920 return 0; 1921 } 1922 EXPORT_SYMBOL(import_single_range); 1923 1924 int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) 1925 { 1926 if (len > MAX_RW_COUNT) 1927 len = MAX_RW_COUNT; 1928 if (unlikely(!access_ok(buf, len))) 1929 return -EFAULT; 1930 1931 iov_iter_ubuf(i, rw, buf, len); 1932 return 0; 1933 } 1934 1935 /** 1936 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when 1937 * iov_iter_save_state() was called. 1938 * 1939 * @i: &struct iov_iter to restore 1940 * @state: state to restore from 1941 * 1942 * Used after iov_iter_save_state() to bring restore @i, if operations may 1943 * have advanced it. 1944 * 1945 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC 1946 */ 1947 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) 1948 { 1949 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && 1950 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) 1951 return; 1952 i->iov_offset = state->iov_offset; 1953 i->count = state->count; 1954 if (iter_is_ubuf(i)) 1955 return; 1956 /* 1957 * For the *vec iters, nr_segs + iov is constant - if we increment 1958 * the vec, then we also decrement the nr_segs count. Hence we don't 1959 * need to track both of these, just one is enough and we can deduct 1960 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct 1961 * size, so we can just increment the iov pointer as they are unionzed. 1962 * ITER_BVEC _may_ be the same size on some archs, but on others it is 1963 * not. Be safe and handle it separately. 1964 */ 1965 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 1966 if (iov_iter_is_bvec(i)) 1967 i->bvec -= state->nr_segs - i->nr_segs; 1968 else 1969 i->iov -= state->nr_segs - i->nr_segs; 1970 i->nr_segs = state->nr_segs; 1971 } 1972 1973 /* 1974 * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not 1975 * get references on the pages, nor does it get a pin on them. 1976 */ 1977 static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, 1978 struct page ***pages, size_t maxsize, 1979 unsigned int maxpages, 1980 iov_iter_extraction_t extraction_flags, 1981 size_t *offset0) 1982 { 1983 struct page *page, **p; 1984 unsigned int nr = 0, offset; 1985 loff_t pos = i->xarray_start + i->iov_offset; 1986 pgoff_t index = pos >> PAGE_SHIFT; 1987 XA_STATE(xas, i->xarray, index); 1988 1989 offset = pos & ~PAGE_MASK; 1990 *offset0 = offset; 1991 1992 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 1993 if (!maxpages) 1994 return -ENOMEM; 1995 p = *pages; 1996 1997 rcu_read_lock(); 1998 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1999 if (xas_retry(&xas, page)) 2000 continue; 2001 2002 /* Has the page moved or been split? */ 2003 if (unlikely(page != xas_reload(&xas))) { 2004 xas_reset(&xas); 2005 continue; 2006 } 2007 2008 p[nr++] = find_subpage(page, xas.xa_index); 2009 if (nr == maxpages) 2010 break; 2011 } 2012 rcu_read_unlock(); 2013 2014 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 2015 iov_iter_advance(i, maxsize); 2016 return maxsize; 2017 } 2018 2019 /* 2020 * Extract a list of contiguous pages from an ITER_BVEC iterator. This does 2021 * not get references on the pages, nor does it get a pin on them. 2022 */ 2023 static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, 2024 struct page ***pages, size_t maxsize, 2025 unsigned int maxpages, 2026 iov_iter_extraction_t extraction_flags, 2027 size_t *offset0) 2028 { 2029 struct page **p, *page; 2030 size_t skip = i->iov_offset, offset; 2031 int k; 2032 2033 for (;;) { 2034 if (i->nr_segs == 0) 2035 return 0; 2036 maxsize = min(maxsize, i->bvec->bv_len - skip); 2037 if (maxsize) 2038 break; 2039 i->iov_offset = 0; 2040 i->nr_segs--; 2041 i->bvec++; 2042 skip = 0; 2043 } 2044 2045 skip += i->bvec->bv_offset; 2046 page = i->bvec->bv_page + skip / PAGE_SIZE; 2047 offset = skip % PAGE_SIZE; 2048 *offset0 = offset; 2049 2050 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 2051 if (!maxpages) 2052 return -ENOMEM; 2053 p = *pages; 2054 for (k = 0; k < maxpages; k++) 2055 p[k] = page + k; 2056 2057 maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset); 2058 iov_iter_advance(i, maxsize); 2059 return maxsize; 2060 } 2061 2062 /* 2063 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. 2064 * This does not get references on the pages, nor does it get a pin on them. 2065 */ 2066 static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, 2067 struct page ***pages, size_t maxsize, 2068 unsigned int maxpages, 2069 iov_iter_extraction_t extraction_flags, 2070 size_t *offset0) 2071 { 2072 struct page **p, *page; 2073 const void *kaddr; 2074 size_t skip = i->iov_offset, offset, len; 2075 int k; 2076 2077 for (;;) { 2078 if (i->nr_segs == 0) 2079 return 0; 2080 maxsize = min(maxsize, i->kvec->iov_len - skip); 2081 if (maxsize) 2082 break; 2083 i->iov_offset = 0; 2084 i->nr_segs--; 2085 i->kvec++; 2086 skip = 0; 2087 } 2088 2089 kaddr = i->kvec->iov_base + skip; 2090 offset = (unsigned long)kaddr & ~PAGE_MASK; 2091 *offset0 = offset; 2092 2093 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 2094 if (!maxpages) 2095 return -ENOMEM; 2096 p = *pages; 2097 2098 kaddr -= offset; 2099 len = offset + maxsize; 2100 for (k = 0; k < maxpages; k++) { 2101 size_t seg = min_t(size_t, len, PAGE_SIZE); 2102 2103 if (is_vmalloc_or_module_addr(kaddr)) 2104 page = vmalloc_to_page(kaddr); 2105 else 2106 page = virt_to_page(kaddr); 2107 2108 p[k] = page; 2109 len -= seg; 2110 kaddr += PAGE_SIZE; 2111 } 2112 2113 maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset); 2114 iov_iter_advance(i, maxsize); 2115 return maxsize; 2116 } 2117 2118 /* 2119 * Extract a list of contiguous pages from a user iterator and get a pin on 2120 * each of them. This should only be used if the iterator is user-backed 2121 * (IOBUF/UBUF). 2122 * 2123 * It does not get refs on the pages, but the pages must be unpinned by the 2124 * caller once the transfer is complete. 2125 * 2126 * This is safe to be used where background IO/DMA *is* going to be modifying 2127 * the buffer; using a pin rather than a ref makes forces fork() to give the 2128 * child a copy of the page. 2129 */ 2130 static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, 2131 struct page ***pages, 2132 size_t maxsize, 2133 unsigned int maxpages, 2134 iov_iter_extraction_t extraction_flags, 2135 size_t *offset0) 2136 { 2137 unsigned long addr; 2138 unsigned int gup_flags = 0; 2139 size_t offset; 2140 int res; 2141 2142 if (i->data_source == ITER_DEST) 2143 gup_flags |= FOLL_WRITE; 2144 if (extraction_flags & ITER_ALLOW_P2PDMA) 2145 gup_flags |= FOLL_PCI_P2PDMA; 2146 if (i->nofault) 2147 gup_flags |= FOLL_NOFAULT; 2148 2149 addr = first_iovec_segment(i, &maxsize); 2150 *offset0 = offset = addr % PAGE_SIZE; 2151 addr &= PAGE_MASK; 2152 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 2153 if (!maxpages) 2154 return -ENOMEM; 2155 res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); 2156 if (unlikely(res <= 0)) 2157 return res; 2158 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); 2159 iov_iter_advance(i, maxsize); 2160 return maxsize; 2161 } 2162 2163 /** 2164 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator 2165 * @i: The iterator to extract from 2166 * @pages: Where to return the list of pages 2167 * @maxsize: The maximum amount of iterator to extract 2168 * @maxpages: The maximum size of the list of pages 2169 * @extraction_flags: Flags to qualify request 2170 * @offset0: Where to return the starting offset into (*@pages)[0] 2171 * 2172 * Extract a list of contiguous pages from the current point of the iterator, 2173 * advancing the iterator. The maximum number of pages and the maximum amount 2174 * of page contents can be set. 2175 * 2176 * If *@pages is NULL, a page list will be allocated to the required size and 2177 * *@pages will be set to its base. If *@pages is not NULL, it will be assumed 2178 * that the caller allocated a page list at least @maxpages in size and this 2179 * will be filled in. 2180 * 2181 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA 2182 * be allowed on the pages extracted. 2183 * 2184 * The iov_iter_extract_will_pin() function can be used to query how cleanup 2185 * should be performed. 2186 * 2187 * Extra refs or pins on the pages may be obtained as follows: 2188 * 2189 * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be 2190 * added to the pages, but refs will not be taken. 2191 * iov_iter_extract_will_pin() will return true. 2192 * 2193 * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are 2194 * merely listed; no extra refs or pins are obtained. 2195 * iov_iter_extract_will_pin() will return 0. 2196 * 2197 * Note also: 2198 * 2199 * (*) Use with ITER_DISCARD is not supported as that has no content. 2200 * 2201 * On success, the function sets *@pages to the new pagelist, if allocated, and 2202 * sets *offset0 to the offset into the first page. 2203 * 2204 * It may also return -ENOMEM and -EFAULT. 2205 */ 2206 ssize_t iov_iter_extract_pages(struct iov_iter *i, 2207 struct page ***pages, 2208 size_t maxsize, 2209 unsigned int maxpages, 2210 iov_iter_extraction_t extraction_flags, 2211 size_t *offset0) 2212 { 2213 maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); 2214 if (!maxsize) 2215 return 0; 2216 2217 if (likely(user_backed_iter(i))) 2218 return iov_iter_extract_user_pages(i, pages, maxsize, 2219 maxpages, extraction_flags, 2220 offset0); 2221 if (iov_iter_is_kvec(i)) 2222 return iov_iter_extract_kvec_pages(i, pages, maxsize, 2223 maxpages, extraction_flags, 2224 offset0); 2225 if (iov_iter_is_bvec(i)) 2226 return iov_iter_extract_bvec_pages(i, pages, maxsize, 2227 maxpages, extraction_flags, 2228 offset0); 2229 if (iov_iter_is_xarray(i)) 2230 return iov_iter_extract_xarray_pages(i, pages, maxsize, 2231 maxpages, extraction_flags, 2232 offset0); 2233 return -EFAULT; 2234 } 2235 EXPORT_SYMBOL_GPL(iov_iter_extract_pages); 2236