1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers ubuf and kbuf alike */ 20 #define iterate_buf(i, n, base, len, off, __p, STEP) { \ 21 size_t __maybe_unused off = 0; \ 22 len = n; \ 23 base = __p + i->iov_offset; \ 24 len -= (STEP); \ 25 i->iov_offset += len; \ 26 n = len; \ 27 } 28 29 /* covers iovec and kvec alike */ 30 #define iterate_iovec(i, n, base, len, off, __p, STEP) { \ 31 size_t off = 0; \ 32 size_t skip = i->iov_offset; \ 33 do { \ 34 len = min(n, __p->iov_len - skip); \ 35 if (likely(len)) { \ 36 base = __p->iov_base + skip; \ 37 len -= (STEP); \ 38 off += len; \ 39 skip += len; \ 40 n -= len; \ 41 if (skip < __p->iov_len) \ 42 break; \ 43 } \ 44 __p++; \ 45 skip = 0; \ 46 } while (n); \ 47 i->iov_offset = skip; \ 48 n = off; \ 49 } 50 51 #define iterate_bvec(i, n, base, len, off, p, STEP) { \ 52 size_t off = 0; \ 53 unsigned skip = i->iov_offset; \ 54 while (n) { \ 55 unsigned offset = p->bv_offset + skip; \ 56 unsigned left; \ 57 void *kaddr = kmap_local_page(p->bv_page + \ 58 offset / PAGE_SIZE); \ 59 base = kaddr + offset % PAGE_SIZE; \ 60 len = min(min(n, (size_t)(p->bv_len - skip)), \ 61 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 62 left = (STEP); \ 63 kunmap_local(kaddr); \ 64 len -= left; \ 65 off += len; \ 66 skip += len; \ 67 if (skip == p->bv_len) { \ 68 skip = 0; \ 69 p++; \ 70 } \ 71 n -= len; \ 72 if (left) \ 73 break; \ 74 } \ 75 i->iov_offset = skip; \ 76 n = off; \ 77 } 78 79 #define iterate_xarray(i, n, base, len, __off, STEP) { \ 80 __label__ __out; \ 81 size_t __off = 0; \ 82 struct folio *folio; \ 83 loff_t start = i->xarray_start + i->iov_offset; \ 84 pgoff_t index = start / PAGE_SIZE; \ 85 XA_STATE(xas, i->xarray, index); \ 86 \ 87 len = PAGE_SIZE - offset_in_page(start); \ 88 rcu_read_lock(); \ 89 xas_for_each(&xas, folio, ULONG_MAX) { \ 90 unsigned left; \ 91 size_t offset; \ 92 if (xas_retry(&xas, folio)) \ 93 continue; \ 94 if (WARN_ON(xa_is_value(folio))) \ 95 break; \ 96 if (WARN_ON(folio_test_hugetlb(folio))) \ 97 break; \ 98 offset = offset_in_folio(folio, start + __off); \ 99 while (offset < folio_size(folio)) { \ 100 base = kmap_local_folio(folio, offset); \ 101 len = min(n, len); \ 102 left = (STEP); \ 103 kunmap_local(base); \ 104 len -= left; \ 105 __off += len; \ 106 n -= len; \ 107 if (left || n == 0) \ 108 goto __out; \ 109 offset += len; \ 110 len = PAGE_SIZE; \ 111 } \ 112 } \ 113 __out: \ 114 rcu_read_unlock(); \ 115 i->iov_offset += __off; \ 116 n = __off; \ 117 } 118 119 #define __iterate_and_advance(i, n, base, len, off, I, K) { \ 120 if (unlikely(i->count < n)) \ 121 n = i->count; \ 122 if (likely(n)) { \ 123 if (likely(iter_is_ubuf(i))) { \ 124 void __user *base; \ 125 size_t len; \ 126 iterate_buf(i, n, base, len, off, \ 127 i->ubuf, (I)) \ 128 } else if (likely(iter_is_iovec(i))) { \ 129 const struct iovec *iov = iter_iov(i); \ 130 void __user *base; \ 131 size_t len; \ 132 iterate_iovec(i, n, base, len, off, \ 133 iov, (I)) \ 134 i->nr_segs -= iov - iter_iov(i); \ 135 i->__iov = iov; \ 136 } else if (iov_iter_is_bvec(i)) { \ 137 const struct bio_vec *bvec = i->bvec; \ 138 void *base; \ 139 size_t len; \ 140 iterate_bvec(i, n, base, len, off, \ 141 bvec, (K)) \ 142 i->nr_segs -= bvec - i->bvec; \ 143 i->bvec = bvec; \ 144 } else if (iov_iter_is_kvec(i)) { \ 145 const struct kvec *kvec = i->kvec; \ 146 void *base; \ 147 size_t len; \ 148 iterate_iovec(i, n, base, len, off, \ 149 kvec, (K)) \ 150 i->nr_segs -= kvec - i->kvec; \ 151 i->kvec = kvec; \ 152 } else if (iov_iter_is_xarray(i)) { \ 153 void *base; \ 154 size_t len; \ 155 iterate_xarray(i, n, base, len, off, \ 156 (K)) \ 157 } \ 158 i->count -= n; \ 159 } \ 160 } 161 #define iterate_and_advance(i, n, base, len, off, I, K) \ 162 __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0)) 163 164 static int copyout(void __user *to, const void *from, size_t n) 165 { 166 if (should_fail_usercopy()) 167 return n; 168 if (access_ok(to, n)) { 169 instrument_copy_to_user(to, from, n); 170 n = raw_copy_to_user(to, from, n); 171 } 172 return n; 173 } 174 175 static int copyin(void *to, const void __user *from, size_t n) 176 { 177 size_t res = n; 178 179 if (should_fail_usercopy()) 180 return n; 181 if (access_ok(from, n)) { 182 instrument_copy_from_user_before(to, from, n); 183 res = raw_copy_from_user(to, from, n); 184 instrument_copy_from_user_after(to, from, n, res); 185 } 186 return res; 187 } 188 189 #ifdef PIPE_PARANOIA 190 static bool sanity(const struct iov_iter *i) 191 { 192 struct pipe_inode_info *pipe = i->pipe; 193 unsigned int p_head = pipe->head; 194 unsigned int p_tail = pipe->tail; 195 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 196 unsigned int i_head = i->head; 197 unsigned int idx; 198 199 if (i->last_offset) { 200 struct pipe_buffer *p; 201 if (unlikely(p_occupancy == 0)) 202 goto Bad; // pipe must be non-empty 203 if (unlikely(i_head != p_head - 1)) 204 goto Bad; // must be at the last buffer... 205 206 p = pipe_buf(pipe, i_head); 207 if (unlikely(p->offset + p->len != abs(i->last_offset))) 208 goto Bad; // ... at the end of segment 209 } else { 210 if (i_head != p_head) 211 goto Bad; // must be right after the last buffer 212 } 213 return true; 214 Bad: 215 printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset); 216 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 217 p_head, p_tail, pipe->ring_size); 218 for (idx = 0; idx < pipe->ring_size; idx++) 219 printk(KERN_ERR "[%p %p %d %d]\n", 220 pipe->bufs[idx].ops, 221 pipe->bufs[idx].page, 222 pipe->bufs[idx].offset, 223 pipe->bufs[idx].len); 224 WARN_ON(1); 225 return false; 226 } 227 #else 228 #define sanity(i) true 229 #endif 230 231 static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size) 232 { 233 struct page *page = alloc_page(GFP_USER); 234 if (page) { 235 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 236 *buf = (struct pipe_buffer) { 237 .ops = &default_pipe_buf_ops, 238 .page = page, 239 .offset = 0, 240 .len = size 241 }; 242 } 243 return page; 244 } 245 246 static void push_page(struct pipe_inode_info *pipe, struct page *page, 247 unsigned int offset, unsigned int size) 248 { 249 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++); 250 *buf = (struct pipe_buffer) { 251 .ops = &page_cache_pipe_buf_ops, 252 .page = page, 253 .offset = offset, 254 .len = size 255 }; 256 get_page(page); 257 } 258 259 static inline int last_offset(const struct pipe_buffer *buf) 260 { 261 if (buf->ops == &default_pipe_buf_ops) 262 return buf->len; // buf->offset is 0 for those 263 else 264 return -(buf->offset + buf->len); 265 } 266 267 static struct page *append_pipe(struct iov_iter *i, size_t size, 268 unsigned int *off) 269 { 270 struct pipe_inode_info *pipe = i->pipe; 271 int offset = i->last_offset; 272 struct pipe_buffer *buf; 273 struct page *page; 274 275 if (offset > 0 && offset < PAGE_SIZE) { 276 // some space in the last buffer; add to it 277 buf = pipe_buf(pipe, pipe->head - 1); 278 size = min_t(size_t, size, PAGE_SIZE - offset); 279 buf->len += size; 280 i->last_offset += size; 281 i->count -= size; 282 *off = offset; 283 return buf->page; 284 } 285 // OK, we need a new buffer 286 *off = 0; 287 size = min_t(size_t, size, PAGE_SIZE); 288 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 289 return NULL; 290 page = push_anon(pipe, size); 291 if (!page) 292 return NULL; 293 i->head = pipe->head - 1; 294 i->last_offset = size; 295 i->count -= size; 296 return page; 297 } 298 299 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 300 struct iov_iter *i) 301 { 302 struct pipe_inode_info *pipe = i->pipe; 303 unsigned int head = pipe->head; 304 305 if (unlikely(bytes > i->count)) 306 bytes = i->count; 307 308 if (unlikely(!bytes)) 309 return 0; 310 311 if (!sanity(i)) 312 return 0; 313 314 if (offset && i->last_offset == -offset) { // could we merge it? 315 struct pipe_buffer *buf = pipe_buf(pipe, head - 1); 316 if (buf->page == page) { 317 buf->len += bytes; 318 i->last_offset -= bytes; 319 i->count -= bytes; 320 return bytes; 321 } 322 } 323 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 324 return 0; 325 326 push_page(pipe, page, offset, bytes); 327 i->last_offset = -(offset + bytes); 328 i->head = head; 329 i->count -= bytes; 330 return bytes; 331 } 332 333 /* 334 * fault_in_iov_iter_readable - fault in iov iterator for reading 335 * @i: iterator 336 * @size: maximum length 337 * 338 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 339 * @size. For each iovec, fault in each page that constitutes the iovec. 340 * 341 * Returns the number of bytes not faulted in (like copy_to_user() and 342 * copy_from_user()). 343 * 344 * Always returns 0 for non-userspace iterators. 345 */ 346 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) 347 { 348 if (iter_is_ubuf(i)) { 349 size_t n = min(size, iov_iter_count(i)); 350 n -= fault_in_readable(i->ubuf + i->iov_offset, n); 351 return size - n; 352 } else if (iter_is_iovec(i)) { 353 size_t count = min(size, iov_iter_count(i)); 354 const struct iovec *p; 355 size_t skip; 356 357 size -= count; 358 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { 359 size_t len = min(count, p->iov_len - skip); 360 size_t ret; 361 362 if (unlikely(!len)) 363 continue; 364 ret = fault_in_readable(p->iov_base + skip, len); 365 count -= len - ret; 366 if (ret) 367 break; 368 } 369 return count + size; 370 } 371 return 0; 372 } 373 EXPORT_SYMBOL(fault_in_iov_iter_readable); 374 375 /* 376 * fault_in_iov_iter_writeable - fault in iov iterator for writing 377 * @i: iterator 378 * @size: maximum length 379 * 380 * Faults in the iterator using get_user_pages(), i.e., without triggering 381 * hardware page faults. This is primarily useful when we already know that 382 * some or all of the pages in @i aren't in memory. 383 * 384 * Returns the number of bytes not faulted in, like copy_to_user() and 385 * copy_from_user(). 386 * 387 * Always returns 0 for non-user-space iterators. 388 */ 389 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) 390 { 391 if (iter_is_ubuf(i)) { 392 size_t n = min(size, iov_iter_count(i)); 393 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); 394 return size - n; 395 } else if (iter_is_iovec(i)) { 396 size_t count = min(size, iov_iter_count(i)); 397 const struct iovec *p; 398 size_t skip; 399 400 size -= count; 401 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { 402 size_t len = min(count, p->iov_len - skip); 403 size_t ret; 404 405 if (unlikely(!len)) 406 continue; 407 ret = fault_in_safe_writeable(p->iov_base + skip, len); 408 count -= len - ret; 409 if (ret) 410 break; 411 } 412 return count + size; 413 } 414 return 0; 415 } 416 EXPORT_SYMBOL(fault_in_iov_iter_writeable); 417 418 void iov_iter_init(struct iov_iter *i, unsigned int direction, 419 const struct iovec *iov, unsigned long nr_segs, 420 size_t count) 421 { 422 WARN_ON(direction & ~(READ | WRITE)); 423 *i = (struct iov_iter) { 424 .iter_type = ITER_IOVEC, 425 .nofault = false, 426 .user_backed = true, 427 .data_source = direction, 428 .__iov = iov, 429 .nr_segs = nr_segs, 430 .iov_offset = 0, 431 .count = count 432 }; 433 } 434 EXPORT_SYMBOL(iov_iter_init); 435 436 // returns the offset in partial buffer (if any) 437 static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages) 438 { 439 struct pipe_inode_info *pipe = i->pipe; 440 int used = pipe->head - pipe->tail; 441 int off = i->last_offset; 442 443 *npages = max((int)pipe->max_usage - used, 0); 444 445 if (off > 0 && off < PAGE_SIZE) { // anon and not full 446 (*npages)++; 447 return off; 448 } 449 return 0; 450 } 451 452 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 453 struct iov_iter *i) 454 { 455 unsigned int off, chunk; 456 457 if (unlikely(bytes > i->count)) 458 bytes = i->count; 459 if (unlikely(!bytes)) 460 return 0; 461 462 if (!sanity(i)) 463 return 0; 464 465 for (size_t n = bytes; n; n -= chunk) { 466 struct page *page = append_pipe(i, n, &off); 467 chunk = min_t(size_t, n, PAGE_SIZE - off); 468 if (!page) 469 return bytes - n; 470 memcpy_to_page(page, off, addr, chunk); 471 addr += chunk; 472 } 473 return bytes; 474 } 475 476 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 477 __wsum sum, size_t off) 478 { 479 __wsum next = csum_partial_copy_nocheck(from, to, len); 480 return csum_block_add(sum, next, off); 481 } 482 483 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 484 struct iov_iter *i, __wsum *sump) 485 { 486 __wsum sum = *sump; 487 size_t off = 0; 488 unsigned int chunk, r; 489 490 if (unlikely(bytes > i->count)) 491 bytes = i->count; 492 if (unlikely(!bytes)) 493 return 0; 494 495 if (!sanity(i)) 496 return 0; 497 498 while (bytes) { 499 struct page *page = append_pipe(i, bytes, &r); 500 char *p; 501 502 if (!page) 503 break; 504 chunk = min_t(size_t, bytes, PAGE_SIZE - r); 505 p = kmap_local_page(page); 506 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off); 507 kunmap_local(p); 508 off += chunk; 509 bytes -= chunk; 510 } 511 *sump = sum; 512 return off; 513 } 514 515 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 516 { 517 if (WARN_ON_ONCE(i->data_source)) 518 return 0; 519 if (unlikely(iov_iter_is_pipe(i))) 520 return copy_pipe_to_iter(addr, bytes, i); 521 if (user_backed_iter(i)) 522 might_fault(); 523 iterate_and_advance(i, bytes, base, len, off, 524 copyout(base, addr + off, len), 525 memcpy(base, addr + off, len) 526 ) 527 528 return bytes; 529 } 530 EXPORT_SYMBOL(_copy_to_iter); 531 532 #ifdef CONFIG_ARCH_HAS_COPY_MC 533 static int copyout_mc(void __user *to, const void *from, size_t n) 534 { 535 if (access_ok(to, n)) { 536 instrument_copy_to_user(to, from, n); 537 n = copy_mc_to_user((__force void *) to, from, n); 538 } 539 return n; 540 } 541 542 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 543 struct iov_iter *i) 544 { 545 size_t xfer = 0; 546 unsigned int off, chunk; 547 548 if (unlikely(bytes > i->count)) 549 bytes = i->count; 550 if (unlikely(!bytes)) 551 return 0; 552 553 if (!sanity(i)) 554 return 0; 555 556 while (bytes) { 557 struct page *page = append_pipe(i, bytes, &off); 558 unsigned long rem; 559 char *p; 560 561 if (!page) 562 break; 563 chunk = min_t(size_t, bytes, PAGE_SIZE - off); 564 p = kmap_local_page(page); 565 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk); 566 chunk -= rem; 567 kunmap_local(p); 568 xfer += chunk; 569 bytes -= chunk; 570 if (rem) { 571 iov_iter_revert(i, rem); 572 break; 573 } 574 } 575 return xfer; 576 } 577 578 /** 579 * _copy_mc_to_iter - copy to iter with source memory error exception handling 580 * @addr: source kernel address 581 * @bytes: total transfer length 582 * @i: destination iterator 583 * 584 * The pmem driver deploys this for the dax operation 585 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 586 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 587 * successfully copied. 588 * 589 * The main differences between this and typical _copy_to_iter(). 590 * 591 * * Typical tail/residue handling after a fault retries the copy 592 * byte-by-byte until the fault happens again. Re-triggering machine 593 * checks is potentially fatal so the implementation uses source 594 * alignment and poison alignment assumptions to avoid re-triggering 595 * hardware exceptions. 596 * 597 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 598 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 599 * a short copy. 600 * 601 * Return: number of bytes copied (may be %0) 602 */ 603 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 604 { 605 if (WARN_ON_ONCE(i->data_source)) 606 return 0; 607 if (unlikely(iov_iter_is_pipe(i))) 608 return copy_mc_pipe_to_iter(addr, bytes, i); 609 if (user_backed_iter(i)) 610 might_fault(); 611 __iterate_and_advance(i, bytes, base, len, off, 612 copyout_mc(base, addr + off, len), 613 copy_mc_to_kernel(base, addr + off, len) 614 ) 615 616 return bytes; 617 } 618 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 619 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 620 621 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 622 { 623 if (WARN_ON_ONCE(!i->data_source)) 624 return 0; 625 626 if (user_backed_iter(i)) 627 might_fault(); 628 iterate_and_advance(i, bytes, base, len, off, 629 copyin(addr + off, base, len), 630 memcpy(addr + off, base, len) 631 ) 632 633 return bytes; 634 } 635 EXPORT_SYMBOL(_copy_from_iter); 636 637 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 638 { 639 if (WARN_ON_ONCE(!i->data_source)) 640 return 0; 641 642 iterate_and_advance(i, bytes, base, len, off, 643 __copy_from_user_inatomic_nocache(addr + off, base, len), 644 memcpy(addr + off, base, len) 645 ) 646 647 return bytes; 648 } 649 EXPORT_SYMBOL(_copy_from_iter_nocache); 650 651 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 652 /** 653 * _copy_from_iter_flushcache - write destination through cpu cache 654 * @addr: destination kernel address 655 * @bytes: total transfer length 656 * @i: source iterator 657 * 658 * The pmem driver arranges for filesystem-dax to use this facility via 659 * dax_copy_from_iter() for ensuring that writes to persistent memory 660 * are flushed through the CPU cache. It is differentiated from 661 * _copy_from_iter_nocache() in that guarantees all data is flushed for 662 * all iterator types. The _copy_from_iter_nocache() only attempts to 663 * bypass the cache for the ITER_IOVEC case, and on some archs may use 664 * instructions that strand dirty-data in the cache. 665 * 666 * Return: number of bytes copied (may be %0) 667 */ 668 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 669 { 670 if (WARN_ON_ONCE(!i->data_source)) 671 return 0; 672 673 iterate_and_advance(i, bytes, base, len, off, 674 __copy_from_user_flushcache(addr + off, base, len), 675 memcpy_flushcache(addr + off, base, len) 676 ) 677 678 return bytes; 679 } 680 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 681 #endif 682 683 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 684 { 685 struct page *head; 686 size_t v = n + offset; 687 688 /* 689 * The general case needs to access the page order in order 690 * to compute the page size. 691 * However, we mostly deal with order-0 pages and thus can 692 * avoid a possible cache line miss for requests that fit all 693 * page orders. 694 */ 695 if (n <= v && v <= PAGE_SIZE) 696 return true; 697 698 head = compound_head(page); 699 v += (page - head) << PAGE_SHIFT; 700 701 if (WARN_ON(n > v || v > page_size(head))) 702 return false; 703 return true; 704 } 705 706 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 707 struct iov_iter *i) 708 { 709 size_t res = 0; 710 if (!page_copy_sane(page, offset, bytes)) 711 return 0; 712 if (WARN_ON_ONCE(i->data_source)) 713 return 0; 714 if (unlikely(iov_iter_is_pipe(i))) 715 return copy_page_to_iter_pipe(page, offset, bytes, i); 716 page += offset / PAGE_SIZE; // first subpage 717 offset %= PAGE_SIZE; 718 while (1) { 719 void *kaddr = kmap_local_page(page); 720 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 721 n = _copy_to_iter(kaddr + offset, n, i); 722 kunmap_local(kaddr); 723 res += n; 724 bytes -= n; 725 if (!bytes || !n) 726 break; 727 offset += n; 728 if (offset == PAGE_SIZE) { 729 page++; 730 offset = 0; 731 } 732 } 733 return res; 734 } 735 EXPORT_SYMBOL(copy_page_to_iter); 736 737 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 738 struct iov_iter *i) 739 { 740 size_t res = 0; 741 if (!page_copy_sane(page, offset, bytes)) 742 return 0; 743 page += offset / PAGE_SIZE; // first subpage 744 offset %= PAGE_SIZE; 745 while (1) { 746 void *kaddr = kmap_local_page(page); 747 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 748 n = _copy_from_iter(kaddr + offset, n, i); 749 kunmap_local(kaddr); 750 res += n; 751 bytes -= n; 752 if (!bytes || !n) 753 break; 754 offset += n; 755 if (offset == PAGE_SIZE) { 756 page++; 757 offset = 0; 758 } 759 } 760 return res; 761 } 762 EXPORT_SYMBOL(copy_page_from_iter); 763 764 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 765 { 766 unsigned int chunk, off; 767 768 if (unlikely(bytes > i->count)) 769 bytes = i->count; 770 if (unlikely(!bytes)) 771 return 0; 772 773 if (!sanity(i)) 774 return 0; 775 776 for (size_t n = bytes; n; n -= chunk) { 777 struct page *page = append_pipe(i, n, &off); 778 char *p; 779 780 if (!page) 781 return bytes - n; 782 chunk = min_t(size_t, n, PAGE_SIZE - off); 783 p = kmap_local_page(page); 784 memset(p + off, 0, chunk); 785 kunmap_local(p); 786 } 787 return bytes; 788 } 789 790 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 791 { 792 if (unlikely(iov_iter_is_pipe(i))) 793 return pipe_zero(bytes, i); 794 iterate_and_advance(i, bytes, base, len, count, 795 clear_user(base, len), 796 memset(base, 0, len) 797 ) 798 799 return bytes; 800 } 801 EXPORT_SYMBOL(iov_iter_zero); 802 803 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 804 struct iov_iter *i) 805 { 806 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 807 if (!page_copy_sane(page, offset, bytes)) { 808 kunmap_atomic(kaddr); 809 return 0; 810 } 811 if (WARN_ON_ONCE(!i->data_source)) { 812 kunmap_atomic(kaddr); 813 return 0; 814 } 815 iterate_and_advance(i, bytes, base, len, off, 816 copyin(p + off, base, len), 817 memcpy(p + off, base, len) 818 ) 819 kunmap_atomic(kaddr); 820 return bytes; 821 } 822 EXPORT_SYMBOL(copy_page_from_iter_atomic); 823 824 static void pipe_advance(struct iov_iter *i, size_t size) 825 { 826 struct pipe_inode_info *pipe = i->pipe; 827 int off = i->last_offset; 828 829 if (!off && !size) { 830 pipe_discard_from(pipe, i->start_head); // discard everything 831 return; 832 } 833 i->count -= size; 834 while (1) { 835 struct pipe_buffer *buf = pipe_buf(pipe, i->head); 836 if (off) /* make it relative to the beginning of buffer */ 837 size += abs(off) - buf->offset; 838 if (size <= buf->len) { 839 buf->len = size; 840 i->last_offset = last_offset(buf); 841 break; 842 } 843 size -= buf->len; 844 i->head++; 845 off = 0; 846 } 847 pipe_discard_from(pipe, i->head + 1); // discard everything past this one 848 } 849 850 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 851 { 852 const struct bio_vec *bvec, *end; 853 854 if (!i->count) 855 return; 856 i->count -= size; 857 858 size += i->iov_offset; 859 860 for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { 861 if (likely(size < bvec->bv_len)) 862 break; 863 size -= bvec->bv_len; 864 } 865 i->iov_offset = size; 866 i->nr_segs -= bvec - i->bvec; 867 i->bvec = bvec; 868 } 869 870 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 871 { 872 const struct iovec *iov, *end; 873 874 if (!i->count) 875 return; 876 i->count -= size; 877 878 size += i->iov_offset; // from beginning of current segment 879 for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { 880 if (likely(size < iov->iov_len)) 881 break; 882 size -= iov->iov_len; 883 } 884 i->iov_offset = size; 885 i->nr_segs -= iov - iter_iov(i); 886 i->__iov = iov; 887 } 888 889 void iov_iter_advance(struct iov_iter *i, size_t size) 890 { 891 if (unlikely(i->count < size)) 892 size = i->count; 893 if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { 894 i->iov_offset += size; 895 i->count -= size; 896 } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 897 /* iovec and kvec have identical layouts */ 898 iov_iter_iovec_advance(i, size); 899 } else if (iov_iter_is_bvec(i)) { 900 iov_iter_bvec_advance(i, size); 901 } else if (iov_iter_is_pipe(i)) { 902 pipe_advance(i, size); 903 } else if (iov_iter_is_discard(i)) { 904 i->count -= size; 905 } 906 } 907 EXPORT_SYMBOL(iov_iter_advance); 908 909 void iov_iter_revert(struct iov_iter *i, size_t unroll) 910 { 911 if (!unroll) 912 return; 913 if (WARN_ON(unroll > MAX_RW_COUNT)) 914 return; 915 i->count += unroll; 916 if (unlikely(iov_iter_is_pipe(i))) { 917 struct pipe_inode_info *pipe = i->pipe; 918 unsigned int head = pipe->head; 919 920 while (head > i->start_head) { 921 struct pipe_buffer *b = pipe_buf(pipe, --head); 922 if (unroll < b->len) { 923 b->len -= unroll; 924 i->last_offset = last_offset(b); 925 i->head = head; 926 return; 927 } 928 unroll -= b->len; 929 pipe_buf_release(pipe, b); 930 pipe->head--; 931 } 932 i->last_offset = 0; 933 i->head = head; 934 return; 935 } 936 if (unlikely(iov_iter_is_discard(i))) 937 return; 938 if (unroll <= i->iov_offset) { 939 i->iov_offset -= unroll; 940 return; 941 } 942 unroll -= i->iov_offset; 943 if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { 944 BUG(); /* We should never go beyond the start of the specified 945 * range since we might then be straying into pages that 946 * aren't pinned. 947 */ 948 } else if (iov_iter_is_bvec(i)) { 949 const struct bio_vec *bvec = i->bvec; 950 while (1) { 951 size_t n = (--bvec)->bv_len; 952 i->nr_segs++; 953 if (unroll <= n) { 954 i->bvec = bvec; 955 i->iov_offset = n - unroll; 956 return; 957 } 958 unroll -= n; 959 } 960 } else { /* same logics for iovec and kvec */ 961 const struct iovec *iov = iter_iov(i); 962 while (1) { 963 size_t n = (--iov)->iov_len; 964 i->nr_segs++; 965 if (unroll <= n) { 966 i->__iov = iov; 967 i->iov_offset = n - unroll; 968 return; 969 } 970 unroll -= n; 971 } 972 } 973 } 974 EXPORT_SYMBOL(iov_iter_revert); 975 976 /* 977 * Return the count of just the current iov_iter segment. 978 */ 979 size_t iov_iter_single_seg_count(const struct iov_iter *i) 980 { 981 if (i->nr_segs > 1) { 982 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 983 return min(i->count, iter_iov(i)->iov_len - i->iov_offset); 984 if (iov_iter_is_bvec(i)) 985 return min(i->count, i->bvec->bv_len - i->iov_offset); 986 } 987 return i->count; 988 } 989 EXPORT_SYMBOL(iov_iter_single_seg_count); 990 991 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 992 const struct kvec *kvec, unsigned long nr_segs, 993 size_t count) 994 { 995 WARN_ON(direction & ~(READ | WRITE)); 996 *i = (struct iov_iter){ 997 .iter_type = ITER_KVEC, 998 .data_source = direction, 999 .kvec = kvec, 1000 .nr_segs = nr_segs, 1001 .iov_offset = 0, 1002 .count = count 1003 }; 1004 } 1005 EXPORT_SYMBOL(iov_iter_kvec); 1006 1007 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1008 const struct bio_vec *bvec, unsigned long nr_segs, 1009 size_t count) 1010 { 1011 WARN_ON(direction & ~(READ | WRITE)); 1012 *i = (struct iov_iter){ 1013 .iter_type = ITER_BVEC, 1014 .data_source = direction, 1015 .bvec = bvec, 1016 .nr_segs = nr_segs, 1017 .iov_offset = 0, 1018 .count = count 1019 }; 1020 } 1021 EXPORT_SYMBOL(iov_iter_bvec); 1022 1023 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1024 struct pipe_inode_info *pipe, 1025 size_t count) 1026 { 1027 BUG_ON(direction != READ); 1028 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1029 *i = (struct iov_iter){ 1030 .iter_type = ITER_PIPE, 1031 .data_source = false, 1032 .pipe = pipe, 1033 .head = pipe->head, 1034 .start_head = pipe->head, 1035 .last_offset = 0, 1036 .count = count 1037 }; 1038 } 1039 EXPORT_SYMBOL(iov_iter_pipe); 1040 1041 /** 1042 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1043 * @i: The iterator to initialise. 1044 * @direction: The direction of the transfer. 1045 * @xarray: The xarray to access. 1046 * @start: The start file position. 1047 * @count: The size of the I/O buffer in bytes. 1048 * 1049 * Set up an I/O iterator to either draw data out of the pages attached to an 1050 * inode or to inject data into those pages. The pages *must* be prevented 1051 * from evaporation, either by taking a ref on them or locking them by the 1052 * caller. 1053 */ 1054 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1055 struct xarray *xarray, loff_t start, size_t count) 1056 { 1057 BUG_ON(direction & ~1); 1058 *i = (struct iov_iter) { 1059 .iter_type = ITER_XARRAY, 1060 .data_source = direction, 1061 .xarray = xarray, 1062 .xarray_start = start, 1063 .count = count, 1064 .iov_offset = 0 1065 }; 1066 } 1067 EXPORT_SYMBOL(iov_iter_xarray); 1068 1069 /** 1070 * iov_iter_discard - Initialise an I/O iterator that discards data 1071 * @i: The iterator to initialise. 1072 * @direction: The direction of the transfer. 1073 * @count: The size of the I/O buffer in bytes. 1074 * 1075 * Set up an I/O iterator that just discards everything that's written to it. 1076 * It's only available as a READ iterator. 1077 */ 1078 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1079 { 1080 BUG_ON(direction != READ); 1081 *i = (struct iov_iter){ 1082 .iter_type = ITER_DISCARD, 1083 .data_source = false, 1084 .count = count, 1085 .iov_offset = 0 1086 }; 1087 } 1088 EXPORT_SYMBOL(iov_iter_discard); 1089 1090 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, 1091 unsigned len_mask) 1092 { 1093 size_t size = i->count; 1094 size_t skip = i->iov_offset; 1095 unsigned k; 1096 1097 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1098 const struct iovec *iov = iter_iov(i) + k; 1099 size_t len = iov->iov_len - skip; 1100 1101 if (len > size) 1102 len = size; 1103 if (len & len_mask) 1104 return false; 1105 if ((unsigned long)(iov->iov_base + skip) & addr_mask) 1106 return false; 1107 1108 size -= len; 1109 if (!size) 1110 break; 1111 } 1112 return true; 1113 } 1114 1115 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, 1116 unsigned len_mask) 1117 { 1118 size_t size = i->count; 1119 unsigned skip = i->iov_offset; 1120 unsigned k; 1121 1122 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1123 size_t len = i->bvec[k].bv_len - skip; 1124 1125 if (len > size) 1126 len = size; 1127 if (len & len_mask) 1128 return false; 1129 if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) 1130 return false; 1131 1132 size -= len; 1133 if (!size) 1134 break; 1135 } 1136 return true; 1137 } 1138 1139 /** 1140 * iov_iter_is_aligned() - Check if the addresses and lengths of each segments 1141 * are aligned to the parameters. 1142 * 1143 * @i: &struct iov_iter to restore 1144 * @addr_mask: bit mask to check against the iov element's addresses 1145 * @len_mask: bit mask to check against the iov element's lengths 1146 * 1147 * Return: false if any addresses or lengths intersect with the provided masks 1148 */ 1149 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, 1150 unsigned len_mask) 1151 { 1152 if (likely(iter_is_ubuf(i))) { 1153 if (i->count & len_mask) 1154 return false; 1155 if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) 1156 return false; 1157 return true; 1158 } 1159 1160 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1161 return iov_iter_aligned_iovec(i, addr_mask, len_mask); 1162 1163 if (iov_iter_is_bvec(i)) 1164 return iov_iter_aligned_bvec(i, addr_mask, len_mask); 1165 1166 if (iov_iter_is_pipe(i)) { 1167 size_t size = i->count; 1168 1169 if (size & len_mask) 1170 return false; 1171 if (size && i->last_offset > 0) { 1172 if (i->last_offset & addr_mask) 1173 return false; 1174 } 1175 1176 return true; 1177 } 1178 1179 if (iov_iter_is_xarray(i)) { 1180 if (i->count & len_mask) 1181 return false; 1182 if ((i->xarray_start + i->iov_offset) & addr_mask) 1183 return false; 1184 } 1185 1186 return true; 1187 } 1188 EXPORT_SYMBOL_GPL(iov_iter_is_aligned); 1189 1190 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1191 { 1192 unsigned long res = 0; 1193 size_t size = i->count; 1194 size_t skip = i->iov_offset; 1195 unsigned k; 1196 1197 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1198 const struct iovec *iov = iter_iov(i) + k; 1199 size_t len = iov->iov_len - skip; 1200 if (len) { 1201 res |= (unsigned long)iov->iov_base + skip; 1202 if (len > size) 1203 len = size; 1204 res |= len; 1205 size -= len; 1206 if (!size) 1207 break; 1208 } 1209 } 1210 return res; 1211 } 1212 1213 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1214 { 1215 unsigned res = 0; 1216 size_t size = i->count; 1217 unsigned skip = i->iov_offset; 1218 unsigned k; 1219 1220 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1221 size_t len = i->bvec[k].bv_len - skip; 1222 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1223 if (len > size) 1224 len = size; 1225 res |= len; 1226 size -= len; 1227 if (!size) 1228 break; 1229 } 1230 return res; 1231 } 1232 1233 unsigned long iov_iter_alignment(const struct iov_iter *i) 1234 { 1235 if (likely(iter_is_ubuf(i))) { 1236 size_t size = i->count; 1237 if (size) 1238 return ((unsigned long)i->ubuf + i->iov_offset) | size; 1239 return 0; 1240 } 1241 1242 /* iovec and kvec have identical layouts */ 1243 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1244 return iov_iter_alignment_iovec(i); 1245 1246 if (iov_iter_is_bvec(i)) 1247 return iov_iter_alignment_bvec(i); 1248 1249 if (iov_iter_is_pipe(i)) { 1250 size_t size = i->count; 1251 1252 if (size && i->last_offset > 0) 1253 return size | i->last_offset; 1254 return size; 1255 } 1256 1257 if (iov_iter_is_xarray(i)) 1258 return (i->xarray_start + i->iov_offset) | i->count; 1259 1260 return 0; 1261 } 1262 EXPORT_SYMBOL(iov_iter_alignment); 1263 1264 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1265 { 1266 unsigned long res = 0; 1267 unsigned long v = 0; 1268 size_t size = i->count; 1269 unsigned k; 1270 1271 if (iter_is_ubuf(i)) 1272 return 0; 1273 1274 if (WARN_ON(!iter_is_iovec(i))) 1275 return ~0U; 1276 1277 for (k = 0; k < i->nr_segs; k++) { 1278 const struct iovec *iov = iter_iov(i) + k; 1279 if (iov->iov_len) { 1280 unsigned long base = (unsigned long)iov->iov_base; 1281 if (v) // if not the first one 1282 res |= base | v; // this start | previous end 1283 v = base + iov->iov_len; 1284 if (size <= iov->iov_len) 1285 break; 1286 size -= iov->iov_len; 1287 } 1288 } 1289 return res; 1290 } 1291 EXPORT_SYMBOL(iov_iter_gap_alignment); 1292 1293 static int want_pages_array(struct page ***res, size_t size, 1294 size_t start, unsigned int maxpages) 1295 { 1296 unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); 1297 1298 if (count > maxpages) 1299 count = maxpages; 1300 WARN_ON(!count); // caller should've prevented that 1301 if (!*res) { 1302 *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); 1303 if (!*res) 1304 return 0; 1305 } 1306 return count; 1307 } 1308 1309 static ssize_t pipe_get_pages(struct iov_iter *i, 1310 struct page ***pages, size_t maxsize, unsigned maxpages, 1311 size_t *start) 1312 { 1313 unsigned int npages, count, off, chunk; 1314 struct page **p; 1315 size_t left; 1316 1317 if (!sanity(i)) 1318 return -EFAULT; 1319 1320 *start = off = pipe_npages(i, &npages); 1321 if (!npages) 1322 return -EFAULT; 1323 count = want_pages_array(pages, maxsize, off, min(npages, maxpages)); 1324 if (!count) 1325 return -ENOMEM; 1326 p = *pages; 1327 for (npages = 0, left = maxsize ; npages < count; npages++, left -= chunk) { 1328 struct page *page = append_pipe(i, left, &off); 1329 if (!page) 1330 break; 1331 chunk = min_t(size_t, left, PAGE_SIZE - off); 1332 get_page(*p++ = page); 1333 } 1334 if (!npages) 1335 return -EFAULT; 1336 return maxsize - left; 1337 } 1338 1339 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1340 pgoff_t index, unsigned int nr_pages) 1341 { 1342 XA_STATE(xas, xa, index); 1343 struct page *page; 1344 unsigned int ret = 0; 1345 1346 rcu_read_lock(); 1347 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1348 if (xas_retry(&xas, page)) 1349 continue; 1350 1351 /* Has the page moved or been split? */ 1352 if (unlikely(page != xas_reload(&xas))) { 1353 xas_reset(&xas); 1354 continue; 1355 } 1356 1357 pages[ret] = find_subpage(page, xas.xa_index); 1358 get_page(pages[ret]); 1359 if (++ret == nr_pages) 1360 break; 1361 } 1362 rcu_read_unlock(); 1363 return ret; 1364 } 1365 1366 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1367 struct page ***pages, size_t maxsize, 1368 unsigned maxpages, size_t *_start_offset) 1369 { 1370 unsigned nr, offset, count; 1371 pgoff_t index; 1372 loff_t pos; 1373 1374 pos = i->xarray_start + i->iov_offset; 1375 index = pos >> PAGE_SHIFT; 1376 offset = pos & ~PAGE_MASK; 1377 *_start_offset = offset; 1378 1379 count = want_pages_array(pages, maxsize, offset, maxpages); 1380 if (!count) 1381 return -ENOMEM; 1382 nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); 1383 if (nr == 0) 1384 return 0; 1385 1386 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1387 i->iov_offset += maxsize; 1388 i->count -= maxsize; 1389 return maxsize; 1390 } 1391 1392 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ 1393 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) 1394 { 1395 size_t skip; 1396 long k; 1397 1398 if (iter_is_ubuf(i)) 1399 return (unsigned long)i->ubuf + i->iov_offset; 1400 1401 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1402 const struct iovec *iov = iter_iov(i) + k; 1403 size_t len = iov->iov_len - skip; 1404 1405 if (unlikely(!len)) 1406 continue; 1407 if (*size > len) 1408 *size = len; 1409 return (unsigned long)iov->iov_base + skip; 1410 } 1411 BUG(); // if it had been empty, we wouldn't get called 1412 } 1413 1414 /* must be done on non-empty ITER_BVEC one */ 1415 static struct page *first_bvec_segment(const struct iov_iter *i, 1416 size_t *size, size_t *start) 1417 { 1418 struct page *page; 1419 size_t skip = i->iov_offset, len; 1420 1421 len = i->bvec->bv_len - skip; 1422 if (*size > len) 1423 *size = len; 1424 skip += i->bvec->bv_offset; 1425 page = i->bvec->bv_page + skip / PAGE_SIZE; 1426 *start = skip % PAGE_SIZE; 1427 return page; 1428 } 1429 1430 static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, 1431 struct page ***pages, size_t maxsize, 1432 unsigned int maxpages, size_t *start, 1433 iov_iter_extraction_t extraction_flags) 1434 { 1435 unsigned int n, gup_flags = 0; 1436 1437 if (maxsize > i->count) 1438 maxsize = i->count; 1439 if (!maxsize) 1440 return 0; 1441 if (maxsize > MAX_RW_COUNT) 1442 maxsize = MAX_RW_COUNT; 1443 if (extraction_flags & ITER_ALLOW_P2PDMA) 1444 gup_flags |= FOLL_PCI_P2PDMA; 1445 1446 if (likely(user_backed_iter(i))) { 1447 unsigned long addr; 1448 int res; 1449 1450 if (iov_iter_rw(i) != WRITE) 1451 gup_flags |= FOLL_WRITE; 1452 if (i->nofault) 1453 gup_flags |= FOLL_NOFAULT; 1454 1455 addr = first_iovec_segment(i, &maxsize); 1456 *start = addr % PAGE_SIZE; 1457 addr &= PAGE_MASK; 1458 n = want_pages_array(pages, maxsize, *start, maxpages); 1459 if (!n) 1460 return -ENOMEM; 1461 res = get_user_pages_fast(addr, n, gup_flags, *pages); 1462 if (unlikely(res <= 0)) 1463 return res; 1464 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); 1465 iov_iter_advance(i, maxsize); 1466 return maxsize; 1467 } 1468 if (iov_iter_is_bvec(i)) { 1469 struct page **p; 1470 struct page *page; 1471 1472 page = first_bvec_segment(i, &maxsize, start); 1473 n = want_pages_array(pages, maxsize, *start, maxpages); 1474 if (!n) 1475 return -ENOMEM; 1476 p = *pages; 1477 for (int k = 0; k < n; k++) 1478 get_page(p[k] = page + k); 1479 maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); 1480 i->count -= maxsize; 1481 i->iov_offset += maxsize; 1482 if (i->iov_offset == i->bvec->bv_len) { 1483 i->iov_offset = 0; 1484 i->bvec++; 1485 i->nr_segs--; 1486 } 1487 return maxsize; 1488 } 1489 if (iov_iter_is_pipe(i)) 1490 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1491 if (iov_iter_is_xarray(i)) 1492 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1493 return -EFAULT; 1494 } 1495 1496 ssize_t iov_iter_get_pages(struct iov_iter *i, 1497 struct page **pages, size_t maxsize, unsigned maxpages, 1498 size_t *start, iov_iter_extraction_t extraction_flags) 1499 { 1500 if (!maxpages) 1501 return 0; 1502 BUG_ON(!pages); 1503 1504 return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, 1505 start, extraction_flags); 1506 } 1507 EXPORT_SYMBOL_GPL(iov_iter_get_pages); 1508 1509 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, 1510 size_t maxsize, unsigned maxpages, size_t *start) 1511 { 1512 return iov_iter_get_pages(i, pages, maxsize, maxpages, start, 0); 1513 } 1514 EXPORT_SYMBOL(iov_iter_get_pages2); 1515 1516 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1517 struct page ***pages, size_t maxsize, 1518 size_t *start, iov_iter_extraction_t extraction_flags) 1519 { 1520 ssize_t len; 1521 1522 *pages = NULL; 1523 1524 len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, 1525 extraction_flags); 1526 if (len <= 0) { 1527 kvfree(*pages); 1528 *pages = NULL; 1529 } 1530 return len; 1531 } 1532 EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc); 1533 1534 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, 1535 struct page ***pages, size_t maxsize, size_t *start) 1536 { 1537 return iov_iter_get_pages_alloc(i, pages, maxsize, start, 0); 1538 } 1539 EXPORT_SYMBOL(iov_iter_get_pages_alloc2); 1540 1541 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1542 struct iov_iter *i) 1543 { 1544 __wsum sum, next; 1545 sum = *csum; 1546 if (WARN_ON_ONCE(!i->data_source)) 1547 return 0; 1548 1549 iterate_and_advance(i, bytes, base, len, off, ({ 1550 next = csum_and_copy_from_user(base, addr + off, len); 1551 sum = csum_block_add(sum, next, off); 1552 next ? 0 : len; 1553 }), ({ 1554 sum = csum_and_memcpy(addr + off, base, len, sum, off); 1555 }) 1556 ) 1557 *csum = sum; 1558 return bytes; 1559 } 1560 EXPORT_SYMBOL(csum_and_copy_from_iter); 1561 1562 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1563 struct iov_iter *i) 1564 { 1565 struct csum_state *csstate = _csstate; 1566 __wsum sum, next; 1567 1568 if (WARN_ON_ONCE(i->data_source)) 1569 return 0; 1570 if (unlikely(iov_iter_is_discard(i))) { 1571 // can't use csum_memcpy() for that one - data is not copied 1572 csstate->csum = csum_block_add(csstate->csum, 1573 csum_partial(addr, bytes, 0), 1574 csstate->off); 1575 csstate->off += bytes; 1576 return bytes; 1577 } 1578 1579 sum = csum_shift(csstate->csum, csstate->off); 1580 if (unlikely(iov_iter_is_pipe(i))) 1581 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum); 1582 else iterate_and_advance(i, bytes, base, len, off, ({ 1583 next = csum_and_copy_to_user(addr + off, base, len); 1584 sum = csum_block_add(sum, next, off); 1585 next ? 0 : len; 1586 }), ({ 1587 sum = csum_and_memcpy(base, addr + off, len, sum, off); 1588 }) 1589 ) 1590 csstate->csum = csum_shift(sum, csstate->off); 1591 csstate->off += bytes; 1592 return bytes; 1593 } 1594 EXPORT_SYMBOL(csum_and_copy_to_iter); 1595 1596 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1597 struct iov_iter *i) 1598 { 1599 #ifdef CONFIG_CRYPTO_HASH 1600 struct ahash_request *hash = hashp; 1601 struct scatterlist sg; 1602 size_t copied; 1603 1604 copied = copy_to_iter(addr, bytes, i); 1605 sg_init_one(&sg, addr, copied); 1606 ahash_request_set_crypt(hash, &sg, NULL, copied); 1607 crypto_ahash_update(hash); 1608 return copied; 1609 #else 1610 return 0; 1611 #endif 1612 } 1613 EXPORT_SYMBOL(hash_and_copy_to_iter); 1614 1615 static int iov_npages(const struct iov_iter *i, int maxpages) 1616 { 1617 size_t skip = i->iov_offset, size = i->count; 1618 const struct iovec *p; 1619 int npages = 0; 1620 1621 for (p = iter_iov(i); size; skip = 0, p++) { 1622 unsigned offs = offset_in_page(p->iov_base + skip); 1623 size_t len = min(p->iov_len - skip, size); 1624 1625 if (len) { 1626 size -= len; 1627 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1628 if (unlikely(npages > maxpages)) 1629 return maxpages; 1630 } 1631 } 1632 return npages; 1633 } 1634 1635 static int bvec_npages(const struct iov_iter *i, int maxpages) 1636 { 1637 size_t skip = i->iov_offset, size = i->count; 1638 const struct bio_vec *p; 1639 int npages = 0; 1640 1641 for (p = i->bvec; size; skip = 0, p++) { 1642 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1643 size_t len = min(p->bv_len - skip, size); 1644 1645 size -= len; 1646 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1647 if (unlikely(npages > maxpages)) 1648 return maxpages; 1649 } 1650 return npages; 1651 } 1652 1653 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1654 { 1655 if (unlikely(!i->count)) 1656 return 0; 1657 if (likely(iter_is_ubuf(i))) { 1658 unsigned offs = offset_in_page(i->ubuf + i->iov_offset); 1659 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); 1660 return min(npages, maxpages); 1661 } 1662 /* iovec and kvec have identical layouts */ 1663 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1664 return iov_npages(i, maxpages); 1665 if (iov_iter_is_bvec(i)) 1666 return bvec_npages(i, maxpages); 1667 if (iov_iter_is_pipe(i)) { 1668 int npages; 1669 1670 if (!sanity(i)) 1671 return 0; 1672 1673 pipe_npages(i, &npages); 1674 return min(npages, maxpages); 1675 } 1676 if (iov_iter_is_xarray(i)) { 1677 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1678 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1679 return min(npages, maxpages); 1680 } 1681 return 0; 1682 } 1683 EXPORT_SYMBOL(iov_iter_npages); 1684 1685 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1686 { 1687 *new = *old; 1688 if (unlikely(iov_iter_is_pipe(new))) { 1689 WARN_ON(1); 1690 return NULL; 1691 } 1692 if (iov_iter_is_bvec(new)) 1693 return new->bvec = kmemdup(new->bvec, 1694 new->nr_segs * sizeof(struct bio_vec), 1695 flags); 1696 else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) 1697 /* iovec and kvec have identical layout */ 1698 return new->__iov = kmemdup(new->__iov, 1699 new->nr_segs * sizeof(struct iovec), 1700 flags); 1701 return NULL; 1702 } 1703 EXPORT_SYMBOL(dup_iter); 1704 1705 static int copy_compat_iovec_from_user(struct iovec *iov, 1706 const struct iovec __user *uvec, unsigned long nr_segs) 1707 { 1708 const struct compat_iovec __user *uiov = 1709 (const struct compat_iovec __user *)uvec; 1710 int ret = -EFAULT, i; 1711 1712 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1713 return -EFAULT; 1714 1715 for (i = 0; i < nr_segs; i++) { 1716 compat_uptr_t buf; 1717 compat_ssize_t len; 1718 1719 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1720 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1721 1722 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1723 if (len < 0) { 1724 ret = -EINVAL; 1725 goto uaccess_end; 1726 } 1727 iov[i].iov_base = compat_ptr(buf); 1728 iov[i].iov_len = len; 1729 } 1730 1731 ret = 0; 1732 uaccess_end: 1733 user_access_end(); 1734 return ret; 1735 } 1736 1737 static int copy_iovec_from_user(struct iovec *iov, 1738 const struct iovec __user *uvec, unsigned long nr_segs) 1739 { 1740 unsigned long seg; 1741 1742 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1743 return -EFAULT; 1744 for (seg = 0; seg < nr_segs; seg++) { 1745 if ((ssize_t)iov[seg].iov_len < 0) 1746 return -EINVAL; 1747 } 1748 1749 return 0; 1750 } 1751 1752 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1753 unsigned long nr_segs, unsigned long fast_segs, 1754 struct iovec *fast_iov, bool compat) 1755 { 1756 struct iovec *iov = fast_iov; 1757 int ret; 1758 1759 /* 1760 * SuS says "The readv() function *may* fail if the iovcnt argument was 1761 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1762 * traditionally returned zero for zero segments, so... 1763 */ 1764 if (nr_segs == 0) 1765 return iov; 1766 if (nr_segs > UIO_MAXIOV) 1767 return ERR_PTR(-EINVAL); 1768 if (nr_segs > fast_segs) { 1769 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1770 if (!iov) 1771 return ERR_PTR(-ENOMEM); 1772 } 1773 1774 if (compat) 1775 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1776 else 1777 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1778 if (ret) { 1779 if (iov != fast_iov) 1780 kfree(iov); 1781 return ERR_PTR(ret); 1782 } 1783 1784 return iov; 1785 } 1786 1787 /* 1788 * Single segment iovec supplied by the user, import it as ITER_UBUF. 1789 */ 1790 static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, 1791 struct iovec **iovp, struct iov_iter *i, 1792 bool compat) 1793 { 1794 struct iovec *iov = *iovp; 1795 ssize_t ret; 1796 1797 if (compat) 1798 ret = copy_compat_iovec_from_user(iov, uvec, 1); 1799 else 1800 ret = copy_iovec_from_user(iov, uvec, 1); 1801 if (unlikely(ret)) 1802 return ret; 1803 1804 ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); 1805 if (unlikely(ret)) 1806 return ret; 1807 *iovp = NULL; 1808 return i->count; 1809 } 1810 1811 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1812 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1813 struct iov_iter *i, bool compat) 1814 { 1815 ssize_t total_len = 0; 1816 unsigned long seg; 1817 struct iovec *iov; 1818 1819 if (nr_segs == 1) 1820 return __import_iovec_ubuf(type, uvec, iovp, i, compat); 1821 1822 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1823 if (IS_ERR(iov)) { 1824 *iovp = NULL; 1825 return PTR_ERR(iov); 1826 } 1827 1828 /* 1829 * According to the Single Unix Specification we should return EINVAL if 1830 * an element length is < 0 when cast to ssize_t or if the total length 1831 * would overflow the ssize_t return value of the system call. 1832 * 1833 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1834 * overflow case. 1835 */ 1836 for (seg = 0; seg < nr_segs; seg++) { 1837 ssize_t len = (ssize_t)iov[seg].iov_len; 1838 1839 if (!access_ok(iov[seg].iov_base, len)) { 1840 if (iov != *iovp) 1841 kfree(iov); 1842 *iovp = NULL; 1843 return -EFAULT; 1844 } 1845 1846 if (len > MAX_RW_COUNT - total_len) { 1847 len = MAX_RW_COUNT - total_len; 1848 iov[seg].iov_len = len; 1849 } 1850 total_len += len; 1851 } 1852 1853 iov_iter_init(i, type, iov, nr_segs, total_len); 1854 if (iov == *iovp) 1855 *iovp = NULL; 1856 else 1857 *iovp = iov; 1858 return total_len; 1859 } 1860 1861 /** 1862 * import_iovec() - Copy an array of &struct iovec from userspace 1863 * into the kernel, check that it is valid, and initialize a new 1864 * &struct iov_iter iterator to access it. 1865 * 1866 * @type: One of %READ or %WRITE. 1867 * @uvec: Pointer to the userspace array. 1868 * @nr_segs: Number of elements in userspace array. 1869 * @fast_segs: Number of elements in @iov. 1870 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1871 * on-stack) kernel array. 1872 * @i: Pointer to iterator that will be initialized on success. 1873 * 1874 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1875 * then this function places %NULL in *@iov on return. Otherwise, a new 1876 * array will be allocated and the result placed in *@iov. This means that 1877 * the caller may call kfree() on *@iov regardless of whether the small 1878 * on-stack array was used or not (and regardless of whether this function 1879 * returns an error or not). 1880 * 1881 * Return: Negative error code on error, bytes imported on success 1882 */ 1883 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1884 unsigned nr_segs, unsigned fast_segs, 1885 struct iovec **iovp, struct iov_iter *i) 1886 { 1887 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1888 in_compat_syscall()); 1889 } 1890 EXPORT_SYMBOL(import_iovec); 1891 1892 int import_single_range(int rw, void __user *buf, size_t len, 1893 struct iovec *iov, struct iov_iter *i) 1894 { 1895 if (len > MAX_RW_COUNT) 1896 len = MAX_RW_COUNT; 1897 if (unlikely(!access_ok(buf, len))) 1898 return -EFAULT; 1899 1900 iov_iter_ubuf(i, rw, buf, len); 1901 return 0; 1902 } 1903 EXPORT_SYMBOL(import_single_range); 1904 1905 int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) 1906 { 1907 if (len > MAX_RW_COUNT) 1908 len = MAX_RW_COUNT; 1909 if (unlikely(!access_ok(buf, len))) 1910 return -EFAULT; 1911 1912 iov_iter_ubuf(i, rw, buf, len); 1913 return 0; 1914 } 1915 1916 /** 1917 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when 1918 * iov_iter_save_state() was called. 1919 * 1920 * @i: &struct iov_iter to restore 1921 * @state: state to restore from 1922 * 1923 * Used after iov_iter_save_state() to bring restore @i, if operations may 1924 * have advanced it. 1925 * 1926 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC 1927 */ 1928 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) 1929 { 1930 if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && 1931 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) 1932 return; 1933 i->iov_offset = state->iov_offset; 1934 i->count = state->count; 1935 if (iter_is_ubuf(i)) 1936 return; 1937 /* 1938 * For the *vec iters, nr_segs + iov is constant - if we increment 1939 * the vec, then we also decrement the nr_segs count. Hence we don't 1940 * need to track both of these, just one is enough and we can deduct 1941 * the other from that. ITER_KVEC and ITER_IOVEC are the same struct 1942 * size, so we can just increment the iov pointer as they are unionzed. 1943 * ITER_BVEC _may_ be the same size on some archs, but on others it is 1944 * not. Be safe and handle it separately. 1945 */ 1946 BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); 1947 if (iov_iter_is_bvec(i)) 1948 i->bvec -= state->nr_segs - i->nr_segs; 1949 else 1950 i->__iov -= state->nr_segs - i->nr_segs; 1951 i->nr_segs = state->nr_segs; 1952 } 1953 1954 /* 1955 * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not 1956 * get references on the pages, nor does it get a pin on them. 1957 */ 1958 static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, 1959 struct page ***pages, size_t maxsize, 1960 unsigned int maxpages, 1961 iov_iter_extraction_t extraction_flags, 1962 size_t *offset0) 1963 { 1964 struct page *page, **p; 1965 unsigned int nr = 0, offset; 1966 loff_t pos = i->xarray_start + i->iov_offset; 1967 pgoff_t index = pos >> PAGE_SHIFT; 1968 XA_STATE(xas, i->xarray, index); 1969 1970 offset = pos & ~PAGE_MASK; 1971 *offset0 = offset; 1972 1973 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 1974 if (!maxpages) 1975 return -ENOMEM; 1976 p = *pages; 1977 1978 rcu_read_lock(); 1979 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1980 if (xas_retry(&xas, page)) 1981 continue; 1982 1983 /* Has the page moved or been split? */ 1984 if (unlikely(page != xas_reload(&xas))) { 1985 xas_reset(&xas); 1986 continue; 1987 } 1988 1989 p[nr++] = find_subpage(page, xas.xa_index); 1990 if (nr == maxpages) 1991 break; 1992 } 1993 rcu_read_unlock(); 1994 1995 maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); 1996 iov_iter_advance(i, maxsize); 1997 return maxsize; 1998 } 1999 2000 /* 2001 * Extract a list of contiguous pages from an ITER_BVEC iterator. This does 2002 * not get references on the pages, nor does it get a pin on them. 2003 */ 2004 static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, 2005 struct page ***pages, size_t maxsize, 2006 unsigned int maxpages, 2007 iov_iter_extraction_t extraction_flags, 2008 size_t *offset0) 2009 { 2010 struct page **p, *page; 2011 size_t skip = i->iov_offset, offset; 2012 int k; 2013 2014 for (;;) { 2015 if (i->nr_segs == 0) 2016 return 0; 2017 maxsize = min(maxsize, i->bvec->bv_len - skip); 2018 if (maxsize) 2019 break; 2020 i->iov_offset = 0; 2021 i->nr_segs--; 2022 i->bvec++; 2023 skip = 0; 2024 } 2025 2026 skip += i->bvec->bv_offset; 2027 page = i->bvec->bv_page + skip / PAGE_SIZE; 2028 offset = skip % PAGE_SIZE; 2029 *offset0 = offset; 2030 2031 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 2032 if (!maxpages) 2033 return -ENOMEM; 2034 p = *pages; 2035 for (k = 0; k < maxpages; k++) 2036 p[k] = page + k; 2037 2038 maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset); 2039 iov_iter_advance(i, maxsize); 2040 return maxsize; 2041 } 2042 2043 /* 2044 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. 2045 * This does not get references on the pages, nor does it get a pin on them. 2046 */ 2047 static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, 2048 struct page ***pages, size_t maxsize, 2049 unsigned int maxpages, 2050 iov_iter_extraction_t extraction_flags, 2051 size_t *offset0) 2052 { 2053 struct page **p, *page; 2054 const void *kaddr; 2055 size_t skip = i->iov_offset, offset, len; 2056 int k; 2057 2058 for (;;) { 2059 if (i->nr_segs == 0) 2060 return 0; 2061 maxsize = min(maxsize, i->kvec->iov_len - skip); 2062 if (maxsize) 2063 break; 2064 i->iov_offset = 0; 2065 i->nr_segs--; 2066 i->kvec++; 2067 skip = 0; 2068 } 2069 2070 kaddr = i->kvec->iov_base + skip; 2071 offset = (unsigned long)kaddr & ~PAGE_MASK; 2072 *offset0 = offset; 2073 2074 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 2075 if (!maxpages) 2076 return -ENOMEM; 2077 p = *pages; 2078 2079 kaddr -= offset; 2080 len = offset + maxsize; 2081 for (k = 0; k < maxpages; k++) { 2082 size_t seg = min_t(size_t, len, PAGE_SIZE); 2083 2084 if (is_vmalloc_or_module_addr(kaddr)) 2085 page = vmalloc_to_page(kaddr); 2086 else 2087 page = virt_to_page(kaddr); 2088 2089 p[k] = page; 2090 len -= seg; 2091 kaddr += PAGE_SIZE; 2092 } 2093 2094 maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset); 2095 iov_iter_advance(i, maxsize); 2096 return maxsize; 2097 } 2098 2099 /* 2100 * Extract a list of contiguous pages from a user iterator and get a pin on 2101 * each of them. This should only be used if the iterator is user-backed 2102 * (IOBUF/UBUF). 2103 * 2104 * It does not get refs on the pages, but the pages must be unpinned by the 2105 * caller once the transfer is complete. 2106 * 2107 * This is safe to be used where background IO/DMA *is* going to be modifying 2108 * the buffer; using a pin rather than a ref makes forces fork() to give the 2109 * child a copy of the page. 2110 */ 2111 static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, 2112 struct page ***pages, 2113 size_t maxsize, 2114 unsigned int maxpages, 2115 iov_iter_extraction_t extraction_flags, 2116 size_t *offset0) 2117 { 2118 unsigned long addr; 2119 unsigned int gup_flags = 0; 2120 size_t offset; 2121 int res; 2122 2123 if (i->data_source == ITER_DEST) 2124 gup_flags |= FOLL_WRITE; 2125 if (extraction_flags & ITER_ALLOW_P2PDMA) 2126 gup_flags |= FOLL_PCI_P2PDMA; 2127 if (i->nofault) 2128 gup_flags |= FOLL_NOFAULT; 2129 2130 addr = first_iovec_segment(i, &maxsize); 2131 *offset0 = offset = addr % PAGE_SIZE; 2132 addr &= PAGE_MASK; 2133 maxpages = want_pages_array(pages, maxsize, offset, maxpages); 2134 if (!maxpages) 2135 return -ENOMEM; 2136 res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); 2137 if (unlikely(res <= 0)) 2138 return res; 2139 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); 2140 iov_iter_advance(i, maxsize); 2141 return maxsize; 2142 } 2143 2144 /** 2145 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator 2146 * @i: The iterator to extract from 2147 * @pages: Where to return the list of pages 2148 * @maxsize: The maximum amount of iterator to extract 2149 * @maxpages: The maximum size of the list of pages 2150 * @extraction_flags: Flags to qualify request 2151 * @offset0: Where to return the starting offset into (*@pages)[0] 2152 * 2153 * Extract a list of contiguous pages from the current point of the iterator, 2154 * advancing the iterator. The maximum number of pages and the maximum amount 2155 * of page contents can be set. 2156 * 2157 * If *@pages is NULL, a page list will be allocated to the required size and 2158 * *@pages will be set to its base. If *@pages is not NULL, it will be assumed 2159 * that the caller allocated a page list at least @maxpages in size and this 2160 * will be filled in. 2161 * 2162 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA 2163 * be allowed on the pages extracted. 2164 * 2165 * The iov_iter_extract_will_pin() function can be used to query how cleanup 2166 * should be performed. 2167 * 2168 * Extra refs or pins on the pages may be obtained as follows: 2169 * 2170 * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be 2171 * added to the pages, but refs will not be taken. 2172 * iov_iter_extract_will_pin() will return true. 2173 * 2174 * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are 2175 * merely listed; no extra refs or pins are obtained. 2176 * iov_iter_extract_will_pin() will return 0. 2177 * 2178 * Note also: 2179 * 2180 * (*) Use with ITER_DISCARD is not supported as that has no content. 2181 * 2182 * On success, the function sets *@pages to the new pagelist, if allocated, and 2183 * sets *offset0 to the offset into the first page. 2184 * 2185 * It may also return -ENOMEM and -EFAULT. 2186 */ 2187 ssize_t iov_iter_extract_pages(struct iov_iter *i, 2188 struct page ***pages, 2189 size_t maxsize, 2190 unsigned int maxpages, 2191 iov_iter_extraction_t extraction_flags, 2192 size_t *offset0) 2193 { 2194 maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); 2195 if (!maxsize) 2196 return 0; 2197 2198 if (likely(user_backed_iter(i))) 2199 return iov_iter_extract_user_pages(i, pages, maxsize, 2200 maxpages, extraction_flags, 2201 offset0); 2202 if (iov_iter_is_kvec(i)) 2203 return iov_iter_extract_kvec_pages(i, pages, maxsize, 2204 maxpages, extraction_flags, 2205 offset0); 2206 if (iov_iter_is_bvec(i)) 2207 return iov_iter_extract_bvec_pages(i, pages, maxsize, 2208 maxpages, extraction_flags, 2209 offset0); 2210 if (iov_iter_is_xarray(i)) 2211 return iov_iter_extract_xarray_pages(i, pages, maxsize, 2212 maxpages, extraction_flags, 2213 offset0); 2214 return -EFAULT; 2215 } 2216 EXPORT_SYMBOL_GPL(iov_iter_extract_pages); 2217