1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <crypto/hash.h> 3 #include <linux/export.h> 4 #include <linux/bvec.h> 5 #include <linux/fault-inject-usercopy.h> 6 #include <linux/uio.h> 7 #include <linux/pagemap.h> 8 #include <linux/highmem.h> 9 #include <linux/slab.h> 10 #include <linux/vmalloc.h> 11 #include <linux/splice.h> 12 #include <linux/compat.h> 13 #include <net/checksum.h> 14 #include <linux/scatterlist.h> 15 #include <linux/instrumented.h> 16 17 #define PIPE_PARANOIA /* for now */ 18 19 /* covers iovec and kvec alike */ 20 #define iterate_iovec(i, n, __v, __off, __p, skip, STEP) { \ 21 size_t __off = 0; \ 22 do { \ 23 __v.iov_len = min(n, __p->iov_len - skip); \ 24 if (likely(__v.iov_len)) { \ 25 __v.iov_base = __p->iov_base + skip; \ 26 __v.iov_len -= (STEP); \ 27 __off += __v.iov_len; \ 28 skip += __v.iov_len; \ 29 n -= __v.iov_len; \ 30 if (skip < __p->iov_len) \ 31 break; \ 32 } \ 33 __p++; \ 34 skip = 0; \ 35 } while (n); \ 36 n = __off; \ 37 } 38 39 #define iterate_bvec(i, n, __v, __off, p, skip, STEP) { \ 40 size_t __off = 0; \ 41 while (n) { \ 42 unsigned offset = p->bv_offset + skip; \ 43 unsigned left; \ 44 void *kaddr = kmap_local_page(p->bv_page + \ 45 offset / PAGE_SIZE); \ 46 __v.iov_base = kaddr + offset % PAGE_SIZE; \ 47 __v.iov_len = min(min(n, p->bv_len - skip), \ 48 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \ 49 left = (STEP); \ 50 kunmap_local(kaddr); \ 51 __v.iov_len -= left; \ 52 __off += __v.iov_len; \ 53 skip += __v.iov_len; \ 54 if (skip == p->bv_len) { \ 55 skip = 0; \ 56 p++; \ 57 } \ 58 n -= __v.iov_len; \ 59 if (left) \ 60 break; \ 61 } \ 62 n = __off; \ 63 } 64 65 #define iterate_xarray(i, n, __v, __off, skip, STEP) { \ 66 __label__ __out; \ 67 size_t __off = 0; \ 68 struct page *head = NULL; \ 69 size_t seg, offset; \ 70 loff_t start = i->xarray_start + skip; \ 71 pgoff_t index = start >> PAGE_SHIFT; \ 72 int j; \ 73 \ 74 XA_STATE(xas, i->xarray, index); \ 75 \ 76 rcu_read_lock(); \ 77 xas_for_each(&xas, head, ULONG_MAX) { \ 78 unsigned left; \ 79 if (xas_retry(&xas, head)) \ 80 continue; \ 81 if (WARN_ON(xa_is_value(head))) \ 82 break; \ 83 if (WARN_ON(PageHuge(head))) \ 84 break; \ 85 for (j = (head->index < index) ? index - head->index : 0; \ 86 j < thp_nr_pages(head); j++) { \ 87 void *kaddr = kmap_local_page(head + j); \ 88 offset = (start + __off) % PAGE_SIZE; \ 89 __v.iov_base = kaddr + offset; \ 90 seg = PAGE_SIZE - offset; \ 91 __v.iov_len = min(n, seg); \ 92 left = (STEP); \ 93 kunmap_local(kaddr); \ 94 __v.iov_len -= left; \ 95 __off += __v.iov_len; \ 96 n -= __v.iov_len; \ 97 if (left || n == 0) \ 98 goto __out; \ 99 } \ 100 } \ 101 __out: \ 102 rcu_read_unlock(); \ 103 skip += __off; \ 104 n = __off; \ 105 } 106 107 #define __iterate_and_advance(i, n, v, off, I, K) { \ 108 if (unlikely(i->count < n)) \ 109 n = i->count; \ 110 if (likely(n)) { \ 111 size_t skip = i->iov_offset; \ 112 if (likely(iter_is_iovec(i))) { \ 113 const struct iovec *iov = i->iov; \ 114 struct iovec v; \ 115 iterate_iovec(i, n, v, off, iov, skip, (I)) \ 116 i->nr_segs -= iov - i->iov; \ 117 i->iov = iov; \ 118 } else if (iov_iter_is_bvec(i)) { \ 119 const struct bio_vec *bvec = i->bvec; \ 120 struct kvec v; \ 121 iterate_bvec(i, n, v, off, bvec, skip, (K)) \ 122 i->nr_segs -= bvec - i->bvec; \ 123 i->bvec = bvec; \ 124 } else if (iov_iter_is_kvec(i)) { \ 125 const struct kvec *kvec = i->kvec; \ 126 struct kvec v; \ 127 iterate_iovec(i, n, v, off, kvec, skip, (K)) \ 128 i->nr_segs -= kvec - i->kvec; \ 129 i->kvec = kvec; \ 130 } else if (iov_iter_is_xarray(i)) { \ 131 struct kvec v; \ 132 iterate_xarray(i, n, v, off, skip, (K)) \ 133 } \ 134 i->count -= n; \ 135 i->iov_offset = skip; \ 136 } \ 137 } 138 #define iterate_and_advance(i, n, v, off, I, K) \ 139 __iterate_and_advance(i, n, v, off, I, ((void)(K),0)) 140 141 static int copyout(void __user *to, const void *from, size_t n) 142 { 143 if (should_fail_usercopy()) 144 return n; 145 if (access_ok(to, n)) { 146 instrument_copy_to_user(to, from, n); 147 n = raw_copy_to_user(to, from, n); 148 } 149 return n; 150 } 151 152 static int copyin(void *to, const void __user *from, size_t n) 153 { 154 if (should_fail_usercopy()) 155 return n; 156 if (access_ok(from, n)) { 157 instrument_copy_from_user(to, from, n); 158 n = raw_copy_from_user(to, from, n); 159 } 160 return n; 161 } 162 163 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, 164 struct iov_iter *i) 165 { 166 size_t skip, copy, left, wanted; 167 const struct iovec *iov; 168 char __user *buf; 169 void *kaddr, *from; 170 171 if (unlikely(bytes > i->count)) 172 bytes = i->count; 173 174 if (unlikely(!bytes)) 175 return 0; 176 177 might_fault(); 178 wanted = bytes; 179 iov = i->iov; 180 skip = i->iov_offset; 181 buf = iov->iov_base + skip; 182 copy = min(bytes, iov->iov_len - skip); 183 184 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) { 185 kaddr = kmap_atomic(page); 186 from = kaddr + offset; 187 188 /* first chunk, usually the only one */ 189 left = copyout(buf, from, copy); 190 copy -= left; 191 skip += copy; 192 from += copy; 193 bytes -= copy; 194 195 while (unlikely(!left && bytes)) { 196 iov++; 197 buf = iov->iov_base; 198 copy = min(bytes, iov->iov_len); 199 left = copyout(buf, from, copy); 200 copy -= left; 201 skip = copy; 202 from += copy; 203 bytes -= copy; 204 } 205 if (likely(!bytes)) { 206 kunmap_atomic(kaddr); 207 goto done; 208 } 209 offset = from - kaddr; 210 buf += copy; 211 kunmap_atomic(kaddr); 212 copy = min(bytes, iov->iov_len - skip); 213 } 214 /* Too bad - revert to non-atomic kmap */ 215 216 kaddr = kmap(page); 217 from = kaddr + offset; 218 left = copyout(buf, from, copy); 219 copy -= left; 220 skip += copy; 221 from += copy; 222 bytes -= copy; 223 while (unlikely(!left && bytes)) { 224 iov++; 225 buf = iov->iov_base; 226 copy = min(bytes, iov->iov_len); 227 left = copyout(buf, from, copy); 228 copy -= left; 229 skip = copy; 230 from += copy; 231 bytes -= copy; 232 } 233 kunmap(page); 234 235 done: 236 if (skip == iov->iov_len) { 237 iov++; 238 skip = 0; 239 } 240 i->count -= wanted - bytes; 241 i->nr_segs -= iov - i->iov; 242 i->iov = iov; 243 i->iov_offset = skip; 244 return wanted - bytes; 245 } 246 247 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, 248 struct iov_iter *i) 249 { 250 size_t skip, copy, left, wanted; 251 const struct iovec *iov; 252 char __user *buf; 253 void *kaddr, *to; 254 255 if (unlikely(bytes > i->count)) 256 bytes = i->count; 257 258 if (unlikely(!bytes)) 259 return 0; 260 261 might_fault(); 262 wanted = bytes; 263 iov = i->iov; 264 skip = i->iov_offset; 265 buf = iov->iov_base + skip; 266 copy = min(bytes, iov->iov_len - skip); 267 268 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) { 269 kaddr = kmap_atomic(page); 270 to = kaddr + offset; 271 272 /* first chunk, usually the only one */ 273 left = copyin(to, buf, copy); 274 copy -= left; 275 skip += copy; 276 to += copy; 277 bytes -= copy; 278 279 while (unlikely(!left && bytes)) { 280 iov++; 281 buf = iov->iov_base; 282 copy = min(bytes, iov->iov_len); 283 left = copyin(to, buf, copy); 284 copy -= left; 285 skip = copy; 286 to += copy; 287 bytes -= copy; 288 } 289 if (likely(!bytes)) { 290 kunmap_atomic(kaddr); 291 goto done; 292 } 293 offset = to - kaddr; 294 buf += copy; 295 kunmap_atomic(kaddr); 296 copy = min(bytes, iov->iov_len - skip); 297 } 298 /* Too bad - revert to non-atomic kmap */ 299 300 kaddr = kmap(page); 301 to = kaddr + offset; 302 left = copyin(to, buf, copy); 303 copy -= left; 304 skip += copy; 305 to += copy; 306 bytes -= copy; 307 while (unlikely(!left && bytes)) { 308 iov++; 309 buf = iov->iov_base; 310 copy = min(bytes, iov->iov_len); 311 left = copyin(to, buf, copy); 312 copy -= left; 313 skip = copy; 314 to += copy; 315 bytes -= copy; 316 } 317 kunmap(page); 318 319 done: 320 if (skip == iov->iov_len) { 321 iov++; 322 skip = 0; 323 } 324 i->count -= wanted - bytes; 325 i->nr_segs -= iov - i->iov; 326 i->iov = iov; 327 i->iov_offset = skip; 328 return wanted - bytes; 329 } 330 331 #ifdef PIPE_PARANOIA 332 static bool sanity(const struct iov_iter *i) 333 { 334 struct pipe_inode_info *pipe = i->pipe; 335 unsigned int p_head = pipe->head; 336 unsigned int p_tail = pipe->tail; 337 unsigned int p_mask = pipe->ring_size - 1; 338 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail); 339 unsigned int i_head = i->head; 340 unsigned int idx; 341 342 if (i->iov_offset) { 343 struct pipe_buffer *p; 344 if (unlikely(p_occupancy == 0)) 345 goto Bad; // pipe must be non-empty 346 if (unlikely(i_head != p_head - 1)) 347 goto Bad; // must be at the last buffer... 348 349 p = &pipe->bufs[i_head & p_mask]; 350 if (unlikely(p->offset + p->len != i->iov_offset)) 351 goto Bad; // ... at the end of segment 352 } else { 353 if (i_head != p_head) 354 goto Bad; // must be right after the last buffer 355 } 356 return true; 357 Bad: 358 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset); 359 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n", 360 p_head, p_tail, pipe->ring_size); 361 for (idx = 0; idx < pipe->ring_size; idx++) 362 printk(KERN_ERR "[%p %p %d %d]\n", 363 pipe->bufs[idx].ops, 364 pipe->bufs[idx].page, 365 pipe->bufs[idx].offset, 366 pipe->bufs[idx].len); 367 WARN_ON(1); 368 return false; 369 } 370 #else 371 #define sanity(i) true 372 #endif 373 374 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes, 375 struct iov_iter *i) 376 { 377 struct pipe_inode_info *pipe = i->pipe; 378 struct pipe_buffer *buf; 379 unsigned int p_tail = pipe->tail; 380 unsigned int p_mask = pipe->ring_size - 1; 381 unsigned int i_head = i->head; 382 size_t off; 383 384 if (unlikely(bytes > i->count)) 385 bytes = i->count; 386 387 if (unlikely(!bytes)) 388 return 0; 389 390 if (!sanity(i)) 391 return 0; 392 393 off = i->iov_offset; 394 buf = &pipe->bufs[i_head & p_mask]; 395 if (off) { 396 if (offset == off && buf->page == page) { 397 /* merge with the last one */ 398 buf->len += bytes; 399 i->iov_offset += bytes; 400 goto out; 401 } 402 i_head++; 403 buf = &pipe->bufs[i_head & p_mask]; 404 } 405 if (pipe_full(i_head, p_tail, pipe->max_usage)) 406 return 0; 407 408 buf->ops = &page_cache_pipe_buf_ops; 409 get_page(page); 410 buf->page = page; 411 buf->offset = offset; 412 buf->len = bytes; 413 414 pipe->head = i_head + 1; 415 i->iov_offset = offset + bytes; 416 i->head = i_head; 417 out: 418 i->count -= bytes; 419 return bytes; 420 } 421 422 /* 423 * Fault in one or more iovecs of the given iov_iter, to a maximum length of 424 * bytes. For each iovec, fault in each page that constitutes the iovec. 425 * 426 * Return 0 on success, or non-zero if the memory could not be accessed (i.e. 427 * because it is an invalid address). 428 */ 429 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes) 430 { 431 if (iter_is_iovec(i)) { 432 const struct iovec *p; 433 size_t skip; 434 435 if (bytes > i->count) 436 bytes = i->count; 437 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) { 438 size_t len = min(bytes, p->iov_len - skip); 439 int err; 440 441 if (unlikely(!len)) 442 continue; 443 err = fault_in_pages_readable(p->iov_base + skip, len); 444 if (unlikely(err)) 445 return err; 446 bytes -= len; 447 } 448 } 449 return 0; 450 } 451 EXPORT_SYMBOL(iov_iter_fault_in_readable); 452 453 void iov_iter_init(struct iov_iter *i, unsigned int direction, 454 const struct iovec *iov, unsigned long nr_segs, 455 size_t count) 456 { 457 WARN_ON(direction & ~(READ | WRITE)); 458 WARN_ON_ONCE(uaccess_kernel()); 459 *i = (struct iov_iter) { 460 .iter_type = ITER_IOVEC, 461 .data_source = direction, 462 .iov = iov, 463 .nr_segs = nr_segs, 464 .iov_offset = 0, 465 .count = count 466 }; 467 } 468 EXPORT_SYMBOL(iov_iter_init); 469 470 static inline bool allocated(struct pipe_buffer *buf) 471 { 472 return buf->ops == &default_pipe_buf_ops; 473 } 474 475 static inline void data_start(const struct iov_iter *i, 476 unsigned int *iter_headp, size_t *offp) 477 { 478 unsigned int p_mask = i->pipe->ring_size - 1; 479 unsigned int iter_head = i->head; 480 size_t off = i->iov_offset; 481 482 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) || 483 off == PAGE_SIZE)) { 484 iter_head++; 485 off = 0; 486 } 487 *iter_headp = iter_head; 488 *offp = off; 489 } 490 491 static size_t push_pipe(struct iov_iter *i, size_t size, 492 int *iter_headp, size_t *offp) 493 { 494 struct pipe_inode_info *pipe = i->pipe; 495 unsigned int p_tail = pipe->tail; 496 unsigned int p_mask = pipe->ring_size - 1; 497 unsigned int iter_head; 498 size_t off; 499 ssize_t left; 500 501 if (unlikely(size > i->count)) 502 size = i->count; 503 if (unlikely(!size)) 504 return 0; 505 506 left = size; 507 data_start(i, &iter_head, &off); 508 *iter_headp = iter_head; 509 *offp = off; 510 if (off) { 511 left -= PAGE_SIZE - off; 512 if (left <= 0) { 513 pipe->bufs[iter_head & p_mask].len += size; 514 return size; 515 } 516 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE; 517 iter_head++; 518 } 519 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) { 520 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask]; 521 struct page *page = alloc_page(GFP_USER); 522 if (!page) 523 break; 524 525 buf->ops = &default_pipe_buf_ops; 526 buf->page = page; 527 buf->offset = 0; 528 buf->len = min_t(ssize_t, left, PAGE_SIZE); 529 left -= buf->len; 530 iter_head++; 531 pipe->head = iter_head; 532 533 if (left == 0) 534 return size; 535 } 536 return size - left; 537 } 538 539 static size_t copy_pipe_to_iter(const void *addr, size_t bytes, 540 struct iov_iter *i) 541 { 542 struct pipe_inode_info *pipe = i->pipe; 543 unsigned int p_mask = pipe->ring_size - 1; 544 unsigned int i_head; 545 size_t n, off; 546 547 if (!sanity(i)) 548 return 0; 549 550 bytes = n = push_pipe(i, bytes, &i_head, &off); 551 if (unlikely(!n)) 552 return 0; 553 do { 554 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 555 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk); 556 i->head = i_head; 557 i->iov_offset = off + chunk; 558 n -= chunk; 559 addr += chunk; 560 off = 0; 561 i_head++; 562 } while (n); 563 i->count -= bytes; 564 return bytes; 565 } 566 567 static __wsum csum_and_memcpy(void *to, const void *from, size_t len, 568 __wsum sum, size_t off) 569 { 570 __wsum next = csum_partial_copy_nocheck(from, to, len); 571 return csum_block_add(sum, next, off); 572 } 573 574 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes, 575 struct csum_state *csstate, 576 struct iov_iter *i) 577 { 578 struct pipe_inode_info *pipe = i->pipe; 579 unsigned int p_mask = pipe->ring_size - 1; 580 __wsum sum = csstate->csum; 581 size_t off = csstate->off; 582 unsigned int i_head; 583 size_t n, r; 584 585 if (!sanity(i)) 586 return 0; 587 588 bytes = n = push_pipe(i, bytes, &i_head, &r); 589 if (unlikely(!n)) 590 return 0; 591 do { 592 size_t chunk = min_t(size_t, n, PAGE_SIZE - r); 593 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page); 594 sum = csum_and_memcpy(p + r, addr, chunk, sum, off); 595 kunmap_atomic(p); 596 i->head = i_head; 597 i->iov_offset = r + chunk; 598 n -= chunk; 599 off += chunk; 600 addr += chunk; 601 r = 0; 602 i_head++; 603 } while (n); 604 i->count -= bytes; 605 csstate->csum = sum; 606 csstate->off = off; 607 return bytes; 608 } 609 610 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 611 { 612 if (unlikely(iov_iter_is_pipe(i))) 613 return copy_pipe_to_iter(addr, bytes, i); 614 if (iter_is_iovec(i)) 615 might_fault(); 616 iterate_and_advance(i, bytes, v, off, 617 copyout(v.iov_base, addr + off, v.iov_len), 618 memcpy(v.iov_base, addr + off, v.iov_len) 619 ) 620 621 return bytes; 622 } 623 EXPORT_SYMBOL(_copy_to_iter); 624 625 #ifdef CONFIG_ARCH_HAS_COPY_MC 626 static int copyout_mc(void __user *to, const void *from, size_t n) 627 { 628 if (access_ok(to, n)) { 629 instrument_copy_to_user(to, from, n); 630 n = copy_mc_to_user((__force void *) to, from, n); 631 } 632 return n; 633 } 634 635 static unsigned long copy_mc_to_page(struct page *page, size_t offset, 636 const char *from, size_t len) 637 { 638 unsigned long ret; 639 char *to; 640 641 to = kmap_atomic(page); 642 ret = copy_mc_to_kernel(to + offset, from, len); 643 kunmap_atomic(to); 644 645 return ret; 646 } 647 648 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes, 649 struct iov_iter *i) 650 { 651 struct pipe_inode_info *pipe = i->pipe; 652 unsigned int p_mask = pipe->ring_size - 1; 653 unsigned int i_head; 654 size_t n, off, xfer = 0; 655 656 if (!sanity(i)) 657 return 0; 658 659 bytes = n = push_pipe(i, bytes, &i_head, &off); 660 if (unlikely(!n)) 661 return 0; 662 do { 663 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 664 unsigned long rem; 665 666 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page, 667 off, addr, chunk); 668 i->head = i_head; 669 i->iov_offset = off + chunk - rem; 670 xfer += chunk - rem; 671 if (rem) 672 break; 673 n -= chunk; 674 addr += chunk; 675 off = 0; 676 i_head++; 677 } while (n); 678 i->count -= xfer; 679 return xfer; 680 } 681 682 /** 683 * _copy_mc_to_iter - copy to iter with source memory error exception handling 684 * @addr: source kernel address 685 * @bytes: total transfer length 686 * @iter: destination iterator 687 * 688 * The pmem driver deploys this for the dax operation 689 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the 690 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes 691 * successfully copied. 692 * 693 * The main differences between this and typical _copy_to_iter(). 694 * 695 * * Typical tail/residue handling after a fault retries the copy 696 * byte-by-byte until the fault happens again. Re-triggering machine 697 * checks is potentially fatal so the implementation uses source 698 * alignment and poison alignment assumptions to avoid re-triggering 699 * hardware exceptions. 700 * 701 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies. 702 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return 703 * a short copy. 704 */ 705 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) 706 { 707 if (unlikely(iov_iter_is_pipe(i))) 708 return copy_mc_pipe_to_iter(addr, bytes, i); 709 if (iter_is_iovec(i)) 710 might_fault(); 711 __iterate_and_advance(i, bytes, v, off, 712 copyout_mc(v.iov_base, addr + off, v.iov_len), 713 copy_mc_to_kernel(v.iov_base, addr + off, v.iov_len) 714 ) 715 716 return bytes; 717 } 718 EXPORT_SYMBOL_GPL(_copy_mc_to_iter); 719 #endif /* CONFIG_ARCH_HAS_COPY_MC */ 720 721 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) 722 { 723 if (unlikely(iov_iter_is_pipe(i))) { 724 WARN_ON(1); 725 return 0; 726 } 727 if (iter_is_iovec(i)) 728 might_fault(); 729 iterate_and_advance(i, bytes, v, off, 730 copyin(addr + off, v.iov_base, v.iov_len), 731 memcpy(addr + off, v.iov_base, v.iov_len) 732 ) 733 734 return bytes; 735 } 736 EXPORT_SYMBOL(_copy_from_iter); 737 738 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) 739 { 740 if (unlikely(iov_iter_is_pipe(i))) { 741 WARN_ON(1); 742 return 0; 743 } 744 iterate_and_advance(i, bytes, v, off, 745 __copy_from_user_inatomic_nocache(addr + off, 746 v.iov_base, v.iov_len), 747 memcpy(addr + off, v.iov_base, v.iov_len) 748 ) 749 750 return bytes; 751 } 752 EXPORT_SYMBOL(_copy_from_iter_nocache); 753 754 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE 755 /** 756 * _copy_from_iter_flushcache - write destination through cpu cache 757 * @addr: destination kernel address 758 * @bytes: total transfer length 759 * @iter: source iterator 760 * 761 * The pmem driver arranges for filesystem-dax to use this facility via 762 * dax_copy_from_iter() for ensuring that writes to persistent memory 763 * are flushed through the CPU cache. It is differentiated from 764 * _copy_from_iter_nocache() in that guarantees all data is flushed for 765 * all iterator types. The _copy_from_iter_nocache() only attempts to 766 * bypass the cache for the ITER_IOVEC case, and on some archs may use 767 * instructions that strand dirty-data in the cache. 768 */ 769 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) 770 { 771 if (unlikely(iov_iter_is_pipe(i))) { 772 WARN_ON(1); 773 return 0; 774 } 775 iterate_and_advance(i, bytes, v, off, 776 __copy_from_user_flushcache(addr + off, v.iov_base, v.iov_len), 777 memcpy_flushcache(addr + off, v.iov_base, v.iov_len) 778 ) 779 780 return bytes; 781 } 782 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); 783 #endif 784 785 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) 786 { 787 struct page *head; 788 size_t v = n + offset; 789 790 /* 791 * The general case needs to access the page order in order 792 * to compute the page size. 793 * However, we mostly deal with order-0 pages and thus can 794 * avoid a possible cache line miss for requests that fit all 795 * page orders. 796 */ 797 if (n <= v && v <= PAGE_SIZE) 798 return true; 799 800 head = compound_head(page); 801 v += (page - head) << PAGE_SHIFT; 802 803 if (likely(n <= v && v <= (page_size(head)))) 804 return true; 805 WARN_ON(1); 806 return false; 807 } 808 809 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 810 struct iov_iter *i) 811 { 812 if (likely(iter_is_iovec(i))) 813 return copy_page_to_iter_iovec(page, offset, bytes, i); 814 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 815 void *kaddr = kmap_atomic(page); 816 size_t wanted = copy_to_iter(kaddr + offset, bytes, i); 817 kunmap_atomic(kaddr); 818 return wanted; 819 } 820 if (iov_iter_is_pipe(i)) 821 return copy_page_to_iter_pipe(page, offset, bytes, i); 822 if (unlikely(iov_iter_is_discard(i))) { 823 if (unlikely(i->count < bytes)) 824 bytes = i->count; 825 i->count -= bytes; 826 return bytes; 827 } 828 WARN_ON(1); 829 return 0; 830 } 831 832 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, 833 struct iov_iter *i) 834 { 835 size_t res = 0; 836 if (unlikely(!page_copy_sane(page, offset, bytes))) 837 return 0; 838 page += offset / PAGE_SIZE; // first subpage 839 offset %= PAGE_SIZE; 840 while (1) { 841 size_t n = __copy_page_to_iter(page, offset, 842 min(bytes, (size_t)PAGE_SIZE - offset), i); 843 res += n; 844 bytes -= n; 845 if (!bytes || !n) 846 break; 847 offset += n; 848 if (offset == PAGE_SIZE) { 849 page++; 850 offset = 0; 851 } 852 } 853 return res; 854 } 855 EXPORT_SYMBOL(copy_page_to_iter); 856 857 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, 858 struct iov_iter *i) 859 { 860 if (unlikely(!page_copy_sane(page, offset, bytes))) 861 return 0; 862 if (likely(iter_is_iovec(i))) 863 return copy_page_from_iter_iovec(page, offset, bytes, i); 864 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) { 865 void *kaddr = kmap_atomic(page); 866 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i); 867 kunmap_atomic(kaddr); 868 return wanted; 869 } 870 WARN_ON(1); 871 return 0; 872 } 873 EXPORT_SYMBOL(copy_page_from_iter); 874 875 static size_t pipe_zero(size_t bytes, struct iov_iter *i) 876 { 877 struct pipe_inode_info *pipe = i->pipe; 878 unsigned int p_mask = pipe->ring_size - 1; 879 unsigned int i_head; 880 size_t n, off; 881 882 if (!sanity(i)) 883 return 0; 884 885 bytes = n = push_pipe(i, bytes, &i_head, &off); 886 if (unlikely(!n)) 887 return 0; 888 889 do { 890 size_t chunk = min_t(size_t, n, PAGE_SIZE - off); 891 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk); 892 i->head = i_head; 893 i->iov_offset = off + chunk; 894 n -= chunk; 895 off = 0; 896 i_head++; 897 } while (n); 898 i->count -= bytes; 899 return bytes; 900 } 901 902 size_t iov_iter_zero(size_t bytes, struct iov_iter *i) 903 { 904 if (unlikely(iov_iter_is_pipe(i))) 905 return pipe_zero(bytes, i); 906 iterate_and_advance(i, bytes, v, count, 907 clear_user(v.iov_base, v.iov_len), 908 memset(v.iov_base, 0, v.iov_len) 909 ) 910 911 return bytes; 912 } 913 EXPORT_SYMBOL(iov_iter_zero); 914 915 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, 916 struct iov_iter *i) 917 { 918 char *kaddr = kmap_atomic(page), *p = kaddr + offset; 919 if (unlikely(!page_copy_sane(page, offset, bytes))) { 920 kunmap_atomic(kaddr); 921 return 0; 922 } 923 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 924 kunmap_atomic(kaddr); 925 WARN_ON(1); 926 return 0; 927 } 928 iterate_and_advance(i, bytes, v, off, 929 copyin(p + off, v.iov_base, v.iov_len), 930 memcpy(p + off, v.iov_base, v.iov_len) 931 ) 932 kunmap_atomic(kaddr); 933 return bytes; 934 } 935 EXPORT_SYMBOL(copy_page_from_iter_atomic); 936 937 static inline void pipe_truncate(struct iov_iter *i) 938 { 939 struct pipe_inode_info *pipe = i->pipe; 940 unsigned int p_tail = pipe->tail; 941 unsigned int p_head = pipe->head; 942 unsigned int p_mask = pipe->ring_size - 1; 943 944 if (!pipe_empty(p_head, p_tail)) { 945 struct pipe_buffer *buf; 946 unsigned int i_head = i->head; 947 size_t off = i->iov_offset; 948 949 if (off) { 950 buf = &pipe->bufs[i_head & p_mask]; 951 buf->len = off - buf->offset; 952 i_head++; 953 } 954 while (p_head != i_head) { 955 p_head--; 956 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]); 957 } 958 959 pipe->head = p_head; 960 } 961 } 962 963 static void pipe_advance(struct iov_iter *i, size_t size) 964 { 965 struct pipe_inode_info *pipe = i->pipe; 966 if (size) { 967 struct pipe_buffer *buf; 968 unsigned int p_mask = pipe->ring_size - 1; 969 unsigned int i_head = i->head; 970 size_t off = i->iov_offset, left = size; 971 972 if (off) /* make it relative to the beginning of buffer */ 973 left += off - pipe->bufs[i_head & p_mask].offset; 974 while (1) { 975 buf = &pipe->bufs[i_head & p_mask]; 976 if (left <= buf->len) 977 break; 978 left -= buf->len; 979 i_head++; 980 } 981 i->head = i_head; 982 i->iov_offset = buf->offset + left; 983 } 984 i->count -= size; 985 /* ... and discard everything past that point */ 986 pipe_truncate(i); 987 } 988 989 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) 990 { 991 struct bvec_iter bi; 992 993 bi.bi_size = i->count; 994 bi.bi_bvec_done = i->iov_offset; 995 bi.bi_idx = 0; 996 bvec_iter_advance(i->bvec, &bi, size); 997 998 i->bvec += bi.bi_idx; 999 i->nr_segs -= bi.bi_idx; 1000 i->count = bi.bi_size; 1001 i->iov_offset = bi.bi_bvec_done; 1002 } 1003 1004 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) 1005 { 1006 const struct iovec *iov, *end; 1007 1008 if (!i->count) 1009 return; 1010 i->count -= size; 1011 1012 size += i->iov_offset; // from beginning of current segment 1013 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { 1014 if (likely(size < iov->iov_len)) 1015 break; 1016 size -= iov->iov_len; 1017 } 1018 i->iov_offset = size; 1019 i->nr_segs -= iov - i->iov; 1020 i->iov = iov; 1021 } 1022 1023 void iov_iter_advance(struct iov_iter *i, size_t size) 1024 { 1025 if (unlikely(i->count < size)) 1026 size = i->count; 1027 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { 1028 /* iovec and kvec have identical layouts */ 1029 iov_iter_iovec_advance(i, size); 1030 } else if (iov_iter_is_bvec(i)) { 1031 iov_iter_bvec_advance(i, size); 1032 } else if (iov_iter_is_pipe(i)) { 1033 pipe_advance(i, size); 1034 } else if (unlikely(iov_iter_is_xarray(i))) { 1035 i->iov_offset += size; 1036 i->count -= size; 1037 } else if (iov_iter_is_discard(i)) { 1038 i->count -= size; 1039 } 1040 } 1041 EXPORT_SYMBOL(iov_iter_advance); 1042 1043 void iov_iter_revert(struct iov_iter *i, size_t unroll) 1044 { 1045 if (!unroll) 1046 return; 1047 if (WARN_ON(unroll > MAX_RW_COUNT)) 1048 return; 1049 i->count += unroll; 1050 if (unlikely(iov_iter_is_pipe(i))) { 1051 struct pipe_inode_info *pipe = i->pipe; 1052 unsigned int p_mask = pipe->ring_size - 1; 1053 unsigned int i_head = i->head; 1054 size_t off = i->iov_offset; 1055 while (1) { 1056 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask]; 1057 size_t n = off - b->offset; 1058 if (unroll < n) { 1059 off -= unroll; 1060 break; 1061 } 1062 unroll -= n; 1063 if (!unroll && i_head == i->start_head) { 1064 off = 0; 1065 break; 1066 } 1067 i_head--; 1068 b = &pipe->bufs[i_head & p_mask]; 1069 off = b->offset + b->len; 1070 } 1071 i->iov_offset = off; 1072 i->head = i_head; 1073 pipe_truncate(i); 1074 return; 1075 } 1076 if (unlikely(iov_iter_is_discard(i))) 1077 return; 1078 if (unroll <= i->iov_offset) { 1079 i->iov_offset -= unroll; 1080 return; 1081 } 1082 unroll -= i->iov_offset; 1083 if (iov_iter_is_xarray(i)) { 1084 BUG(); /* We should never go beyond the start of the specified 1085 * range since we might then be straying into pages that 1086 * aren't pinned. 1087 */ 1088 } else if (iov_iter_is_bvec(i)) { 1089 const struct bio_vec *bvec = i->bvec; 1090 while (1) { 1091 size_t n = (--bvec)->bv_len; 1092 i->nr_segs++; 1093 if (unroll <= n) { 1094 i->bvec = bvec; 1095 i->iov_offset = n - unroll; 1096 return; 1097 } 1098 unroll -= n; 1099 } 1100 } else { /* same logics for iovec and kvec */ 1101 const struct iovec *iov = i->iov; 1102 while (1) { 1103 size_t n = (--iov)->iov_len; 1104 i->nr_segs++; 1105 if (unroll <= n) { 1106 i->iov = iov; 1107 i->iov_offset = n - unroll; 1108 return; 1109 } 1110 unroll -= n; 1111 } 1112 } 1113 } 1114 EXPORT_SYMBOL(iov_iter_revert); 1115 1116 /* 1117 * Return the count of just the current iov_iter segment. 1118 */ 1119 size_t iov_iter_single_seg_count(const struct iov_iter *i) 1120 { 1121 if (i->nr_segs > 1) { 1122 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1123 return min(i->count, i->iov->iov_len - i->iov_offset); 1124 if (iov_iter_is_bvec(i)) 1125 return min(i->count, i->bvec->bv_len - i->iov_offset); 1126 } 1127 return i->count; 1128 } 1129 EXPORT_SYMBOL(iov_iter_single_seg_count); 1130 1131 void iov_iter_kvec(struct iov_iter *i, unsigned int direction, 1132 const struct kvec *kvec, unsigned long nr_segs, 1133 size_t count) 1134 { 1135 WARN_ON(direction & ~(READ | WRITE)); 1136 *i = (struct iov_iter){ 1137 .iter_type = ITER_KVEC, 1138 .data_source = direction, 1139 .kvec = kvec, 1140 .nr_segs = nr_segs, 1141 .iov_offset = 0, 1142 .count = count 1143 }; 1144 } 1145 EXPORT_SYMBOL(iov_iter_kvec); 1146 1147 void iov_iter_bvec(struct iov_iter *i, unsigned int direction, 1148 const struct bio_vec *bvec, unsigned long nr_segs, 1149 size_t count) 1150 { 1151 WARN_ON(direction & ~(READ | WRITE)); 1152 *i = (struct iov_iter){ 1153 .iter_type = ITER_BVEC, 1154 .data_source = direction, 1155 .bvec = bvec, 1156 .nr_segs = nr_segs, 1157 .iov_offset = 0, 1158 .count = count 1159 }; 1160 } 1161 EXPORT_SYMBOL(iov_iter_bvec); 1162 1163 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, 1164 struct pipe_inode_info *pipe, 1165 size_t count) 1166 { 1167 BUG_ON(direction != READ); 1168 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size)); 1169 *i = (struct iov_iter){ 1170 .iter_type = ITER_PIPE, 1171 .data_source = false, 1172 .pipe = pipe, 1173 .head = pipe->head, 1174 .start_head = pipe->head, 1175 .iov_offset = 0, 1176 .count = count 1177 }; 1178 } 1179 EXPORT_SYMBOL(iov_iter_pipe); 1180 1181 /** 1182 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray 1183 * @i: The iterator to initialise. 1184 * @direction: The direction of the transfer. 1185 * @xarray: The xarray to access. 1186 * @start: The start file position. 1187 * @count: The size of the I/O buffer in bytes. 1188 * 1189 * Set up an I/O iterator to either draw data out of the pages attached to an 1190 * inode or to inject data into those pages. The pages *must* be prevented 1191 * from evaporation, either by taking a ref on them or locking them by the 1192 * caller. 1193 */ 1194 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, 1195 struct xarray *xarray, loff_t start, size_t count) 1196 { 1197 BUG_ON(direction & ~1); 1198 *i = (struct iov_iter) { 1199 .iter_type = ITER_XARRAY, 1200 .data_source = direction, 1201 .xarray = xarray, 1202 .xarray_start = start, 1203 .count = count, 1204 .iov_offset = 0 1205 }; 1206 } 1207 EXPORT_SYMBOL(iov_iter_xarray); 1208 1209 /** 1210 * iov_iter_discard - Initialise an I/O iterator that discards data 1211 * @i: The iterator to initialise. 1212 * @direction: The direction of the transfer. 1213 * @count: The size of the I/O buffer in bytes. 1214 * 1215 * Set up an I/O iterator that just discards everything that's written to it. 1216 * It's only available as a READ iterator. 1217 */ 1218 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) 1219 { 1220 BUG_ON(direction != READ); 1221 *i = (struct iov_iter){ 1222 .iter_type = ITER_DISCARD, 1223 .data_source = false, 1224 .count = count, 1225 .iov_offset = 0 1226 }; 1227 } 1228 EXPORT_SYMBOL(iov_iter_discard); 1229 1230 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) 1231 { 1232 unsigned long res = 0; 1233 size_t size = i->count; 1234 size_t skip = i->iov_offset; 1235 unsigned k; 1236 1237 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1238 size_t len = i->iov[k].iov_len - skip; 1239 if (len) { 1240 res |= (unsigned long)i->iov[k].iov_base + skip; 1241 if (len > size) 1242 len = size; 1243 res |= len; 1244 size -= len; 1245 if (!size) 1246 break; 1247 } 1248 } 1249 return res; 1250 } 1251 1252 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) 1253 { 1254 unsigned res = 0; 1255 size_t size = i->count; 1256 unsigned skip = i->iov_offset; 1257 unsigned k; 1258 1259 for (k = 0; k < i->nr_segs; k++, skip = 0) { 1260 size_t len = i->bvec[k].bv_len - skip; 1261 res |= (unsigned long)i->bvec[k].bv_offset + skip; 1262 if (len > size) 1263 len = size; 1264 res |= len; 1265 size -= len; 1266 if (!size) 1267 break; 1268 } 1269 return res; 1270 } 1271 1272 unsigned long iov_iter_alignment(const struct iov_iter *i) 1273 { 1274 /* iovec and kvec have identical layouts */ 1275 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1276 return iov_iter_alignment_iovec(i); 1277 1278 if (iov_iter_is_bvec(i)) 1279 return iov_iter_alignment_bvec(i); 1280 1281 if (iov_iter_is_pipe(i)) { 1282 unsigned int p_mask = i->pipe->ring_size - 1; 1283 size_t size = i->count; 1284 1285 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask])) 1286 return size | i->iov_offset; 1287 return size; 1288 } 1289 1290 if (iov_iter_is_xarray(i)) 1291 return (i->xarray_start + i->iov_offset) | i->count; 1292 1293 return 0; 1294 } 1295 EXPORT_SYMBOL(iov_iter_alignment); 1296 1297 unsigned long iov_iter_gap_alignment(const struct iov_iter *i) 1298 { 1299 unsigned long res = 0; 1300 unsigned long v = 0; 1301 size_t size = i->count; 1302 unsigned k; 1303 1304 if (WARN_ON(!iter_is_iovec(i))) 1305 return ~0U; 1306 1307 for (k = 0; k < i->nr_segs; k++) { 1308 if (i->iov[k].iov_len) { 1309 unsigned long base = (unsigned long)i->iov[k].iov_base; 1310 if (v) // if not the first one 1311 res |= base | v; // this start | previous end 1312 v = base + i->iov[k].iov_len; 1313 if (size <= i->iov[k].iov_len) 1314 break; 1315 size -= i->iov[k].iov_len; 1316 } 1317 } 1318 return res; 1319 } 1320 EXPORT_SYMBOL(iov_iter_gap_alignment); 1321 1322 static inline ssize_t __pipe_get_pages(struct iov_iter *i, 1323 size_t maxsize, 1324 struct page **pages, 1325 int iter_head, 1326 size_t *start) 1327 { 1328 struct pipe_inode_info *pipe = i->pipe; 1329 unsigned int p_mask = pipe->ring_size - 1; 1330 ssize_t n = push_pipe(i, maxsize, &iter_head, start); 1331 if (!n) 1332 return -EFAULT; 1333 1334 maxsize = n; 1335 n += *start; 1336 while (n > 0) { 1337 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page); 1338 iter_head++; 1339 n -= PAGE_SIZE; 1340 } 1341 1342 return maxsize; 1343 } 1344 1345 static ssize_t pipe_get_pages(struct iov_iter *i, 1346 struct page **pages, size_t maxsize, unsigned maxpages, 1347 size_t *start) 1348 { 1349 unsigned int iter_head, npages; 1350 size_t capacity; 1351 1352 if (!sanity(i)) 1353 return -EFAULT; 1354 1355 data_start(i, &iter_head, start); 1356 /* Amount of free space: some of this one + all after this one */ 1357 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1358 capacity = min(npages, maxpages) * PAGE_SIZE - *start; 1359 1360 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start); 1361 } 1362 1363 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, 1364 pgoff_t index, unsigned int nr_pages) 1365 { 1366 XA_STATE(xas, xa, index); 1367 struct page *page; 1368 unsigned int ret = 0; 1369 1370 rcu_read_lock(); 1371 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1372 if (xas_retry(&xas, page)) 1373 continue; 1374 1375 /* Has the page moved or been split? */ 1376 if (unlikely(page != xas_reload(&xas))) { 1377 xas_reset(&xas); 1378 continue; 1379 } 1380 1381 pages[ret] = find_subpage(page, xas.xa_index); 1382 get_page(pages[ret]); 1383 if (++ret == nr_pages) 1384 break; 1385 } 1386 rcu_read_unlock(); 1387 return ret; 1388 } 1389 1390 static ssize_t iter_xarray_get_pages(struct iov_iter *i, 1391 struct page **pages, size_t maxsize, 1392 unsigned maxpages, size_t *_start_offset) 1393 { 1394 unsigned nr, offset; 1395 pgoff_t index, count; 1396 size_t size = maxsize, actual; 1397 loff_t pos; 1398 1399 if (!size || !maxpages) 1400 return 0; 1401 1402 pos = i->xarray_start + i->iov_offset; 1403 index = pos >> PAGE_SHIFT; 1404 offset = pos & ~PAGE_MASK; 1405 *_start_offset = offset; 1406 1407 count = 1; 1408 if (size > PAGE_SIZE - offset) { 1409 size -= PAGE_SIZE - offset; 1410 count += size >> PAGE_SHIFT; 1411 size &= ~PAGE_MASK; 1412 if (size) 1413 count++; 1414 } 1415 1416 if (count > maxpages) 1417 count = maxpages; 1418 1419 nr = iter_xarray_populate_pages(pages, i->xarray, index, count); 1420 if (nr == 0) 1421 return 0; 1422 1423 actual = PAGE_SIZE * nr; 1424 actual -= offset; 1425 if (nr == count && size > 0) { 1426 unsigned last_offset = (nr > 1) ? 0 : offset; 1427 actual -= PAGE_SIZE - (last_offset + size); 1428 } 1429 return actual; 1430 } 1431 1432 /* must be done on non-empty ITER_IOVEC one */ 1433 static unsigned long first_iovec_segment(const struct iov_iter *i, 1434 size_t *size, size_t *start, 1435 size_t maxsize, unsigned maxpages) 1436 { 1437 size_t skip; 1438 long k; 1439 1440 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { 1441 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip; 1442 size_t len = i->iov[k].iov_len - skip; 1443 1444 if (unlikely(!len)) 1445 continue; 1446 if (len > maxsize) 1447 len = maxsize; 1448 len += (*start = addr % PAGE_SIZE); 1449 if (len > maxpages * PAGE_SIZE) 1450 len = maxpages * PAGE_SIZE; 1451 *size = len; 1452 return addr & PAGE_MASK; 1453 } 1454 BUG(); // if it had been empty, we wouldn't get called 1455 } 1456 1457 /* must be done on non-empty ITER_BVEC one */ 1458 static struct page *first_bvec_segment(const struct iov_iter *i, 1459 size_t *size, size_t *start, 1460 size_t maxsize, unsigned maxpages) 1461 { 1462 struct page *page; 1463 size_t skip = i->iov_offset, len; 1464 1465 len = i->bvec->bv_len - skip; 1466 if (len > maxsize) 1467 len = maxsize; 1468 skip += i->bvec->bv_offset; 1469 page = i->bvec->bv_page + skip / PAGE_SIZE; 1470 len += (*start = skip % PAGE_SIZE); 1471 if (len > maxpages * PAGE_SIZE) 1472 len = maxpages * PAGE_SIZE; 1473 *size = len; 1474 return page; 1475 } 1476 1477 ssize_t iov_iter_get_pages(struct iov_iter *i, 1478 struct page **pages, size_t maxsize, unsigned maxpages, 1479 size_t *start) 1480 { 1481 size_t len; 1482 int n, res; 1483 1484 if (maxsize > i->count) 1485 maxsize = i->count; 1486 if (!maxsize) 1487 return 0; 1488 1489 if (likely(iter_is_iovec(i))) { 1490 unsigned long addr; 1491 1492 addr = first_iovec_segment(i, &len, start, maxsize, maxpages); 1493 n = DIV_ROUND_UP(len, PAGE_SIZE); 1494 res = get_user_pages_fast(addr, n, 1495 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, 1496 pages); 1497 if (unlikely(res < 0)) 1498 return res; 1499 return (res == n ? len : res * PAGE_SIZE) - *start; 1500 } 1501 if (iov_iter_is_bvec(i)) { 1502 struct page *page; 1503 1504 page = first_bvec_segment(i, &len, start, maxsize, maxpages); 1505 n = DIV_ROUND_UP(len, PAGE_SIZE); 1506 while (n--) 1507 get_page(*pages++ = page++); 1508 return len - *start; 1509 } 1510 if (iov_iter_is_pipe(i)) 1511 return pipe_get_pages(i, pages, maxsize, maxpages, start); 1512 if (iov_iter_is_xarray(i)) 1513 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); 1514 return -EFAULT; 1515 } 1516 EXPORT_SYMBOL(iov_iter_get_pages); 1517 1518 static struct page **get_pages_array(size_t n) 1519 { 1520 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); 1521 } 1522 1523 static ssize_t pipe_get_pages_alloc(struct iov_iter *i, 1524 struct page ***pages, size_t maxsize, 1525 size_t *start) 1526 { 1527 struct page **p; 1528 unsigned int iter_head, npages; 1529 ssize_t n; 1530 1531 if (!sanity(i)) 1532 return -EFAULT; 1533 1534 data_start(i, &iter_head, start); 1535 /* Amount of free space: some of this one + all after this one */ 1536 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1537 n = npages * PAGE_SIZE - *start; 1538 if (maxsize > n) 1539 maxsize = n; 1540 else 1541 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE); 1542 p = get_pages_array(npages); 1543 if (!p) 1544 return -ENOMEM; 1545 n = __pipe_get_pages(i, maxsize, p, iter_head, start); 1546 if (n > 0) 1547 *pages = p; 1548 else 1549 kvfree(p); 1550 return n; 1551 } 1552 1553 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i, 1554 struct page ***pages, size_t maxsize, 1555 size_t *_start_offset) 1556 { 1557 struct page **p; 1558 unsigned nr, offset; 1559 pgoff_t index, count; 1560 size_t size = maxsize, actual; 1561 loff_t pos; 1562 1563 if (!size) 1564 return 0; 1565 1566 pos = i->xarray_start + i->iov_offset; 1567 index = pos >> PAGE_SHIFT; 1568 offset = pos & ~PAGE_MASK; 1569 *_start_offset = offset; 1570 1571 count = 1; 1572 if (size > PAGE_SIZE - offset) { 1573 size -= PAGE_SIZE - offset; 1574 count += size >> PAGE_SHIFT; 1575 size &= ~PAGE_MASK; 1576 if (size) 1577 count++; 1578 } 1579 1580 p = get_pages_array(count); 1581 if (!p) 1582 return -ENOMEM; 1583 *pages = p; 1584 1585 nr = iter_xarray_populate_pages(p, i->xarray, index, count); 1586 if (nr == 0) 1587 return 0; 1588 1589 actual = PAGE_SIZE * nr; 1590 actual -= offset; 1591 if (nr == count && size > 0) { 1592 unsigned last_offset = (nr > 1) ? 0 : offset; 1593 actual -= PAGE_SIZE - (last_offset + size); 1594 } 1595 return actual; 1596 } 1597 1598 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, 1599 struct page ***pages, size_t maxsize, 1600 size_t *start) 1601 { 1602 struct page **p; 1603 size_t len; 1604 int n, res; 1605 1606 if (maxsize > i->count) 1607 maxsize = i->count; 1608 if (!maxsize) 1609 return 0; 1610 1611 if (likely(iter_is_iovec(i))) { 1612 unsigned long addr; 1613 1614 addr = first_iovec_segment(i, &len, start, maxsize, ~0U); 1615 n = DIV_ROUND_UP(len, PAGE_SIZE); 1616 p = get_pages_array(n); 1617 if (!p) 1618 return -ENOMEM; 1619 res = get_user_pages_fast(addr, n, 1620 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); 1621 if (unlikely(res < 0)) { 1622 kvfree(p); 1623 return res; 1624 } 1625 *pages = p; 1626 return (res == n ? len : res * PAGE_SIZE) - *start; 1627 } 1628 if (iov_iter_is_bvec(i)) { 1629 struct page *page; 1630 1631 page = first_bvec_segment(i, &len, start, maxsize, ~0U); 1632 n = DIV_ROUND_UP(len, PAGE_SIZE); 1633 *pages = p = get_pages_array(n); 1634 if (!p) 1635 return -ENOMEM; 1636 while (n--) 1637 get_page(*p++ = page++); 1638 return len - *start; 1639 } 1640 if (iov_iter_is_pipe(i)) 1641 return pipe_get_pages_alloc(i, pages, maxsize, start); 1642 if (iov_iter_is_xarray(i)) 1643 return iter_xarray_get_pages_alloc(i, pages, maxsize, start); 1644 return -EFAULT; 1645 } 1646 EXPORT_SYMBOL(iov_iter_get_pages_alloc); 1647 1648 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, 1649 struct iov_iter *i) 1650 { 1651 __wsum sum, next; 1652 sum = *csum; 1653 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) { 1654 WARN_ON(1); 1655 return 0; 1656 } 1657 iterate_and_advance(i, bytes, v, off, ({ 1658 next = csum_and_copy_from_user(v.iov_base, 1659 addr + off, 1660 v.iov_len); 1661 if (next) 1662 sum = csum_block_add(sum, next, off); 1663 next ? 0 : v.iov_len; 1664 }), ({ 1665 sum = csum_and_memcpy(addr + off, v.iov_base, v.iov_len, 1666 sum, off); 1667 }) 1668 ) 1669 *csum = sum; 1670 return bytes; 1671 } 1672 EXPORT_SYMBOL(csum_and_copy_from_iter); 1673 1674 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate, 1675 struct iov_iter *i) 1676 { 1677 struct csum_state *csstate = _csstate; 1678 __wsum sum, next; 1679 1680 if (unlikely(iov_iter_is_pipe(i))) 1681 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i); 1682 1683 sum = csum_shift(csstate->csum, csstate->off); 1684 if (unlikely(iov_iter_is_discard(i))) { 1685 WARN_ON(1); /* for now */ 1686 return 0; 1687 } 1688 iterate_and_advance(i, bytes, v, off, ({ 1689 next = csum_and_copy_to_user(addr + off, 1690 v.iov_base, 1691 v.iov_len); 1692 if (next) 1693 sum = csum_block_add(sum, next, off); 1694 next ? 0 : v.iov_len; 1695 }), ({ 1696 sum = csum_and_memcpy(v.iov_base, 1697 addr + off, 1698 v.iov_len, sum, off); 1699 }) 1700 ) 1701 csstate->csum = csum_shift(sum, csstate->off); 1702 csstate->off += bytes; 1703 return bytes; 1704 } 1705 EXPORT_SYMBOL(csum_and_copy_to_iter); 1706 1707 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, 1708 struct iov_iter *i) 1709 { 1710 #ifdef CONFIG_CRYPTO_HASH 1711 struct ahash_request *hash = hashp; 1712 struct scatterlist sg; 1713 size_t copied; 1714 1715 copied = copy_to_iter(addr, bytes, i); 1716 sg_init_one(&sg, addr, copied); 1717 ahash_request_set_crypt(hash, &sg, NULL, copied); 1718 crypto_ahash_update(hash); 1719 return copied; 1720 #else 1721 return 0; 1722 #endif 1723 } 1724 EXPORT_SYMBOL(hash_and_copy_to_iter); 1725 1726 static int iov_npages(const struct iov_iter *i, int maxpages) 1727 { 1728 size_t skip = i->iov_offset, size = i->count; 1729 const struct iovec *p; 1730 int npages = 0; 1731 1732 for (p = i->iov; size; skip = 0, p++) { 1733 unsigned offs = offset_in_page(p->iov_base + skip); 1734 size_t len = min(p->iov_len - skip, size); 1735 1736 if (len) { 1737 size -= len; 1738 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1739 if (unlikely(npages > maxpages)) 1740 return maxpages; 1741 } 1742 } 1743 return npages; 1744 } 1745 1746 static int bvec_npages(const struct iov_iter *i, int maxpages) 1747 { 1748 size_t skip = i->iov_offset, size = i->count; 1749 const struct bio_vec *p; 1750 int npages = 0; 1751 1752 for (p = i->bvec; size; skip = 0, p++) { 1753 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; 1754 size_t len = min(p->bv_len - skip, size); 1755 1756 size -= len; 1757 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); 1758 if (unlikely(npages > maxpages)) 1759 return maxpages; 1760 } 1761 return npages; 1762 } 1763 1764 int iov_iter_npages(const struct iov_iter *i, int maxpages) 1765 { 1766 if (unlikely(!i->count)) 1767 return 0; 1768 /* iovec and kvec have identical layouts */ 1769 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) 1770 return iov_npages(i, maxpages); 1771 if (iov_iter_is_bvec(i)) 1772 return bvec_npages(i, maxpages); 1773 if (iov_iter_is_pipe(i)) { 1774 unsigned int iter_head; 1775 int npages; 1776 size_t off; 1777 1778 if (!sanity(i)) 1779 return 0; 1780 1781 data_start(i, &iter_head, &off); 1782 /* some of this one + all after this one */ 1783 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe); 1784 return min(npages, maxpages); 1785 } 1786 if (iov_iter_is_xarray(i)) { 1787 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; 1788 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); 1789 return min(npages, maxpages); 1790 } 1791 return 0; 1792 } 1793 EXPORT_SYMBOL(iov_iter_npages); 1794 1795 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) 1796 { 1797 *new = *old; 1798 if (unlikely(iov_iter_is_pipe(new))) { 1799 WARN_ON(1); 1800 return NULL; 1801 } 1802 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new))) 1803 return NULL; 1804 if (iov_iter_is_bvec(new)) 1805 return new->bvec = kmemdup(new->bvec, 1806 new->nr_segs * sizeof(struct bio_vec), 1807 flags); 1808 else 1809 /* iovec and kvec have identical layout */ 1810 return new->iov = kmemdup(new->iov, 1811 new->nr_segs * sizeof(struct iovec), 1812 flags); 1813 } 1814 EXPORT_SYMBOL(dup_iter); 1815 1816 static int copy_compat_iovec_from_user(struct iovec *iov, 1817 const struct iovec __user *uvec, unsigned long nr_segs) 1818 { 1819 const struct compat_iovec __user *uiov = 1820 (const struct compat_iovec __user *)uvec; 1821 int ret = -EFAULT, i; 1822 1823 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) 1824 return -EFAULT; 1825 1826 for (i = 0; i < nr_segs; i++) { 1827 compat_uptr_t buf; 1828 compat_ssize_t len; 1829 1830 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); 1831 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); 1832 1833 /* check for compat_size_t not fitting in compat_ssize_t .. */ 1834 if (len < 0) { 1835 ret = -EINVAL; 1836 goto uaccess_end; 1837 } 1838 iov[i].iov_base = compat_ptr(buf); 1839 iov[i].iov_len = len; 1840 } 1841 1842 ret = 0; 1843 uaccess_end: 1844 user_access_end(); 1845 return ret; 1846 } 1847 1848 static int copy_iovec_from_user(struct iovec *iov, 1849 const struct iovec __user *uvec, unsigned long nr_segs) 1850 { 1851 unsigned long seg; 1852 1853 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec))) 1854 return -EFAULT; 1855 for (seg = 0; seg < nr_segs; seg++) { 1856 if ((ssize_t)iov[seg].iov_len < 0) 1857 return -EINVAL; 1858 } 1859 1860 return 0; 1861 } 1862 1863 struct iovec *iovec_from_user(const struct iovec __user *uvec, 1864 unsigned long nr_segs, unsigned long fast_segs, 1865 struct iovec *fast_iov, bool compat) 1866 { 1867 struct iovec *iov = fast_iov; 1868 int ret; 1869 1870 /* 1871 * SuS says "The readv() function *may* fail if the iovcnt argument was 1872 * less than or equal to 0, or greater than {IOV_MAX}. Linux has 1873 * traditionally returned zero for zero segments, so... 1874 */ 1875 if (nr_segs == 0) 1876 return iov; 1877 if (nr_segs > UIO_MAXIOV) 1878 return ERR_PTR(-EINVAL); 1879 if (nr_segs > fast_segs) { 1880 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); 1881 if (!iov) 1882 return ERR_PTR(-ENOMEM); 1883 } 1884 1885 if (compat) 1886 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); 1887 else 1888 ret = copy_iovec_from_user(iov, uvec, nr_segs); 1889 if (ret) { 1890 if (iov != fast_iov) 1891 kfree(iov); 1892 return ERR_PTR(ret); 1893 } 1894 1895 return iov; 1896 } 1897 1898 ssize_t __import_iovec(int type, const struct iovec __user *uvec, 1899 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, 1900 struct iov_iter *i, bool compat) 1901 { 1902 ssize_t total_len = 0; 1903 unsigned long seg; 1904 struct iovec *iov; 1905 1906 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); 1907 if (IS_ERR(iov)) { 1908 *iovp = NULL; 1909 return PTR_ERR(iov); 1910 } 1911 1912 /* 1913 * According to the Single Unix Specification we should return EINVAL if 1914 * an element length is < 0 when cast to ssize_t or if the total length 1915 * would overflow the ssize_t return value of the system call. 1916 * 1917 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 1918 * overflow case. 1919 */ 1920 for (seg = 0; seg < nr_segs; seg++) { 1921 ssize_t len = (ssize_t)iov[seg].iov_len; 1922 1923 if (!access_ok(iov[seg].iov_base, len)) { 1924 if (iov != *iovp) 1925 kfree(iov); 1926 *iovp = NULL; 1927 return -EFAULT; 1928 } 1929 1930 if (len > MAX_RW_COUNT - total_len) { 1931 len = MAX_RW_COUNT - total_len; 1932 iov[seg].iov_len = len; 1933 } 1934 total_len += len; 1935 } 1936 1937 iov_iter_init(i, type, iov, nr_segs, total_len); 1938 if (iov == *iovp) 1939 *iovp = NULL; 1940 else 1941 *iovp = iov; 1942 return total_len; 1943 } 1944 1945 /** 1946 * import_iovec() - Copy an array of &struct iovec from userspace 1947 * into the kernel, check that it is valid, and initialize a new 1948 * &struct iov_iter iterator to access it. 1949 * 1950 * @type: One of %READ or %WRITE. 1951 * @uvec: Pointer to the userspace array. 1952 * @nr_segs: Number of elements in userspace array. 1953 * @fast_segs: Number of elements in @iov. 1954 * @iovp: (input and output parameter) Pointer to pointer to (usually small 1955 * on-stack) kernel array. 1956 * @i: Pointer to iterator that will be initialized on success. 1957 * 1958 * If the array pointed to by *@iov is large enough to hold all @nr_segs, 1959 * then this function places %NULL in *@iov on return. Otherwise, a new 1960 * array will be allocated and the result placed in *@iov. This means that 1961 * the caller may call kfree() on *@iov regardless of whether the small 1962 * on-stack array was used or not (and regardless of whether this function 1963 * returns an error or not). 1964 * 1965 * Return: Negative error code on error, bytes imported on success 1966 */ 1967 ssize_t import_iovec(int type, const struct iovec __user *uvec, 1968 unsigned nr_segs, unsigned fast_segs, 1969 struct iovec **iovp, struct iov_iter *i) 1970 { 1971 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, 1972 in_compat_syscall()); 1973 } 1974 EXPORT_SYMBOL(import_iovec); 1975 1976 int import_single_range(int rw, void __user *buf, size_t len, 1977 struct iovec *iov, struct iov_iter *i) 1978 { 1979 if (len > MAX_RW_COUNT) 1980 len = MAX_RW_COUNT; 1981 if (unlikely(!access_ok(buf, len))) 1982 return -EFAULT; 1983 1984 iov->iov_base = buf; 1985 iov->iov_len = len; 1986 iov_iter_init(i, rw, iov, 1, len); 1987 return 0; 1988 } 1989 EXPORT_SYMBOL(import_single_range); 1990