1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/err.h> 5 #include <linux/spinlock.h> 6 7 #include <linux/mm.h> 8 #include <linux/memremap.h> 9 #include <linux/pagemap.h> 10 #include <linux/rmap.h> 11 #include <linux/swap.h> 12 #include <linux/swapops.h> 13 #include <linux/secretmem.h> 14 15 #include <linux/sched/signal.h> 16 #include <linux/rwsem.h> 17 #include <linux/hugetlb.h> 18 #include <linux/migrate.h> 19 #include <linux/mm_inline.h> 20 #include <linux/sched/mm.h> 21 22 #include <asm/mmu_context.h> 23 #include <asm/tlbflush.h> 24 25 #include "internal.h" 26 27 struct follow_page_context { 28 struct dev_pagemap *pgmap; 29 unsigned int page_mask; 30 }; 31 32 static inline void sanity_check_pinned_pages(struct page **pages, 33 unsigned long npages) 34 { 35 if (!IS_ENABLED(CONFIG_DEBUG_VM)) 36 return; 37 38 /* 39 * We only pin anonymous pages if they are exclusive. Once pinned, we 40 * can no longer turn them possibly shared and PageAnonExclusive() will 41 * stick around until the page is freed. 42 * 43 * We'd like to verify that our pinned anonymous pages are still mapped 44 * exclusively. The issue with anon THP is that we don't know how 45 * they are/were mapped when pinning them. However, for anon 46 * THP we can assume that either the given page (PTE-mapped THP) or 47 * the head page (PMD-mapped THP) should be PageAnonExclusive(). If 48 * neither is the case, there is certainly something wrong. 49 */ 50 for (; npages; npages--, pages++) { 51 struct page *page = *pages; 52 struct folio *folio = page_folio(page); 53 54 if (is_zero_page(page) || 55 !folio_test_anon(folio)) 56 continue; 57 if (!folio_test_large(folio) || folio_test_hugetlb(folio)) 58 VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); 59 else 60 /* Either a PTE-mapped or a PMD-mapped THP. */ 61 VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) && 62 !PageAnonExclusive(page), page); 63 } 64 } 65 66 /* 67 * Return the folio with ref appropriately incremented, 68 * or NULL if that failed. 69 */ 70 static inline struct folio *try_get_folio(struct page *page, int refs) 71 { 72 struct folio *folio; 73 74 retry: 75 folio = page_folio(page); 76 if (WARN_ON_ONCE(folio_ref_count(folio) < 0)) 77 return NULL; 78 if (unlikely(!folio_ref_try_add_rcu(folio, refs))) 79 return NULL; 80 81 /* 82 * At this point we have a stable reference to the folio; but it 83 * could be that between calling page_folio() and the refcount 84 * increment, the folio was split, in which case we'd end up 85 * holding a reference on a folio that has nothing to do with the page 86 * we were given anymore. 87 * So now that the folio is stable, recheck that the page still 88 * belongs to this folio. 89 */ 90 if (unlikely(page_folio(page) != folio)) { 91 if (!put_devmap_managed_page_refs(&folio->page, refs)) 92 folio_put_refs(folio, refs); 93 goto retry; 94 } 95 96 return folio; 97 } 98 99 /** 100 * try_grab_folio() - Attempt to get or pin a folio. 101 * @page: pointer to page to be grabbed 102 * @refs: the value to (effectively) add to the folio's refcount 103 * @flags: gup flags: these are the FOLL_* flag values. 104 * 105 * "grab" names in this file mean, "look at flags to decide whether to use 106 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount. 107 * 108 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the 109 * same time. (That's true throughout the get_user_pages*() and 110 * pin_user_pages*() APIs.) Cases: 111 * 112 * FOLL_GET: folio's refcount will be incremented by @refs. 113 * 114 * FOLL_PIN on large folios: folio's refcount will be incremented by 115 * @refs, and its pincount will be incremented by @refs. 116 * 117 * FOLL_PIN on single-page folios: folio's refcount will be incremented by 118 * @refs * GUP_PIN_COUNTING_BIAS. 119 * 120 * Return: The folio containing @page (with refcount appropriately 121 * incremented) for success, or NULL upon failure. If neither FOLL_GET 122 * nor FOLL_PIN was set, that's considered failure, and furthermore, 123 * a likely bug in the caller, so a warning is also emitted. 124 */ 125 struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) 126 { 127 if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) 128 return NULL; 129 130 if (flags & FOLL_GET) 131 return try_get_folio(page, refs); 132 else if (flags & FOLL_PIN) { 133 struct folio *folio; 134 135 /* 136 * Don't take a pin on the zero page - it's not going anywhere 137 * and it is used in a *lot* of places. 138 */ 139 if (is_zero_page(page)) 140 return page_folio(page); 141 142 /* 143 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a 144 * right zone, so fail and let the caller fall back to the slow 145 * path. 146 */ 147 if (unlikely((flags & FOLL_LONGTERM) && 148 !is_longterm_pinnable_page(page))) 149 return NULL; 150 151 /* 152 * CAUTION: Don't use compound_head() on the page before this 153 * point, the result won't be stable. 154 */ 155 folio = try_get_folio(page, refs); 156 if (!folio) 157 return NULL; 158 159 /* 160 * When pinning a large folio, use an exact count to track it. 161 * 162 * However, be sure to *also* increment the normal folio 163 * refcount field at least once, so that the folio really 164 * is pinned. That's why the refcount from the earlier 165 * try_get_folio() is left intact. 166 */ 167 if (folio_test_large(folio)) 168 atomic_add(refs, &folio->_pincount); 169 else 170 folio_ref_add(folio, 171 refs * (GUP_PIN_COUNTING_BIAS - 1)); 172 /* 173 * Adjust the pincount before re-checking the PTE for changes. 174 * This is essentially a smp_mb() and is paired with a memory 175 * barrier in page_try_share_anon_rmap(). 176 */ 177 smp_mb__after_atomic(); 178 179 node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); 180 181 return folio; 182 } 183 184 WARN_ON_ONCE(1); 185 return NULL; 186 } 187 188 static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) 189 { 190 if (flags & FOLL_PIN) { 191 if (is_zero_folio(folio)) 192 return; 193 node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); 194 if (folio_test_large(folio)) 195 atomic_sub(refs, &folio->_pincount); 196 else 197 refs *= GUP_PIN_COUNTING_BIAS; 198 } 199 200 if (!put_devmap_managed_page_refs(&folio->page, refs)) 201 folio_put_refs(folio, refs); 202 } 203 204 /** 205 * try_grab_page() - elevate a page's refcount by a flag-dependent amount 206 * @page: pointer to page to be grabbed 207 * @flags: gup flags: these are the FOLL_* flag values. 208 * 209 * This might not do anything at all, depending on the flags argument. 210 * 211 * "grab" names in this file mean, "look at flags to decide whether to use 212 * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount. 213 * 214 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same 215 * time. Cases: please see the try_grab_folio() documentation, with 216 * "refs=1". 217 * 218 * Return: 0 for success, or if no action was required (if neither FOLL_PIN 219 * nor FOLL_GET was set, nothing is done). A negative error code for failure: 220 * 221 * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not 222 * be grabbed. 223 */ 224 int __must_check try_grab_page(struct page *page, unsigned int flags) 225 { 226 struct folio *folio = page_folio(page); 227 228 if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) 229 return -ENOMEM; 230 231 if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) 232 return -EREMOTEIO; 233 234 if (flags & FOLL_GET) 235 folio_ref_inc(folio); 236 else if (flags & FOLL_PIN) { 237 /* 238 * Don't take a pin on the zero page - it's not going anywhere 239 * and it is used in a *lot* of places. 240 */ 241 if (is_zero_page(page)) 242 return 0; 243 244 /* 245 * Similar to try_grab_folio(): be sure to *also* 246 * increment the normal page refcount field at least once, 247 * so that the page really is pinned. 248 */ 249 if (folio_test_large(folio)) { 250 folio_ref_add(folio, 1); 251 atomic_add(1, &folio->_pincount); 252 } else { 253 folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); 254 } 255 256 node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); 257 } 258 259 return 0; 260 } 261 262 /** 263 * unpin_user_page() - release a dma-pinned page 264 * @page: pointer to page to be released 265 * 266 * Pages that were pinned via pin_user_pages*() must be released via either 267 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so 268 * that such pages can be separately tracked and uniquely handled. In 269 * particular, interactions with RDMA and filesystems need special handling. 270 */ 271 void unpin_user_page(struct page *page) 272 { 273 sanity_check_pinned_pages(&page, 1); 274 gup_put_folio(page_folio(page), 1, FOLL_PIN); 275 } 276 EXPORT_SYMBOL(unpin_user_page); 277 278 /** 279 * folio_add_pin - Try to get an additional pin on a pinned folio 280 * @folio: The folio to be pinned 281 * 282 * Get an additional pin on a folio we already have a pin on. Makes no change 283 * if the folio is a zero_page. 284 */ 285 void folio_add_pin(struct folio *folio) 286 { 287 if (is_zero_folio(folio)) 288 return; 289 290 /* 291 * Similar to try_grab_folio(): be sure to *also* increment the normal 292 * page refcount field at least once, so that the page really is 293 * pinned. 294 */ 295 if (folio_test_large(folio)) { 296 WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); 297 folio_ref_inc(folio); 298 atomic_inc(&folio->_pincount); 299 } else { 300 WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS); 301 folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); 302 } 303 } 304 305 static inline struct folio *gup_folio_range_next(struct page *start, 306 unsigned long npages, unsigned long i, unsigned int *ntails) 307 { 308 struct page *next = nth_page(start, i); 309 struct folio *folio = page_folio(next); 310 unsigned int nr = 1; 311 312 if (folio_test_large(folio)) 313 nr = min_t(unsigned int, npages - i, 314 folio_nr_pages(folio) - folio_page_idx(folio, next)); 315 316 *ntails = nr; 317 return folio; 318 } 319 320 static inline struct folio *gup_folio_next(struct page **list, 321 unsigned long npages, unsigned long i, unsigned int *ntails) 322 { 323 struct folio *folio = page_folio(list[i]); 324 unsigned int nr; 325 326 for (nr = i + 1; nr < npages; nr++) { 327 if (page_folio(list[nr]) != folio) 328 break; 329 } 330 331 *ntails = nr - i; 332 return folio; 333 } 334 335 /** 336 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages 337 * @pages: array of pages to be maybe marked dirty, and definitely released. 338 * @npages: number of pages in the @pages array. 339 * @make_dirty: whether to mark the pages dirty 340 * 341 * "gup-pinned page" refers to a page that has had one of the get_user_pages() 342 * variants called on that page. 343 * 344 * For each page in the @pages array, make that page (or its head page, if a 345 * compound page) dirty, if @make_dirty is true, and if the page was previously 346 * listed as clean. In any case, releases all pages using unpin_user_page(), 347 * possibly via unpin_user_pages(), for the non-dirty case. 348 * 349 * Please see the unpin_user_page() documentation for details. 350 * 351 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is 352 * required, then the caller should a) verify that this is really correct, 353 * because _lock() is usually required, and b) hand code it: 354 * set_page_dirty_lock(), unpin_user_page(). 355 * 356 */ 357 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, 358 bool make_dirty) 359 { 360 unsigned long i; 361 struct folio *folio; 362 unsigned int nr; 363 364 if (!make_dirty) { 365 unpin_user_pages(pages, npages); 366 return; 367 } 368 369 sanity_check_pinned_pages(pages, npages); 370 for (i = 0; i < npages; i += nr) { 371 folio = gup_folio_next(pages, npages, i, &nr); 372 /* 373 * Checking PageDirty at this point may race with 374 * clear_page_dirty_for_io(), but that's OK. Two key 375 * cases: 376 * 377 * 1) This code sees the page as already dirty, so it 378 * skips the call to set_page_dirty(). That could happen 379 * because clear_page_dirty_for_io() called 380 * page_mkclean(), followed by set_page_dirty(). 381 * However, now the page is going to get written back, 382 * which meets the original intention of setting it 383 * dirty, so all is well: clear_page_dirty_for_io() goes 384 * on to call TestClearPageDirty(), and write the page 385 * back. 386 * 387 * 2) This code sees the page as clean, so it calls 388 * set_page_dirty(). The page stays dirty, despite being 389 * written back, so it gets written back again in the 390 * next writeback cycle. This is harmless. 391 */ 392 if (!folio_test_dirty(folio)) { 393 folio_lock(folio); 394 folio_mark_dirty(folio); 395 folio_unlock(folio); 396 } 397 gup_put_folio(folio, nr, FOLL_PIN); 398 } 399 } 400 EXPORT_SYMBOL(unpin_user_pages_dirty_lock); 401 402 /** 403 * unpin_user_page_range_dirty_lock() - release and optionally dirty 404 * gup-pinned page range 405 * 406 * @page: the starting page of a range maybe marked dirty, and definitely released. 407 * @npages: number of consecutive pages to release. 408 * @make_dirty: whether to mark the pages dirty 409 * 410 * "gup-pinned page range" refers to a range of pages that has had one of the 411 * pin_user_pages() variants called on that page. 412 * 413 * For the page ranges defined by [page .. page+npages], make that range (or 414 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the 415 * page range was previously listed as clean. 416 * 417 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is 418 * required, then the caller should a) verify that this is really correct, 419 * because _lock() is usually required, and b) hand code it: 420 * set_page_dirty_lock(), unpin_user_page(). 421 * 422 */ 423 void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, 424 bool make_dirty) 425 { 426 unsigned long i; 427 struct folio *folio; 428 unsigned int nr; 429 430 for (i = 0; i < npages; i += nr) { 431 folio = gup_folio_range_next(page, npages, i, &nr); 432 if (make_dirty && !folio_test_dirty(folio)) { 433 folio_lock(folio); 434 folio_mark_dirty(folio); 435 folio_unlock(folio); 436 } 437 gup_put_folio(folio, nr, FOLL_PIN); 438 } 439 } 440 EXPORT_SYMBOL(unpin_user_page_range_dirty_lock); 441 442 static void unpin_user_pages_lockless(struct page **pages, unsigned long npages) 443 { 444 unsigned long i; 445 struct folio *folio; 446 unsigned int nr; 447 448 /* 449 * Don't perform any sanity checks because we might have raced with 450 * fork() and some anonymous pages might now actually be shared -- 451 * which is why we're unpinning after all. 452 */ 453 for (i = 0; i < npages; i += nr) { 454 folio = gup_folio_next(pages, npages, i, &nr); 455 gup_put_folio(folio, nr, FOLL_PIN); 456 } 457 } 458 459 /** 460 * unpin_user_pages() - release an array of gup-pinned pages. 461 * @pages: array of pages to be marked dirty and released. 462 * @npages: number of pages in the @pages array. 463 * 464 * For each page in the @pages array, release the page using unpin_user_page(). 465 * 466 * Please see the unpin_user_page() documentation for details. 467 */ 468 void unpin_user_pages(struct page **pages, unsigned long npages) 469 { 470 unsigned long i; 471 struct folio *folio; 472 unsigned int nr; 473 474 /* 475 * If this WARN_ON() fires, then the system *might* be leaking pages (by 476 * leaving them pinned), but probably not. More likely, gup/pup returned 477 * a hard -ERRNO error to the caller, who erroneously passed it here. 478 */ 479 if (WARN_ON(IS_ERR_VALUE(npages))) 480 return; 481 482 sanity_check_pinned_pages(pages, npages); 483 for (i = 0; i < npages; i += nr) { 484 folio = gup_folio_next(pages, npages, i, &nr); 485 gup_put_folio(folio, nr, FOLL_PIN); 486 } 487 } 488 EXPORT_SYMBOL(unpin_user_pages); 489 490 /* 491 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's 492 * lifecycle. Avoid setting the bit unless necessary, or it might cause write 493 * cache bouncing on large SMP machines for concurrent pinned gups. 494 */ 495 static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) 496 { 497 if (!test_bit(MMF_HAS_PINNED, mm_flags)) 498 set_bit(MMF_HAS_PINNED, mm_flags); 499 } 500 501 #ifdef CONFIG_MMU 502 static struct page *no_page_table(struct vm_area_struct *vma, 503 unsigned int flags) 504 { 505 /* 506 * When core dumping an enormous anonymous area that nobody 507 * has touched so far, we don't want to allocate unnecessary pages or 508 * page tables. Return error instead of NULL to skip handle_mm_fault, 509 * then get_dump_page() will return NULL to leave a hole in the dump. 510 * But we can only make this optimization where a hole would surely 511 * be zero-filled if handle_mm_fault() actually did handle it. 512 */ 513 if ((flags & FOLL_DUMP) && 514 (vma_is_anonymous(vma) || !vma->vm_ops->fault)) 515 return ERR_PTR(-EFAULT); 516 return NULL; 517 } 518 519 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, 520 pte_t *pte, unsigned int flags) 521 { 522 if (flags & FOLL_TOUCH) { 523 pte_t entry = *pte; 524 525 if (flags & FOLL_WRITE) 526 entry = pte_mkdirty(entry); 527 entry = pte_mkyoung(entry); 528 529 if (!pte_same(*pte, entry)) { 530 set_pte_at(vma->vm_mm, address, pte, entry); 531 update_mmu_cache(vma, address, pte); 532 } 533 } 534 535 /* Proper page table entry exists, but no corresponding struct page */ 536 return -EEXIST; 537 } 538 539 /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ 540 static inline bool can_follow_write_pte(pte_t pte, struct page *page, 541 struct vm_area_struct *vma, 542 unsigned int flags) 543 { 544 /* If the pte is writable, we can write to the page. */ 545 if (pte_write(pte)) 546 return true; 547 548 /* Maybe FOLL_FORCE is set to override it? */ 549 if (!(flags & FOLL_FORCE)) 550 return false; 551 552 /* But FOLL_FORCE has no effect on shared mappings */ 553 if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) 554 return false; 555 556 /* ... or read-only private ones */ 557 if (!(vma->vm_flags & VM_MAYWRITE)) 558 return false; 559 560 /* ... or already writable ones that just need to take a write fault */ 561 if (vma->vm_flags & VM_WRITE) 562 return false; 563 564 /* 565 * See can_change_pte_writable(): we broke COW and could map the page 566 * writable if we have an exclusive anonymous page ... 567 */ 568 if (!page || !PageAnon(page) || !PageAnonExclusive(page)) 569 return false; 570 571 /* ... and a write-fault isn't required for other reasons. */ 572 if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) 573 return false; 574 return !userfaultfd_pte_wp(vma, pte); 575 } 576 577 static struct page *follow_page_pte(struct vm_area_struct *vma, 578 unsigned long address, pmd_t *pmd, unsigned int flags, 579 struct dev_pagemap **pgmap) 580 { 581 struct mm_struct *mm = vma->vm_mm; 582 struct page *page; 583 spinlock_t *ptl; 584 pte_t *ptep, pte; 585 int ret; 586 587 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 588 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 589 (FOLL_PIN | FOLL_GET))) 590 return ERR_PTR(-EINVAL); 591 if (unlikely(pmd_bad(*pmd))) 592 return no_page_table(vma, flags); 593 594 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 595 pte = *ptep; 596 if (!pte_present(pte)) 597 goto no_page; 598 if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) 599 goto no_page; 600 601 page = vm_normal_page(vma, address, pte); 602 603 /* 604 * We only care about anon pages in can_follow_write_pte() and don't 605 * have to worry about pte_devmap() because they are never anon. 606 */ 607 if ((flags & FOLL_WRITE) && 608 !can_follow_write_pte(pte, page, vma, flags)) { 609 page = NULL; 610 goto out; 611 } 612 613 if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { 614 /* 615 * Only return device mapping pages in the FOLL_GET or FOLL_PIN 616 * case since they are only valid while holding the pgmap 617 * reference. 618 */ 619 *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); 620 if (*pgmap) 621 page = pte_page(pte); 622 else 623 goto no_page; 624 } else if (unlikely(!page)) { 625 if (flags & FOLL_DUMP) { 626 /* Avoid special (like zero) pages in core dumps */ 627 page = ERR_PTR(-EFAULT); 628 goto out; 629 } 630 631 if (is_zero_pfn(pte_pfn(pte))) { 632 page = pte_page(pte); 633 } else { 634 ret = follow_pfn_pte(vma, address, ptep, flags); 635 page = ERR_PTR(ret); 636 goto out; 637 } 638 } 639 640 if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { 641 page = ERR_PTR(-EMLINK); 642 goto out; 643 } 644 645 VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && 646 !PageAnonExclusive(page), page); 647 648 /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ 649 ret = try_grab_page(page, flags); 650 if (unlikely(ret)) { 651 page = ERR_PTR(ret); 652 goto out; 653 } 654 655 /* 656 * We need to make the page accessible if and only if we are going 657 * to access its content (the FOLL_PIN case). Please see 658 * Documentation/core-api/pin_user_pages.rst for details. 659 */ 660 if (flags & FOLL_PIN) { 661 ret = arch_make_page_accessible(page); 662 if (ret) { 663 unpin_user_page(page); 664 page = ERR_PTR(ret); 665 goto out; 666 } 667 } 668 if (flags & FOLL_TOUCH) { 669 if ((flags & FOLL_WRITE) && 670 !pte_dirty(pte) && !PageDirty(page)) 671 set_page_dirty(page); 672 /* 673 * pte_mkyoung() would be more correct here, but atomic care 674 * is needed to avoid losing the dirty bit: it is easier to use 675 * mark_page_accessed(). 676 */ 677 mark_page_accessed(page); 678 } 679 out: 680 pte_unmap_unlock(ptep, ptl); 681 return page; 682 no_page: 683 pte_unmap_unlock(ptep, ptl); 684 if (!pte_none(pte)) 685 return NULL; 686 return no_page_table(vma, flags); 687 } 688 689 static struct page *follow_pmd_mask(struct vm_area_struct *vma, 690 unsigned long address, pud_t *pudp, 691 unsigned int flags, 692 struct follow_page_context *ctx) 693 { 694 pmd_t *pmd, pmdval; 695 spinlock_t *ptl; 696 struct page *page; 697 struct mm_struct *mm = vma->vm_mm; 698 699 pmd = pmd_offset(pudp, address); 700 /* 701 * The READ_ONCE() will stabilize the pmdval in a register or 702 * on the stack so that it will stop changing under the code. 703 */ 704 pmdval = READ_ONCE(*pmd); 705 if (pmd_none(pmdval)) 706 return no_page_table(vma, flags); 707 if (!pmd_present(pmdval)) 708 return no_page_table(vma, flags); 709 if (pmd_devmap(pmdval)) { 710 ptl = pmd_lock(mm, pmd); 711 page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); 712 spin_unlock(ptl); 713 if (page) 714 return page; 715 } 716 if (likely(!pmd_trans_huge(pmdval))) 717 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 718 719 if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) 720 return no_page_table(vma, flags); 721 722 ptl = pmd_lock(mm, pmd); 723 if (unlikely(!pmd_present(*pmd))) { 724 spin_unlock(ptl); 725 return no_page_table(vma, flags); 726 } 727 if (unlikely(!pmd_trans_huge(*pmd))) { 728 spin_unlock(ptl); 729 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 730 } 731 if (flags & FOLL_SPLIT_PMD) { 732 int ret; 733 page = pmd_page(*pmd); 734 if (is_huge_zero_page(page)) { 735 spin_unlock(ptl); 736 ret = 0; 737 split_huge_pmd(vma, pmd, address); 738 if (pmd_trans_unstable(pmd)) 739 ret = -EBUSY; 740 } else { 741 spin_unlock(ptl); 742 split_huge_pmd(vma, pmd, address); 743 ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; 744 } 745 746 return ret ? ERR_PTR(ret) : 747 follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 748 } 749 page = follow_trans_huge_pmd(vma, address, pmd, flags); 750 spin_unlock(ptl); 751 ctx->page_mask = HPAGE_PMD_NR - 1; 752 return page; 753 } 754 755 static struct page *follow_pud_mask(struct vm_area_struct *vma, 756 unsigned long address, p4d_t *p4dp, 757 unsigned int flags, 758 struct follow_page_context *ctx) 759 { 760 pud_t *pud; 761 spinlock_t *ptl; 762 struct page *page; 763 struct mm_struct *mm = vma->vm_mm; 764 765 pud = pud_offset(p4dp, address); 766 if (pud_none(*pud)) 767 return no_page_table(vma, flags); 768 if (pud_devmap(*pud)) { 769 ptl = pud_lock(mm, pud); 770 page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); 771 spin_unlock(ptl); 772 if (page) 773 return page; 774 } 775 if (unlikely(pud_bad(*pud))) 776 return no_page_table(vma, flags); 777 778 return follow_pmd_mask(vma, address, pud, flags, ctx); 779 } 780 781 static struct page *follow_p4d_mask(struct vm_area_struct *vma, 782 unsigned long address, pgd_t *pgdp, 783 unsigned int flags, 784 struct follow_page_context *ctx) 785 { 786 p4d_t *p4d; 787 788 p4d = p4d_offset(pgdp, address); 789 if (p4d_none(*p4d)) 790 return no_page_table(vma, flags); 791 BUILD_BUG_ON(p4d_huge(*p4d)); 792 if (unlikely(p4d_bad(*p4d))) 793 return no_page_table(vma, flags); 794 795 return follow_pud_mask(vma, address, p4d, flags, ctx); 796 } 797 798 /** 799 * follow_page_mask - look up a page descriptor from a user-virtual address 800 * @vma: vm_area_struct mapping @address 801 * @address: virtual address to look up 802 * @flags: flags modifying lookup behaviour 803 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a 804 * pointer to output page_mask 805 * 806 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 807 * 808 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches 809 * the device's dev_pagemap metadata to avoid repeating expensive lookups. 810 * 811 * When getting an anonymous page and the caller has to trigger unsharing 812 * of a shared anonymous page first, -EMLINK is returned. The caller should 813 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only 814 * relevant with FOLL_PIN and !FOLL_WRITE. 815 * 816 * On output, the @ctx->page_mask is set according to the size of the page. 817 * 818 * Return: the mapped (struct page *), %NULL if no mapping exists, or 819 * an error pointer if there is a mapping to something not represented 820 * by a page descriptor (see also vm_normal_page()). 821 */ 822 static struct page *follow_page_mask(struct vm_area_struct *vma, 823 unsigned long address, unsigned int flags, 824 struct follow_page_context *ctx) 825 { 826 pgd_t *pgd; 827 struct page *page; 828 struct mm_struct *mm = vma->vm_mm; 829 830 ctx->page_mask = 0; 831 832 /* 833 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use 834 * special hugetlb page table walking code. This eliminates the 835 * need to check for hugetlb entries in the general walking code. 836 * 837 * hugetlb_follow_page_mask is only for follow_page() handling here. 838 * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. 839 */ 840 if (is_vm_hugetlb_page(vma)) { 841 page = hugetlb_follow_page_mask(vma, address, flags); 842 if (!page) 843 page = no_page_table(vma, flags); 844 return page; 845 } 846 847 pgd = pgd_offset(mm, address); 848 849 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 850 return no_page_table(vma, flags); 851 852 return follow_p4d_mask(vma, address, pgd, flags, ctx); 853 } 854 855 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 856 unsigned int foll_flags) 857 { 858 struct follow_page_context ctx = { NULL }; 859 struct page *page; 860 861 if (vma_is_secretmem(vma)) 862 return NULL; 863 864 if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) 865 return NULL; 866 867 page = follow_page_mask(vma, address, foll_flags, &ctx); 868 if (ctx.pgmap) 869 put_dev_pagemap(ctx.pgmap); 870 return page; 871 } 872 873 static int get_gate_page(struct mm_struct *mm, unsigned long address, 874 unsigned int gup_flags, struct vm_area_struct **vma, 875 struct page **page) 876 { 877 pgd_t *pgd; 878 p4d_t *p4d; 879 pud_t *pud; 880 pmd_t *pmd; 881 pte_t *pte; 882 int ret = -EFAULT; 883 884 /* user gate pages are read-only */ 885 if (gup_flags & FOLL_WRITE) 886 return -EFAULT; 887 if (address > TASK_SIZE) 888 pgd = pgd_offset_k(address); 889 else 890 pgd = pgd_offset_gate(mm, address); 891 if (pgd_none(*pgd)) 892 return -EFAULT; 893 p4d = p4d_offset(pgd, address); 894 if (p4d_none(*p4d)) 895 return -EFAULT; 896 pud = pud_offset(p4d, address); 897 if (pud_none(*pud)) 898 return -EFAULT; 899 pmd = pmd_offset(pud, address); 900 if (!pmd_present(*pmd)) 901 return -EFAULT; 902 VM_BUG_ON(pmd_trans_huge(*pmd)); 903 pte = pte_offset_map(pmd, address); 904 if (pte_none(*pte)) 905 goto unmap; 906 *vma = get_gate_vma(mm); 907 if (!page) 908 goto out; 909 *page = vm_normal_page(*vma, address, *pte); 910 if (!*page) { 911 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) 912 goto unmap; 913 *page = pte_page(*pte); 914 } 915 ret = try_grab_page(*page, gup_flags); 916 if (unlikely(ret)) 917 goto unmap; 918 out: 919 ret = 0; 920 unmap: 921 pte_unmap(pte); 922 return ret; 923 } 924 925 /* 926 * mmap_lock must be held on entry. If @flags has FOLL_UNLOCKABLE but not 927 * FOLL_NOWAIT, the mmap_lock may be released. If it is, *@locked will be set 928 * to 0 and -EBUSY returned. 929 */ 930 static int faultin_page(struct vm_area_struct *vma, 931 unsigned long address, unsigned int *flags, bool unshare, 932 int *locked) 933 { 934 unsigned int fault_flags = 0; 935 vm_fault_t ret; 936 937 if (*flags & FOLL_NOFAULT) 938 return -EFAULT; 939 if (*flags & FOLL_WRITE) 940 fault_flags |= FAULT_FLAG_WRITE; 941 if (*flags & FOLL_REMOTE) 942 fault_flags |= FAULT_FLAG_REMOTE; 943 if (*flags & FOLL_UNLOCKABLE) { 944 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 945 /* 946 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set 947 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. 948 * That's because some callers may not be prepared to 949 * handle early exits caused by non-fatal signals. 950 */ 951 if (*flags & FOLL_INTERRUPTIBLE) 952 fault_flags |= FAULT_FLAG_INTERRUPTIBLE; 953 } 954 if (*flags & FOLL_NOWAIT) 955 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 956 if (*flags & FOLL_TRIED) { 957 /* 958 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED 959 * can co-exist 960 */ 961 fault_flags |= FAULT_FLAG_TRIED; 962 } 963 if (unshare) { 964 fault_flags |= FAULT_FLAG_UNSHARE; 965 /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */ 966 VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE); 967 } 968 969 ret = handle_mm_fault(vma, address, fault_flags, NULL); 970 971 if (ret & VM_FAULT_COMPLETED) { 972 /* 973 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the 974 * mmap lock in the page fault handler. Sanity check this. 975 */ 976 WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); 977 *locked = 0; 978 979 /* 980 * We should do the same as VM_FAULT_RETRY, but let's not 981 * return -EBUSY since that's not reflecting the reality of 982 * what has happened - we've just fully completed a page 983 * fault, with the mmap lock released. Use -EAGAIN to show 984 * that we want to take the mmap lock _again_. 985 */ 986 return -EAGAIN; 987 } 988 989 if (ret & VM_FAULT_ERROR) { 990 int err = vm_fault_to_errno(ret, *flags); 991 992 if (err) 993 return err; 994 BUG(); 995 } 996 997 if (ret & VM_FAULT_RETRY) { 998 if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 999 *locked = 0; 1000 return -EBUSY; 1001 } 1002 1003 return 0; 1004 } 1005 1006 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 1007 { 1008 vm_flags_t vm_flags = vma->vm_flags; 1009 int write = (gup_flags & FOLL_WRITE); 1010 int foreign = (gup_flags & FOLL_REMOTE); 1011 1012 if (vm_flags & (VM_IO | VM_PFNMAP)) 1013 return -EFAULT; 1014 1015 if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) 1016 return -EFAULT; 1017 1018 if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) 1019 return -EOPNOTSUPP; 1020 1021 if (vma_is_secretmem(vma)) 1022 return -EFAULT; 1023 1024 if (write) { 1025 if (!(vm_flags & VM_WRITE)) { 1026 if (!(gup_flags & FOLL_FORCE)) 1027 return -EFAULT; 1028 /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ 1029 if (is_vm_hugetlb_page(vma)) 1030 return -EFAULT; 1031 /* 1032 * We used to let the write,force case do COW in a 1033 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could 1034 * set a breakpoint in a read-only mapping of an 1035 * executable, without corrupting the file (yet only 1036 * when that file had been opened for writing!). 1037 * Anon pages in shared mappings are surprising: now 1038 * just reject it. 1039 */ 1040 if (!is_cow_mapping(vm_flags)) 1041 return -EFAULT; 1042 } 1043 } else if (!(vm_flags & VM_READ)) { 1044 if (!(gup_flags & FOLL_FORCE)) 1045 return -EFAULT; 1046 /* 1047 * Is there actually any vma we can reach here which does not 1048 * have VM_MAYREAD set? 1049 */ 1050 if (!(vm_flags & VM_MAYREAD)) 1051 return -EFAULT; 1052 } 1053 /* 1054 * gups are always data accesses, not instruction 1055 * fetches, so execute=false here 1056 */ 1057 if (!arch_vma_access_permitted(vma, write, false, foreign)) 1058 return -EFAULT; 1059 return 0; 1060 } 1061 1062 /** 1063 * __get_user_pages() - pin user pages in memory 1064 * @mm: mm_struct of target mm 1065 * @start: starting user address 1066 * @nr_pages: number of pages from start to pin 1067 * @gup_flags: flags modifying pin behaviour 1068 * @pages: array that receives pointers to the pages pinned. 1069 * Should be at least nr_pages long. Or NULL, if caller 1070 * only intends to ensure the pages are faulted in. 1071 * @vmas: array of pointers to vmas corresponding to each page. 1072 * Or NULL if the caller does not require them. 1073 * @locked: whether we're still with the mmap_lock held 1074 * 1075 * Returns either number of pages pinned (which may be less than the 1076 * number requested), or an error. Details about the return value: 1077 * 1078 * -- If nr_pages is 0, returns 0. 1079 * -- If nr_pages is >0, but no pages were pinned, returns -errno. 1080 * -- If nr_pages is >0, and some pages were pinned, returns the number of 1081 * pages pinned. Again, this may be less than nr_pages. 1082 * -- 0 return value is possible when the fault would need to be retried. 1083 * 1084 * The caller is responsible for releasing returned @pages, via put_page(). 1085 * 1086 * @vmas are valid only as long as mmap_lock is held. 1087 * 1088 * Must be called with mmap_lock held. It may be released. See below. 1089 * 1090 * __get_user_pages walks a process's page tables and takes a reference to 1091 * each struct page that each user address corresponds to at a given 1092 * instant. That is, it takes the page that would be accessed if a user 1093 * thread accesses the given user virtual address at that instant. 1094 * 1095 * This does not guarantee that the page exists in the user mappings when 1096 * __get_user_pages returns, and there may even be a completely different 1097 * page there in some cases (eg. if mmapped pagecache has been invalidated 1098 * and subsequently re-faulted). However it does guarantee that the page 1099 * won't be freed completely. And mostly callers simply care that the page 1100 * contains data that was valid *at some point in time*. Typically, an IO 1101 * or similar operation cannot guarantee anything stronger anyway because 1102 * locks can't be held over the syscall boundary. 1103 * 1104 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 1105 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 1106 * appropriate) must be called after the page is finished with, and 1107 * before put_page is called. 1108 * 1109 * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may 1110 * be released. If this happens *@locked will be set to 0 on return. 1111 * 1112 * A caller using such a combination of @gup_flags must therefore hold the 1113 * mmap_lock for reading only, and recognize when it's been released. Otherwise, 1114 * it must be held for either reading or writing and will not be released. 1115 * 1116 * In most cases, get_user_pages or get_user_pages_fast should be used 1117 * instead of __get_user_pages. __get_user_pages should be used only if 1118 * you need some special @gup_flags. 1119 */ 1120 static long __get_user_pages(struct mm_struct *mm, 1121 unsigned long start, unsigned long nr_pages, 1122 unsigned int gup_flags, struct page **pages, 1123 struct vm_area_struct **vmas, int *locked) 1124 { 1125 long ret = 0, i = 0; 1126 struct vm_area_struct *vma = NULL; 1127 struct follow_page_context ctx = { NULL }; 1128 1129 if (!nr_pages) 1130 return 0; 1131 1132 start = untagged_addr_remote(mm, start); 1133 1134 VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); 1135 1136 do { 1137 struct page *page; 1138 unsigned int foll_flags = gup_flags; 1139 unsigned int page_increm; 1140 1141 /* first iteration or cross vma bound */ 1142 if (!vma || start >= vma->vm_end) { 1143 vma = find_extend_vma(mm, start); 1144 if (!vma && in_gate_area(mm, start)) { 1145 ret = get_gate_page(mm, start & PAGE_MASK, 1146 gup_flags, &vma, 1147 pages ? &pages[i] : NULL); 1148 if (ret) 1149 goto out; 1150 ctx.page_mask = 0; 1151 goto next_page; 1152 } 1153 1154 if (!vma) { 1155 ret = -EFAULT; 1156 goto out; 1157 } 1158 ret = check_vma_flags(vma, gup_flags); 1159 if (ret) 1160 goto out; 1161 1162 if (is_vm_hugetlb_page(vma)) { 1163 i = follow_hugetlb_page(mm, vma, pages, vmas, 1164 &start, &nr_pages, i, 1165 gup_flags, locked); 1166 if (!*locked) { 1167 /* 1168 * We've got a VM_FAULT_RETRY 1169 * and we've lost mmap_lock. 1170 * We must stop here. 1171 */ 1172 BUG_ON(gup_flags & FOLL_NOWAIT); 1173 goto out; 1174 } 1175 continue; 1176 } 1177 } 1178 retry: 1179 /* 1180 * If we have a pending SIGKILL, don't keep faulting pages and 1181 * potentially allocating memory. 1182 */ 1183 if (fatal_signal_pending(current)) { 1184 ret = -EINTR; 1185 goto out; 1186 } 1187 cond_resched(); 1188 1189 page = follow_page_mask(vma, start, foll_flags, &ctx); 1190 if (!page || PTR_ERR(page) == -EMLINK) { 1191 ret = faultin_page(vma, start, &foll_flags, 1192 PTR_ERR(page) == -EMLINK, locked); 1193 switch (ret) { 1194 case 0: 1195 goto retry; 1196 case -EBUSY: 1197 case -EAGAIN: 1198 ret = 0; 1199 fallthrough; 1200 case -EFAULT: 1201 case -ENOMEM: 1202 case -EHWPOISON: 1203 goto out; 1204 } 1205 BUG(); 1206 } else if (PTR_ERR(page) == -EEXIST) { 1207 /* 1208 * Proper page table entry exists, but no corresponding 1209 * struct page. If the caller expects **pages to be 1210 * filled in, bail out now, because that can't be done 1211 * for this page. 1212 */ 1213 if (pages) { 1214 ret = PTR_ERR(page); 1215 goto out; 1216 } 1217 1218 goto next_page; 1219 } else if (IS_ERR(page)) { 1220 ret = PTR_ERR(page); 1221 goto out; 1222 } 1223 if (pages) { 1224 pages[i] = page; 1225 flush_anon_page(vma, page, start); 1226 flush_dcache_page(page); 1227 ctx.page_mask = 0; 1228 } 1229 next_page: 1230 if (vmas) { 1231 vmas[i] = vma; 1232 ctx.page_mask = 0; 1233 } 1234 page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); 1235 if (page_increm > nr_pages) 1236 page_increm = nr_pages; 1237 i += page_increm; 1238 start += page_increm * PAGE_SIZE; 1239 nr_pages -= page_increm; 1240 } while (nr_pages); 1241 out: 1242 if (ctx.pgmap) 1243 put_dev_pagemap(ctx.pgmap); 1244 return i ? i : ret; 1245 } 1246 1247 static bool vma_permits_fault(struct vm_area_struct *vma, 1248 unsigned int fault_flags) 1249 { 1250 bool write = !!(fault_flags & FAULT_FLAG_WRITE); 1251 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); 1252 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; 1253 1254 if (!(vm_flags & vma->vm_flags)) 1255 return false; 1256 1257 /* 1258 * The architecture might have a hardware protection 1259 * mechanism other than read/write that can deny access. 1260 * 1261 * gup always represents data access, not instruction 1262 * fetches, so execute=false here: 1263 */ 1264 if (!arch_vma_access_permitted(vma, write, false, foreign)) 1265 return false; 1266 1267 return true; 1268 } 1269 1270 /** 1271 * fixup_user_fault() - manually resolve a user page fault 1272 * @mm: mm_struct of target mm 1273 * @address: user address 1274 * @fault_flags:flags to pass down to handle_mm_fault() 1275 * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller 1276 * does not allow retry. If NULL, the caller must guarantee 1277 * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. 1278 * 1279 * This is meant to be called in the specific scenario where for locking reasons 1280 * we try to access user memory in atomic context (within a pagefault_disable() 1281 * section), this returns -EFAULT, and we want to resolve the user fault before 1282 * trying again. 1283 * 1284 * Typically this is meant to be used by the futex code. 1285 * 1286 * The main difference with get_user_pages() is that this function will 1287 * unconditionally call handle_mm_fault() which will in turn perform all the 1288 * necessary SW fixup of the dirty and young bits in the PTE, while 1289 * get_user_pages() only guarantees to update these in the struct page. 1290 * 1291 * This is important for some architectures where those bits also gate the 1292 * access permission to the page because they are maintained in software. On 1293 * such architectures, gup() will not be enough to make a subsequent access 1294 * succeed. 1295 * 1296 * This function will not return with an unlocked mmap_lock. So it has not the 1297 * same semantics wrt the @mm->mmap_lock as does filemap_fault(). 1298 */ 1299 int fixup_user_fault(struct mm_struct *mm, 1300 unsigned long address, unsigned int fault_flags, 1301 bool *unlocked) 1302 { 1303 struct vm_area_struct *vma; 1304 vm_fault_t ret; 1305 1306 address = untagged_addr_remote(mm, address); 1307 1308 if (unlocked) 1309 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 1310 1311 retry: 1312 vma = find_extend_vma(mm, address); 1313 if (!vma || address < vma->vm_start) 1314 return -EFAULT; 1315 1316 if (!vma_permits_fault(vma, fault_flags)) 1317 return -EFAULT; 1318 1319 if ((fault_flags & FAULT_FLAG_KILLABLE) && 1320 fatal_signal_pending(current)) 1321 return -EINTR; 1322 1323 ret = handle_mm_fault(vma, address, fault_flags, NULL); 1324 1325 if (ret & VM_FAULT_COMPLETED) { 1326 /* 1327 * NOTE: it's a pity that we need to retake the lock here 1328 * to pair with the unlock() in the callers. Ideally we 1329 * could tell the callers so they do not need to unlock. 1330 */ 1331 mmap_read_lock(mm); 1332 *unlocked = true; 1333 return 0; 1334 } 1335 1336 if (ret & VM_FAULT_ERROR) { 1337 int err = vm_fault_to_errno(ret, 0); 1338 1339 if (err) 1340 return err; 1341 BUG(); 1342 } 1343 1344 if (ret & VM_FAULT_RETRY) { 1345 mmap_read_lock(mm); 1346 *unlocked = true; 1347 fault_flags |= FAULT_FLAG_TRIED; 1348 goto retry; 1349 } 1350 1351 return 0; 1352 } 1353 EXPORT_SYMBOL_GPL(fixup_user_fault); 1354 1355 /* 1356 * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is 1357 * specified, it'll also respond to generic signals. The caller of GUP 1358 * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. 1359 */ 1360 static bool gup_signal_pending(unsigned int flags) 1361 { 1362 if (fatal_signal_pending(current)) 1363 return true; 1364 1365 if (!(flags & FOLL_INTERRUPTIBLE)) 1366 return false; 1367 1368 return signal_pending(current); 1369 } 1370 1371 /* 1372 * Locking: (*locked == 1) means that the mmap_lock has already been acquired by 1373 * the caller. This function may drop the mmap_lock. If it does so, then it will 1374 * set (*locked = 0). 1375 * 1376 * (*locked == 0) means that the caller expects this function to acquire and 1377 * drop the mmap_lock. Therefore, the value of *locked will still be zero when 1378 * the function returns, even though it may have changed temporarily during 1379 * function execution. 1380 * 1381 * Please note that this function, unlike __get_user_pages(), will not return 0 1382 * for nr_pages > 0, unless FOLL_NOWAIT is used. 1383 */ 1384 static __always_inline long __get_user_pages_locked(struct mm_struct *mm, 1385 unsigned long start, 1386 unsigned long nr_pages, 1387 struct page **pages, 1388 struct vm_area_struct **vmas, 1389 int *locked, 1390 unsigned int flags) 1391 { 1392 long ret, pages_done; 1393 bool must_unlock = false; 1394 1395 /* 1396 * The internal caller expects GUP to manage the lock internally and the 1397 * lock must be released when this returns. 1398 */ 1399 if (!*locked) { 1400 if (mmap_read_lock_killable(mm)) 1401 return -EAGAIN; 1402 must_unlock = true; 1403 *locked = 1; 1404 } 1405 else 1406 mmap_assert_locked(mm); 1407 1408 if (flags & FOLL_PIN) 1409 mm_set_has_pinned_flag(&mm->flags); 1410 1411 /* 1412 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior 1413 * is to set FOLL_GET if the caller wants pages[] filled in (but has 1414 * carelessly failed to specify FOLL_GET), so keep doing that, but only 1415 * for FOLL_GET, not for the newer FOLL_PIN. 1416 * 1417 * FOLL_PIN always expects pages to be non-null, but no need to assert 1418 * that here, as any failures will be obvious enough. 1419 */ 1420 if (pages && !(flags & FOLL_PIN)) 1421 flags |= FOLL_GET; 1422 1423 pages_done = 0; 1424 for (;;) { 1425 ret = __get_user_pages(mm, start, nr_pages, flags, pages, 1426 vmas, locked); 1427 if (!(flags & FOLL_UNLOCKABLE)) { 1428 /* VM_FAULT_RETRY couldn't trigger, bypass */ 1429 pages_done = ret; 1430 break; 1431 } 1432 1433 /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ 1434 if (!*locked) { 1435 BUG_ON(ret < 0); 1436 BUG_ON(ret >= nr_pages); 1437 } 1438 1439 if (ret > 0) { 1440 nr_pages -= ret; 1441 pages_done += ret; 1442 if (!nr_pages) 1443 break; 1444 } 1445 if (*locked) { 1446 /* 1447 * VM_FAULT_RETRY didn't trigger or it was a 1448 * FOLL_NOWAIT. 1449 */ 1450 if (!pages_done) 1451 pages_done = ret; 1452 break; 1453 } 1454 /* 1455 * VM_FAULT_RETRY triggered, so seek to the faulting offset. 1456 * For the prefault case (!pages) we only update counts. 1457 */ 1458 if (likely(pages)) 1459 pages += ret; 1460 start += ret << PAGE_SHIFT; 1461 1462 /* The lock was temporarily dropped, so we must unlock later */ 1463 must_unlock = true; 1464 1465 retry: 1466 /* 1467 * Repeat on the address that fired VM_FAULT_RETRY 1468 * with both FAULT_FLAG_ALLOW_RETRY and 1469 * FAULT_FLAG_TRIED. Note that GUP can be interrupted 1470 * by fatal signals of even common signals, depending on 1471 * the caller's request. So we need to check it before we 1472 * start trying again otherwise it can loop forever. 1473 */ 1474 if (gup_signal_pending(flags)) { 1475 if (!pages_done) 1476 pages_done = -EINTR; 1477 break; 1478 } 1479 1480 ret = mmap_read_lock_killable(mm); 1481 if (ret) { 1482 BUG_ON(ret > 0); 1483 if (!pages_done) 1484 pages_done = ret; 1485 break; 1486 } 1487 1488 *locked = 1; 1489 ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED, 1490 pages, NULL, locked); 1491 if (!*locked) { 1492 /* Continue to retry until we succeeded */ 1493 BUG_ON(ret != 0); 1494 goto retry; 1495 } 1496 if (ret != 1) { 1497 BUG_ON(ret > 1); 1498 if (!pages_done) 1499 pages_done = ret; 1500 break; 1501 } 1502 nr_pages--; 1503 pages_done++; 1504 if (!nr_pages) 1505 break; 1506 if (likely(pages)) 1507 pages++; 1508 start += PAGE_SIZE; 1509 } 1510 if (must_unlock && *locked) { 1511 /* 1512 * We either temporarily dropped the lock, or the caller 1513 * requested that we both acquire and drop the lock. Either way, 1514 * we must now unlock, and notify the caller of that state. 1515 */ 1516 mmap_read_unlock(mm); 1517 *locked = 0; 1518 } 1519 return pages_done; 1520 } 1521 1522 /** 1523 * populate_vma_page_range() - populate a range of pages in the vma. 1524 * @vma: target vma 1525 * @start: start address 1526 * @end: end address 1527 * @locked: whether the mmap_lock is still held 1528 * 1529 * This takes care of mlocking the pages too if VM_LOCKED is set. 1530 * 1531 * Return either number of pages pinned in the vma, or a negative error 1532 * code on error. 1533 * 1534 * vma->vm_mm->mmap_lock must be held. 1535 * 1536 * If @locked is NULL, it may be held for read or write and will 1537 * be unperturbed. 1538 * 1539 * If @locked is non-NULL, it must held for read only and may be 1540 * released. If it's released, *@locked will be set to 0. 1541 */ 1542 long populate_vma_page_range(struct vm_area_struct *vma, 1543 unsigned long start, unsigned long end, int *locked) 1544 { 1545 struct mm_struct *mm = vma->vm_mm; 1546 unsigned long nr_pages = (end - start) / PAGE_SIZE; 1547 int local_locked = 1; 1548 int gup_flags; 1549 long ret; 1550 1551 VM_BUG_ON(!PAGE_ALIGNED(start)); 1552 VM_BUG_ON(!PAGE_ALIGNED(end)); 1553 VM_BUG_ON_VMA(start < vma->vm_start, vma); 1554 VM_BUG_ON_VMA(end > vma->vm_end, vma); 1555 mmap_assert_locked(mm); 1556 1557 /* 1558 * Rightly or wrongly, the VM_LOCKONFAULT case has never used 1559 * faultin_page() to break COW, so it has no work to do here. 1560 */ 1561 if (vma->vm_flags & VM_LOCKONFAULT) 1562 return nr_pages; 1563 1564 gup_flags = FOLL_TOUCH; 1565 /* 1566 * We want to touch writable mappings with a write fault in order 1567 * to break COW, except for shared mappings because these don't COW 1568 * and we would not want to dirty them for nothing. 1569 */ 1570 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 1571 gup_flags |= FOLL_WRITE; 1572 1573 /* 1574 * We want mlock to succeed for regions that have any permissions 1575 * other than PROT_NONE. 1576 */ 1577 if (vma_is_accessible(vma)) 1578 gup_flags |= FOLL_FORCE; 1579 1580 if (locked) 1581 gup_flags |= FOLL_UNLOCKABLE; 1582 1583 /* 1584 * We made sure addr is within a VMA, so the following will 1585 * not result in a stack expansion that recurses back here. 1586 */ 1587 ret = __get_user_pages(mm, start, nr_pages, gup_flags, 1588 NULL, NULL, locked ? locked : &local_locked); 1589 lru_add_drain(); 1590 return ret; 1591 } 1592 1593 /* 1594 * faultin_vma_page_range() - populate (prefault) page tables inside the 1595 * given VMA range readable/writable 1596 * 1597 * This takes care of mlocking the pages, too, if VM_LOCKED is set. 1598 * 1599 * @vma: target vma 1600 * @start: start address 1601 * @end: end address 1602 * @write: whether to prefault readable or writable 1603 * @locked: whether the mmap_lock is still held 1604 * 1605 * Returns either number of processed pages in the vma, or a negative error 1606 * code on error (see __get_user_pages()). 1607 * 1608 * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and 1609 * covered by the VMA. If it's released, *@locked will be set to 0. 1610 */ 1611 long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, 1612 unsigned long end, bool write, int *locked) 1613 { 1614 struct mm_struct *mm = vma->vm_mm; 1615 unsigned long nr_pages = (end - start) / PAGE_SIZE; 1616 int gup_flags; 1617 long ret; 1618 1619 VM_BUG_ON(!PAGE_ALIGNED(start)); 1620 VM_BUG_ON(!PAGE_ALIGNED(end)); 1621 VM_BUG_ON_VMA(start < vma->vm_start, vma); 1622 VM_BUG_ON_VMA(end > vma->vm_end, vma); 1623 mmap_assert_locked(mm); 1624 1625 /* 1626 * FOLL_TOUCH: Mark page accessed and thereby young; will also mark 1627 * the page dirty with FOLL_WRITE -- which doesn't make a 1628 * difference with !FOLL_FORCE, because the page is writable 1629 * in the page table. 1630 * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit 1631 * a poisoned page. 1632 * !FOLL_FORCE: Require proper access permissions. 1633 */ 1634 gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; 1635 if (write) 1636 gup_flags |= FOLL_WRITE; 1637 1638 /* 1639 * We want to report -EINVAL instead of -EFAULT for any permission 1640 * problems or incompatible mappings. 1641 */ 1642 if (check_vma_flags(vma, gup_flags)) 1643 return -EINVAL; 1644 1645 ret = __get_user_pages(mm, start, nr_pages, gup_flags, 1646 NULL, NULL, locked); 1647 lru_add_drain(); 1648 return ret; 1649 } 1650 1651 /* 1652 * __mm_populate - populate and/or mlock pages within a range of address space. 1653 * 1654 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap 1655 * flags. VMAs must be already marked with the desired vm_flags, and 1656 * mmap_lock must not be held. 1657 */ 1658 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) 1659 { 1660 struct mm_struct *mm = current->mm; 1661 unsigned long end, nstart, nend; 1662 struct vm_area_struct *vma = NULL; 1663 int locked = 0; 1664 long ret = 0; 1665 1666 end = start + len; 1667 1668 for (nstart = start; nstart < end; nstart = nend) { 1669 /* 1670 * We want to fault in pages for [nstart; end) address range. 1671 * Find first corresponding VMA. 1672 */ 1673 if (!locked) { 1674 locked = 1; 1675 mmap_read_lock(mm); 1676 vma = find_vma_intersection(mm, nstart, end); 1677 } else if (nstart >= vma->vm_end) 1678 vma = find_vma_intersection(mm, vma->vm_end, end); 1679 1680 if (!vma) 1681 break; 1682 /* 1683 * Set [nstart; nend) to intersection of desired address 1684 * range with the first VMA. Also, skip undesirable VMA types. 1685 */ 1686 nend = min(end, vma->vm_end); 1687 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1688 continue; 1689 if (nstart < vma->vm_start) 1690 nstart = vma->vm_start; 1691 /* 1692 * Now fault in a range of pages. populate_vma_page_range() 1693 * double checks the vma flags, so that it won't mlock pages 1694 * if the vma was already munlocked. 1695 */ 1696 ret = populate_vma_page_range(vma, nstart, nend, &locked); 1697 if (ret < 0) { 1698 if (ignore_errors) { 1699 ret = 0; 1700 continue; /* continue at next VMA */ 1701 } 1702 break; 1703 } 1704 nend = nstart + ret * PAGE_SIZE; 1705 ret = 0; 1706 } 1707 if (locked) 1708 mmap_read_unlock(mm); 1709 return ret; /* 0 or negative error code */ 1710 } 1711 #else /* CONFIG_MMU */ 1712 static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, 1713 unsigned long nr_pages, struct page **pages, 1714 struct vm_area_struct **vmas, int *locked, 1715 unsigned int foll_flags) 1716 { 1717 struct vm_area_struct *vma; 1718 bool must_unlock = false; 1719 unsigned long vm_flags; 1720 long i; 1721 1722 if (!nr_pages) 1723 return 0; 1724 1725 /* 1726 * The internal caller expects GUP to manage the lock internally and the 1727 * lock must be released when this returns. 1728 */ 1729 if (!*locked) { 1730 if (mmap_read_lock_killable(mm)) 1731 return -EAGAIN; 1732 must_unlock = true; 1733 *locked = 1; 1734 } 1735 1736 /* calculate required read or write permissions. 1737 * If FOLL_FORCE is set, we only require the "MAY" flags. 1738 */ 1739 vm_flags = (foll_flags & FOLL_WRITE) ? 1740 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1741 vm_flags &= (foll_flags & FOLL_FORCE) ? 1742 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1743 1744 for (i = 0; i < nr_pages; i++) { 1745 vma = find_vma(mm, start); 1746 if (!vma) 1747 break; 1748 1749 /* protect what we can, including chardevs */ 1750 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1751 !(vm_flags & vma->vm_flags)) 1752 break; 1753 1754 if (pages) { 1755 pages[i] = virt_to_page((void *)start); 1756 if (pages[i]) 1757 get_page(pages[i]); 1758 } 1759 if (vmas) 1760 vmas[i] = vma; 1761 start = (start + PAGE_SIZE) & PAGE_MASK; 1762 } 1763 1764 if (must_unlock && *locked) { 1765 mmap_read_unlock(mm); 1766 *locked = 0; 1767 } 1768 1769 return i ? : -EFAULT; 1770 } 1771 #endif /* !CONFIG_MMU */ 1772 1773 /** 1774 * fault_in_writeable - fault in userspace address range for writing 1775 * @uaddr: start of address range 1776 * @size: size of address range 1777 * 1778 * Returns the number of bytes not faulted in (like copy_to_user() and 1779 * copy_from_user()). 1780 */ 1781 size_t fault_in_writeable(char __user *uaddr, size_t size) 1782 { 1783 char __user *start = uaddr, *end; 1784 1785 if (unlikely(size == 0)) 1786 return 0; 1787 if (!user_write_access_begin(uaddr, size)) 1788 return size; 1789 if (!PAGE_ALIGNED(uaddr)) { 1790 unsafe_put_user(0, uaddr, out); 1791 uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); 1792 } 1793 end = (char __user *)PAGE_ALIGN((unsigned long)start + size); 1794 if (unlikely(end < start)) 1795 end = NULL; 1796 while (uaddr != end) { 1797 unsafe_put_user(0, uaddr, out); 1798 uaddr += PAGE_SIZE; 1799 } 1800 1801 out: 1802 user_write_access_end(); 1803 if (size > uaddr - start) 1804 return size - (uaddr - start); 1805 return 0; 1806 } 1807 EXPORT_SYMBOL(fault_in_writeable); 1808 1809 /** 1810 * fault_in_subpage_writeable - fault in an address range for writing 1811 * @uaddr: start of address range 1812 * @size: size of address range 1813 * 1814 * Fault in a user address range for writing while checking for permissions at 1815 * sub-page granularity (e.g. arm64 MTE). This function should be used when 1816 * the caller cannot guarantee forward progress of a copy_to_user() loop. 1817 * 1818 * Returns the number of bytes not faulted in (like copy_to_user() and 1819 * copy_from_user()). 1820 */ 1821 size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) 1822 { 1823 size_t faulted_in; 1824 1825 /* 1826 * Attempt faulting in at page granularity first for page table 1827 * permission checking. The arch-specific probe_subpage_writeable() 1828 * functions may not check for this. 1829 */ 1830 faulted_in = size - fault_in_writeable(uaddr, size); 1831 if (faulted_in) 1832 faulted_in -= probe_subpage_writeable(uaddr, faulted_in); 1833 1834 return size - faulted_in; 1835 } 1836 EXPORT_SYMBOL(fault_in_subpage_writeable); 1837 1838 /* 1839 * fault_in_safe_writeable - fault in an address range for writing 1840 * @uaddr: start of address range 1841 * @size: length of address range 1842 * 1843 * Faults in an address range for writing. This is primarily useful when we 1844 * already know that some or all of the pages in the address range aren't in 1845 * memory. 1846 * 1847 * Unlike fault_in_writeable(), this function is non-destructive. 1848 * 1849 * Note that we don't pin or otherwise hold the pages referenced that we fault 1850 * in. There's no guarantee that they'll stay in memory for any duration of 1851 * time. 1852 * 1853 * Returns the number of bytes not faulted in, like copy_to_user() and 1854 * copy_from_user(). 1855 */ 1856 size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) 1857 { 1858 unsigned long start = (unsigned long)uaddr, end; 1859 struct mm_struct *mm = current->mm; 1860 bool unlocked = false; 1861 1862 if (unlikely(size == 0)) 1863 return 0; 1864 end = PAGE_ALIGN(start + size); 1865 if (end < start) 1866 end = 0; 1867 1868 mmap_read_lock(mm); 1869 do { 1870 if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked)) 1871 break; 1872 start = (start + PAGE_SIZE) & PAGE_MASK; 1873 } while (start != end); 1874 mmap_read_unlock(mm); 1875 1876 if (size > (unsigned long)uaddr - start) 1877 return size - ((unsigned long)uaddr - start); 1878 return 0; 1879 } 1880 EXPORT_SYMBOL(fault_in_safe_writeable); 1881 1882 /** 1883 * fault_in_readable - fault in userspace address range for reading 1884 * @uaddr: start of user address range 1885 * @size: size of user address range 1886 * 1887 * Returns the number of bytes not faulted in (like copy_to_user() and 1888 * copy_from_user()). 1889 */ 1890 size_t fault_in_readable(const char __user *uaddr, size_t size) 1891 { 1892 const char __user *start = uaddr, *end; 1893 volatile char c; 1894 1895 if (unlikely(size == 0)) 1896 return 0; 1897 if (!user_read_access_begin(uaddr, size)) 1898 return size; 1899 if (!PAGE_ALIGNED(uaddr)) { 1900 unsafe_get_user(c, uaddr, out); 1901 uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); 1902 } 1903 end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); 1904 if (unlikely(end < start)) 1905 end = NULL; 1906 while (uaddr != end) { 1907 unsafe_get_user(c, uaddr, out); 1908 uaddr += PAGE_SIZE; 1909 } 1910 1911 out: 1912 user_read_access_end(); 1913 (void)c; 1914 if (size > uaddr - start) 1915 return size - (uaddr - start); 1916 return 0; 1917 } 1918 EXPORT_SYMBOL(fault_in_readable); 1919 1920 /** 1921 * get_dump_page() - pin user page in memory while writing it to core dump 1922 * @addr: user address 1923 * 1924 * Returns struct page pointer of user page pinned for dump, 1925 * to be freed afterwards by put_page(). 1926 * 1927 * Returns NULL on any kind of failure - a hole must then be inserted into 1928 * the corefile, to preserve alignment with its headers; and also returns 1929 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 1930 * allowing a hole to be left in the corefile to save disk space. 1931 * 1932 * Called without mmap_lock (takes and releases the mmap_lock by itself). 1933 */ 1934 #ifdef CONFIG_ELF_CORE 1935 struct page *get_dump_page(unsigned long addr) 1936 { 1937 struct page *page; 1938 int locked = 0; 1939 int ret; 1940 1941 ret = __get_user_pages_locked(current->mm, addr, 1, &page, NULL, 1942 &locked, 1943 FOLL_FORCE | FOLL_DUMP | FOLL_GET); 1944 return (ret == 1) ? page : NULL; 1945 } 1946 #endif /* CONFIG_ELF_CORE */ 1947 1948 #ifdef CONFIG_MIGRATION 1949 /* 1950 * Returns the number of collected pages. Return value is always >= 0. 1951 */ 1952 static unsigned long collect_longterm_unpinnable_pages( 1953 struct list_head *movable_page_list, 1954 unsigned long nr_pages, 1955 struct page **pages) 1956 { 1957 unsigned long i, collected = 0; 1958 struct folio *prev_folio = NULL; 1959 bool drain_allow = true; 1960 1961 for (i = 0; i < nr_pages; i++) { 1962 struct folio *folio = page_folio(pages[i]); 1963 1964 if (folio == prev_folio) 1965 continue; 1966 prev_folio = folio; 1967 1968 if (folio_is_longterm_pinnable(folio)) 1969 continue; 1970 1971 collected++; 1972 1973 if (folio_is_device_coherent(folio)) 1974 continue; 1975 1976 if (folio_test_hugetlb(folio)) { 1977 isolate_hugetlb(folio, movable_page_list); 1978 continue; 1979 } 1980 1981 if (!folio_test_lru(folio) && drain_allow) { 1982 lru_add_drain_all(); 1983 drain_allow = false; 1984 } 1985 1986 if (!folio_isolate_lru(folio)) 1987 continue; 1988 1989 list_add_tail(&folio->lru, movable_page_list); 1990 node_stat_mod_folio(folio, 1991 NR_ISOLATED_ANON + folio_is_file_lru(folio), 1992 folio_nr_pages(folio)); 1993 } 1994 1995 return collected; 1996 } 1997 1998 /* 1999 * Unpins all pages and migrates device coherent pages and movable_page_list. 2000 * Returns -EAGAIN if all pages were successfully migrated or -errno for failure 2001 * (or partial success). 2002 */ 2003 static int migrate_longterm_unpinnable_pages( 2004 struct list_head *movable_page_list, 2005 unsigned long nr_pages, 2006 struct page **pages) 2007 { 2008 int ret; 2009 unsigned long i; 2010 2011 for (i = 0; i < nr_pages; i++) { 2012 struct folio *folio = page_folio(pages[i]); 2013 2014 if (folio_is_device_coherent(folio)) { 2015 /* 2016 * Migration will fail if the page is pinned, so convert 2017 * the pin on the source page to a normal reference. 2018 */ 2019 pages[i] = NULL; 2020 folio_get(folio); 2021 gup_put_folio(folio, 1, FOLL_PIN); 2022 2023 if (migrate_device_coherent_page(&folio->page)) { 2024 ret = -EBUSY; 2025 goto err; 2026 } 2027 2028 continue; 2029 } 2030 2031 /* 2032 * We can't migrate pages with unexpected references, so drop 2033 * the reference obtained by __get_user_pages_locked(). 2034 * Migrating pages have been added to movable_page_list after 2035 * calling folio_isolate_lru() which takes a reference so the 2036 * page won't be freed if it's migrating. 2037 */ 2038 unpin_user_page(pages[i]); 2039 pages[i] = NULL; 2040 } 2041 2042 if (!list_empty(movable_page_list)) { 2043 struct migration_target_control mtc = { 2044 .nid = NUMA_NO_NODE, 2045 .gfp_mask = GFP_USER | __GFP_NOWARN, 2046 }; 2047 2048 if (migrate_pages(movable_page_list, alloc_migration_target, 2049 NULL, (unsigned long)&mtc, MIGRATE_SYNC, 2050 MR_LONGTERM_PIN, NULL)) { 2051 ret = -ENOMEM; 2052 goto err; 2053 } 2054 } 2055 2056 putback_movable_pages(movable_page_list); 2057 2058 return -EAGAIN; 2059 2060 err: 2061 for (i = 0; i < nr_pages; i++) 2062 if (pages[i]) 2063 unpin_user_page(pages[i]); 2064 putback_movable_pages(movable_page_list); 2065 2066 return ret; 2067 } 2068 2069 /* 2070 * Check whether all pages are *allowed* to be pinned. Rather confusingly, all 2071 * pages in the range are required to be pinned via FOLL_PIN, before calling 2072 * this routine. 2073 * 2074 * If any pages in the range are not allowed to be pinned, then this routine 2075 * will migrate those pages away, unpin all the pages in the range and return 2076 * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then 2077 * call this routine again. 2078 * 2079 * If an error other than -EAGAIN occurs, this indicates a migration failure. 2080 * The caller should give up, and propagate the error back up the call stack. 2081 * 2082 * If everything is OK and all pages in the range are allowed to be pinned, then 2083 * this routine leaves all pages pinned and returns zero for success. 2084 */ 2085 static long check_and_migrate_movable_pages(unsigned long nr_pages, 2086 struct page **pages) 2087 { 2088 unsigned long collected; 2089 LIST_HEAD(movable_page_list); 2090 2091 collected = collect_longterm_unpinnable_pages(&movable_page_list, 2092 nr_pages, pages); 2093 if (!collected) 2094 return 0; 2095 2096 return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, 2097 pages); 2098 } 2099 #else 2100 static long check_and_migrate_movable_pages(unsigned long nr_pages, 2101 struct page **pages) 2102 { 2103 return 0; 2104 } 2105 #endif /* CONFIG_MIGRATION */ 2106 2107 /* 2108 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which 2109 * allows us to process the FOLL_LONGTERM flag. 2110 */ 2111 static long __gup_longterm_locked(struct mm_struct *mm, 2112 unsigned long start, 2113 unsigned long nr_pages, 2114 struct page **pages, 2115 struct vm_area_struct **vmas, 2116 int *locked, 2117 unsigned int gup_flags) 2118 { 2119 unsigned int flags; 2120 long rc, nr_pinned_pages; 2121 2122 if (!(gup_flags & FOLL_LONGTERM)) 2123 return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, 2124 locked, gup_flags); 2125 2126 flags = memalloc_pin_save(); 2127 do { 2128 nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, 2129 pages, vmas, locked, 2130 gup_flags); 2131 if (nr_pinned_pages <= 0) { 2132 rc = nr_pinned_pages; 2133 break; 2134 } 2135 2136 /* FOLL_LONGTERM implies FOLL_PIN */ 2137 rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); 2138 } while (rc == -EAGAIN); 2139 memalloc_pin_restore(flags); 2140 return rc ? rc : nr_pinned_pages; 2141 } 2142 2143 /* 2144 * Check that the given flags are valid for the exported gup/pup interface, and 2145 * update them with the required flags that the caller must have set. 2146 */ 2147 static bool is_valid_gup_args(struct page **pages, struct vm_area_struct **vmas, 2148 int *locked, unsigned int *gup_flags_p, 2149 unsigned int to_set) 2150 { 2151 unsigned int gup_flags = *gup_flags_p; 2152 2153 /* 2154 * These flags not allowed to be specified externally to the gup 2155 * interfaces: 2156 * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only 2157 * - FOLL_REMOTE is internal only and used on follow_page() 2158 * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL 2159 */ 2160 if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE | 2161 FOLL_REMOTE | FOLL_FAST_ONLY))) 2162 return false; 2163 2164 gup_flags |= to_set; 2165 if (locked) { 2166 /* At the external interface locked must be set */ 2167 if (WARN_ON_ONCE(*locked != 1)) 2168 return false; 2169 2170 gup_flags |= FOLL_UNLOCKABLE; 2171 } 2172 2173 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 2174 if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == 2175 (FOLL_PIN | FOLL_GET))) 2176 return false; 2177 2178 /* LONGTERM can only be specified when pinning */ 2179 if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM))) 2180 return false; 2181 2182 /* Pages input must be given if using GET/PIN */ 2183 if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages)) 2184 return false; 2185 2186 /* We want to allow the pgmap to be hot-unplugged at all times */ 2187 if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) && 2188 (gup_flags & FOLL_PCI_P2PDMA))) 2189 return false; 2190 2191 /* 2192 * Can't use VMAs with locked, as locked allows GUP to unlock 2193 * which invalidates the vmas array 2194 */ 2195 if (WARN_ON_ONCE(vmas && (gup_flags & FOLL_UNLOCKABLE))) 2196 return false; 2197 2198 *gup_flags_p = gup_flags; 2199 return true; 2200 } 2201 2202 #ifdef CONFIG_MMU 2203 /** 2204 * get_user_pages_remote() - pin user pages in memory 2205 * @mm: mm_struct of target mm 2206 * @start: starting user address 2207 * @nr_pages: number of pages from start to pin 2208 * @gup_flags: flags modifying lookup behaviour 2209 * @pages: array that receives pointers to the pages pinned. 2210 * Should be at least nr_pages long. Or NULL, if caller 2211 * only intends to ensure the pages are faulted in. 2212 * @vmas: array of pointers to vmas corresponding to each page. 2213 * Or NULL if the caller does not require them. 2214 * @locked: pointer to lock flag indicating whether lock is held and 2215 * subsequently whether VM_FAULT_RETRY functionality can be 2216 * utilised. Lock must initially be held. 2217 * 2218 * Returns either number of pages pinned (which may be less than the 2219 * number requested), or an error. Details about the return value: 2220 * 2221 * -- If nr_pages is 0, returns 0. 2222 * -- If nr_pages is >0, but no pages were pinned, returns -errno. 2223 * -- If nr_pages is >0, and some pages were pinned, returns the number of 2224 * pages pinned. Again, this may be less than nr_pages. 2225 * 2226 * The caller is responsible for releasing returned @pages, via put_page(). 2227 * 2228 * @vmas are valid only as long as mmap_lock is held. 2229 * 2230 * Must be called with mmap_lock held for read or write. 2231 * 2232 * get_user_pages_remote walks a process's page tables and takes a reference 2233 * to each struct page that each user address corresponds to at a given 2234 * instant. That is, it takes the page that would be accessed if a user 2235 * thread accesses the given user virtual address at that instant. 2236 * 2237 * This does not guarantee that the page exists in the user mappings when 2238 * get_user_pages_remote returns, and there may even be a completely different 2239 * page there in some cases (eg. if mmapped pagecache has been invalidated 2240 * and subsequently re-faulted). However it does guarantee that the page 2241 * won't be freed completely. And mostly callers simply care that the page 2242 * contains data that was valid *at some point in time*. Typically, an IO 2243 * or similar operation cannot guarantee anything stronger anyway because 2244 * locks can't be held over the syscall boundary. 2245 * 2246 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page 2247 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must 2248 * be called after the page is finished with, and before put_page is called. 2249 * 2250 * get_user_pages_remote is typically used for fewer-copy IO operations, 2251 * to get a handle on the memory by some means other than accesses 2252 * via the user virtual addresses. The pages may be submitted for 2253 * DMA to devices or accessed via their kernel linear mapping (via the 2254 * kmap APIs). Care should be taken to use the correct cache flushing APIs. 2255 * 2256 * See also get_user_pages_fast, for performance critical applications. 2257 * 2258 * get_user_pages_remote should be phased out in favor of 2259 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing 2260 * should use get_user_pages_remote because it cannot pass 2261 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. 2262 */ 2263 long get_user_pages_remote(struct mm_struct *mm, 2264 unsigned long start, unsigned long nr_pages, 2265 unsigned int gup_flags, struct page **pages, 2266 struct vm_area_struct **vmas, int *locked) 2267 { 2268 int local_locked = 1; 2269 2270 if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, 2271 FOLL_TOUCH | FOLL_REMOTE)) 2272 return -EINVAL; 2273 2274 return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, 2275 locked ? locked : &local_locked, 2276 gup_flags); 2277 } 2278 EXPORT_SYMBOL(get_user_pages_remote); 2279 2280 #else /* CONFIG_MMU */ 2281 long get_user_pages_remote(struct mm_struct *mm, 2282 unsigned long start, unsigned long nr_pages, 2283 unsigned int gup_flags, struct page **pages, 2284 struct vm_area_struct **vmas, int *locked) 2285 { 2286 return 0; 2287 } 2288 #endif /* !CONFIG_MMU */ 2289 2290 /** 2291 * get_user_pages() - pin user pages in memory 2292 * @start: starting user address 2293 * @nr_pages: number of pages from start to pin 2294 * @gup_flags: flags modifying lookup behaviour 2295 * @pages: array that receives pointers to the pages pinned. 2296 * Should be at least nr_pages long. Or NULL, if caller 2297 * only intends to ensure the pages are faulted in. 2298 * @vmas: array of pointers to vmas corresponding to each page. 2299 * Or NULL if the caller does not require them. 2300 * 2301 * This is the same as get_user_pages_remote(), just with a less-flexible 2302 * calling convention where we assume that the mm being operated on belongs to 2303 * the current task, and doesn't allow passing of a locked parameter. We also 2304 * obviously don't pass FOLL_REMOTE in here. 2305 */ 2306 long get_user_pages(unsigned long start, unsigned long nr_pages, 2307 unsigned int gup_flags, struct page **pages, 2308 struct vm_area_struct **vmas) 2309 { 2310 int locked = 1; 2311 2312 if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_TOUCH)) 2313 return -EINVAL; 2314 2315 return __get_user_pages_locked(current->mm, start, nr_pages, pages, 2316 vmas, &locked, gup_flags); 2317 } 2318 EXPORT_SYMBOL(get_user_pages); 2319 2320 /* 2321 * get_user_pages_unlocked() is suitable to replace the form: 2322 * 2323 * mmap_read_lock(mm); 2324 * get_user_pages(mm, ..., pages, NULL); 2325 * mmap_read_unlock(mm); 2326 * 2327 * with: 2328 * 2329 * get_user_pages_unlocked(mm, ..., pages); 2330 * 2331 * It is functionally equivalent to get_user_pages_fast so 2332 * get_user_pages_fast should be used instead if specific gup_flags 2333 * (e.g. FOLL_FORCE) are not required. 2334 */ 2335 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 2336 struct page **pages, unsigned int gup_flags) 2337 { 2338 int locked = 0; 2339 2340 if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, 2341 FOLL_TOUCH | FOLL_UNLOCKABLE)) 2342 return -EINVAL; 2343 2344 return __get_user_pages_locked(current->mm, start, nr_pages, pages, 2345 NULL, &locked, gup_flags); 2346 } 2347 EXPORT_SYMBOL(get_user_pages_unlocked); 2348 2349 /* 2350 * Fast GUP 2351 * 2352 * get_user_pages_fast attempts to pin user pages by walking the page 2353 * tables directly and avoids taking locks. Thus the walker needs to be 2354 * protected from page table pages being freed from under it, and should 2355 * block any THP splits. 2356 * 2357 * One way to achieve this is to have the walker disable interrupts, and 2358 * rely on IPIs from the TLB flushing code blocking before the page table 2359 * pages are freed. This is unsuitable for architectures that do not need 2360 * to broadcast an IPI when invalidating TLBs. 2361 * 2362 * Another way to achieve this is to batch up page table containing pages 2363 * belonging to more than one mm_user, then rcu_sched a callback to free those 2364 * pages. Disabling interrupts will allow the fast_gup walker to both block 2365 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs 2366 * (which is a relatively rare event). The code below adopts this strategy. 2367 * 2368 * Before activating this code, please be aware that the following assumptions 2369 * are currently made: 2370 * 2371 * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to 2372 * free pages containing page tables or TLB flushing requires IPI broadcast. 2373 * 2374 * *) ptes can be read atomically by the architecture. 2375 * 2376 * *) access_ok is sufficient to validate userspace address ranges. 2377 * 2378 * The last two assumptions can be relaxed by the addition of helper functions. 2379 * 2380 * This code is based heavily on the PowerPC implementation by Nick Piggin. 2381 */ 2382 #ifdef CONFIG_HAVE_FAST_GUP 2383 2384 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, 2385 unsigned int flags, 2386 struct page **pages) 2387 { 2388 while ((*nr) - nr_start) { 2389 struct page *page = pages[--(*nr)]; 2390 2391 ClearPageReferenced(page); 2392 if (flags & FOLL_PIN) 2393 unpin_user_page(page); 2394 else 2395 put_page(page); 2396 } 2397 } 2398 2399 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL 2400 /* 2401 * Fast-gup relies on pte change detection to avoid concurrent pgtable 2402 * operations. 2403 * 2404 * To pin the page, fast-gup needs to do below in order: 2405 * (1) pin the page (by prefetching pte), then (2) check pte not changed. 2406 * 2407 * For the rest of pgtable operations where pgtable updates can be racy 2408 * with fast-gup, we need to do (1) clear pte, then (2) check whether page 2409 * is pinned. 2410 * 2411 * Above will work for all pte-level operations, including THP split. 2412 * 2413 * For THP collapse, it's a bit more complicated because fast-gup may be 2414 * walking a pgtable page that is being freed (pte is still valid but pmd 2415 * can be cleared already). To avoid race in such condition, we need to 2416 * also check pmd here to make sure pmd doesn't change (corresponds to 2417 * pmdp_collapse_flush() in the THP collapse code path). 2418 */ 2419 static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, 2420 unsigned long end, unsigned int flags, 2421 struct page **pages, int *nr) 2422 { 2423 struct dev_pagemap *pgmap = NULL; 2424 int nr_start = *nr, ret = 0; 2425 pte_t *ptep, *ptem; 2426 2427 ptem = ptep = pte_offset_map(&pmd, addr); 2428 do { 2429 pte_t pte = ptep_get_lockless(ptep); 2430 struct page *page; 2431 struct folio *folio; 2432 2433 if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) 2434 goto pte_unmap; 2435 2436 if (!pte_access_permitted(pte, flags & FOLL_WRITE)) 2437 goto pte_unmap; 2438 2439 if (pte_devmap(pte)) { 2440 if (unlikely(flags & FOLL_LONGTERM)) 2441 goto pte_unmap; 2442 2443 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); 2444 if (unlikely(!pgmap)) { 2445 undo_dev_pagemap(nr, nr_start, flags, pages); 2446 goto pte_unmap; 2447 } 2448 } else if (pte_special(pte)) 2449 goto pte_unmap; 2450 2451 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 2452 page = pte_page(pte); 2453 2454 folio = try_grab_folio(page, 1, flags); 2455 if (!folio) 2456 goto pte_unmap; 2457 2458 if (unlikely(page_is_secretmem(page))) { 2459 gup_put_folio(folio, 1, flags); 2460 goto pte_unmap; 2461 } 2462 2463 if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) || 2464 unlikely(pte_val(pte) != pte_val(*ptep))) { 2465 gup_put_folio(folio, 1, flags); 2466 goto pte_unmap; 2467 } 2468 2469 if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { 2470 gup_put_folio(folio, 1, flags); 2471 goto pte_unmap; 2472 } 2473 2474 /* 2475 * We need to make the page accessible if and only if we are 2476 * going to access its content (the FOLL_PIN case). Please 2477 * see Documentation/core-api/pin_user_pages.rst for 2478 * details. 2479 */ 2480 if (flags & FOLL_PIN) { 2481 ret = arch_make_page_accessible(page); 2482 if (ret) { 2483 gup_put_folio(folio, 1, flags); 2484 goto pte_unmap; 2485 } 2486 } 2487 folio_set_referenced(folio); 2488 pages[*nr] = page; 2489 (*nr)++; 2490 } while (ptep++, addr += PAGE_SIZE, addr != end); 2491 2492 ret = 1; 2493 2494 pte_unmap: 2495 if (pgmap) 2496 put_dev_pagemap(pgmap); 2497 pte_unmap(ptem); 2498 return ret; 2499 } 2500 #else 2501 2502 /* 2503 * If we can't determine whether or not a pte is special, then fail immediately 2504 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not 2505 * to be special. 2506 * 2507 * For a futex to be placed on a THP tail page, get_futex_key requires a 2508 * get_user_pages_fast_only implementation that can pin pages. Thus it's still 2509 * useful to have gup_huge_pmd even if we can't operate on ptes. 2510 */ 2511 static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, 2512 unsigned long end, unsigned int flags, 2513 struct page **pages, int *nr) 2514 { 2515 return 0; 2516 } 2517 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ 2518 2519 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 2520 static int __gup_device_huge(unsigned long pfn, unsigned long addr, 2521 unsigned long end, unsigned int flags, 2522 struct page **pages, int *nr) 2523 { 2524 int nr_start = *nr; 2525 struct dev_pagemap *pgmap = NULL; 2526 2527 do { 2528 struct page *page = pfn_to_page(pfn); 2529 2530 pgmap = get_dev_pagemap(pfn, pgmap); 2531 if (unlikely(!pgmap)) { 2532 undo_dev_pagemap(nr, nr_start, flags, pages); 2533 break; 2534 } 2535 2536 if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { 2537 undo_dev_pagemap(nr, nr_start, flags, pages); 2538 break; 2539 } 2540 2541 SetPageReferenced(page); 2542 pages[*nr] = page; 2543 if (unlikely(try_grab_page(page, flags))) { 2544 undo_dev_pagemap(nr, nr_start, flags, pages); 2545 break; 2546 } 2547 (*nr)++; 2548 pfn++; 2549 } while (addr += PAGE_SIZE, addr != end); 2550 2551 put_dev_pagemap(pgmap); 2552 return addr == end; 2553 } 2554 2555 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 2556 unsigned long end, unsigned int flags, 2557 struct page **pages, int *nr) 2558 { 2559 unsigned long fault_pfn; 2560 int nr_start = *nr; 2561 2562 fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 2563 if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) 2564 return 0; 2565 2566 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 2567 undo_dev_pagemap(nr, nr_start, flags, pages); 2568 return 0; 2569 } 2570 return 1; 2571 } 2572 2573 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 2574 unsigned long end, unsigned int flags, 2575 struct page **pages, int *nr) 2576 { 2577 unsigned long fault_pfn; 2578 int nr_start = *nr; 2579 2580 fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 2581 if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr)) 2582 return 0; 2583 2584 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 2585 undo_dev_pagemap(nr, nr_start, flags, pages); 2586 return 0; 2587 } 2588 return 1; 2589 } 2590 #else 2591 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 2592 unsigned long end, unsigned int flags, 2593 struct page **pages, int *nr) 2594 { 2595 BUILD_BUG(); 2596 return 0; 2597 } 2598 2599 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, 2600 unsigned long end, unsigned int flags, 2601 struct page **pages, int *nr) 2602 { 2603 BUILD_BUG(); 2604 return 0; 2605 } 2606 #endif 2607 2608 static int record_subpages(struct page *page, unsigned long addr, 2609 unsigned long end, struct page **pages) 2610 { 2611 int nr; 2612 2613 for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) 2614 pages[nr] = nth_page(page, nr); 2615 2616 return nr; 2617 } 2618 2619 #ifdef CONFIG_ARCH_HAS_HUGEPD 2620 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 2621 unsigned long sz) 2622 { 2623 unsigned long __boundary = (addr + sz) & ~(sz-1); 2624 return (__boundary - 1 < end - 1) ? __boundary : end; 2625 } 2626 2627 static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 2628 unsigned long end, unsigned int flags, 2629 struct page **pages, int *nr) 2630 { 2631 unsigned long pte_end; 2632 struct page *page; 2633 struct folio *folio; 2634 pte_t pte; 2635 int refs; 2636 2637 pte_end = (addr + sz) & ~(sz-1); 2638 if (pte_end < end) 2639 end = pte_end; 2640 2641 pte = huge_ptep_get(ptep); 2642 2643 if (!pte_access_permitted(pte, flags & FOLL_WRITE)) 2644 return 0; 2645 2646 /* hugepages are never "special" */ 2647 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 2648 2649 page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT); 2650 refs = record_subpages(page, addr, end, pages + *nr); 2651 2652 folio = try_grab_folio(page, refs, flags); 2653 if (!folio) 2654 return 0; 2655 2656 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 2657 gup_put_folio(folio, refs, flags); 2658 return 0; 2659 } 2660 2661 if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { 2662 gup_put_folio(folio, refs, flags); 2663 return 0; 2664 } 2665 2666 *nr += refs; 2667 folio_set_referenced(folio); 2668 return 1; 2669 } 2670 2671 static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, 2672 unsigned int pdshift, unsigned long end, unsigned int flags, 2673 struct page **pages, int *nr) 2674 { 2675 pte_t *ptep; 2676 unsigned long sz = 1UL << hugepd_shift(hugepd); 2677 unsigned long next; 2678 2679 ptep = hugepte_offset(hugepd, addr, pdshift); 2680 do { 2681 next = hugepte_addr_end(addr, end, sz); 2682 if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) 2683 return 0; 2684 } while (ptep++, addr = next, addr != end); 2685 2686 return 1; 2687 } 2688 #else 2689 static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, 2690 unsigned int pdshift, unsigned long end, unsigned int flags, 2691 struct page **pages, int *nr) 2692 { 2693 return 0; 2694 } 2695 #endif /* CONFIG_ARCH_HAS_HUGEPD */ 2696 2697 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 2698 unsigned long end, unsigned int flags, 2699 struct page **pages, int *nr) 2700 { 2701 struct page *page; 2702 struct folio *folio; 2703 int refs; 2704 2705 if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) 2706 return 0; 2707 2708 if (pmd_devmap(orig)) { 2709 if (unlikely(flags & FOLL_LONGTERM)) 2710 return 0; 2711 return __gup_device_huge_pmd(orig, pmdp, addr, end, flags, 2712 pages, nr); 2713 } 2714 2715 page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT); 2716 refs = record_subpages(page, addr, end, pages + *nr); 2717 2718 folio = try_grab_folio(page, refs, flags); 2719 if (!folio) 2720 return 0; 2721 2722 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 2723 gup_put_folio(folio, refs, flags); 2724 return 0; 2725 } 2726 2727 if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { 2728 gup_put_folio(folio, refs, flags); 2729 return 0; 2730 } 2731 2732 *nr += refs; 2733 folio_set_referenced(folio); 2734 return 1; 2735 } 2736 2737 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 2738 unsigned long end, unsigned int flags, 2739 struct page **pages, int *nr) 2740 { 2741 struct page *page; 2742 struct folio *folio; 2743 int refs; 2744 2745 if (!pud_access_permitted(orig, flags & FOLL_WRITE)) 2746 return 0; 2747 2748 if (pud_devmap(orig)) { 2749 if (unlikely(flags & FOLL_LONGTERM)) 2750 return 0; 2751 return __gup_device_huge_pud(orig, pudp, addr, end, flags, 2752 pages, nr); 2753 } 2754 2755 page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT); 2756 refs = record_subpages(page, addr, end, pages + *nr); 2757 2758 folio = try_grab_folio(page, refs, flags); 2759 if (!folio) 2760 return 0; 2761 2762 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 2763 gup_put_folio(folio, refs, flags); 2764 return 0; 2765 } 2766 2767 if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { 2768 gup_put_folio(folio, refs, flags); 2769 return 0; 2770 } 2771 2772 *nr += refs; 2773 folio_set_referenced(folio); 2774 return 1; 2775 } 2776 2777 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, 2778 unsigned long end, unsigned int flags, 2779 struct page **pages, int *nr) 2780 { 2781 int refs; 2782 struct page *page; 2783 struct folio *folio; 2784 2785 if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) 2786 return 0; 2787 2788 BUILD_BUG_ON(pgd_devmap(orig)); 2789 2790 page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT); 2791 refs = record_subpages(page, addr, end, pages + *nr); 2792 2793 folio = try_grab_folio(page, refs, flags); 2794 if (!folio) 2795 return 0; 2796 2797 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { 2798 gup_put_folio(folio, refs, flags); 2799 return 0; 2800 } 2801 2802 *nr += refs; 2803 folio_set_referenced(folio); 2804 return 1; 2805 } 2806 2807 static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, 2808 unsigned int flags, struct page **pages, int *nr) 2809 { 2810 unsigned long next; 2811 pmd_t *pmdp; 2812 2813 pmdp = pmd_offset_lockless(pudp, pud, addr); 2814 do { 2815 pmd_t pmd = pmdp_get_lockless(pmdp); 2816 2817 next = pmd_addr_end(addr, end); 2818 if (!pmd_present(pmd)) 2819 return 0; 2820 2821 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || 2822 pmd_devmap(pmd))) { 2823 if (pmd_protnone(pmd) && 2824 !gup_can_follow_protnone(flags)) 2825 return 0; 2826 2827 if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, 2828 pages, nr)) 2829 return 0; 2830 2831 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { 2832 /* 2833 * architecture have different format for hugetlbfs 2834 * pmd format and THP pmd format 2835 */ 2836 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, 2837 PMD_SHIFT, next, flags, pages, nr)) 2838 return 0; 2839 } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr)) 2840 return 0; 2841 } while (pmdp++, addr = next, addr != end); 2842 2843 return 1; 2844 } 2845 2846 static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, 2847 unsigned int flags, struct page **pages, int *nr) 2848 { 2849 unsigned long next; 2850 pud_t *pudp; 2851 2852 pudp = pud_offset_lockless(p4dp, p4d, addr); 2853 do { 2854 pud_t pud = READ_ONCE(*pudp); 2855 2856 next = pud_addr_end(addr, end); 2857 if (unlikely(!pud_present(pud))) 2858 return 0; 2859 if (unlikely(pud_huge(pud) || pud_devmap(pud))) { 2860 if (!gup_huge_pud(pud, pudp, addr, next, flags, 2861 pages, nr)) 2862 return 0; 2863 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 2864 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 2865 PUD_SHIFT, next, flags, pages, nr)) 2866 return 0; 2867 } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) 2868 return 0; 2869 } while (pudp++, addr = next, addr != end); 2870 2871 return 1; 2872 } 2873 2874 static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, 2875 unsigned int flags, struct page **pages, int *nr) 2876 { 2877 unsigned long next; 2878 p4d_t *p4dp; 2879 2880 p4dp = p4d_offset_lockless(pgdp, pgd, addr); 2881 do { 2882 p4d_t p4d = READ_ONCE(*p4dp); 2883 2884 next = p4d_addr_end(addr, end); 2885 if (p4d_none(p4d)) 2886 return 0; 2887 BUILD_BUG_ON(p4d_huge(p4d)); 2888 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { 2889 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, 2890 P4D_SHIFT, next, flags, pages, nr)) 2891 return 0; 2892 } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) 2893 return 0; 2894 } while (p4dp++, addr = next, addr != end); 2895 2896 return 1; 2897 } 2898 2899 static void gup_pgd_range(unsigned long addr, unsigned long end, 2900 unsigned int flags, struct page **pages, int *nr) 2901 { 2902 unsigned long next; 2903 pgd_t *pgdp; 2904 2905 pgdp = pgd_offset(current->mm, addr); 2906 do { 2907 pgd_t pgd = READ_ONCE(*pgdp); 2908 2909 next = pgd_addr_end(addr, end); 2910 if (pgd_none(pgd)) 2911 return; 2912 if (unlikely(pgd_huge(pgd))) { 2913 if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, 2914 pages, nr)) 2915 return; 2916 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 2917 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 2918 PGDIR_SHIFT, next, flags, pages, nr)) 2919 return; 2920 } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) 2921 return; 2922 } while (pgdp++, addr = next, addr != end); 2923 } 2924 #else 2925 static inline void gup_pgd_range(unsigned long addr, unsigned long end, 2926 unsigned int flags, struct page **pages, int *nr) 2927 { 2928 } 2929 #endif /* CONFIG_HAVE_FAST_GUP */ 2930 2931 #ifndef gup_fast_permitted 2932 /* 2933 * Check if it's allowed to use get_user_pages_fast_only() for the range, or 2934 * we need to fall back to the slow version: 2935 */ 2936 static bool gup_fast_permitted(unsigned long start, unsigned long end) 2937 { 2938 return true; 2939 } 2940 #endif 2941 2942 static unsigned long lockless_pages_from_mm(unsigned long start, 2943 unsigned long end, 2944 unsigned int gup_flags, 2945 struct page **pages) 2946 { 2947 unsigned long flags; 2948 int nr_pinned = 0; 2949 unsigned seq; 2950 2951 if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || 2952 !gup_fast_permitted(start, end)) 2953 return 0; 2954 2955 if (gup_flags & FOLL_PIN) { 2956 seq = raw_read_seqcount(¤t->mm->write_protect_seq); 2957 if (seq & 1) 2958 return 0; 2959 } 2960 2961 /* 2962 * Disable interrupts. The nested form is used, in order to allow full, 2963 * general purpose use of this routine. 2964 * 2965 * With interrupts disabled, we block page table pages from being freed 2966 * from under us. See struct mmu_table_batch comments in 2967 * include/asm-generic/tlb.h for more details. 2968 * 2969 * We do not adopt an rcu_read_lock() here as we also want to block IPIs 2970 * that come from THPs splitting. 2971 */ 2972 local_irq_save(flags); 2973 gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); 2974 local_irq_restore(flags); 2975 2976 /* 2977 * When pinning pages for DMA there could be a concurrent write protect 2978 * from fork() via copy_page_range(), in this case always fail fast GUP. 2979 */ 2980 if (gup_flags & FOLL_PIN) { 2981 if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { 2982 unpin_user_pages_lockless(pages, nr_pinned); 2983 return 0; 2984 } else { 2985 sanity_check_pinned_pages(pages, nr_pinned); 2986 } 2987 } 2988 return nr_pinned; 2989 } 2990 2991 static int internal_get_user_pages_fast(unsigned long start, 2992 unsigned long nr_pages, 2993 unsigned int gup_flags, 2994 struct page **pages) 2995 { 2996 unsigned long len, end; 2997 unsigned long nr_pinned; 2998 int locked = 0; 2999 int ret; 3000 3001 if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | 3002 FOLL_FORCE | FOLL_PIN | FOLL_GET | 3003 FOLL_FAST_ONLY | FOLL_NOFAULT | 3004 FOLL_PCI_P2PDMA))) 3005 return -EINVAL; 3006 3007 if (gup_flags & FOLL_PIN) 3008 mm_set_has_pinned_flag(¤t->mm->flags); 3009 3010 if (!(gup_flags & FOLL_FAST_ONLY)) 3011 might_lock_read(¤t->mm->mmap_lock); 3012 3013 start = untagged_addr(start) & PAGE_MASK; 3014 len = nr_pages << PAGE_SHIFT; 3015 if (check_add_overflow(start, len, &end)) 3016 return 0; 3017 if (end > TASK_SIZE_MAX) 3018 return -EFAULT; 3019 if (unlikely(!access_ok((void __user *)start, len))) 3020 return -EFAULT; 3021 3022 nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); 3023 if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) 3024 return nr_pinned; 3025 3026 /* Slow path: try to get the remaining pages with get_user_pages */ 3027 start += nr_pinned << PAGE_SHIFT; 3028 pages += nr_pinned; 3029 ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned, 3030 pages, NULL, &locked, 3031 gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE); 3032 if (ret < 0) { 3033 /* 3034 * The caller has to unpin the pages we already pinned so 3035 * returning -errno is not an option 3036 */ 3037 if (nr_pinned) 3038 return nr_pinned; 3039 return ret; 3040 } 3041 return ret + nr_pinned; 3042 } 3043 3044 /** 3045 * get_user_pages_fast_only() - pin user pages in memory 3046 * @start: starting user address 3047 * @nr_pages: number of pages from start to pin 3048 * @gup_flags: flags modifying pin behaviour 3049 * @pages: array that receives pointers to the pages pinned. 3050 * Should be at least nr_pages long. 3051 * 3052 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to 3053 * the regular GUP. 3054 * 3055 * If the architecture does not support this function, simply return with no 3056 * pages pinned. 3057 * 3058 * Careful, careful! COW breaking can go either way, so a non-write 3059 * access can get ambiguous page results. If you call this function without 3060 * 'write' set, you'd better be sure that you're ok with that ambiguity. 3061 */ 3062 int get_user_pages_fast_only(unsigned long start, int nr_pages, 3063 unsigned int gup_flags, struct page **pages) 3064 { 3065 /* 3066 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET, 3067 * because gup fast is always a "pin with a +1 page refcount" request. 3068 * 3069 * FOLL_FAST_ONLY is required in order to match the API description of 3070 * this routine: no fall back to regular ("slow") GUP. 3071 */ 3072 if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, 3073 FOLL_GET | FOLL_FAST_ONLY)) 3074 return -EINVAL; 3075 3076 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); 3077 } 3078 EXPORT_SYMBOL_GPL(get_user_pages_fast_only); 3079 3080 /** 3081 * get_user_pages_fast() - pin user pages in memory 3082 * @start: starting user address 3083 * @nr_pages: number of pages from start to pin 3084 * @gup_flags: flags modifying pin behaviour 3085 * @pages: array that receives pointers to the pages pinned. 3086 * Should be at least nr_pages long. 3087 * 3088 * Attempt to pin user pages in memory without taking mm->mmap_lock. 3089 * If not successful, it will fall back to taking the lock and 3090 * calling get_user_pages(). 3091 * 3092 * Returns number of pages pinned. This may be fewer than the number requested. 3093 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns 3094 * -errno. 3095 */ 3096 int get_user_pages_fast(unsigned long start, int nr_pages, 3097 unsigned int gup_flags, struct page **pages) 3098 { 3099 /* 3100 * The caller may or may not have explicitly set FOLL_GET; either way is 3101 * OK. However, internally (within mm/gup.c), gup fast variants must set 3102 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount" 3103 * request. 3104 */ 3105 if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_GET)) 3106 return -EINVAL; 3107 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); 3108 } 3109 EXPORT_SYMBOL_GPL(get_user_pages_fast); 3110 3111 /** 3112 * pin_user_pages_fast() - pin user pages in memory without taking locks 3113 * 3114 * @start: starting user address 3115 * @nr_pages: number of pages from start to pin 3116 * @gup_flags: flags modifying pin behaviour 3117 * @pages: array that receives pointers to the pages pinned. 3118 * Should be at least nr_pages long. 3119 * 3120 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See 3121 * get_user_pages_fast() for documentation on the function arguments, because 3122 * the arguments here are identical. 3123 * 3124 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please 3125 * see Documentation/core-api/pin_user_pages.rst for further details. 3126 * 3127 * Note that if a zero_page is amongst the returned pages, it will not have 3128 * pins in it and unpin_user_page() will not remove pins from it. 3129 */ 3130 int pin_user_pages_fast(unsigned long start, int nr_pages, 3131 unsigned int gup_flags, struct page **pages) 3132 { 3133 if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, FOLL_PIN)) 3134 return -EINVAL; 3135 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); 3136 } 3137 EXPORT_SYMBOL_GPL(pin_user_pages_fast); 3138 3139 /** 3140 * pin_user_pages_remote() - pin pages of a remote process 3141 * 3142 * @mm: mm_struct of target mm 3143 * @start: starting user address 3144 * @nr_pages: number of pages from start to pin 3145 * @gup_flags: flags modifying lookup behaviour 3146 * @pages: array that receives pointers to the pages pinned. 3147 * Should be at least nr_pages long. 3148 * @vmas: array of pointers to vmas corresponding to each page. 3149 * Or NULL if the caller does not require them. 3150 * @locked: pointer to lock flag indicating whether lock is held and 3151 * subsequently whether VM_FAULT_RETRY functionality can be 3152 * utilised. Lock must initially be held. 3153 * 3154 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See 3155 * get_user_pages_remote() for documentation on the function arguments, because 3156 * the arguments here are identical. 3157 * 3158 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please 3159 * see Documentation/core-api/pin_user_pages.rst for details. 3160 * 3161 * Note that if a zero_page is amongst the returned pages, it will not have 3162 * pins in it and unpin_user_page*() will not remove pins from it. 3163 */ 3164 long pin_user_pages_remote(struct mm_struct *mm, 3165 unsigned long start, unsigned long nr_pages, 3166 unsigned int gup_flags, struct page **pages, 3167 struct vm_area_struct **vmas, int *locked) 3168 { 3169 int local_locked = 1; 3170 3171 if (!is_valid_gup_args(pages, vmas, locked, &gup_flags, 3172 FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE)) 3173 return 0; 3174 return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, 3175 locked ? locked : &local_locked, 3176 gup_flags); 3177 } 3178 EXPORT_SYMBOL(pin_user_pages_remote); 3179 3180 /** 3181 * pin_user_pages() - pin user pages in memory for use by other devices 3182 * 3183 * @start: starting user address 3184 * @nr_pages: number of pages from start to pin 3185 * @gup_flags: flags modifying lookup behaviour 3186 * @pages: array that receives pointers to the pages pinned. 3187 * Should be at least nr_pages long. 3188 * @vmas: array of pointers to vmas corresponding to each page. 3189 * Or NULL if the caller does not require them. 3190 * 3191 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and 3192 * FOLL_PIN is set. 3193 * 3194 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please 3195 * see Documentation/core-api/pin_user_pages.rst for details. 3196 * 3197 * Note that if a zero_page is amongst the returned pages, it will not have 3198 * pins in it and unpin_user_page*() will not remove pins from it. 3199 */ 3200 long pin_user_pages(unsigned long start, unsigned long nr_pages, 3201 unsigned int gup_flags, struct page **pages, 3202 struct vm_area_struct **vmas) 3203 { 3204 int locked = 1; 3205 3206 if (!is_valid_gup_args(pages, vmas, NULL, &gup_flags, FOLL_PIN)) 3207 return 0; 3208 return __gup_longterm_locked(current->mm, start, nr_pages, 3209 pages, vmas, &locked, gup_flags); 3210 } 3211 EXPORT_SYMBOL(pin_user_pages); 3212 3213 /* 3214 * pin_user_pages_unlocked() is the FOLL_PIN variant of 3215 * get_user_pages_unlocked(). Behavior is the same, except that this one sets 3216 * FOLL_PIN and rejects FOLL_GET. 3217 * 3218 * Note that if a zero_page is amongst the returned pages, it will not have 3219 * pins in it and unpin_user_page*() will not remove pins from it. 3220 */ 3221 long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 3222 struct page **pages, unsigned int gup_flags) 3223 { 3224 int locked = 0; 3225 3226 if (!is_valid_gup_args(pages, NULL, NULL, &gup_flags, 3227 FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE)) 3228 return 0; 3229 3230 return __gup_longterm_locked(current->mm, start, nr_pages, pages, NULL, 3231 &locked, gup_flags); 3232 } 3233 EXPORT_SYMBOL(pin_user_pages_unlocked); 3234