1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/err.h> 5 #include <linux/spinlock.h> 6 7 #include <linux/mm.h> 8 #include <linux/memremap.h> 9 #include <linux/pagemap.h> 10 #include <linux/rmap.h> 11 #include <linux/swap.h> 12 #include <linux/swapops.h> 13 14 #include <linux/sched/signal.h> 15 #include <linux/rwsem.h> 16 #include <linux/hugetlb.h> 17 #include <linux/migrate.h> 18 #include <linux/mm_inline.h> 19 #include <linux/sched/mm.h> 20 21 #include <asm/mmu_context.h> 22 #include <asm/pgtable.h> 23 #include <asm/tlbflush.h> 24 25 #include "internal.h" 26 27 struct follow_page_context { 28 struct dev_pagemap *pgmap; 29 unsigned int page_mask; 30 }; 31 32 /* 33 * Return the compound head page with ref appropriately incremented, 34 * or NULL if that failed. 35 */ 36 static inline struct page *try_get_compound_head(struct page *page, int refs) 37 { 38 struct page *head = compound_head(page); 39 40 if (WARN_ON_ONCE(page_ref_count(head) < 0)) 41 return NULL; 42 if (unlikely(!page_cache_add_speculative(head, refs))) 43 return NULL; 44 return head; 45 } 46 47 /** 48 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages 49 * @pages: array of pages to be maybe marked dirty, and definitely released. 50 * @npages: number of pages in the @pages array. 51 * @make_dirty: whether to mark the pages dirty 52 * 53 * "gup-pinned page" refers to a page that has had one of the get_user_pages() 54 * variants called on that page. 55 * 56 * For each page in the @pages array, make that page (or its head page, if a 57 * compound page) dirty, if @make_dirty is true, and if the page was previously 58 * listed as clean. In any case, releases all pages using unpin_user_page(), 59 * possibly via unpin_user_pages(), for the non-dirty case. 60 * 61 * Please see the unpin_user_page() documentation for details. 62 * 63 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is 64 * required, then the caller should a) verify that this is really correct, 65 * because _lock() is usually required, and b) hand code it: 66 * set_page_dirty_lock(), unpin_user_page(). 67 * 68 */ 69 void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, 70 bool make_dirty) 71 { 72 unsigned long index; 73 74 /* 75 * TODO: this can be optimized for huge pages: if a series of pages is 76 * physically contiguous and part of the same compound page, then a 77 * single operation to the head page should suffice. 78 */ 79 80 if (!make_dirty) { 81 unpin_user_pages(pages, npages); 82 return; 83 } 84 85 for (index = 0; index < npages; index++) { 86 struct page *page = compound_head(pages[index]); 87 /* 88 * Checking PageDirty at this point may race with 89 * clear_page_dirty_for_io(), but that's OK. Two key 90 * cases: 91 * 92 * 1) This code sees the page as already dirty, so it 93 * skips the call to set_page_dirty(). That could happen 94 * because clear_page_dirty_for_io() called 95 * page_mkclean(), followed by set_page_dirty(). 96 * However, now the page is going to get written back, 97 * which meets the original intention of setting it 98 * dirty, so all is well: clear_page_dirty_for_io() goes 99 * on to call TestClearPageDirty(), and write the page 100 * back. 101 * 102 * 2) This code sees the page as clean, so it calls 103 * set_page_dirty(). The page stays dirty, despite being 104 * written back, so it gets written back again in the 105 * next writeback cycle. This is harmless. 106 */ 107 if (!PageDirty(page)) 108 set_page_dirty_lock(page); 109 unpin_user_page(page); 110 } 111 } 112 EXPORT_SYMBOL(unpin_user_pages_dirty_lock); 113 114 /** 115 * unpin_user_pages() - release an array of gup-pinned pages. 116 * @pages: array of pages to be marked dirty and released. 117 * @npages: number of pages in the @pages array. 118 * 119 * For each page in the @pages array, release the page using unpin_user_page(). 120 * 121 * Please see the unpin_user_page() documentation for details. 122 */ 123 void unpin_user_pages(struct page **pages, unsigned long npages) 124 { 125 unsigned long index; 126 127 /* 128 * TODO: this can be optimized for huge pages: if a series of pages is 129 * physically contiguous and part of the same compound page, then a 130 * single operation to the head page should suffice. 131 */ 132 for (index = 0; index < npages; index++) 133 unpin_user_page(pages[index]); 134 } 135 EXPORT_SYMBOL(unpin_user_pages); 136 137 #ifdef CONFIG_MMU 138 static struct page *no_page_table(struct vm_area_struct *vma, 139 unsigned int flags) 140 { 141 /* 142 * When core dumping an enormous anonymous area that nobody 143 * has touched so far, we don't want to allocate unnecessary pages or 144 * page tables. Return error instead of NULL to skip handle_mm_fault, 145 * then get_dump_page() will return NULL to leave a hole in the dump. 146 * But we can only make this optimization where a hole would surely 147 * be zero-filled if handle_mm_fault() actually did handle it. 148 */ 149 if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) 150 return ERR_PTR(-EFAULT); 151 return NULL; 152 } 153 154 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, 155 pte_t *pte, unsigned int flags) 156 { 157 /* No page to get reference */ 158 if (flags & FOLL_GET) 159 return -EFAULT; 160 161 if (flags & FOLL_TOUCH) { 162 pte_t entry = *pte; 163 164 if (flags & FOLL_WRITE) 165 entry = pte_mkdirty(entry); 166 entry = pte_mkyoung(entry); 167 168 if (!pte_same(*pte, entry)) { 169 set_pte_at(vma->vm_mm, address, pte, entry); 170 update_mmu_cache(vma, address, pte); 171 } 172 } 173 174 /* Proper page table entry exists, but no corresponding struct page */ 175 return -EEXIST; 176 } 177 178 /* 179 * FOLL_FORCE can write to even unwritable pte's, but only 180 * after we've gone through a COW cycle and they are dirty. 181 */ 182 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) 183 { 184 return pte_write(pte) || 185 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); 186 } 187 188 static struct page *follow_page_pte(struct vm_area_struct *vma, 189 unsigned long address, pmd_t *pmd, unsigned int flags, 190 struct dev_pagemap **pgmap) 191 { 192 struct mm_struct *mm = vma->vm_mm; 193 struct page *page; 194 spinlock_t *ptl; 195 pte_t *ptep, pte; 196 197 /* FOLL_GET and FOLL_PIN are mutually exclusive. */ 198 if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == 199 (FOLL_PIN | FOLL_GET))) 200 return ERR_PTR(-EINVAL); 201 retry: 202 if (unlikely(pmd_bad(*pmd))) 203 return no_page_table(vma, flags); 204 205 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 206 pte = *ptep; 207 if (!pte_present(pte)) { 208 swp_entry_t entry; 209 /* 210 * KSM's break_ksm() relies upon recognizing a ksm page 211 * even while it is being migrated, so for that case we 212 * need migration_entry_wait(). 213 */ 214 if (likely(!(flags & FOLL_MIGRATION))) 215 goto no_page; 216 if (pte_none(pte)) 217 goto no_page; 218 entry = pte_to_swp_entry(pte); 219 if (!is_migration_entry(entry)) 220 goto no_page; 221 pte_unmap_unlock(ptep, ptl); 222 migration_entry_wait(mm, pmd, address); 223 goto retry; 224 } 225 if ((flags & FOLL_NUMA) && pte_protnone(pte)) 226 goto no_page; 227 if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { 228 pte_unmap_unlock(ptep, ptl); 229 return NULL; 230 } 231 232 page = vm_normal_page(vma, address, pte); 233 if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { 234 /* 235 * Only return device mapping pages in the FOLL_GET case since 236 * they are only valid while holding the pgmap reference. 237 */ 238 *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); 239 if (*pgmap) 240 page = pte_page(pte); 241 else 242 goto no_page; 243 } else if (unlikely(!page)) { 244 if (flags & FOLL_DUMP) { 245 /* Avoid special (like zero) pages in core dumps */ 246 page = ERR_PTR(-EFAULT); 247 goto out; 248 } 249 250 if (is_zero_pfn(pte_pfn(pte))) { 251 page = pte_page(pte); 252 } else { 253 int ret; 254 255 ret = follow_pfn_pte(vma, address, ptep, flags); 256 page = ERR_PTR(ret); 257 goto out; 258 } 259 } 260 261 if (flags & FOLL_SPLIT && PageTransCompound(page)) { 262 int ret; 263 get_page(page); 264 pte_unmap_unlock(ptep, ptl); 265 lock_page(page); 266 ret = split_huge_page(page); 267 unlock_page(page); 268 put_page(page); 269 if (ret) 270 return ERR_PTR(ret); 271 goto retry; 272 } 273 274 if (flags & FOLL_GET) { 275 if (unlikely(!try_get_page(page))) { 276 page = ERR_PTR(-ENOMEM); 277 goto out; 278 } 279 } 280 if (flags & FOLL_TOUCH) { 281 if ((flags & FOLL_WRITE) && 282 !pte_dirty(pte) && !PageDirty(page)) 283 set_page_dirty(page); 284 /* 285 * pte_mkyoung() would be more correct here, but atomic care 286 * is needed to avoid losing the dirty bit: it is easier to use 287 * mark_page_accessed(). 288 */ 289 mark_page_accessed(page); 290 } 291 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 292 /* Do not mlock pte-mapped THP */ 293 if (PageTransCompound(page)) 294 goto out; 295 296 /* 297 * The preliminary mapping check is mainly to avoid the 298 * pointless overhead of lock_page on the ZERO_PAGE 299 * which might bounce very badly if there is contention. 300 * 301 * If the page is already locked, we don't need to 302 * handle it now - vmscan will handle it later if and 303 * when it attempts to reclaim the page. 304 */ 305 if (page->mapping && trylock_page(page)) { 306 lru_add_drain(); /* push cached pages to LRU */ 307 /* 308 * Because we lock page here, and migration is 309 * blocked by the pte's page reference, and we 310 * know the page is still mapped, we don't even 311 * need to check for file-cache page truncation. 312 */ 313 mlock_vma_page(page); 314 unlock_page(page); 315 } 316 } 317 out: 318 pte_unmap_unlock(ptep, ptl); 319 return page; 320 no_page: 321 pte_unmap_unlock(ptep, ptl); 322 if (!pte_none(pte)) 323 return NULL; 324 return no_page_table(vma, flags); 325 } 326 327 static struct page *follow_pmd_mask(struct vm_area_struct *vma, 328 unsigned long address, pud_t *pudp, 329 unsigned int flags, 330 struct follow_page_context *ctx) 331 { 332 pmd_t *pmd, pmdval; 333 spinlock_t *ptl; 334 struct page *page; 335 struct mm_struct *mm = vma->vm_mm; 336 337 pmd = pmd_offset(pudp, address); 338 /* 339 * The READ_ONCE() will stabilize the pmdval in a register or 340 * on the stack so that it will stop changing under the code. 341 */ 342 pmdval = READ_ONCE(*pmd); 343 if (pmd_none(pmdval)) 344 return no_page_table(vma, flags); 345 if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { 346 page = follow_huge_pmd(mm, address, pmd, flags); 347 if (page) 348 return page; 349 return no_page_table(vma, flags); 350 } 351 if (is_hugepd(__hugepd(pmd_val(pmdval)))) { 352 page = follow_huge_pd(vma, address, 353 __hugepd(pmd_val(pmdval)), flags, 354 PMD_SHIFT); 355 if (page) 356 return page; 357 return no_page_table(vma, flags); 358 } 359 retry: 360 if (!pmd_present(pmdval)) { 361 if (likely(!(flags & FOLL_MIGRATION))) 362 return no_page_table(vma, flags); 363 VM_BUG_ON(thp_migration_supported() && 364 !is_pmd_migration_entry(pmdval)); 365 if (is_pmd_migration_entry(pmdval)) 366 pmd_migration_entry_wait(mm, pmd); 367 pmdval = READ_ONCE(*pmd); 368 /* 369 * MADV_DONTNEED may convert the pmd to null because 370 * mmap_sem is held in read mode 371 */ 372 if (pmd_none(pmdval)) 373 return no_page_table(vma, flags); 374 goto retry; 375 } 376 if (pmd_devmap(pmdval)) { 377 ptl = pmd_lock(mm, pmd); 378 page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); 379 spin_unlock(ptl); 380 if (page) 381 return page; 382 } 383 if (likely(!pmd_trans_huge(pmdval))) 384 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 385 386 if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) 387 return no_page_table(vma, flags); 388 389 retry_locked: 390 ptl = pmd_lock(mm, pmd); 391 if (unlikely(pmd_none(*pmd))) { 392 spin_unlock(ptl); 393 return no_page_table(vma, flags); 394 } 395 if (unlikely(!pmd_present(*pmd))) { 396 spin_unlock(ptl); 397 if (likely(!(flags & FOLL_MIGRATION))) 398 return no_page_table(vma, flags); 399 pmd_migration_entry_wait(mm, pmd); 400 goto retry_locked; 401 } 402 if (unlikely(!pmd_trans_huge(*pmd))) { 403 spin_unlock(ptl); 404 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 405 } 406 if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { 407 int ret; 408 page = pmd_page(*pmd); 409 if (is_huge_zero_page(page)) { 410 spin_unlock(ptl); 411 ret = 0; 412 split_huge_pmd(vma, pmd, address); 413 if (pmd_trans_unstable(pmd)) 414 ret = -EBUSY; 415 } else if (flags & FOLL_SPLIT) { 416 if (unlikely(!try_get_page(page))) { 417 spin_unlock(ptl); 418 return ERR_PTR(-ENOMEM); 419 } 420 spin_unlock(ptl); 421 lock_page(page); 422 ret = split_huge_page(page); 423 unlock_page(page); 424 put_page(page); 425 if (pmd_none(*pmd)) 426 return no_page_table(vma, flags); 427 } else { /* flags & FOLL_SPLIT_PMD */ 428 spin_unlock(ptl); 429 split_huge_pmd(vma, pmd, address); 430 ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; 431 } 432 433 return ret ? ERR_PTR(ret) : 434 follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 435 } 436 page = follow_trans_huge_pmd(vma, address, pmd, flags); 437 spin_unlock(ptl); 438 ctx->page_mask = HPAGE_PMD_NR - 1; 439 return page; 440 } 441 442 static struct page *follow_pud_mask(struct vm_area_struct *vma, 443 unsigned long address, p4d_t *p4dp, 444 unsigned int flags, 445 struct follow_page_context *ctx) 446 { 447 pud_t *pud; 448 spinlock_t *ptl; 449 struct page *page; 450 struct mm_struct *mm = vma->vm_mm; 451 452 pud = pud_offset(p4dp, address); 453 if (pud_none(*pud)) 454 return no_page_table(vma, flags); 455 if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { 456 page = follow_huge_pud(mm, address, pud, flags); 457 if (page) 458 return page; 459 return no_page_table(vma, flags); 460 } 461 if (is_hugepd(__hugepd(pud_val(*pud)))) { 462 page = follow_huge_pd(vma, address, 463 __hugepd(pud_val(*pud)), flags, 464 PUD_SHIFT); 465 if (page) 466 return page; 467 return no_page_table(vma, flags); 468 } 469 if (pud_devmap(*pud)) { 470 ptl = pud_lock(mm, pud); 471 page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); 472 spin_unlock(ptl); 473 if (page) 474 return page; 475 } 476 if (unlikely(pud_bad(*pud))) 477 return no_page_table(vma, flags); 478 479 return follow_pmd_mask(vma, address, pud, flags, ctx); 480 } 481 482 static struct page *follow_p4d_mask(struct vm_area_struct *vma, 483 unsigned long address, pgd_t *pgdp, 484 unsigned int flags, 485 struct follow_page_context *ctx) 486 { 487 p4d_t *p4d; 488 struct page *page; 489 490 p4d = p4d_offset(pgdp, address); 491 if (p4d_none(*p4d)) 492 return no_page_table(vma, flags); 493 BUILD_BUG_ON(p4d_huge(*p4d)); 494 if (unlikely(p4d_bad(*p4d))) 495 return no_page_table(vma, flags); 496 497 if (is_hugepd(__hugepd(p4d_val(*p4d)))) { 498 page = follow_huge_pd(vma, address, 499 __hugepd(p4d_val(*p4d)), flags, 500 P4D_SHIFT); 501 if (page) 502 return page; 503 return no_page_table(vma, flags); 504 } 505 return follow_pud_mask(vma, address, p4d, flags, ctx); 506 } 507 508 /** 509 * follow_page_mask - look up a page descriptor from a user-virtual address 510 * @vma: vm_area_struct mapping @address 511 * @address: virtual address to look up 512 * @flags: flags modifying lookup behaviour 513 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a 514 * pointer to output page_mask 515 * 516 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 517 * 518 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches 519 * the device's dev_pagemap metadata to avoid repeating expensive lookups. 520 * 521 * On output, the @ctx->page_mask is set according to the size of the page. 522 * 523 * Return: the mapped (struct page *), %NULL if no mapping exists, or 524 * an error pointer if there is a mapping to something not represented 525 * by a page descriptor (see also vm_normal_page()). 526 */ 527 static struct page *follow_page_mask(struct vm_area_struct *vma, 528 unsigned long address, unsigned int flags, 529 struct follow_page_context *ctx) 530 { 531 pgd_t *pgd; 532 struct page *page; 533 struct mm_struct *mm = vma->vm_mm; 534 535 ctx->page_mask = 0; 536 537 /* make this handle hugepd */ 538 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 539 if (!IS_ERR(page)) { 540 BUG_ON(flags & FOLL_GET); 541 return page; 542 } 543 544 pgd = pgd_offset(mm, address); 545 546 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 547 return no_page_table(vma, flags); 548 549 if (pgd_huge(*pgd)) { 550 page = follow_huge_pgd(mm, address, pgd, flags); 551 if (page) 552 return page; 553 return no_page_table(vma, flags); 554 } 555 if (is_hugepd(__hugepd(pgd_val(*pgd)))) { 556 page = follow_huge_pd(vma, address, 557 __hugepd(pgd_val(*pgd)), flags, 558 PGDIR_SHIFT); 559 if (page) 560 return page; 561 return no_page_table(vma, flags); 562 } 563 564 return follow_p4d_mask(vma, address, pgd, flags, ctx); 565 } 566 567 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 568 unsigned int foll_flags) 569 { 570 struct follow_page_context ctx = { NULL }; 571 struct page *page; 572 573 page = follow_page_mask(vma, address, foll_flags, &ctx); 574 if (ctx.pgmap) 575 put_dev_pagemap(ctx.pgmap); 576 return page; 577 } 578 579 static int get_gate_page(struct mm_struct *mm, unsigned long address, 580 unsigned int gup_flags, struct vm_area_struct **vma, 581 struct page **page) 582 { 583 pgd_t *pgd; 584 p4d_t *p4d; 585 pud_t *pud; 586 pmd_t *pmd; 587 pte_t *pte; 588 int ret = -EFAULT; 589 590 /* user gate pages are read-only */ 591 if (gup_flags & FOLL_WRITE) 592 return -EFAULT; 593 if (address > TASK_SIZE) 594 pgd = pgd_offset_k(address); 595 else 596 pgd = pgd_offset_gate(mm, address); 597 if (pgd_none(*pgd)) 598 return -EFAULT; 599 p4d = p4d_offset(pgd, address); 600 if (p4d_none(*p4d)) 601 return -EFAULT; 602 pud = pud_offset(p4d, address); 603 if (pud_none(*pud)) 604 return -EFAULT; 605 pmd = pmd_offset(pud, address); 606 if (!pmd_present(*pmd)) 607 return -EFAULT; 608 VM_BUG_ON(pmd_trans_huge(*pmd)); 609 pte = pte_offset_map(pmd, address); 610 if (pte_none(*pte)) 611 goto unmap; 612 *vma = get_gate_vma(mm); 613 if (!page) 614 goto out; 615 *page = vm_normal_page(*vma, address, *pte); 616 if (!*page) { 617 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) 618 goto unmap; 619 *page = pte_page(*pte); 620 } 621 if (unlikely(!try_get_page(*page))) { 622 ret = -ENOMEM; 623 goto unmap; 624 } 625 out: 626 ret = 0; 627 unmap: 628 pte_unmap(pte); 629 return ret; 630 } 631 632 /* 633 * mmap_sem must be held on entry. If @nonblocking != NULL and 634 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. 635 * If it is, *@nonblocking will be set to 0 and -EBUSY returned. 636 */ 637 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, 638 unsigned long address, unsigned int *flags, int *nonblocking) 639 { 640 unsigned int fault_flags = 0; 641 vm_fault_t ret; 642 643 /* mlock all present pages, but do not fault in new pages */ 644 if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) 645 return -ENOENT; 646 if (*flags & FOLL_WRITE) 647 fault_flags |= FAULT_FLAG_WRITE; 648 if (*flags & FOLL_REMOTE) 649 fault_flags |= FAULT_FLAG_REMOTE; 650 if (nonblocking) 651 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 652 if (*flags & FOLL_NOWAIT) 653 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 654 if (*flags & FOLL_TRIED) { 655 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); 656 fault_flags |= FAULT_FLAG_TRIED; 657 } 658 659 ret = handle_mm_fault(vma, address, fault_flags); 660 if (ret & VM_FAULT_ERROR) { 661 int err = vm_fault_to_errno(ret, *flags); 662 663 if (err) 664 return err; 665 BUG(); 666 } 667 668 if (tsk) { 669 if (ret & VM_FAULT_MAJOR) 670 tsk->maj_flt++; 671 else 672 tsk->min_flt++; 673 } 674 675 if (ret & VM_FAULT_RETRY) { 676 if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 677 *nonblocking = 0; 678 return -EBUSY; 679 } 680 681 /* 682 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when 683 * necessary, even if maybe_mkwrite decided not to set pte_write. We 684 * can thus safely do subsequent page lookups as if they were reads. 685 * But only do so when looping for pte_write is futile: in some cases 686 * userspace may also be wanting to write to the gotten user page, 687 * which a read fault here might prevent (a readonly page might get 688 * reCOWed by userspace write). 689 */ 690 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) 691 *flags |= FOLL_COW; 692 return 0; 693 } 694 695 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 696 { 697 vm_flags_t vm_flags = vma->vm_flags; 698 int write = (gup_flags & FOLL_WRITE); 699 int foreign = (gup_flags & FOLL_REMOTE); 700 701 if (vm_flags & (VM_IO | VM_PFNMAP)) 702 return -EFAULT; 703 704 if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) 705 return -EFAULT; 706 707 if (write) { 708 if (!(vm_flags & VM_WRITE)) { 709 if (!(gup_flags & FOLL_FORCE)) 710 return -EFAULT; 711 /* 712 * We used to let the write,force case do COW in a 713 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could 714 * set a breakpoint in a read-only mapping of an 715 * executable, without corrupting the file (yet only 716 * when that file had been opened for writing!). 717 * Anon pages in shared mappings are surprising: now 718 * just reject it. 719 */ 720 if (!is_cow_mapping(vm_flags)) 721 return -EFAULT; 722 } 723 } else if (!(vm_flags & VM_READ)) { 724 if (!(gup_flags & FOLL_FORCE)) 725 return -EFAULT; 726 /* 727 * Is there actually any vma we can reach here which does not 728 * have VM_MAYREAD set? 729 */ 730 if (!(vm_flags & VM_MAYREAD)) 731 return -EFAULT; 732 } 733 /* 734 * gups are always data accesses, not instruction 735 * fetches, so execute=false here 736 */ 737 if (!arch_vma_access_permitted(vma, write, false, foreign)) 738 return -EFAULT; 739 return 0; 740 } 741 742 /** 743 * __get_user_pages() - pin user pages in memory 744 * @tsk: task_struct of target task 745 * @mm: mm_struct of target mm 746 * @start: starting user address 747 * @nr_pages: number of pages from start to pin 748 * @gup_flags: flags modifying pin behaviour 749 * @pages: array that receives pointers to the pages pinned. 750 * Should be at least nr_pages long. Or NULL, if caller 751 * only intends to ensure the pages are faulted in. 752 * @vmas: array of pointers to vmas corresponding to each page. 753 * Or NULL if the caller does not require them. 754 * @nonblocking: whether waiting for disk IO or mmap_sem contention 755 * 756 * Returns either number of pages pinned (which may be less than the 757 * number requested), or an error. Details about the return value: 758 * 759 * -- If nr_pages is 0, returns 0. 760 * -- If nr_pages is >0, but no pages were pinned, returns -errno. 761 * -- If nr_pages is >0, and some pages were pinned, returns the number of 762 * pages pinned. Again, this may be less than nr_pages. 763 * 764 * The caller is responsible for releasing returned @pages, via put_page(). 765 * 766 * @vmas are valid only as long as mmap_sem is held. 767 * 768 * Must be called with mmap_sem held. It may be released. See below. 769 * 770 * __get_user_pages walks a process's page tables and takes a reference to 771 * each struct page that each user address corresponds to at a given 772 * instant. That is, it takes the page that would be accessed if a user 773 * thread accesses the given user virtual address at that instant. 774 * 775 * This does not guarantee that the page exists in the user mappings when 776 * __get_user_pages returns, and there may even be a completely different 777 * page there in some cases (eg. if mmapped pagecache has been invalidated 778 * and subsequently re faulted). However it does guarantee that the page 779 * won't be freed completely. And mostly callers simply care that the page 780 * contains data that was valid *at some point in time*. Typically, an IO 781 * or similar operation cannot guarantee anything stronger anyway because 782 * locks can't be held over the syscall boundary. 783 * 784 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 785 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 786 * appropriate) must be called after the page is finished with, and 787 * before put_page is called. 788 * 789 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 790 * or mmap_sem contention, and if waiting is needed to pin all pages, 791 * *@nonblocking will be set to 0. Further, if @gup_flags does not 792 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in 793 * this case. 794 * 795 * A caller using such a combination of @nonblocking and @gup_flags 796 * must therefore hold the mmap_sem for reading only, and recognize 797 * when it's been released. Otherwise, it must be held for either 798 * reading or writing and will not be released. 799 * 800 * In most cases, get_user_pages or get_user_pages_fast should be used 801 * instead of __get_user_pages. __get_user_pages should be used only if 802 * you need some special @gup_flags. 803 */ 804 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 805 unsigned long start, unsigned long nr_pages, 806 unsigned int gup_flags, struct page **pages, 807 struct vm_area_struct **vmas, int *nonblocking) 808 { 809 long ret = 0, i = 0; 810 struct vm_area_struct *vma = NULL; 811 struct follow_page_context ctx = { NULL }; 812 813 if (!nr_pages) 814 return 0; 815 816 start = untagged_addr(start); 817 818 VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); 819 820 /* 821 * If FOLL_FORCE is set then do not force a full fault as the hinting 822 * fault information is unrelated to the reference behaviour of a task 823 * using the address space 824 */ 825 if (!(gup_flags & FOLL_FORCE)) 826 gup_flags |= FOLL_NUMA; 827 828 do { 829 struct page *page; 830 unsigned int foll_flags = gup_flags; 831 unsigned int page_increm; 832 833 /* first iteration or cross vma bound */ 834 if (!vma || start >= vma->vm_end) { 835 vma = find_extend_vma(mm, start); 836 if (!vma && in_gate_area(mm, start)) { 837 ret = get_gate_page(mm, start & PAGE_MASK, 838 gup_flags, &vma, 839 pages ? &pages[i] : NULL); 840 if (ret) 841 goto out; 842 ctx.page_mask = 0; 843 goto next_page; 844 } 845 846 if (!vma || check_vma_flags(vma, gup_flags)) { 847 ret = -EFAULT; 848 goto out; 849 } 850 if (is_vm_hugetlb_page(vma)) { 851 i = follow_hugetlb_page(mm, vma, pages, vmas, 852 &start, &nr_pages, i, 853 gup_flags, nonblocking); 854 continue; 855 } 856 } 857 retry: 858 /* 859 * If we have a pending SIGKILL, don't keep faulting pages and 860 * potentially allocating memory. 861 */ 862 if (fatal_signal_pending(current)) { 863 ret = -ERESTARTSYS; 864 goto out; 865 } 866 cond_resched(); 867 868 page = follow_page_mask(vma, start, foll_flags, &ctx); 869 if (!page) { 870 ret = faultin_page(tsk, vma, start, &foll_flags, 871 nonblocking); 872 switch (ret) { 873 case 0: 874 goto retry; 875 case -EBUSY: 876 ret = 0; 877 /* FALLTHRU */ 878 case -EFAULT: 879 case -ENOMEM: 880 case -EHWPOISON: 881 goto out; 882 case -ENOENT: 883 goto next_page; 884 } 885 BUG(); 886 } else if (PTR_ERR(page) == -EEXIST) { 887 /* 888 * Proper page table entry exists, but no corresponding 889 * struct page. 890 */ 891 goto next_page; 892 } else if (IS_ERR(page)) { 893 ret = PTR_ERR(page); 894 goto out; 895 } 896 if (pages) { 897 pages[i] = page; 898 flush_anon_page(vma, page, start); 899 flush_dcache_page(page); 900 ctx.page_mask = 0; 901 } 902 next_page: 903 if (vmas) { 904 vmas[i] = vma; 905 ctx.page_mask = 0; 906 } 907 page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); 908 if (page_increm > nr_pages) 909 page_increm = nr_pages; 910 i += page_increm; 911 start += page_increm * PAGE_SIZE; 912 nr_pages -= page_increm; 913 } while (nr_pages); 914 out: 915 if (ctx.pgmap) 916 put_dev_pagemap(ctx.pgmap); 917 return i ? i : ret; 918 } 919 920 static bool vma_permits_fault(struct vm_area_struct *vma, 921 unsigned int fault_flags) 922 { 923 bool write = !!(fault_flags & FAULT_FLAG_WRITE); 924 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); 925 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; 926 927 if (!(vm_flags & vma->vm_flags)) 928 return false; 929 930 /* 931 * The architecture might have a hardware protection 932 * mechanism other than read/write that can deny access. 933 * 934 * gup always represents data access, not instruction 935 * fetches, so execute=false here: 936 */ 937 if (!arch_vma_access_permitted(vma, write, false, foreign)) 938 return false; 939 940 return true; 941 } 942 943 /* 944 * fixup_user_fault() - manually resolve a user page fault 945 * @tsk: the task_struct to use for page fault accounting, or 946 * NULL if faults are not to be recorded. 947 * @mm: mm_struct of target mm 948 * @address: user address 949 * @fault_flags:flags to pass down to handle_mm_fault() 950 * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller 951 * does not allow retry 952 * 953 * This is meant to be called in the specific scenario where for locking reasons 954 * we try to access user memory in atomic context (within a pagefault_disable() 955 * section), this returns -EFAULT, and we want to resolve the user fault before 956 * trying again. 957 * 958 * Typically this is meant to be used by the futex code. 959 * 960 * The main difference with get_user_pages() is that this function will 961 * unconditionally call handle_mm_fault() which will in turn perform all the 962 * necessary SW fixup of the dirty and young bits in the PTE, while 963 * get_user_pages() only guarantees to update these in the struct page. 964 * 965 * This is important for some architectures where those bits also gate the 966 * access permission to the page because they are maintained in software. On 967 * such architectures, gup() will not be enough to make a subsequent access 968 * succeed. 969 * 970 * This function will not return with an unlocked mmap_sem. So it has not the 971 * same semantics wrt the @mm->mmap_sem as does filemap_fault(). 972 */ 973 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 974 unsigned long address, unsigned int fault_flags, 975 bool *unlocked) 976 { 977 struct vm_area_struct *vma; 978 vm_fault_t ret, major = 0; 979 980 address = untagged_addr(address); 981 982 if (unlocked) 983 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 984 985 retry: 986 vma = find_extend_vma(mm, address); 987 if (!vma || address < vma->vm_start) 988 return -EFAULT; 989 990 if (!vma_permits_fault(vma, fault_flags)) 991 return -EFAULT; 992 993 ret = handle_mm_fault(vma, address, fault_flags); 994 major |= ret & VM_FAULT_MAJOR; 995 if (ret & VM_FAULT_ERROR) { 996 int err = vm_fault_to_errno(ret, 0); 997 998 if (err) 999 return err; 1000 BUG(); 1001 } 1002 1003 if (ret & VM_FAULT_RETRY) { 1004 down_read(&mm->mmap_sem); 1005 if (!(fault_flags & FAULT_FLAG_TRIED)) { 1006 *unlocked = true; 1007 fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; 1008 fault_flags |= FAULT_FLAG_TRIED; 1009 goto retry; 1010 } 1011 } 1012 1013 if (tsk) { 1014 if (major) 1015 tsk->maj_flt++; 1016 else 1017 tsk->min_flt++; 1018 } 1019 return 0; 1020 } 1021 EXPORT_SYMBOL_GPL(fixup_user_fault); 1022 1023 static __always_inline long __get_user_pages_locked(struct task_struct *tsk, 1024 struct mm_struct *mm, 1025 unsigned long start, 1026 unsigned long nr_pages, 1027 struct page **pages, 1028 struct vm_area_struct **vmas, 1029 int *locked, 1030 unsigned int flags) 1031 { 1032 long ret, pages_done; 1033 bool lock_dropped; 1034 1035 if (locked) { 1036 /* if VM_FAULT_RETRY can be returned, vmas become invalid */ 1037 BUG_ON(vmas); 1038 /* check caller initialized locked */ 1039 BUG_ON(*locked != 1); 1040 } 1041 1042 /* 1043 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior 1044 * is to set FOLL_GET if the caller wants pages[] filled in (but has 1045 * carelessly failed to specify FOLL_GET), so keep doing that, but only 1046 * for FOLL_GET, not for the newer FOLL_PIN. 1047 * 1048 * FOLL_PIN always expects pages to be non-null, but no need to assert 1049 * that here, as any failures will be obvious enough. 1050 */ 1051 if (pages && !(flags & FOLL_PIN)) 1052 flags |= FOLL_GET; 1053 1054 pages_done = 0; 1055 lock_dropped = false; 1056 for (;;) { 1057 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, 1058 vmas, locked); 1059 if (!locked) 1060 /* VM_FAULT_RETRY couldn't trigger, bypass */ 1061 return ret; 1062 1063 /* VM_FAULT_RETRY cannot return errors */ 1064 if (!*locked) { 1065 BUG_ON(ret < 0); 1066 BUG_ON(ret >= nr_pages); 1067 } 1068 1069 if (ret > 0) { 1070 nr_pages -= ret; 1071 pages_done += ret; 1072 if (!nr_pages) 1073 break; 1074 } 1075 if (*locked) { 1076 /* 1077 * VM_FAULT_RETRY didn't trigger or it was a 1078 * FOLL_NOWAIT. 1079 */ 1080 if (!pages_done) 1081 pages_done = ret; 1082 break; 1083 } 1084 /* 1085 * VM_FAULT_RETRY triggered, so seek to the faulting offset. 1086 * For the prefault case (!pages) we only update counts. 1087 */ 1088 if (likely(pages)) 1089 pages += ret; 1090 start += ret << PAGE_SHIFT; 1091 1092 /* 1093 * Repeat on the address that fired VM_FAULT_RETRY 1094 * without FAULT_FLAG_ALLOW_RETRY but with 1095 * FAULT_FLAG_TRIED. 1096 */ 1097 *locked = 1; 1098 lock_dropped = true; 1099 down_read(&mm->mmap_sem); 1100 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, 1101 pages, NULL, NULL); 1102 if (ret != 1) { 1103 BUG_ON(ret > 1); 1104 if (!pages_done) 1105 pages_done = ret; 1106 break; 1107 } 1108 nr_pages--; 1109 pages_done++; 1110 if (!nr_pages) 1111 break; 1112 if (likely(pages)) 1113 pages++; 1114 start += PAGE_SIZE; 1115 } 1116 if (lock_dropped && *locked) { 1117 /* 1118 * We must let the caller know we temporarily dropped the lock 1119 * and so the critical section protected by it was lost. 1120 */ 1121 up_read(&mm->mmap_sem); 1122 *locked = 0; 1123 } 1124 return pages_done; 1125 } 1126 1127 /** 1128 * populate_vma_page_range() - populate a range of pages in the vma. 1129 * @vma: target vma 1130 * @start: start address 1131 * @end: end address 1132 * @nonblocking: 1133 * 1134 * This takes care of mlocking the pages too if VM_LOCKED is set. 1135 * 1136 * return 0 on success, negative error code on error. 1137 * 1138 * vma->vm_mm->mmap_sem must be held. 1139 * 1140 * If @nonblocking is NULL, it may be held for read or write and will 1141 * be unperturbed. 1142 * 1143 * If @nonblocking is non-NULL, it must held for read only and may be 1144 * released. If it's released, *@nonblocking will be set to 0. 1145 */ 1146 long populate_vma_page_range(struct vm_area_struct *vma, 1147 unsigned long start, unsigned long end, int *nonblocking) 1148 { 1149 struct mm_struct *mm = vma->vm_mm; 1150 unsigned long nr_pages = (end - start) / PAGE_SIZE; 1151 int gup_flags; 1152 1153 VM_BUG_ON(start & ~PAGE_MASK); 1154 VM_BUG_ON(end & ~PAGE_MASK); 1155 VM_BUG_ON_VMA(start < vma->vm_start, vma); 1156 VM_BUG_ON_VMA(end > vma->vm_end, vma); 1157 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); 1158 1159 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; 1160 if (vma->vm_flags & VM_LOCKONFAULT) 1161 gup_flags &= ~FOLL_POPULATE; 1162 /* 1163 * We want to touch writable mappings with a write fault in order 1164 * to break COW, except for shared mappings because these don't COW 1165 * and we would not want to dirty them for nothing. 1166 */ 1167 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 1168 gup_flags |= FOLL_WRITE; 1169 1170 /* 1171 * We want mlock to succeed for regions that have any permissions 1172 * other than PROT_NONE. 1173 */ 1174 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 1175 gup_flags |= FOLL_FORCE; 1176 1177 /* 1178 * We made sure addr is within a VMA, so the following will 1179 * not result in a stack expansion that recurses back here. 1180 */ 1181 return __get_user_pages(current, mm, start, nr_pages, gup_flags, 1182 NULL, NULL, nonblocking); 1183 } 1184 1185 /* 1186 * __mm_populate - populate and/or mlock pages within a range of address space. 1187 * 1188 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap 1189 * flags. VMAs must be already marked with the desired vm_flags, and 1190 * mmap_sem must not be held. 1191 */ 1192 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) 1193 { 1194 struct mm_struct *mm = current->mm; 1195 unsigned long end, nstart, nend; 1196 struct vm_area_struct *vma = NULL; 1197 int locked = 0; 1198 long ret = 0; 1199 1200 end = start + len; 1201 1202 for (nstart = start; nstart < end; nstart = nend) { 1203 /* 1204 * We want to fault in pages for [nstart; end) address range. 1205 * Find first corresponding VMA. 1206 */ 1207 if (!locked) { 1208 locked = 1; 1209 down_read(&mm->mmap_sem); 1210 vma = find_vma(mm, nstart); 1211 } else if (nstart >= vma->vm_end) 1212 vma = vma->vm_next; 1213 if (!vma || vma->vm_start >= end) 1214 break; 1215 /* 1216 * Set [nstart; nend) to intersection of desired address 1217 * range with the first VMA. Also, skip undesirable VMA types. 1218 */ 1219 nend = min(end, vma->vm_end); 1220 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1221 continue; 1222 if (nstart < vma->vm_start) 1223 nstart = vma->vm_start; 1224 /* 1225 * Now fault in a range of pages. populate_vma_page_range() 1226 * double checks the vma flags, so that it won't mlock pages 1227 * if the vma was already munlocked. 1228 */ 1229 ret = populate_vma_page_range(vma, nstart, nend, &locked); 1230 if (ret < 0) { 1231 if (ignore_errors) { 1232 ret = 0; 1233 continue; /* continue at next VMA */ 1234 } 1235 break; 1236 } 1237 nend = nstart + ret * PAGE_SIZE; 1238 ret = 0; 1239 } 1240 if (locked) 1241 up_read(&mm->mmap_sem); 1242 return ret; /* 0 or negative error code */ 1243 } 1244 1245 /** 1246 * get_dump_page() - pin user page in memory while writing it to core dump 1247 * @addr: user address 1248 * 1249 * Returns struct page pointer of user page pinned for dump, 1250 * to be freed afterwards by put_page(). 1251 * 1252 * Returns NULL on any kind of failure - a hole must then be inserted into 1253 * the corefile, to preserve alignment with its headers; and also returns 1254 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 1255 * allowing a hole to be left in the corefile to save diskspace. 1256 * 1257 * Called without mmap_sem, but after all other threads have been killed. 1258 */ 1259 #ifdef CONFIG_ELF_CORE 1260 struct page *get_dump_page(unsigned long addr) 1261 { 1262 struct vm_area_struct *vma; 1263 struct page *page; 1264 1265 if (__get_user_pages(current, current->mm, addr, 1, 1266 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, 1267 NULL) < 1) 1268 return NULL; 1269 flush_cache_page(vma, addr, page_to_pfn(page)); 1270 return page; 1271 } 1272 #endif /* CONFIG_ELF_CORE */ 1273 #else /* CONFIG_MMU */ 1274 static long __get_user_pages_locked(struct task_struct *tsk, 1275 struct mm_struct *mm, unsigned long start, 1276 unsigned long nr_pages, struct page **pages, 1277 struct vm_area_struct **vmas, int *locked, 1278 unsigned int foll_flags) 1279 { 1280 struct vm_area_struct *vma; 1281 unsigned long vm_flags; 1282 int i; 1283 1284 /* calculate required read or write permissions. 1285 * If FOLL_FORCE is set, we only require the "MAY" flags. 1286 */ 1287 vm_flags = (foll_flags & FOLL_WRITE) ? 1288 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1289 vm_flags &= (foll_flags & FOLL_FORCE) ? 1290 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1291 1292 for (i = 0; i < nr_pages; i++) { 1293 vma = find_vma(mm, start); 1294 if (!vma) 1295 goto finish_or_fault; 1296 1297 /* protect what we can, including chardevs */ 1298 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1299 !(vm_flags & vma->vm_flags)) 1300 goto finish_or_fault; 1301 1302 if (pages) { 1303 pages[i] = virt_to_page(start); 1304 if (pages[i]) 1305 get_page(pages[i]); 1306 } 1307 if (vmas) 1308 vmas[i] = vma; 1309 start = (start + PAGE_SIZE) & PAGE_MASK; 1310 } 1311 1312 return i; 1313 1314 finish_or_fault: 1315 return i ? : -EFAULT; 1316 } 1317 #endif /* !CONFIG_MMU */ 1318 1319 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) 1320 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) 1321 { 1322 long i; 1323 struct vm_area_struct *vma_prev = NULL; 1324 1325 for (i = 0; i < nr_pages; i++) { 1326 struct vm_area_struct *vma = vmas[i]; 1327 1328 if (vma == vma_prev) 1329 continue; 1330 1331 vma_prev = vma; 1332 1333 if (vma_is_fsdax(vma)) 1334 return true; 1335 } 1336 return false; 1337 } 1338 1339 #ifdef CONFIG_CMA 1340 static struct page *new_non_cma_page(struct page *page, unsigned long private) 1341 { 1342 /* 1343 * We want to make sure we allocate the new page from the same node 1344 * as the source page. 1345 */ 1346 int nid = page_to_nid(page); 1347 /* 1348 * Trying to allocate a page for migration. Ignore allocation 1349 * failure warnings. We don't force __GFP_THISNODE here because 1350 * this node here is the node where we have CMA reservation and 1351 * in some case these nodes will have really less non movable 1352 * allocation memory. 1353 */ 1354 gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; 1355 1356 if (PageHighMem(page)) 1357 gfp_mask |= __GFP_HIGHMEM; 1358 1359 #ifdef CONFIG_HUGETLB_PAGE 1360 if (PageHuge(page)) { 1361 struct hstate *h = page_hstate(page); 1362 /* 1363 * We don't want to dequeue from the pool because pool pages will 1364 * mostly be from the CMA region. 1365 */ 1366 return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); 1367 } 1368 #endif 1369 if (PageTransHuge(page)) { 1370 struct page *thp; 1371 /* 1372 * ignore allocation failure warnings 1373 */ 1374 gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; 1375 1376 /* 1377 * Remove the movable mask so that we don't allocate from 1378 * CMA area again. 1379 */ 1380 thp_gfpmask &= ~__GFP_MOVABLE; 1381 thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); 1382 if (!thp) 1383 return NULL; 1384 prep_transhuge_page(thp); 1385 return thp; 1386 } 1387 1388 return __alloc_pages_node(nid, gfp_mask, 0); 1389 } 1390 1391 static long check_and_migrate_cma_pages(struct task_struct *tsk, 1392 struct mm_struct *mm, 1393 unsigned long start, 1394 unsigned long nr_pages, 1395 struct page **pages, 1396 struct vm_area_struct **vmas, 1397 unsigned int gup_flags) 1398 { 1399 unsigned long i; 1400 unsigned long step; 1401 bool drain_allow = true; 1402 bool migrate_allow = true; 1403 LIST_HEAD(cma_page_list); 1404 long ret = nr_pages; 1405 1406 check_again: 1407 for (i = 0; i < nr_pages;) { 1408 1409 struct page *head = compound_head(pages[i]); 1410 1411 /* 1412 * gup may start from a tail page. Advance step by the left 1413 * part. 1414 */ 1415 step = compound_nr(head) - (pages[i] - head); 1416 /* 1417 * If we get a page from the CMA zone, since we are going to 1418 * be pinning these entries, we might as well move them out 1419 * of the CMA zone if possible. 1420 */ 1421 if (is_migrate_cma_page(head)) { 1422 if (PageHuge(head)) 1423 isolate_huge_page(head, &cma_page_list); 1424 else { 1425 if (!PageLRU(head) && drain_allow) { 1426 lru_add_drain_all(); 1427 drain_allow = false; 1428 } 1429 1430 if (!isolate_lru_page(head)) { 1431 list_add_tail(&head->lru, &cma_page_list); 1432 mod_node_page_state(page_pgdat(head), 1433 NR_ISOLATED_ANON + 1434 page_is_file_cache(head), 1435 hpage_nr_pages(head)); 1436 } 1437 } 1438 } 1439 1440 i += step; 1441 } 1442 1443 if (!list_empty(&cma_page_list)) { 1444 /* 1445 * drop the above get_user_pages reference. 1446 */ 1447 for (i = 0; i < nr_pages; i++) 1448 put_page(pages[i]); 1449 1450 if (migrate_pages(&cma_page_list, new_non_cma_page, 1451 NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { 1452 /* 1453 * some of the pages failed migration. Do get_user_pages 1454 * without migration. 1455 */ 1456 migrate_allow = false; 1457 1458 if (!list_empty(&cma_page_list)) 1459 putback_movable_pages(&cma_page_list); 1460 } 1461 /* 1462 * We did migrate all the pages, Try to get the page references 1463 * again migrating any new CMA pages which we failed to isolate 1464 * earlier. 1465 */ 1466 ret = __get_user_pages_locked(tsk, mm, start, nr_pages, 1467 pages, vmas, NULL, 1468 gup_flags); 1469 1470 if ((ret > 0) && migrate_allow) { 1471 nr_pages = ret; 1472 drain_allow = true; 1473 goto check_again; 1474 } 1475 } 1476 1477 return ret; 1478 } 1479 #else 1480 static long check_and_migrate_cma_pages(struct task_struct *tsk, 1481 struct mm_struct *mm, 1482 unsigned long start, 1483 unsigned long nr_pages, 1484 struct page **pages, 1485 struct vm_area_struct **vmas, 1486 unsigned int gup_flags) 1487 { 1488 return nr_pages; 1489 } 1490 #endif /* CONFIG_CMA */ 1491 1492 /* 1493 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which 1494 * allows us to process the FOLL_LONGTERM flag. 1495 */ 1496 static long __gup_longterm_locked(struct task_struct *tsk, 1497 struct mm_struct *mm, 1498 unsigned long start, 1499 unsigned long nr_pages, 1500 struct page **pages, 1501 struct vm_area_struct **vmas, 1502 unsigned int gup_flags) 1503 { 1504 struct vm_area_struct **vmas_tmp = vmas; 1505 unsigned long flags = 0; 1506 long rc, i; 1507 1508 if (gup_flags & FOLL_LONGTERM) { 1509 if (!pages) 1510 return -EINVAL; 1511 1512 if (!vmas_tmp) { 1513 vmas_tmp = kcalloc(nr_pages, 1514 sizeof(struct vm_area_struct *), 1515 GFP_KERNEL); 1516 if (!vmas_tmp) 1517 return -ENOMEM; 1518 } 1519 flags = memalloc_nocma_save(); 1520 } 1521 1522 rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, 1523 vmas_tmp, NULL, gup_flags); 1524 1525 if (gup_flags & FOLL_LONGTERM) { 1526 memalloc_nocma_restore(flags); 1527 if (rc < 0) 1528 goto out; 1529 1530 if (check_dax_vmas(vmas_tmp, rc)) { 1531 for (i = 0; i < rc; i++) 1532 put_page(pages[i]); 1533 rc = -EOPNOTSUPP; 1534 goto out; 1535 } 1536 1537 rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, 1538 vmas_tmp, gup_flags); 1539 } 1540 1541 out: 1542 if (vmas_tmp != vmas) 1543 kfree(vmas_tmp); 1544 return rc; 1545 } 1546 #else /* !CONFIG_FS_DAX && !CONFIG_CMA */ 1547 static __always_inline long __gup_longterm_locked(struct task_struct *tsk, 1548 struct mm_struct *mm, 1549 unsigned long start, 1550 unsigned long nr_pages, 1551 struct page **pages, 1552 struct vm_area_struct **vmas, 1553 unsigned int flags) 1554 { 1555 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1556 NULL, flags); 1557 } 1558 #endif /* CONFIG_FS_DAX || CONFIG_CMA */ 1559 1560 /* 1561 * get_user_pages_remote() - pin user pages in memory 1562 * @tsk: the task_struct to use for page fault accounting, or 1563 * NULL if faults are not to be recorded. 1564 * @mm: mm_struct of target mm 1565 * @start: starting user address 1566 * @nr_pages: number of pages from start to pin 1567 * @gup_flags: flags modifying lookup behaviour 1568 * @pages: array that receives pointers to the pages pinned. 1569 * Should be at least nr_pages long. Or NULL, if caller 1570 * only intends to ensure the pages are faulted in. 1571 * @vmas: array of pointers to vmas corresponding to each page. 1572 * Or NULL if the caller does not require them. 1573 * @locked: pointer to lock flag indicating whether lock is held and 1574 * subsequently whether VM_FAULT_RETRY functionality can be 1575 * utilised. Lock must initially be held. 1576 * 1577 * Returns either number of pages pinned (which may be less than the 1578 * number requested), or an error. Details about the return value: 1579 * 1580 * -- If nr_pages is 0, returns 0. 1581 * -- If nr_pages is >0, but no pages were pinned, returns -errno. 1582 * -- If nr_pages is >0, and some pages were pinned, returns the number of 1583 * pages pinned. Again, this may be less than nr_pages. 1584 * 1585 * The caller is responsible for releasing returned @pages, via put_page(). 1586 * 1587 * @vmas are valid only as long as mmap_sem is held. 1588 * 1589 * Must be called with mmap_sem held for read or write. 1590 * 1591 * get_user_pages walks a process's page tables and takes a reference to 1592 * each struct page that each user address corresponds to at a given 1593 * instant. That is, it takes the page that would be accessed if a user 1594 * thread accesses the given user virtual address at that instant. 1595 * 1596 * This does not guarantee that the page exists in the user mappings when 1597 * get_user_pages returns, and there may even be a completely different 1598 * page there in some cases (eg. if mmapped pagecache has been invalidated 1599 * and subsequently re faulted). However it does guarantee that the page 1600 * won't be freed completely. And mostly callers simply care that the page 1601 * contains data that was valid *at some point in time*. Typically, an IO 1602 * or similar operation cannot guarantee anything stronger anyway because 1603 * locks can't be held over the syscall boundary. 1604 * 1605 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page 1606 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must 1607 * be called after the page is finished with, and before put_page is called. 1608 * 1609 * get_user_pages is typically used for fewer-copy IO operations, to get a 1610 * handle on the memory by some means other than accesses via the user virtual 1611 * addresses. The pages may be submitted for DMA to devices or accessed via 1612 * their kernel linear mapping (via the kmap APIs). Care should be taken to 1613 * use the correct cache flushing APIs. 1614 * 1615 * See also get_user_pages_fast, for performance critical applications. 1616 * 1617 * get_user_pages should be phased out in favor of 1618 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing 1619 * should use get_user_pages because it cannot pass 1620 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. 1621 */ 1622 #ifdef CONFIG_MMU 1623 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 1624 unsigned long start, unsigned long nr_pages, 1625 unsigned int gup_flags, struct page **pages, 1626 struct vm_area_struct **vmas, int *locked) 1627 { 1628 /* 1629 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, 1630 * never directly by the caller, so enforce that with an assertion: 1631 */ 1632 if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) 1633 return -EINVAL; 1634 1635 /* 1636 * Parts of FOLL_LONGTERM behavior are incompatible with 1637 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1638 * vmas. However, this only comes up if locked is set, and there are 1639 * callers that do request FOLL_LONGTERM, but do not set locked. So, 1640 * allow what we can. 1641 */ 1642 if (gup_flags & FOLL_LONGTERM) { 1643 if (WARN_ON_ONCE(locked)) 1644 return -EINVAL; 1645 /* 1646 * This will check the vmas (even if our vmas arg is NULL) 1647 * and return -ENOTSUPP if DAX isn't allowed in this case: 1648 */ 1649 return __gup_longterm_locked(tsk, mm, start, nr_pages, pages, 1650 vmas, gup_flags | FOLL_TOUCH | 1651 FOLL_REMOTE); 1652 } 1653 1654 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1655 locked, 1656 gup_flags | FOLL_TOUCH | FOLL_REMOTE); 1657 } 1658 EXPORT_SYMBOL(get_user_pages_remote); 1659 1660 #else /* CONFIG_MMU */ 1661 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 1662 unsigned long start, unsigned long nr_pages, 1663 unsigned int gup_flags, struct page **pages, 1664 struct vm_area_struct **vmas, int *locked) 1665 { 1666 return 0; 1667 } 1668 #endif /* !CONFIG_MMU */ 1669 1670 /* 1671 * This is the same as get_user_pages_remote(), just with a 1672 * less-flexible calling convention where we assume that the task 1673 * and mm being operated on are the current task's and don't allow 1674 * passing of a locked parameter. We also obviously don't pass 1675 * FOLL_REMOTE in here. 1676 */ 1677 long get_user_pages(unsigned long start, unsigned long nr_pages, 1678 unsigned int gup_flags, struct page **pages, 1679 struct vm_area_struct **vmas) 1680 { 1681 /* 1682 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, 1683 * never directly by the caller, so enforce that with an assertion: 1684 */ 1685 if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) 1686 return -EINVAL; 1687 1688 return __gup_longterm_locked(current, current->mm, start, nr_pages, 1689 pages, vmas, gup_flags | FOLL_TOUCH); 1690 } 1691 EXPORT_SYMBOL(get_user_pages); 1692 1693 /* 1694 * We can leverage the VM_FAULT_RETRY functionality in the page fault 1695 * paths better by using either get_user_pages_locked() or 1696 * get_user_pages_unlocked(). 1697 * 1698 * get_user_pages_locked() is suitable to replace the form: 1699 * 1700 * down_read(&mm->mmap_sem); 1701 * do_something() 1702 * get_user_pages(tsk, mm, ..., pages, NULL); 1703 * up_read(&mm->mmap_sem); 1704 * 1705 * to: 1706 * 1707 * int locked = 1; 1708 * down_read(&mm->mmap_sem); 1709 * do_something() 1710 * get_user_pages_locked(tsk, mm, ..., pages, &locked); 1711 * if (locked) 1712 * up_read(&mm->mmap_sem); 1713 */ 1714 long get_user_pages_locked(unsigned long start, unsigned long nr_pages, 1715 unsigned int gup_flags, struct page **pages, 1716 int *locked) 1717 { 1718 /* 1719 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1720 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1721 * vmas. As there are no users of this flag in this call we simply 1722 * disallow this option for now. 1723 */ 1724 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1725 return -EINVAL; 1726 1727 return __get_user_pages_locked(current, current->mm, start, nr_pages, 1728 pages, NULL, locked, 1729 gup_flags | FOLL_TOUCH); 1730 } 1731 EXPORT_SYMBOL(get_user_pages_locked); 1732 1733 /* 1734 * get_user_pages_unlocked() is suitable to replace the form: 1735 * 1736 * down_read(&mm->mmap_sem); 1737 * get_user_pages(tsk, mm, ..., pages, NULL); 1738 * up_read(&mm->mmap_sem); 1739 * 1740 * with: 1741 * 1742 * get_user_pages_unlocked(tsk, mm, ..., pages); 1743 * 1744 * It is functionally equivalent to get_user_pages_fast so 1745 * get_user_pages_fast should be used instead if specific gup_flags 1746 * (e.g. FOLL_FORCE) are not required. 1747 */ 1748 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 1749 struct page **pages, unsigned int gup_flags) 1750 { 1751 struct mm_struct *mm = current->mm; 1752 int locked = 1; 1753 long ret; 1754 1755 /* 1756 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1757 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1758 * vmas. As there are no users of this flag in this call we simply 1759 * disallow this option for now. 1760 */ 1761 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1762 return -EINVAL; 1763 1764 down_read(&mm->mmap_sem); 1765 ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, 1766 &locked, gup_flags | FOLL_TOUCH); 1767 if (locked) 1768 up_read(&mm->mmap_sem); 1769 return ret; 1770 } 1771 EXPORT_SYMBOL(get_user_pages_unlocked); 1772 1773 /* 1774 * Fast GUP 1775 * 1776 * get_user_pages_fast attempts to pin user pages by walking the page 1777 * tables directly and avoids taking locks. Thus the walker needs to be 1778 * protected from page table pages being freed from under it, and should 1779 * block any THP splits. 1780 * 1781 * One way to achieve this is to have the walker disable interrupts, and 1782 * rely on IPIs from the TLB flushing code blocking before the page table 1783 * pages are freed. This is unsuitable for architectures that do not need 1784 * to broadcast an IPI when invalidating TLBs. 1785 * 1786 * Another way to achieve this is to batch up page table containing pages 1787 * belonging to more than one mm_user, then rcu_sched a callback to free those 1788 * pages. Disabling interrupts will allow the fast_gup walker to both block 1789 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs 1790 * (which is a relatively rare event). The code below adopts this strategy. 1791 * 1792 * Before activating this code, please be aware that the following assumptions 1793 * are currently made: 1794 * 1795 * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to 1796 * free pages containing page tables or TLB flushing requires IPI broadcast. 1797 * 1798 * *) ptes can be read atomically by the architecture. 1799 * 1800 * *) access_ok is sufficient to validate userspace address ranges. 1801 * 1802 * The last two assumptions can be relaxed by the addition of helper functions. 1803 * 1804 * This code is based heavily on the PowerPC implementation by Nick Piggin. 1805 */ 1806 #ifdef CONFIG_HAVE_FAST_GUP 1807 #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH 1808 /* 1809 * WARNING: only to be used in the get_user_pages_fast() implementation. 1810 * 1811 * With get_user_pages_fast(), we walk down the pagetables without taking any 1812 * locks. For this we would like to load the pointers atomically, but sometimes 1813 * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What 1814 * we do have is the guarantee that a PTE will only either go from not present 1815 * to present, or present to not present or both -- it will not switch to a 1816 * completely different present page without a TLB flush in between; something 1817 * that we are blocking by holding interrupts off. 1818 * 1819 * Setting ptes from not present to present goes: 1820 * 1821 * ptep->pte_high = h; 1822 * smp_wmb(); 1823 * ptep->pte_low = l; 1824 * 1825 * And present to not present goes: 1826 * 1827 * ptep->pte_low = 0; 1828 * smp_wmb(); 1829 * ptep->pte_high = 0; 1830 * 1831 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. 1832 * We load pte_high *after* loading pte_low, which ensures we don't see an older 1833 * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't 1834 * picked up a changed pte high. We might have gotten rubbish values from 1835 * pte_low and pte_high, but we are guaranteed that pte_low will not have the 1836 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only 1837 * operates on present ptes we're safe. 1838 */ 1839 static inline pte_t gup_get_pte(pte_t *ptep) 1840 { 1841 pte_t pte; 1842 1843 do { 1844 pte.pte_low = ptep->pte_low; 1845 smp_rmb(); 1846 pte.pte_high = ptep->pte_high; 1847 smp_rmb(); 1848 } while (unlikely(pte.pte_low != ptep->pte_low)); 1849 1850 return pte; 1851 } 1852 #else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 1853 /* 1854 * We require that the PTE can be read atomically. 1855 */ 1856 static inline pte_t gup_get_pte(pte_t *ptep) 1857 { 1858 return READ_ONCE(*ptep); 1859 } 1860 #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 1861 1862 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, 1863 struct page **pages) 1864 { 1865 while ((*nr) - nr_start) { 1866 struct page *page = pages[--(*nr)]; 1867 1868 ClearPageReferenced(page); 1869 put_page(page); 1870 } 1871 } 1872 1873 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL 1874 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1875 unsigned int flags, struct page **pages, int *nr) 1876 { 1877 struct dev_pagemap *pgmap = NULL; 1878 int nr_start = *nr, ret = 0; 1879 pte_t *ptep, *ptem; 1880 1881 ptem = ptep = pte_offset_map(&pmd, addr); 1882 do { 1883 pte_t pte = gup_get_pte(ptep); 1884 struct page *head, *page; 1885 1886 /* 1887 * Similar to the PMD case below, NUMA hinting must take slow 1888 * path using the pte_protnone check. 1889 */ 1890 if (pte_protnone(pte)) 1891 goto pte_unmap; 1892 1893 if (!pte_access_permitted(pte, flags & FOLL_WRITE)) 1894 goto pte_unmap; 1895 1896 if (pte_devmap(pte)) { 1897 if (unlikely(flags & FOLL_LONGTERM)) 1898 goto pte_unmap; 1899 1900 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); 1901 if (unlikely(!pgmap)) { 1902 undo_dev_pagemap(nr, nr_start, pages); 1903 goto pte_unmap; 1904 } 1905 } else if (pte_special(pte)) 1906 goto pte_unmap; 1907 1908 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1909 page = pte_page(pte); 1910 1911 head = try_get_compound_head(page, 1); 1912 if (!head) 1913 goto pte_unmap; 1914 1915 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1916 put_page(head); 1917 goto pte_unmap; 1918 } 1919 1920 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1921 1922 SetPageReferenced(page); 1923 pages[*nr] = page; 1924 (*nr)++; 1925 1926 } while (ptep++, addr += PAGE_SIZE, addr != end); 1927 1928 ret = 1; 1929 1930 pte_unmap: 1931 if (pgmap) 1932 put_dev_pagemap(pgmap); 1933 pte_unmap(ptem); 1934 return ret; 1935 } 1936 #else 1937 1938 /* 1939 * If we can't determine whether or not a pte is special, then fail immediately 1940 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not 1941 * to be special. 1942 * 1943 * For a futex to be placed on a THP tail page, get_futex_key requires a 1944 * __get_user_pages_fast implementation that can pin pages. Thus it's still 1945 * useful to have gup_huge_pmd even if we can't operate on ptes. 1946 */ 1947 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1948 unsigned int flags, struct page **pages, int *nr) 1949 { 1950 return 0; 1951 } 1952 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ 1953 1954 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1955 static int __gup_device_huge(unsigned long pfn, unsigned long addr, 1956 unsigned long end, struct page **pages, int *nr) 1957 { 1958 int nr_start = *nr; 1959 struct dev_pagemap *pgmap = NULL; 1960 1961 do { 1962 struct page *page = pfn_to_page(pfn); 1963 1964 pgmap = get_dev_pagemap(pfn, pgmap); 1965 if (unlikely(!pgmap)) { 1966 undo_dev_pagemap(nr, nr_start, pages); 1967 return 0; 1968 } 1969 SetPageReferenced(page); 1970 pages[*nr] = page; 1971 get_page(page); 1972 (*nr)++; 1973 pfn++; 1974 } while (addr += PAGE_SIZE, addr != end); 1975 1976 if (pgmap) 1977 put_dev_pagemap(pgmap); 1978 return 1; 1979 } 1980 1981 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1982 unsigned long end, struct page **pages, int *nr) 1983 { 1984 unsigned long fault_pfn; 1985 int nr_start = *nr; 1986 1987 fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1988 if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) 1989 return 0; 1990 1991 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 1992 undo_dev_pagemap(nr, nr_start, pages); 1993 return 0; 1994 } 1995 return 1; 1996 } 1997 1998 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 1999 unsigned long end, struct page **pages, int *nr) 2000 { 2001 unsigned long fault_pfn; 2002 int nr_start = *nr; 2003 2004 fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 2005 if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) 2006 return 0; 2007 2008 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 2009 undo_dev_pagemap(nr, nr_start, pages); 2010 return 0; 2011 } 2012 return 1; 2013 } 2014 #else 2015 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 2016 unsigned long end, struct page **pages, int *nr) 2017 { 2018 BUILD_BUG(); 2019 return 0; 2020 } 2021 2022 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, 2023 unsigned long end, struct page **pages, int *nr) 2024 { 2025 BUILD_BUG(); 2026 return 0; 2027 } 2028 #endif 2029 2030 static int record_subpages(struct page *page, unsigned long addr, 2031 unsigned long end, struct page **pages) 2032 { 2033 int nr; 2034 2035 for (nr = 0; addr != end; addr += PAGE_SIZE) 2036 pages[nr++] = page++; 2037 2038 return nr; 2039 } 2040 2041 static void put_compound_head(struct page *page, int refs) 2042 { 2043 VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); 2044 /* 2045 * Calling put_page() for each ref is unnecessarily slow. Only the last 2046 * ref needs a put_page(). 2047 */ 2048 if (refs > 1) 2049 page_ref_sub(page, refs - 1); 2050 put_page(page); 2051 } 2052 2053 #ifdef CONFIG_ARCH_HAS_HUGEPD 2054 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 2055 unsigned long sz) 2056 { 2057 unsigned long __boundary = (addr + sz) & ~(sz-1); 2058 return (__boundary - 1 < end - 1) ? __boundary : end; 2059 } 2060 2061 static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 2062 unsigned long end, unsigned int flags, 2063 struct page **pages, int *nr) 2064 { 2065 unsigned long pte_end; 2066 struct page *head, *page; 2067 pte_t pte; 2068 int refs; 2069 2070 pte_end = (addr + sz) & ~(sz-1); 2071 if (pte_end < end) 2072 end = pte_end; 2073 2074 pte = READ_ONCE(*ptep); 2075 2076 if (!pte_access_permitted(pte, flags & FOLL_WRITE)) 2077 return 0; 2078 2079 /* hugepages are never "special" */ 2080 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 2081 2082 head = pte_page(pte); 2083 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 2084 refs = record_subpages(page, addr, end, pages + *nr); 2085 2086 head = try_get_compound_head(head, refs); 2087 if (!head) 2088 return 0; 2089 2090 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 2091 put_compound_head(head, refs); 2092 return 0; 2093 } 2094 2095 *nr += refs; 2096 SetPageReferenced(head); 2097 return 1; 2098 } 2099 2100 static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, 2101 unsigned int pdshift, unsigned long end, unsigned int flags, 2102 struct page **pages, int *nr) 2103 { 2104 pte_t *ptep; 2105 unsigned long sz = 1UL << hugepd_shift(hugepd); 2106 unsigned long next; 2107 2108 ptep = hugepte_offset(hugepd, addr, pdshift); 2109 do { 2110 next = hugepte_addr_end(addr, end, sz); 2111 if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr)) 2112 return 0; 2113 } while (ptep++, addr = next, addr != end); 2114 2115 return 1; 2116 } 2117 #else 2118 static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, 2119 unsigned int pdshift, unsigned long end, unsigned int flags, 2120 struct page **pages, int *nr) 2121 { 2122 return 0; 2123 } 2124 #endif /* CONFIG_ARCH_HAS_HUGEPD */ 2125 2126 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 2127 unsigned long end, unsigned int flags, 2128 struct page **pages, int *nr) 2129 { 2130 struct page *head, *page; 2131 int refs; 2132 2133 if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) 2134 return 0; 2135 2136 if (pmd_devmap(orig)) { 2137 if (unlikely(flags & FOLL_LONGTERM)) 2138 return 0; 2139 return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); 2140 } 2141 2142 page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 2143 refs = record_subpages(page, addr, end, pages + *nr); 2144 2145 head = try_get_compound_head(pmd_page(orig), refs); 2146 if (!head) 2147 return 0; 2148 2149 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 2150 put_compound_head(head, refs); 2151 return 0; 2152 } 2153 2154 *nr += refs; 2155 SetPageReferenced(head); 2156 return 1; 2157 } 2158 2159 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 2160 unsigned long end, unsigned int flags, struct page **pages, int *nr) 2161 { 2162 struct page *head, *page; 2163 int refs; 2164 2165 if (!pud_access_permitted(orig, flags & FOLL_WRITE)) 2166 return 0; 2167 2168 if (pud_devmap(orig)) { 2169 if (unlikely(flags & FOLL_LONGTERM)) 2170 return 0; 2171 return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); 2172 } 2173 2174 page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 2175 refs = record_subpages(page, addr, end, pages + *nr); 2176 2177 head = try_get_compound_head(pud_page(orig), refs); 2178 if (!head) 2179 return 0; 2180 2181 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 2182 put_compound_head(head, refs); 2183 return 0; 2184 } 2185 2186 *nr += refs; 2187 SetPageReferenced(head); 2188 return 1; 2189 } 2190 2191 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, 2192 unsigned long end, unsigned int flags, 2193 struct page **pages, int *nr) 2194 { 2195 int refs; 2196 struct page *head, *page; 2197 2198 if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) 2199 return 0; 2200 2201 BUILD_BUG_ON(pgd_devmap(orig)); 2202 2203 page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 2204 refs = record_subpages(page, addr, end, pages + *nr); 2205 2206 head = try_get_compound_head(pgd_page(orig), refs); 2207 if (!head) 2208 return 0; 2209 2210 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { 2211 put_compound_head(head, refs); 2212 return 0; 2213 } 2214 2215 *nr += refs; 2216 SetPageReferenced(head); 2217 return 1; 2218 } 2219 2220 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 2221 unsigned int flags, struct page **pages, int *nr) 2222 { 2223 unsigned long next; 2224 pmd_t *pmdp; 2225 2226 pmdp = pmd_offset(&pud, addr); 2227 do { 2228 pmd_t pmd = READ_ONCE(*pmdp); 2229 2230 next = pmd_addr_end(addr, end); 2231 if (!pmd_present(pmd)) 2232 return 0; 2233 2234 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || 2235 pmd_devmap(pmd))) { 2236 /* 2237 * NUMA hinting faults need to be handled in the GUP 2238 * slowpath for accounting purposes and so that they 2239 * can be serialised against THP migration. 2240 */ 2241 if (pmd_protnone(pmd)) 2242 return 0; 2243 2244 if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, 2245 pages, nr)) 2246 return 0; 2247 2248 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { 2249 /* 2250 * architecture have different format for hugetlbfs 2251 * pmd format and THP pmd format 2252 */ 2253 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, 2254 PMD_SHIFT, next, flags, pages, nr)) 2255 return 0; 2256 } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) 2257 return 0; 2258 } while (pmdp++, addr = next, addr != end); 2259 2260 return 1; 2261 } 2262 2263 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, 2264 unsigned int flags, struct page **pages, int *nr) 2265 { 2266 unsigned long next; 2267 pud_t *pudp; 2268 2269 pudp = pud_offset(&p4d, addr); 2270 do { 2271 pud_t pud = READ_ONCE(*pudp); 2272 2273 next = pud_addr_end(addr, end); 2274 if (unlikely(!pud_present(pud))) 2275 return 0; 2276 if (unlikely(pud_huge(pud))) { 2277 if (!gup_huge_pud(pud, pudp, addr, next, flags, 2278 pages, nr)) 2279 return 0; 2280 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 2281 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 2282 PUD_SHIFT, next, flags, pages, nr)) 2283 return 0; 2284 } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) 2285 return 0; 2286 } while (pudp++, addr = next, addr != end); 2287 2288 return 1; 2289 } 2290 2291 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, 2292 unsigned int flags, struct page **pages, int *nr) 2293 { 2294 unsigned long next; 2295 p4d_t *p4dp; 2296 2297 p4dp = p4d_offset(&pgd, addr); 2298 do { 2299 p4d_t p4d = READ_ONCE(*p4dp); 2300 2301 next = p4d_addr_end(addr, end); 2302 if (p4d_none(p4d)) 2303 return 0; 2304 BUILD_BUG_ON(p4d_huge(p4d)); 2305 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { 2306 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, 2307 P4D_SHIFT, next, flags, pages, nr)) 2308 return 0; 2309 } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) 2310 return 0; 2311 } while (p4dp++, addr = next, addr != end); 2312 2313 return 1; 2314 } 2315 2316 static void gup_pgd_range(unsigned long addr, unsigned long end, 2317 unsigned int flags, struct page **pages, int *nr) 2318 { 2319 unsigned long next; 2320 pgd_t *pgdp; 2321 2322 pgdp = pgd_offset(current->mm, addr); 2323 do { 2324 pgd_t pgd = READ_ONCE(*pgdp); 2325 2326 next = pgd_addr_end(addr, end); 2327 if (pgd_none(pgd)) 2328 return; 2329 if (unlikely(pgd_huge(pgd))) { 2330 if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, 2331 pages, nr)) 2332 return; 2333 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 2334 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 2335 PGDIR_SHIFT, next, flags, pages, nr)) 2336 return; 2337 } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) 2338 return; 2339 } while (pgdp++, addr = next, addr != end); 2340 } 2341 #else 2342 static inline void gup_pgd_range(unsigned long addr, unsigned long end, 2343 unsigned int flags, struct page **pages, int *nr) 2344 { 2345 } 2346 #endif /* CONFIG_HAVE_FAST_GUP */ 2347 2348 #ifndef gup_fast_permitted 2349 /* 2350 * Check if it's allowed to use __get_user_pages_fast() for the range, or 2351 * we need to fall back to the slow version: 2352 */ 2353 static bool gup_fast_permitted(unsigned long start, unsigned long end) 2354 { 2355 return true; 2356 } 2357 #endif 2358 2359 /* 2360 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to 2361 * the regular GUP. 2362 * Note a difference with get_user_pages_fast: this always returns the 2363 * number of pages pinned, 0 if no pages were pinned. 2364 * 2365 * If the architecture does not support this function, simply return with no 2366 * pages pinned. 2367 */ 2368 int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 2369 struct page **pages) 2370 { 2371 unsigned long len, end; 2372 unsigned long flags; 2373 int nr = 0; 2374 2375 start = untagged_addr(start) & PAGE_MASK; 2376 len = (unsigned long) nr_pages << PAGE_SHIFT; 2377 end = start + len; 2378 2379 if (end <= start) 2380 return 0; 2381 if (unlikely(!access_ok((void __user *)start, len))) 2382 return 0; 2383 2384 /* 2385 * Disable interrupts. We use the nested form as we can already have 2386 * interrupts disabled by get_futex_key. 2387 * 2388 * With interrupts disabled, we block page table pages from being 2389 * freed from under us. See struct mmu_table_batch comments in 2390 * include/asm-generic/tlb.h for more details. 2391 * 2392 * We do not adopt an rcu_read_lock(.) here as we also want to 2393 * block IPIs that come from THPs splitting. 2394 */ 2395 2396 if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && 2397 gup_fast_permitted(start, end)) { 2398 local_irq_save(flags); 2399 gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); 2400 local_irq_restore(flags); 2401 } 2402 2403 return nr; 2404 } 2405 EXPORT_SYMBOL_GPL(__get_user_pages_fast); 2406 2407 static int __gup_longterm_unlocked(unsigned long start, int nr_pages, 2408 unsigned int gup_flags, struct page **pages) 2409 { 2410 int ret; 2411 2412 /* 2413 * FIXME: FOLL_LONGTERM does not work with 2414 * get_user_pages_unlocked() (see comments in that function) 2415 */ 2416 if (gup_flags & FOLL_LONGTERM) { 2417 down_read(¤t->mm->mmap_sem); 2418 ret = __gup_longterm_locked(current, current->mm, 2419 start, nr_pages, 2420 pages, NULL, gup_flags); 2421 up_read(¤t->mm->mmap_sem); 2422 } else { 2423 ret = get_user_pages_unlocked(start, nr_pages, 2424 pages, gup_flags); 2425 } 2426 2427 return ret; 2428 } 2429 2430 static int internal_get_user_pages_fast(unsigned long start, int nr_pages, 2431 unsigned int gup_flags, 2432 struct page **pages) 2433 { 2434 unsigned long addr, len, end; 2435 int nr = 0, ret = 0; 2436 2437 if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | 2438 FOLL_FORCE | FOLL_PIN))) 2439 return -EINVAL; 2440 2441 start = untagged_addr(start) & PAGE_MASK; 2442 addr = start; 2443 len = (unsigned long) nr_pages << PAGE_SHIFT; 2444 end = start + len; 2445 2446 if (end <= start) 2447 return 0; 2448 if (unlikely(!access_ok((void __user *)start, len))) 2449 return -EFAULT; 2450 2451 if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && 2452 gup_fast_permitted(start, end)) { 2453 local_irq_disable(); 2454 gup_pgd_range(addr, end, gup_flags, pages, &nr); 2455 local_irq_enable(); 2456 ret = nr; 2457 } 2458 2459 if (nr < nr_pages) { 2460 /* Try to get the remaining pages with get_user_pages */ 2461 start += nr << PAGE_SHIFT; 2462 pages += nr; 2463 2464 ret = __gup_longterm_unlocked(start, nr_pages - nr, 2465 gup_flags, pages); 2466 2467 /* Have to be a bit careful with return values */ 2468 if (nr > 0) { 2469 if (ret < 0) 2470 ret = nr; 2471 else 2472 ret += nr; 2473 } 2474 } 2475 2476 return ret; 2477 } 2478 2479 /** 2480 * get_user_pages_fast() - pin user pages in memory 2481 * @start: starting user address 2482 * @nr_pages: number of pages from start to pin 2483 * @gup_flags: flags modifying pin behaviour 2484 * @pages: array that receives pointers to the pages pinned. 2485 * Should be at least nr_pages long. 2486 * 2487 * Attempt to pin user pages in memory without taking mm->mmap_sem. 2488 * If not successful, it will fall back to taking the lock and 2489 * calling get_user_pages(). 2490 * 2491 * Returns number of pages pinned. This may be fewer than the number requested. 2492 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns 2493 * -errno. 2494 */ 2495 int get_user_pages_fast(unsigned long start, int nr_pages, 2496 unsigned int gup_flags, struct page **pages) 2497 { 2498 /* 2499 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, 2500 * never directly by the caller, so enforce that: 2501 */ 2502 if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) 2503 return -EINVAL; 2504 2505 return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); 2506 } 2507 EXPORT_SYMBOL_GPL(get_user_pages_fast); 2508 2509 /** 2510 * pin_user_pages_fast() - pin user pages in memory without taking locks 2511 * 2512 * For now, this is a placeholder function, until various call sites are 2513 * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, 2514 * this is identical to get_user_pages_fast(). 2515 * 2516 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It 2517 * is NOT intended for Case 2 (RDMA: long-term pins). 2518 */ 2519 int pin_user_pages_fast(unsigned long start, int nr_pages, 2520 unsigned int gup_flags, struct page **pages) 2521 { 2522 /* 2523 * This is a placeholder, until the pin functionality is activated. 2524 * Until then, just behave like the corresponding get_user_pages*() 2525 * routine. 2526 */ 2527 return get_user_pages_fast(start, nr_pages, gup_flags, pages); 2528 } 2529 EXPORT_SYMBOL_GPL(pin_user_pages_fast); 2530 2531 /** 2532 * pin_user_pages_remote() - pin pages of a remote process (task != current) 2533 * 2534 * For now, this is a placeholder function, until various call sites are 2535 * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, 2536 * this is identical to get_user_pages_remote(). 2537 * 2538 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It 2539 * is NOT intended for Case 2 (RDMA: long-term pins). 2540 */ 2541 long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 2542 unsigned long start, unsigned long nr_pages, 2543 unsigned int gup_flags, struct page **pages, 2544 struct vm_area_struct **vmas, int *locked) 2545 { 2546 /* 2547 * This is a placeholder, until the pin functionality is activated. 2548 * Until then, just behave like the corresponding get_user_pages*() 2549 * routine. 2550 */ 2551 return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages, 2552 vmas, locked); 2553 } 2554 EXPORT_SYMBOL(pin_user_pages_remote); 2555 2556 /** 2557 * pin_user_pages() - pin user pages in memory for use by other devices 2558 * 2559 * For now, this is a placeholder function, until various call sites are 2560 * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, 2561 * this is identical to get_user_pages(). 2562 * 2563 * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It 2564 * is NOT intended for Case 2 (RDMA: long-term pins). 2565 */ 2566 long pin_user_pages(unsigned long start, unsigned long nr_pages, 2567 unsigned int gup_flags, struct page **pages, 2568 struct vm_area_struct **vmas) 2569 { 2570 /* 2571 * This is a placeholder, until the pin functionality is activated. 2572 * Until then, just behave like the corresponding get_user_pages*() 2573 * routine. 2574 */ 2575 return get_user_pages(start, nr_pages, gup_flags, pages, vmas); 2576 } 2577 EXPORT_SYMBOL(pin_user_pages); 2578