1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/errno.h> 4 #include <linux/err.h> 5 #include <linux/spinlock.h> 6 7 #include <linux/mm.h> 8 #include <linux/memremap.h> 9 #include <linux/pagemap.h> 10 #include <linux/rmap.h> 11 #include <linux/swap.h> 12 #include <linux/swapops.h> 13 14 #include <linux/sched/signal.h> 15 #include <linux/rwsem.h> 16 #include <linux/hugetlb.h> 17 #include <linux/migrate.h> 18 #include <linux/mm_inline.h> 19 #include <linux/sched/mm.h> 20 21 #include <asm/mmu_context.h> 22 #include <asm/pgtable.h> 23 #include <asm/tlbflush.h> 24 25 #include "internal.h" 26 27 struct follow_page_context { 28 struct dev_pagemap *pgmap; 29 unsigned int page_mask; 30 }; 31 32 typedef int (*set_dirty_func_t)(struct page *page); 33 34 static void __put_user_pages_dirty(struct page **pages, 35 unsigned long npages, 36 set_dirty_func_t sdf) 37 { 38 unsigned long index; 39 40 for (index = 0; index < npages; index++) { 41 struct page *page = compound_head(pages[index]); 42 43 /* 44 * Checking PageDirty at this point may race with 45 * clear_page_dirty_for_io(), but that's OK. Two key cases: 46 * 47 * 1) This code sees the page as already dirty, so it skips 48 * the call to sdf(). That could happen because 49 * clear_page_dirty_for_io() called page_mkclean(), 50 * followed by set_page_dirty(). However, now the page is 51 * going to get written back, which meets the original 52 * intention of setting it dirty, so all is well: 53 * clear_page_dirty_for_io() goes on to call 54 * TestClearPageDirty(), and write the page back. 55 * 56 * 2) This code sees the page as clean, so it calls sdf(). 57 * The page stays dirty, despite being written back, so it 58 * gets written back again in the next writeback cycle. 59 * This is harmless. 60 */ 61 if (!PageDirty(page)) 62 sdf(page); 63 64 put_user_page(page); 65 } 66 } 67 68 /** 69 * put_user_pages_dirty() - release and dirty an array of gup-pinned pages 70 * @pages: array of pages to be marked dirty and released. 71 * @npages: number of pages in the @pages array. 72 * 73 * "gup-pinned page" refers to a page that has had one of the get_user_pages() 74 * variants called on that page. 75 * 76 * For each page in the @pages array, make that page (or its head page, if a 77 * compound page) dirty, if it was previously listed as clean. Then, release 78 * the page using put_user_page(). 79 * 80 * Please see the put_user_page() documentation for details. 81 * 82 * set_page_dirty(), which does not lock the page, is used here. 83 * Therefore, it is the caller's responsibility to ensure that this is 84 * safe. If not, then put_user_pages_dirty_lock() should be called instead. 85 * 86 */ 87 void put_user_pages_dirty(struct page **pages, unsigned long npages) 88 { 89 __put_user_pages_dirty(pages, npages, set_page_dirty); 90 } 91 EXPORT_SYMBOL(put_user_pages_dirty); 92 93 /** 94 * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages 95 * @pages: array of pages to be marked dirty and released. 96 * @npages: number of pages in the @pages array. 97 * 98 * For each page in the @pages array, make that page (or its head page, if a 99 * compound page) dirty, if it was previously listed as clean. Then, release 100 * the page using put_user_page(). 101 * 102 * Please see the put_user_page() documentation for details. 103 * 104 * This is just like put_user_pages_dirty(), except that it invokes 105 * set_page_dirty_lock(), instead of set_page_dirty(). 106 * 107 */ 108 void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) 109 { 110 __put_user_pages_dirty(pages, npages, set_page_dirty_lock); 111 } 112 EXPORT_SYMBOL(put_user_pages_dirty_lock); 113 114 /** 115 * put_user_pages() - release an array of gup-pinned pages. 116 * @pages: array of pages to be marked dirty and released. 117 * @npages: number of pages in the @pages array. 118 * 119 * For each page in the @pages array, release the page using put_user_page(). 120 * 121 * Please see the put_user_page() documentation for details. 122 */ 123 void put_user_pages(struct page **pages, unsigned long npages) 124 { 125 unsigned long index; 126 127 /* 128 * TODO: this can be optimized for huge pages: if a series of pages is 129 * physically contiguous and part of the same compound page, then a 130 * single operation to the head page should suffice. 131 */ 132 for (index = 0; index < npages; index++) 133 put_user_page(pages[index]); 134 } 135 EXPORT_SYMBOL(put_user_pages); 136 137 #ifdef CONFIG_MMU 138 static struct page *no_page_table(struct vm_area_struct *vma, 139 unsigned int flags) 140 { 141 /* 142 * When core dumping an enormous anonymous area that nobody 143 * has touched so far, we don't want to allocate unnecessary pages or 144 * page tables. Return error instead of NULL to skip handle_mm_fault, 145 * then get_dump_page() will return NULL to leave a hole in the dump. 146 * But we can only make this optimization where a hole would surely 147 * be zero-filled if handle_mm_fault() actually did handle it. 148 */ 149 if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) 150 return ERR_PTR(-EFAULT); 151 return NULL; 152 } 153 154 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, 155 pte_t *pte, unsigned int flags) 156 { 157 /* No page to get reference */ 158 if (flags & FOLL_GET) 159 return -EFAULT; 160 161 if (flags & FOLL_TOUCH) { 162 pte_t entry = *pte; 163 164 if (flags & FOLL_WRITE) 165 entry = pte_mkdirty(entry); 166 entry = pte_mkyoung(entry); 167 168 if (!pte_same(*pte, entry)) { 169 set_pte_at(vma->vm_mm, address, pte, entry); 170 update_mmu_cache(vma, address, pte); 171 } 172 } 173 174 /* Proper page table entry exists, but no corresponding struct page */ 175 return -EEXIST; 176 } 177 178 /* 179 * FOLL_FORCE can write to even unwritable pte's, but only 180 * after we've gone through a COW cycle and they are dirty. 181 */ 182 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) 183 { 184 return pte_write(pte) || 185 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); 186 } 187 188 static struct page *follow_page_pte(struct vm_area_struct *vma, 189 unsigned long address, pmd_t *pmd, unsigned int flags, 190 struct dev_pagemap **pgmap) 191 { 192 struct mm_struct *mm = vma->vm_mm; 193 struct page *page; 194 spinlock_t *ptl; 195 pte_t *ptep, pte; 196 197 retry: 198 if (unlikely(pmd_bad(*pmd))) 199 return no_page_table(vma, flags); 200 201 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 202 pte = *ptep; 203 if (!pte_present(pte)) { 204 swp_entry_t entry; 205 /* 206 * KSM's break_ksm() relies upon recognizing a ksm page 207 * even while it is being migrated, so for that case we 208 * need migration_entry_wait(). 209 */ 210 if (likely(!(flags & FOLL_MIGRATION))) 211 goto no_page; 212 if (pte_none(pte)) 213 goto no_page; 214 entry = pte_to_swp_entry(pte); 215 if (!is_migration_entry(entry)) 216 goto no_page; 217 pte_unmap_unlock(ptep, ptl); 218 migration_entry_wait(mm, pmd, address); 219 goto retry; 220 } 221 if ((flags & FOLL_NUMA) && pte_protnone(pte)) 222 goto no_page; 223 if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { 224 pte_unmap_unlock(ptep, ptl); 225 return NULL; 226 } 227 228 page = vm_normal_page(vma, address, pte); 229 if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { 230 /* 231 * Only return device mapping pages in the FOLL_GET case since 232 * they are only valid while holding the pgmap reference. 233 */ 234 *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); 235 if (*pgmap) 236 page = pte_page(pte); 237 else 238 goto no_page; 239 } else if (unlikely(!page)) { 240 if (flags & FOLL_DUMP) { 241 /* Avoid special (like zero) pages in core dumps */ 242 page = ERR_PTR(-EFAULT); 243 goto out; 244 } 245 246 if (is_zero_pfn(pte_pfn(pte))) { 247 page = pte_page(pte); 248 } else { 249 int ret; 250 251 ret = follow_pfn_pte(vma, address, ptep, flags); 252 page = ERR_PTR(ret); 253 goto out; 254 } 255 } 256 257 if (flags & FOLL_SPLIT && PageTransCompound(page)) { 258 int ret; 259 get_page(page); 260 pte_unmap_unlock(ptep, ptl); 261 lock_page(page); 262 ret = split_huge_page(page); 263 unlock_page(page); 264 put_page(page); 265 if (ret) 266 return ERR_PTR(ret); 267 goto retry; 268 } 269 270 if (flags & FOLL_GET) { 271 if (unlikely(!try_get_page(page))) { 272 page = ERR_PTR(-ENOMEM); 273 goto out; 274 } 275 } 276 if (flags & FOLL_TOUCH) { 277 if ((flags & FOLL_WRITE) && 278 !pte_dirty(pte) && !PageDirty(page)) 279 set_page_dirty(page); 280 /* 281 * pte_mkyoung() would be more correct here, but atomic care 282 * is needed to avoid losing the dirty bit: it is easier to use 283 * mark_page_accessed(). 284 */ 285 mark_page_accessed(page); 286 } 287 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 288 /* Do not mlock pte-mapped THP */ 289 if (PageTransCompound(page)) 290 goto out; 291 292 /* 293 * The preliminary mapping check is mainly to avoid the 294 * pointless overhead of lock_page on the ZERO_PAGE 295 * which might bounce very badly if there is contention. 296 * 297 * If the page is already locked, we don't need to 298 * handle it now - vmscan will handle it later if and 299 * when it attempts to reclaim the page. 300 */ 301 if (page->mapping && trylock_page(page)) { 302 lru_add_drain(); /* push cached pages to LRU */ 303 /* 304 * Because we lock page here, and migration is 305 * blocked by the pte's page reference, and we 306 * know the page is still mapped, we don't even 307 * need to check for file-cache page truncation. 308 */ 309 mlock_vma_page(page); 310 unlock_page(page); 311 } 312 } 313 out: 314 pte_unmap_unlock(ptep, ptl); 315 return page; 316 no_page: 317 pte_unmap_unlock(ptep, ptl); 318 if (!pte_none(pte)) 319 return NULL; 320 return no_page_table(vma, flags); 321 } 322 323 static struct page *follow_pmd_mask(struct vm_area_struct *vma, 324 unsigned long address, pud_t *pudp, 325 unsigned int flags, 326 struct follow_page_context *ctx) 327 { 328 pmd_t *pmd, pmdval; 329 spinlock_t *ptl; 330 struct page *page; 331 struct mm_struct *mm = vma->vm_mm; 332 333 pmd = pmd_offset(pudp, address); 334 /* 335 * The READ_ONCE() will stabilize the pmdval in a register or 336 * on the stack so that it will stop changing under the code. 337 */ 338 pmdval = READ_ONCE(*pmd); 339 if (pmd_none(pmdval)) 340 return no_page_table(vma, flags); 341 if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { 342 page = follow_huge_pmd(mm, address, pmd, flags); 343 if (page) 344 return page; 345 return no_page_table(vma, flags); 346 } 347 if (is_hugepd(__hugepd(pmd_val(pmdval)))) { 348 page = follow_huge_pd(vma, address, 349 __hugepd(pmd_val(pmdval)), flags, 350 PMD_SHIFT); 351 if (page) 352 return page; 353 return no_page_table(vma, flags); 354 } 355 retry: 356 if (!pmd_present(pmdval)) { 357 if (likely(!(flags & FOLL_MIGRATION))) 358 return no_page_table(vma, flags); 359 VM_BUG_ON(thp_migration_supported() && 360 !is_pmd_migration_entry(pmdval)); 361 if (is_pmd_migration_entry(pmdval)) 362 pmd_migration_entry_wait(mm, pmd); 363 pmdval = READ_ONCE(*pmd); 364 /* 365 * MADV_DONTNEED may convert the pmd to null because 366 * mmap_sem is held in read mode 367 */ 368 if (pmd_none(pmdval)) 369 return no_page_table(vma, flags); 370 goto retry; 371 } 372 if (pmd_devmap(pmdval)) { 373 ptl = pmd_lock(mm, pmd); 374 page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); 375 spin_unlock(ptl); 376 if (page) 377 return page; 378 } 379 if (likely(!pmd_trans_huge(pmdval))) 380 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 381 382 if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) 383 return no_page_table(vma, flags); 384 385 retry_locked: 386 ptl = pmd_lock(mm, pmd); 387 if (unlikely(pmd_none(*pmd))) { 388 spin_unlock(ptl); 389 return no_page_table(vma, flags); 390 } 391 if (unlikely(!pmd_present(*pmd))) { 392 spin_unlock(ptl); 393 if (likely(!(flags & FOLL_MIGRATION))) 394 return no_page_table(vma, flags); 395 pmd_migration_entry_wait(mm, pmd); 396 goto retry_locked; 397 } 398 if (unlikely(!pmd_trans_huge(*pmd))) { 399 spin_unlock(ptl); 400 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 401 } 402 if (flags & FOLL_SPLIT) { 403 int ret; 404 page = pmd_page(*pmd); 405 if (is_huge_zero_page(page)) { 406 spin_unlock(ptl); 407 ret = 0; 408 split_huge_pmd(vma, pmd, address); 409 if (pmd_trans_unstable(pmd)) 410 ret = -EBUSY; 411 } else { 412 if (unlikely(!try_get_page(page))) { 413 spin_unlock(ptl); 414 return ERR_PTR(-ENOMEM); 415 } 416 spin_unlock(ptl); 417 lock_page(page); 418 ret = split_huge_page(page); 419 unlock_page(page); 420 put_page(page); 421 if (pmd_none(*pmd)) 422 return no_page_table(vma, flags); 423 } 424 425 return ret ? ERR_PTR(ret) : 426 follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 427 } 428 page = follow_trans_huge_pmd(vma, address, pmd, flags); 429 spin_unlock(ptl); 430 ctx->page_mask = HPAGE_PMD_NR - 1; 431 return page; 432 } 433 434 static struct page *follow_pud_mask(struct vm_area_struct *vma, 435 unsigned long address, p4d_t *p4dp, 436 unsigned int flags, 437 struct follow_page_context *ctx) 438 { 439 pud_t *pud; 440 spinlock_t *ptl; 441 struct page *page; 442 struct mm_struct *mm = vma->vm_mm; 443 444 pud = pud_offset(p4dp, address); 445 if (pud_none(*pud)) 446 return no_page_table(vma, flags); 447 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 448 page = follow_huge_pud(mm, address, pud, flags); 449 if (page) 450 return page; 451 return no_page_table(vma, flags); 452 } 453 if (is_hugepd(__hugepd(pud_val(*pud)))) { 454 page = follow_huge_pd(vma, address, 455 __hugepd(pud_val(*pud)), flags, 456 PUD_SHIFT); 457 if (page) 458 return page; 459 return no_page_table(vma, flags); 460 } 461 if (pud_devmap(*pud)) { 462 ptl = pud_lock(mm, pud); 463 page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); 464 spin_unlock(ptl); 465 if (page) 466 return page; 467 } 468 if (unlikely(pud_bad(*pud))) 469 return no_page_table(vma, flags); 470 471 return follow_pmd_mask(vma, address, pud, flags, ctx); 472 } 473 474 static struct page *follow_p4d_mask(struct vm_area_struct *vma, 475 unsigned long address, pgd_t *pgdp, 476 unsigned int flags, 477 struct follow_page_context *ctx) 478 { 479 p4d_t *p4d; 480 struct page *page; 481 482 p4d = p4d_offset(pgdp, address); 483 if (p4d_none(*p4d)) 484 return no_page_table(vma, flags); 485 BUILD_BUG_ON(p4d_huge(*p4d)); 486 if (unlikely(p4d_bad(*p4d))) 487 return no_page_table(vma, flags); 488 489 if (is_hugepd(__hugepd(p4d_val(*p4d)))) { 490 page = follow_huge_pd(vma, address, 491 __hugepd(p4d_val(*p4d)), flags, 492 P4D_SHIFT); 493 if (page) 494 return page; 495 return no_page_table(vma, flags); 496 } 497 return follow_pud_mask(vma, address, p4d, flags, ctx); 498 } 499 500 /** 501 * follow_page_mask - look up a page descriptor from a user-virtual address 502 * @vma: vm_area_struct mapping @address 503 * @address: virtual address to look up 504 * @flags: flags modifying lookup behaviour 505 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a 506 * pointer to output page_mask 507 * 508 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 509 * 510 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches 511 * the device's dev_pagemap metadata to avoid repeating expensive lookups. 512 * 513 * On output, the @ctx->page_mask is set according to the size of the page. 514 * 515 * Return: the mapped (struct page *), %NULL if no mapping exists, or 516 * an error pointer if there is a mapping to something not represented 517 * by a page descriptor (see also vm_normal_page()). 518 */ 519 static struct page *follow_page_mask(struct vm_area_struct *vma, 520 unsigned long address, unsigned int flags, 521 struct follow_page_context *ctx) 522 { 523 pgd_t *pgd; 524 struct page *page; 525 struct mm_struct *mm = vma->vm_mm; 526 527 ctx->page_mask = 0; 528 529 /* make this handle hugepd */ 530 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 531 if (!IS_ERR(page)) { 532 BUG_ON(flags & FOLL_GET); 533 return page; 534 } 535 536 pgd = pgd_offset(mm, address); 537 538 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 539 return no_page_table(vma, flags); 540 541 if (pgd_huge(*pgd)) { 542 page = follow_huge_pgd(mm, address, pgd, flags); 543 if (page) 544 return page; 545 return no_page_table(vma, flags); 546 } 547 if (is_hugepd(__hugepd(pgd_val(*pgd)))) { 548 page = follow_huge_pd(vma, address, 549 __hugepd(pgd_val(*pgd)), flags, 550 PGDIR_SHIFT); 551 if (page) 552 return page; 553 return no_page_table(vma, flags); 554 } 555 556 return follow_p4d_mask(vma, address, pgd, flags, ctx); 557 } 558 559 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 560 unsigned int foll_flags) 561 { 562 struct follow_page_context ctx = { NULL }; 563 struct page *page; 564 565 page = follow_page_mask(vma, address, foll_flags, &ctx); 566 if (ctx.pgmap) 567 put_dev_pagemap(ctx.pgmap); 568 return page; 569 } 570 571 static int get_gate_page(struct mm_struct *mm, unsigned long address, 572 unsigned int gup_flags, struct vm_area_struct **vma, 573 struct page **page) 574 { 575 pgd_t *pgd; 576 p4d_t *p4d; 577 pud_t *pud; 578 pmd_t *pmd; 579 pte_t *pte; 580 int ret = -EFAULT; 581 582 /* user gate pages are read-only */ 583 if (gup_flags & FOLL_WRITE) 584 return -EFAULT; 585 if (address > TASK_SIZE) 586 pgd = pgd_offset_k(address); 587 else 588 pgd = pgd_offset_gate(mm, address); 589 if (pgd_none(*pgd)) 590 return -EFAULT; 591 p4d = p4d_offset(pgd, address); 592 if (p4d_none(*p4d)) 593 return -EFAULT; 594 pud = pud_offset(p4d, address); 595 if (pud_none(*pud)) 596 return -EFAULT; 597 pmd = pmd_offset(pud, address); 598 if (!pmd_present(*pmd)) 599 return -EFAULT; 600 VM_BUG_ON(pmd_trans_huge(*pmd)); 601 pte = pte_offset_map(pmd, address); 602 if (pte_none(*pte)) 603 goto unmap; 604 *vma = get_gate_vma(mm); 605 if (!page) 606 goto out; 607 *page = vm_normal_page(*vma, address, *pte); 608 if (!*page) { 609 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) 610 goto unmap; 611 *page = pte_page(*pte); 612 } 613 if (unlikely(!try_get_page(*page))) { 614 ret = -ENOMEM; 615 goto unmap; 616 } 617 out: 618 ret = 0; 619 unmap: 620 pte_unmap(pte); 621 return ret; 622 } 623 624 /* 625 * mmap_sem must be held on entry. If @nonblocking != NULL and 626 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. 627 * If it is, *@nonblocking will be set to 0 and -EBUSY returned. 628 */ 629 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, 630 unsigned long address, unsigned int *flags, int *nonblocking) 631 { 632 unsigned int fault_flags = 0; 633 vm_fault_t ret; 634 635 /* mlock all present pages, but do not fault in new pages */ 636 if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) 637 return -ENOENT; 638 if (*flags & FOLL_WRITE) 639 fault_flags |= FAULT_FLAG_WRITE; 640 if (*flags & FOLL_REMOTE) 641 fault_flags |= FAULT_FLAG_REMOTE; 642 if (nonblocking) 643 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 644 if (*flags & FOLL_NOWAIT) 645 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 646 if (*flags & FOLL_TRIED) { 647 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); 648 fault_flags |= FAULT_FLAG_TRIED; 649 } 650 651 ret = handle_mm_fault(vma, address, fault_flags); 652 if (ret & VM_FAULT_ERROR) { 653 int err = vm_fault_to_errno(ret, *flags); 654 655 if (err) 656 return err; 657 BUG(); 658 } 659 660 if (tsk) { 661 if (ret & VM_FAULT_MAJOR) 662 tsk->maj_flt++; 663 else 664 tsk->min_flt++; 665 } 666 667 if (ret & VM_FAULT_RETRY) { 668 if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 669 *nonblocking = 0; 670 return -EBUSY; 671 } 672 673 /* 674 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when 675 * necessary, even if maybe_mkwrite decided not to set pte_write. We 676 * can thus safely do subsequent page lookups as if they were reads. 677 * But only do so when looping for pte_write is futile: in some cases 678 * userspace may also be wanting to write to the gotten user page, 679 * which a read fault here might prevent (a readonly page might get 680 * reCOWed by userspace write). 681 */ 682 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) 683 *flags |= FOLL_COW; 684 return 0; 685 } 686 687 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 688 { 689 vm_flags_t vm_flags = vma->vm_flags; 690 int write = (gup_flags & FOLL_WRITE); 691 int foreign = (gup_flags & FOLL_REMOTE); 692 693 if (vm_flags & (VM_IO | VM_PFNMAP)) 694 return -EFAULT; 695 696 if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) 697 return -EFAULT; 698 699 if (write) { 700 if (!(vm_flags & VM_WRITE)) { 701 if (!(gup_flags & FOLL_FORCE)) 702 return -EFAULT; 703 /* 704 * We used to let the write,force case do COW in a 705 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could 706 * set a breakpoint in a read-only mapping of an 707 * executable, without corrupting the file (yet only 708 * when that file had been opened for writing!). 709 * Anon pages in shared mappings are surprising: now 710 * just reject it. 711 */ 712 if (!is_cow_mapping(vm_flags)) 713 return -EFAULT; 714 } 715 } else if (!(vm_flags & VM_READ)) { 716 if (!(gup_flags & FOLL_FORCE)) 717 return -EFAULT; 718 /* 719 * Is there actually any vma we can reach here which does not 720 * have VM_MAYREAD set? 721 */ 722 if (!(vm_flags & VM_MAYREAD)) 723 return -EFAULT; 724 } 725 /* 726 * gups are always data accesses, not instruction 727 * fetches, so execute=false here 728 */ 729 if (!arch_vma_access_permitted(vma, write, false, foreign)) 730 return -EFAULT; 731 return 0; 732 } 733 734 /** 735 * __get_user_pages() - pin user pages in memory 736 * @tsk: task_struct of target task 737 * @mm: mm_struct of target mm 738 * @start: starting user address 739 * @nr_pages: number of pages from start to pin 740 * @gup_flags: flags modifying pin behaviour 741 * @pages: array that receives pointers to the pages pinned. 742 * Should be at least nr_pages long. Or NULL, if caller 743 * only intends to ensure the pages are faulted in. 744 * @vmas: array of pointers to vmas corresponding to each page. 745 * Or NULL if the caller does not require them. 746 * @nonblocking: whether waiting for disk IO or mmap_sem contention 747 * 748 * Returns number of pages pinned. This may be fewer than the number 749 * requested. If nr_pages is 0 or negative, returns 0. If no pages 750 * were pinned, returns -errno. Each page returned must be released 751 * with a put_page() call when it is finished with. vmas will only 752 * remain valid while mmap_sem is held. 753 * 754 * Must be called with mmap_sem held. It may be released. See below. 755 * 756 * __get_user_pages walks a process's page tables and takes a reference to 757 * each struct page that each user address corresponds to at a given 758 * instant. That is, it takes the page that would be accessed if a user 759 * thread accesses the given user virtual address at that instant. 760 * 761 * This does not guarantee that the page exists in the user mappings when 762 * __get_user_pages returns, and there may even be a completely different 763 * page there in some cases (eg. if mmapped pagecache has been invalidated 764 * and subsequently re faulted). However it does guarantee that the page 765 * won't be freed completely. And mostly callers simply care that the page 766 * contains data that was valid *at some point in time*. Typically, an IO 767 * or similar operation cannot guarantee anything stronger anyway because 768 * locks can't be held over the syscall boundary. 769 * 770 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 771 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 772 * appropriate) must be called after the page is finished with, and 773 * before put_page is called. 774 * 775 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 776 * or mmap_sem contention, and if waiting is needed to pin all pages, 777 * *@nonblocking will be set to 0. Further, if @gup_flags does not 778 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in 779 * this case. 780 * 781 * A caller using such a combination of @nonblocking and @gup_flags 782 * must therefore hold the mmap_sem for reading only, and recognize 783 * when it's been released. Otherwise, it must be held for either 784 * reading or writing and will not be released. 785 * 786 * In most cases, get_user_pages or get_user_pages_fast should be used 787 * instead of __get_user_pages. __get_user_pages should be used only if 788 * you need some special @gup_flags. 789 */ 790 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 791 unsigned long start, unsigned long nr_pages, 792 unsigned int gup_flags, struct page **pages, 793 struct vm_area_struct **vmas, int *nonblocking) 794 { 795 long ret = 0, i = 0; 796 struct vm_area_struct *vma = NULL; 797 struct follow_page_context ctx = { NULL }; 798 799 if (!nr_pages) 800 return 0; 801 802 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 803 804 /* 805 * If FOLL_FORCE is set then do not force a full fault as the hinting 806 * fault information is unrelated to the reference behaviour of a task 807 * using the address space 808 */ 809 if (!(gup_flags & FOLL_FORCE)) 810 gup_flags |= FOLL_NUMA; 811 812 do { 813 struct page *page; 814 unsigned int foll_flags = gup_flags; 815 unsigned int page_increm; 816 817 /* first iteration or cross vma bound */ 818 if (!vma || start >= vma->vm_end) { 819 vma = find_extend_vma(mm, start); 820 if (!vma && in_gate_area(mm, start)) { 821 ret = get_gate_page(mm, start & PAGE_MASK, 822 gup_flags, &vma, 823 pages ? &pages[i] : NULL); 824 if (ret) 825 goto out; 826 ctx.page_mask = 0; 827 goto next_page; 828 } 829 830 if (!vma || check_vma_flags(vma, gup_flags)) { 831 ret = -EFAULT; 832 goto out; 833 } 834 if (is_vm_hugetlb_page(vma)) { 835 i = follow_hugetlb_page(mm, vma, pages, vmas, 836 &start, &nr_pages, i, 837 gup_flags, nonblocking); 838 continue; 839 } 840 } 841 retry: 842 /* 843 * If we have a pending SIGKILL, don't keep faulting pages and 844 * potentially allocating memory. 845 */ 846 if (fatal_signal_pending(current)) { 847 ret = -ERESTARTSYS; 848 goto out; 849 } 850 cond_resched(); 851 852 page = follow_page_mask(vma, start, foll_flags, &ctx); 853 if (!page) { 854 ret = faultin_page(tsk, vma, start, &foll_flags, 855 nonblocking); 856 switch (ret) { 857 case 0: 858 goto retry; 859 case -EBUSY: 860 ret = 0; 861 /* FALLTHRU */ 862 case -EFAULT: 863 case -ENOMEM: 864 case -EHWPOISON: 865 goto out; 866 case -ENOENT: 867 goto next_page; 868 } 869 BUG(); 870 } else if (PTR_ERR(page) == -EEXIST) { 871 /* 872 * Proper page table entry exists, but no corresponding 873 * struct page. 874 */ 875 goto next_page; 876 } else if (IS_ERR(page)) { 877 ret = PTR_ERR(page); 878 goto out; 879 } 880 if (pages) { 881 pages[i] = page; 882 flush_anon_page(vma, page, start); 883 flush_dcache_page(page); 884 ctx.page_mask = 0; 885 } 886 next_page: 887 if (vmas) { 888 vmas[i] = vma; 889 ctx.page_mask = 0; 890 } 891 page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); 892 if (page_increm > nr_pages) 893 page_increm = nr_pages; 894 i += page_increm; 895 start += page_increm * PAGE_SIZE; 896 nr_pages -= page_increm; 897 } while (nr_pages); 898 out: 899 if (ctx.pgmap) 900 put_dev_pagemap(ctx.pgmap); 901 return i ? i : ret; 902 } 903 904 static bool vma_permits_fault(struct vm_area_struct *vma, 905 unsigned int fault_flags) 906 { 907 bool write = !!(fault_flags & FAULT_FLAG_WRITE); 908 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); 909 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; 910 911 if (!(vm_flags & vma->vm_flags)) 912 return false; 913 914 /* 915 * The architecture might have a hardware protection 916 * mechanism other than read/write that can deny access. 917 * 918 * gup always represents data access, not instruction 919 * fetches, so execute=false here: 920 */ 921 if (!arch_vma_access_permitted(vma, write, false, foreign)) 922 return false; 923 924 return true; 925 } 926 927 /* 928 * fixup_user_fault() - manually resolve a user page fault 929 * @tsk: the task_struct to use for page fault accounting, or 930 * NULL if faults are not to be recorded. 931 * @mm: mm_struct of target mm 932 * @address: user address 933 * @fault_flags:flags to pass down to handle_mm_fault() 934 * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller 935 * does not allow retry 936 * 937 * This is meant to be called in the specific scenario where for locking reasons 938 * we try to access user memory in atomic context (within a pagefault_disable() 939 * section), this returns -EFAULT, and we want to resolve the user fault before 940 * trying again. 941 * 942 * Typically this is meant to be used by the futex code. 943 * 944 * The main difference with get_user_pages() is that this function will 945 * unconditionally call handle_mm_fault() which will in turn perform all the 946 * necessary SW fixup of the dirty and young bits in the PTE, while 947 * get_user_pages() only guarantees to update these in the struct page. 948 * 949 * This is important for some architectures where those bits also gate the 950 * access permission to the page because they are maintained in software. On 951 * such architectures, gup() will not be enough to make a subsequent access 952 * succeed. 953 * 954 * This function will not return with an unlocked mmap_sem. So it has not the 955 * same semantics wrt the @mm->mmap_sem as does filemap_fault(). 956 */ 957 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 958 unsigned long address, unsigned int fault_flags, 959 bool *unlocked) 960 { 961 struct vm_area_struct *vma; 962 vm_fault_t ret, major = 0; 963 964 if (unlocked) 965 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 966 967 retry: 968 vma = find_extend_vma(mm, address); 969 if (!vma || address < vma->vm_start) 970 return -EFAULT; 971 972 if (!vma_permits_fault(vma, fault_flags)) 973 return -EFAULT; 974 975 ret = handle_mm_fault(vma, address, fault_flags); 976 major |= ret & VM_FAULT_MAJOR; 977 if (ret & VM_FAULT_ERROR) { 978 int err = vm_fault_to_errno(ret, 0); 979 980 if (err) 981 return err; 982 BUG(); 983 } 984 985 if (ret & VM_FAULT_RETRY) { 986 down_read(&mm->mmap_sem); 987 if (!(fault_flags & FAULT_FLAG_TRIED)) { 988 *unlocked = true; 989 fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; 990 fault_flags |= FAULT_FLAG_TRIED; 991 goto retry; 992 } 993 } 994 995 if (tsk) { 996 if (major) 997 tsk->maj_flt++; 998 else 999 tsk->min_flt++; 1000 } 1001 return 0; 1002 } 1003 EXPORT_SYMBOL_GPL(fixup_user_fault); 1004 1005 static __always_inline long __get_user_pages_locked(struct task_struct *tsk, 1006 struct mm_struct *mm, 1007 unsigned long start, 1008 unsigned long nr_pages, 1009 struct page **pages, 1010 struct vm_area_struct **vmas, 1011 int *locked, 1012 unsigned int flags) 1013 { 1014 long ret, pages_done; 1015 bool lock_dropped; 1016 1017 if (locked) { 1018 /* if VM_FAULT_RETRY can be returned, vmas become invalid */ 1019 BUG_ON(vmas); 1020 /* check caller initialized locked */ 1021 BUG_ON(*locked != 1); 1022 } 1023 1024 if (pages) 1025 flags |= FOLL_GET; 1026 1027 pages_done = 0; 1028 lock_dropped = false; 1029 for (;;) { 1030 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, 1031 vmas, locked); 1032 if (!locked) 1033 /* VM_FAULT_RETRY couldn't trigger, bypass */ 1034 return ret; 1035 1036 /* VM_FAULT_RETRY cannot return errors */ 1037 if (!*locked) { 1038 BUG_ON(ret < 0); 1039 BUG_ON(ret >= nr_pages); 1040 } 1041 1042 if (ret > 0) { 1043 nr_pages -= ret; 1044 pages_done += ret; 1045 if (!nr_pages) 1046 break; 1047 } 1048 if (*locked) { 1049 /* 1050 * VM_FAULT_RETRY didn't trigger or it was a 1051 * FOLL_NOWAIT. 1052 */ 1053 if (!pages_done) 1054 pages_done = ret; 1055 break; 1056 } 1057 /* 1058 * VM_FAULT_RETRY triggered, so seek to the faulting offset. 1059 * For the prefault case (!pages) we only update counts. 1060 */ 1061 if (likely(pages)) 1062 pages += ret; 1063 start += ret << PAGE_SHIFT; 1064 1065 /* 1066 * Repeat on the address that fired VM_FAULT_RETRY 1067 * without FAULT_FLAG_ALLOW_RETRY but with 1068 * FAULT_FLAG_TRIED. 1069 */ 1070 *locked = 1; 1071 lock_dropped = true; 1072 down_read(&mm->mmap_sem); 1073 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, 1074 pages, NULL, NULL); 1075 if (ret != 1) { 1076 BUG_ON(ret > 1); 1077 if (!pages_done) 1078 pages_done = ret; 1079 break; 1080 } 1081 nr_pages--; 1082 pages_done++; 1083 if (!nr_pages) 1084 break; 1085 if (likely(pages)) 1086 pages++; 1087 start += PAGE_SIZE; 1088 } 1089 if (lock_dropped && *locked) { 1090 /* 1091 * We must let the caller know we temporarily dropped the lock 1092 * and so the critical section protected by it was lost. 1093 */ 1094 up_read(&mm->mmap_sem); 1095 *locked = 0; 1096 } 1097 return pages_done; 1098 } 1099 1100 /* 1101 * get_user_pages_remote() - pin user pages in memory 1102 * @tsk: the task_struct to use for page fault accounting, or 1103 * NULL if faults are not to be recorded. 1104 * @mm: mm_struct of target mm 1105 * @start: starting user address 1106 * @nr_pages: number of pages from start to pin 1107 * @gup_flags: flags modifying lookup behaviour 1108 * @pages: array that receives pointers to the pages pinned. 1109 * Should be at least nr_pages long. Or NULL, if caller 1110 * only intends to ensure the pages are faulted in. 1111 * @vmas: array of pointers to vmas corresponding to each page. 1112 * Or NULL if the caller does not require them. 1113 * @locked: pointer to lock flag indicating whether lock is held and 1114 * subsequently whether VM_FAULT_RETRY functionality can be 1115 * utilised. Lock must initially be held. 1116 * 1117 * Returns number of pages pinned. This may be fewer than the number 1118 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1119 * were pinned, returns -errno. Each page returned must be released 1120 * with a put_page() call when it is finished with. vmas will only 1121 * remain valid while mmap_sem is held. 1122 * 1123 * Must be called with mmap_sem held for read or write. 1124 * 1125 * get_user_pages walks a process's page tables and takes a reference to 1126 * each struct page that each user address corresponds to at a given 1127 * instant. That is, it takes the page that would be accessed if a user 1128 * thread accesses the given user virtual address at that instant. 1129 * 1130 * This does not guarantee that the page exists in the user mappings when 1131 * get_user_pages returns, and there may even be a completely different 1132 * page there in some cases (eg. if mmapped pagecache has been invalidated 1133 * and subsequently re faulted). However it does guarantee that the page 1134 * won't be freed completely. And mostly callers simply care that the page 1135 * contains data that was valid *at some point in time*. Typically, an IO 1136 * or similar operation cannot guarantee anything stronger anyway because 1137 * locks can't be held over the syscall boundary. 1138 * 1139 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page 1140 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must 1141 * be called after the page is finished with, and before put_page is called. 1142 * 1143 * get_user_pages is typically used for fewer-copy IO operations, to get a 1144 * handle on the memory by some means other than accesses via the user virtual 1145 * addresses. The pages may be submitted for DMA to devices or accessed via 1146 * their kernel linear mapping (via the kmap APIs). Care should be taken to 1147 * use the correct cache flushing APIs. 1148 * 1149 * See also get_user_pages_fast, for performance critical applications. 1150 * 1151 * get_user_pages should be phased out in favor of 1152 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing 1153 * should use get_user_pages because it cannot pass 1154 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. 1155 */ 1156 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 1157 unsigned long start, unsigned long nr_pages, 1158 unsigned int gup_flags, struct page **pages, 1159 struct vm_area_struct **vmas, int *locked) 1160 { 1161 /* 1162 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1163 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1164 * vmas. As there are no users of this flag in this call we simply 1165 * disallow this option for now. 1166 */ 1167 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1168 return -EINVAL; 1169 1170 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1171 locked, 1172 gup_flags | FOLL_TOUCH | FOLL_REMOTE); 1173 } 1174 EXPORT_SYMBOL(get_user_pages_remote); 1175 1176 /** 1177 * populate_vma_page_range() - populate a range of pages in the vma. 1178 * @vma: target vma 1179 * @start: start address 1180 * @end: end address 1181 * @nonblocking: 1182 * 1183 * This takes care of mlocking the pages too if VM_LOCKED is set. 1184 * 1185 * return 0 on success, negative error code on error. 1186 * 1187 * vma->vm_mm->mmap_sem must be held. 1188 * 1189 * If @nonblocking is NULL, it may be held for read or write and will 1190 * be unperturbed. 1191 * 1192 * If @nonblocking is non-NULL, it must held for read only and may be 1193 * released. If it's released, *@nonblocking will be set to 0. 1194 */ 1195 long populate_vma_page_range(struct vm_area_struct *vma, 1196 unsigned long start, unsigned long end, int *nonblocking) 1197 { 1198 struct mm_struct *mm = vma->vm_mm; 1199 unsigned long nr_pages = (end - start) / PAGE_SIZE; 1200 int gup_flags; 1201 1202 VM_BUG_ON(start & ~PAGE_MASK); 1203 VM_BUG_ON(end & ~PAGE_MASK); 1204 VM_BUG_ON_VMA(start < vma->vm_start, vma); 1205 VM_BUG_ON_VMA(end > vma->vm_end, vma); 1206 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); 1207 1208 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; 1209 if (vma->vm_flags & VM_LOCKONFAULT) 1210 gup_flags &= ~FOLL_POPULATE; 1211 /* 1212 * We want to touch writable mappings with a write fault in order 1213 * to break COW, except for shared mappings because these don't COW 1214 * and we would not want to dirty them for nothing. 1215 */ 1216 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 1217 gup_flags |= FOLL_WRITE; 1218 1219 /* 1220 * We want mlock to succeed for regions that have any permissions 1221 * other than PROT_NONE. 1222 */ 1223 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 1224 gup_flags |= FOLL_FORCE; 1225 1226 /* 1227 * We made sure addr is within a VMA, so the following will 1228 * not result in a stack expansion that recurses back here. 1229 */ 1230 return __get_user_pages(current, mm, start, nr_pages, gup_flags, 1231 NULL, NULL, nonblocking); 1232 } 1233 1234 /* 1235 * __mm_populate - populate and/or mlock pages within a range of address space. 1236 * 1237 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap 1238 * flags. VMAs must be already marked with the desired vm_flags, and 1239 * mmap_sem must not be held. 1240 */ 1241 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) 1242 { 1243 struct mm_struct *mm = current->mm; 1244 unsigned long end, nstart, nend; 1245 struct vm_area_struct *vma = NULL; 1246 int locked = 0; 1247 long ret = 0; 1248 1249 end = start + len; 1250 1251 for (nstart = start; nstart < end; nstart = nend) { 1252 /* 1253 * We want to fault in pages for [nstart; end) address range. 1254 * Find first corresponding VMA. 1255 */ 1256 if (!locked) { 1257 locked = 1; 1258 down_read(&mm->mmap_sem); 1259 vma = find_vma(mm, nstart); 1260 } else if (nstart >= vma->vm_end) 1261 vma = vma->vm_next; 1262 if (!vma || vma->vm_start >= end) 1263 break; 1264 /* 1265 * Set [nstart; nend) to intersection of desired address 1266 * range with the first VMA. Also, skip undesirable VMA types. 1267 */ 1268 nend = min(end, vma->vm_end); 1269 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1270 continue; 1271 if (nstart < vma->vm_start) 1272 nstart = vma->vm_start; 1273 /* 1274 * Now fault in a range of pages. populate_vma_page_range() 1275 * double checks the vma flags, so that it won't mlock pages 1276 * if the vma was already munlocked. 1277 */ 1278 ret = populate_vma_page_range(vma, nstart, nend, &locked); 1279 if (ret < 0) { 1280 if (ignore_errors) { 1281 ret = 0; 1282 continue; /* continue at next VMA */ 1283 } 1284 break; 1285 } 1286 nend = nstart + ret * PAGE_SIZE; 1287 ret = 0; 1288 } 1289 if (locked) 1290 up_read(&mm->mmap_sem); 1291 return ret; /* 0 or negative error code */ 1292 } 1293 1294 /** 1295 * get_dump_page() - pin user page in memory while writing it to core dump 1296 * @addr: user address 1297 * 1298 * Returns struct page pointer of user page pinned for dump, 1299 * to be freed afterwards by put_page(). 1300 * 1301 * Returns NULL on any kind of failure - a hole must then be inserted into 1302 * the corefile, to preserve alignment with its headers; and also returns 1303 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 1304 * allowing a hole to be left in the corefile to save diskspace. 1305 * 1306 * Called without mmap_sem, but after all other threads have been killed. 1307 */ 1308 #ifdef CONFIG_ELF_CORE 1309 struct page *get_dump_page(unsigned long addr) 1310 { 1311 struct vm_area_struct *vma; 1312 struct page *page; 1313 1314 if (__get_user_pages(current, current->mm, addr, 1, 1315 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, 1316 NULL) < 1) 1317 return NULL; 1318 flush_cache_page(vma, addr, page_to_pfn(page)); 1319 return page; 1320 } 1321 #endif /* CONFIG_ELF_CORE */ 1322 #else /* CONFIG_MMU */ 1323 static long __get_user_pages_locked(struct task_struct *tsk, 1324 struct mm_struct *mm, unsigned long start, 1325 unsigned long nr_pages, struct page **pages, 1326 struct vm_area_struct **vmas, int *locked, 1327 unsigned int foll_flags) 1328 { 1329 struct vm_area_struct *vma; 1330 unsigned long vm_flags; 1331 int i; 1332 1333 /* calculate required read or write permissions. 1334 * If FOLL_FORCE is set, we only require the "MAY" flags. 1335 */ 1336 vm_flags = (foll_flags & FOLL_WRITE) ? 1337 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1338 vm_flags &= (foll_flags & FOLL_FORCE) ? 1339 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1340 1341 for (i = 0; i < nr_pages; i++) { 1342 vma = find_vma(mm, start); 1343 if (!vma) 1344 goto finish_or_fault; 1345 1346 /* protect what we can, including chardevs */ 1347 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1348 !(vm_flags & vma->vm_flags)) 1349 goto finish_or_fault; 1350 1351 if (pages) { 1352 pages[i] = virt_to_page(start); 1353 if (pages[i]) 1354 get_page(pages[i]); 1355 } 1356 if (vmas) 1357 vmas[i] = vma; 1358 start = (start + PAGE_SIZE) & PAGE_MASK; 1359 } 1360 1361 return i; 1362 1363 finish_or_fault: 1364 return i ? : -EFAULT; 1365 } 1366 #endif /* !CONFIG_MMU */ 1367 1368 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) 1369 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) 1370 { 1371 long i; 1372 struct vm_area_struct *vma_prev = NULL; 1373 1374 for (i = 0; i < nr_pages; i++) { 1375 struct vm_area_struct *vma = vmas[i]; 1376 1377 if (vma == vma_prev) 1378 continue; 1379 1380 vma_prev = vma; 1381 1382 if (vma_is_fsdax(vma)) 1383 return true; 1384 } 1385 return false; 1386 } 1387 1388 #ifdef CONFIG_CMA 1389 static struct page *new_non_cma_page(struct page *page, unsigned long private) 1390 { 1391 /* 1392 * We want to make sure we allocate the new page from the same node 1393 * as the source page. 1394 */ 1395 int nid = page_to_nid(page); 1396 /* 1397 * Trying to allocate a page for migration. Ignore allocation 1398 * failure warnings. We don't force __GFP_THISNODE here because 1399 * this node here is the node where we have CMA reservation and 1400 * in some case these nodes will have really less non movable 1401 * allocation memory. 1402 */ 1403 gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; 1404 1405 if (PageHighMem(page)) 1406 gfp_mask |= __GFP_HIGHMEM; 1407 1408 #ifdef CONFIG_HUGETLB_PAGE 1409 if (PageHuge(page)) { 1410 struct hstate *h = page_hstate(page); 1411 /* 1412 * We don't want to dequeue from the pool because pool pages will 1413 * mostly be from the CMA region. 1414 */ 1415 return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); 1416 } 1417 #endif 1418 if (PageTransHuge(page)) { 1419 struct page *thp; 1420 /* 1421 * ignore allocation failure warnings 1422 */ 1423 gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; 1424 1425 /* 1426 * Remove the movable mask so that we don't allocate from 1427 * CMA area again. 1428 */ 1429 thp_gfpmask &= ~__GFP_MOVABLE; 1430 thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); 1431 if (!thp) 1432 return NULL; 1433 prep_transhuge_page(thp); 1434 return thp; 1435 } 1436 1437 return __alloc_pages_node(nid, gfp_mask, 0); 1438 } 1439 1440 static long check_and_migrate_cma_pages(struct task_struct *tsk, 1441 struct mm_struct *mm, 1442 unsigned long start, 1443 unsigned long nr_pages, 1444 struct page **pages, 1445 struct vm_area_struct **vmas, 1446 unsigned int gup_flags) 1447 { 1448 unsigned long i; 1449 unsigned long step; 1450 bool drain_allow = true; 1451 bool migrate_allow = true; 1452 LIST_HEAD(cma_page_list); 1453 1454 check_again: 1455 for (i = 0; i < nr_pages;) { 1456 1457 struct page *head = compound_head(pages[i]); 1458 1459 /* 1460 * gup may start from a tail page. Advance step by the left 1461 * part. 1462 */ 1463 step = (1 << compound_order(head)) - (pages[i] - head); 1464 /* 1465 * If we get a page from the CMA zone, since we are going to 1466 * be pinning these entries, we might as well move them out 1467 * of the CMA zone if possible. 1468 */ 1469 if (is_migrate_cma_page(head)) { 1470 if (PageHuge(head)) 1471 isolate_huge_page(head, &cma_page_list); 1472 else { 1473 if (!PageLRU(head) && drain_allow) { 1474 lru_add_drain_all(); 1475 drain_allow = false; 1476 } 1477 1478 if (!isolate_lru_page(head)) { 1479 list_add_tail(&head->lru, &cma_page_list); 1480 mod_node_page_state(page_pgdat(head), 1481 NR_ISOLATED_ANON + 1482 page_is_file_cache(head), 1483 hpage_nr_pages(head)); 1484 } 1485 } 1486 } 1487 1488 i += step; 1489 } 1490 1491 if (!list_empty(&cma_page_list)) { 1492 /* 1493 * drop the above get_user_pages reference. 1494 */ 1495 for (i = 0; i < nr_pages; i++) 1496 put_page(pages[i]); 1497 1498 if (migrate_pages(&cma_page_list, new_non_cma_page, 1499 NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { 1500 /* 1501 * some of the pages failed migration. Do get_user_pages 1502 * without migration. 1503 */ 1504 migrate_allow = false; 1505 1506 if (!list_empty(&cma_page_list)) 1507 putback_movable_pages(&cma_page_list); 1508 } 1509 /* 1510 * We did migrate all the pages, Try to get the page references 1511 * again migrating any new CMA pages which we failed to isolate 1512 * earlier. 1513 */ 1514 nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, 1515 pages, vmas, NULL, 1516 gup_flags); 1517 1518 if ((nr_pages > 0) && migrate_allow) { 1519 drain_allow = true; 1520 goto check_again; 1521 } 1522 } 1523 1524 return nr_pages; 1525 } 1526 #else 1527 static long check_and_migrate_cma_pages(struct task_struct *tsk, 1528 struct mm_struct *mm, 1529 unsigned long start, 1530 unsigned long nr_pages, 1531 struct page **pages, 1532 struct vm_area_struct **vmas, 1533 unsigned int gup_flags) 1534 { 1535 return nr_pages; 1536 } 1537 #endif /* CONFIG_CMA */ 1538 1539 /* 1540 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which 1541 * allows us to process the FOLL_LONGTERM flag. 1542 */ 1543 static long __gup_longterm_locked(struct task_struct *tsk, 1544 struct mm_struct *mm, 1545 unsigned long start, 1546 unsigned long nr_pages, 1547 struct page **pages, 1548 struct vm_area_struct **vmas, 1549 unsigned int gup_flags) 1550 { 1551 struct vm_area_struct **vmas_tmp = vmas; 1552 unsigned long flags = 0; 1553 long rc, i; 1554 1555 if (gup_flags & FOLL_LONGTERM) { 1556 if (!pages) 1557 return -EINVAL; 1558 1559 if (!vmas_tmp) { 1560 vmas_tmp = kcalloc(nr_pages, 1561 sizeof(struct vm_area_struct *), 1562 GFP_KERNEL); 1563 if (!vmas_tmp) 1564 return -ENOMEM; 1565 } 1566 flags = memalloc_nocma_save(); 1567 } 1568 1569 rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, 1570 vmas_tmp, NULL, gup_flags); 1571 1572 if (gup_flags & FOLL_LONGTERM) { 1573 memalloc_nocma_restore(flags); 1574 if (rc < 0) 1575 goto out; 1576 1577 if (check_dax_vmas(vmas_tmp, rc)) { 1578 for (i = 0; i < rc; i++) 1579 put_page(pages[i]); 1580 rc = -EOPNOTSUPP; 1581 goto out; 1582 } 1583 1584 rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, 1585 vmas_tmp, gup_flags); 1586 } 1587 1588 out: 1589 if (vmas_tmp != vmas) 1590 kfree(vmas_tmp); 1591 return rc; 1592 } 1593 #else /* !CONFIG_FS_DAX && !CONFIG_CMA */ 1594 static __always_inline long __gup_longterm_locked(struct task_struct *tsk, 1595 struct mm_struct *mm, 1596 unsigned long start, 1597 unsigned long nr_pages, 1598 struct page **pages, 1599 struct vm_area_struct **vmas, 1600 unsigned int flags) 1601 { 1602 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1603 NULL, flags); 1604 } 1605 #endif /* CONFIG_FS_DAX || CONFIG_CMA */ 1606 1607 /* 1608 * This is the same as get_user_pages_remote(), just with a 1609 * less-flexible calling convention where we assume that the task 1610 * and mm being operated on are the current task's and don't allow 1611 * passing of a locked parameter. We also obviously don't pass 1612 * FOLL_REMOTE in here. 1613 */ 1614 long get_user_pages(unsigned long start, unsigned long nr_pages, 1615 unsigned int gup_flags, struct page **pages, 1616 struct vm_area_struct **vmas) 1617 { 1618 return __gup_longterm_locked(current, current->mm, start, nr_pages, 1619 pages, vmas, gup_flags | FOLL_TOUCH); 1620 } 1621 EXPORT_SYMBOL(get_user_pages); 1622 1623 /* 1624 * We can leverage the VM_FAULT_RETRY functionality in the page fault 1625 * paths better by using either get_user_pages_locked() or 1626 * get_user_pages_unlocked(). 1627 * 1628 * get_user_pages_locked() is suitable to replace the form: 1629 * 1630 * down_read(&mm->mmap_sem); 1631 * do_something() 1632 * get_user_pages(tsk, mm, ..., pages, NULL); 1633 * up_read(&mm->mmap_sem); 1634 * 1635 * to: 1636 * 1637 * int locked = 1; 1638 * down_read(&mm->mmap_sem); 1639 * do_something() 1640 * get_user_pages_locked(tsk, mm, ..., pages, &locked); 1641 * if (locked) 1642 * up_read(&mm->mmap_sem); 1643 */ 1644 long get_user_pages_locked(unsigned long start, unsigned long nr_pages, 1645 unsigned int gup_flags, struct page **pages, 1646 int *locked) 1647 { 1648 /* 1649 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1650 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1651 * vmas. As there are no users of this flag in this call we simply 1652 * disallow this option for now. 1653 */ 1654 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1655 return -EINVAL; 1656 1657 return __get_user_pages_locked(current, current->mm, start, nr_pages, 1658 pages, NULL, locked, 1659 gup_flags | FOLL_TOUCH); 1660 } 1661 EXPORT_SYMBOL(get_user_pages_locked); 1662 1663 /* 1664 * get_user_pages_unlocked() is suitable to replace the form: 1665 * 1666 * down_read(&mm->mmap_sem); 1667 * get_user_pages(tsk, mm, ..., pages, NULL); 1668 * up_read(&mm->mmap_sem); 1669 * 1670 * with: 1671 * 1672 * get_user_pages_unlocked(tsk, mm, ..., pages); 1673 * 1674 * It is functionally equivalent to get_user_pages_fast so 1675 * get_user_pages_fast should be used instead if specific gup_flags 1676 * (e.g. FOLL_FORCE) are not required. 1677 */ 1678 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 1679 struct page **pages, unsigned int gup_flags) 1680 { 1681 struct mm_struct *mm = current->mm; 1682 int locked = 1; 1683 long ret; 1684 1685 /* 1686 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1687 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1688 * vmas. As there are no users of this flag in this call we simply 1689 * disallow this option for now. 1690 */ 1691 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1692 return -EINVAL; 1693 1694 down_read(&mm->mmap_sem); 1695 ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, 1696 &locked, gup_flags | FOLL_TOUCH); 1697 if (locked) 1698 up_read(&mm->mmap_sem); 1699 return ret; 1700 } 1701 EXPORT_SYMBOL(get_user_pages_unlocked); 1702 1703 /* 1704 * Fast GUP 1705 * 1706 * get_user_pages_fast attempts to pin user pages by walking the page 1707 * tables directly and avoids taking locks. Thus the walker needs to be 1708 * protected from page table pages being freed from under it, and should 1709 * block any THP splits. 1710 * 1711 * One way to achieve this is to have the walker disable interrupts, and 1712 * rely on IPIs from the TLB flushing code blocking before the page table 1713 * pages are freed. This is unsuitable for architectures that do not need 1714 * to broadcast an IPI when invalidating TLBs. 1715 * 1716 * Another way to achieve this is to batch up page table containing pages 1717 * belonging to more than one mm_user, then rcu_sched a callback to free those 1718 * pages. Disabling interrupts will allow the fast_gup walker to both block 1719 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs 1720 * (which is a relatively rare event). The code below adopts this strategy. 1721 * 1722 * Before activating this code, please be aware that the following assumptions 1723 * are currently made: 1724 * 1725 * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to 1726 * free pages containing page tables or TLB flushing requires IPI broadcast. 1727 * 1728 * *) ptes can be read atomically by the architecture. 1729 * 1730 * *) access_ok is sufficient to validate userspace address ranges. 1731 * 1732 * The last two assumptions can be relaxed by the addition of helper functions. 1733 * 1734 * This code is based heavily on the PowerPC implementation by Nick Piggin. 1735 */ 1736 #ifdef CONFIG_HAVE_FAST_GUP 1737 #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH 1738 /* 1739 * WARNING: only to be used in the get_user_pages_fast() implementation. 1740 * 1741 * With get_user_pages_fast(), we walk down the pagetables without taking any 1742 * locks. For this we would like to load the pointers atomically, but sometimes 1743 * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What 1744 * we do have is the guarantee that a PTE will only either go from not present 1745 * to present, or present to not present or both -- it will not switch to a 1746 * completely different present page without a TLB flush in between; something 1747 * that we are blocking by holding interrupts off. 1748 * 1749 * Setting ptes from not present to present goes: 1750 * 1751 * ptep->pte_high = h; 1752 * smp_wmb(); 1753 * ptep->pte_low = l; 1754 * 1755 * And present to not present goes: 1756 * 1757 * ptep->pte_low = 0; 1758 * smp_wmb(); 1759 * ptep->pte_high = 0; 1760 * 1761 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. 1762 * We load pte_high *after* loading pte_low, which ensures we don't see an older 1763 * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't 1764 * picked up a changed pte high. We might have gotten rubbish values from 1765 * pte_low and pte_high, but we are guaranteed that pte_low will not have the 1766 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only 1767 * operates on present ptes we're safe. 1768 */ 1769 static inline pte_t gup_get_pte(pte_t *ptep) 1770 { 1771 pte_t pte; 1772 1773 do { 1774 pte.pte_low = ptep->pte_low; 1775 smp_rmb(); 1776 pte.pte_high = ptep->pte_high; 1777 smp_rmb(); 1778 } while (unlikely(pte.pte_low != ptep->pte_low)); 1779 1780 return pte; 1781 } 1782 #else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 1783 /* 1784 * We require that the PTE can be read atomically. 1785 */ 1786 static inline pte_t gup_get_pte(pte_t *ptep) 1787 { 1788 return READ_ONCE(*ptep); 1789 } 1790 #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ 1791 1792 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, 1793 struct page **pages) 1794 { 1795 while ((*nr) - nr_start) { 1796 struct page *page = pages[--(*nr)]; 1797 1798 ClearPageReferenced(page); 1799 put_page(page); 1800 } 1801 } 1802 1803 /* 1804 * Return the compund head page with ref appropriately incremented, 1805 * or NULL if that failed. 1806 */ 1807 static inline struct page *try_get_compound_head(struct page *page, int refs) 1808 { 1809 struct page *head = compound_head(page); 1810 if (WARN_ON_ONCE(page_ref_count(head) < 0)) 1811 return NULL; 1812 if (unlikely(!page_cache_add_speculative(head, refs))) 1813 return NULL; 1814 return head; 1815 } 1816 1817 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL 1818 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1819 unsigned int flags, struct page **pages, int *nr) 1820 { 1821 struct dev_pagemap *pgmap = NULL; 1822 int nr_start = *nr, ret = 0; 1823 pte_t *ptep, *ptem; 1824 1825 ptem = ptep = pte_offset_map(&pmd, addr); 1826 do { 1827 pte_t pte = gup_get_pte(ptep); 1828 struct page *head, *page; 1829 1830 /* 1831 * Similar to the PMD case below, NUMA hinting must take slow 1832 * path using the pte_protnone check. 1833 */ 1834 if (pte_protnone(pte)) 1835 goto pte_unmap; 1836 1837 if (!pte_access_permitted(pte, flags & FOLL_WRITE)) 1838 goto pte_unmap; 1839 1840 if (pte_devmap(pte)) { 1841 if (unlikely(flags & FOLL_LONGTERM)) 1842 goto pte_unmap; 1843 1844 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); 1845 if (unlikely(!pgmap)) { 1846 undo_dev_pagemap(nr, nr_start, pages); 1847 goto pte_unmap; 1848 } 1849 } else if (pte_special(pte)) 1850 goto pte_unmap; 1851 1852 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1853 page = pte_page(pte); 1854 1855 head = try_get_compound_head(page, 1); 1856 if (!head) 1857 goto pte_unmap; 1858 1859 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1860 put_page(head); 1861 goto pte_unmap; 1862 } 1863 1864 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1865 1866 SetPageReferenced(page); 1867 pages[*nr] = page; 1868 (*nr)++; 1869 1870 } while (ptep++, addr += PAGE_SIZE, addr != end); 1871 1872 ret = 1; 1873 1874 pte_unmap: 1875 if (pgmap) 1876 put_dev_pagemap(pgmap); 1877 pte_unmap(ptem); 1878 return ret; 1879 } 1880 #else 1881 1882 /* 1883 * If we can't determine whether or not a pte is special, then fail immediately 1884 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not 1885 * to be special. 1886 * 1887 * For a futex to be placed on a THP tail page, get_futex_key requires a 1888 * __get_user_pages_fast implementation that can pin pages. Thus it's still 1889 * useful to have gup_huge_pmd even if we can't operate on ptes. 1890 */ 1891 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1892 unsigned int flags, struct page **pages, int *nr) 1893 { 1894 return 0; 1895 } 1896 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ 1897 1898 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1899 static int __gup_device_huge(unsigned long pfn, unsigned long addr, 1900 unsigned long end, struct page **pages, int *nr) 1901 { 1902 int nr_start = *nr; 1903 struct dev_pagemap *pgmap = NULL; 1904 1905 do { 1906 struct page *page = pfn_to_page(pfn); 1907 1908 pgmap = get_dev_pagemap(pfn, pgmap); 1909 if (unlikely(!pgmap)) { 1910 undo_dev_pagemap(nr, nr_start, pages); 1911 return 0; 1912 } 1913 SetPageReferenced(page); 1914 pages[*nr] = page; 1915 get_page(page); 1916 (*nr)++; 1917 pfn++; 1918 } while (addr += PAGE_SIZE, addr != end); 1919 1920 if (pgmap) 1921 put_dev_pagemap(pgmap); 1922 return 1; 1923 } 1924 1925 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1926 unsigned long end, struct page **pages, int *nr) 1927 { 1928 unsigned long fault_pfn; 1929 int nr_start = *nr; 1930 1931 fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1932 if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) 1933 return 0; 1934 1935 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 1936 undo_dev_pagemap(nr, nr_start, pages); 1937 return 0; 1938 } 1939 return 1; 1940 } 1941 1942 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 1943 unsigned long end, struct page **pages, int *nr) 1944 { 1945 unsigned long fault_pfn; 1946 int nr_start = *nr; 1947 1948 fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 1949 if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) 1950 return 0; 1951 1952 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 1953 undo_dev_pagemap(nr, nr_start, pages); 1954 return 0; 1955 } 1956 return 1; 1957 } 1958 #else 1959 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1960 unsigned long end, struct page **pages, int *nr) 1961 { 1962 BUILD_BUG(); 1963 return 0; 1964 } 1965 1966 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, 1967 unsigned long end, struct page **pages, int *nr) 1968 { 1969 BUILD_BUG(); 1970 return 0; 1971 } 1972 #endif 1973 1974 #ifdef CONFIG_ARCH_HAS_HUGEPD 1975 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, 1976 unsigned long sz) 1977 { 1978 unsigned long __boundary = (addr + sz) & ~(sz-1); 1979 return (__boundary - 1 < end - 1) ? __boundary : end; 1980 } 1981 1982 static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, 1983 unsigned long end, int write, struct page **pages, int *nr) 1984 { 1985 unsigned long pte_end; 1986 struct page *head, *page; 1987 pte_t pte; 1988 int refs; 1989 1990 pte_end = (addr + sz) & ~(sz-1); 1991 if (pte_end < end) 1992 end = pte_end; 1993 1994 pte = READ_ONCE(*ptep); 1995 1996 if (!pte_access_permitted(pte, write)) 1997 return 0; 1998 1999 /* hugepages are never "special" */ 2000 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 2001 2002 refs = 0; 2003 head = pte_page(pte); 2004 2005 page = head + ((addr & (sz-1)) >> PAGE_SHIFT); 2006 do { 2007 VM_BUG_ON(compound_head(page) != head); 2008 pages[*nr] = page; 2009 (*nr)++; 2010 page++; 2011 refs++; 2012 } while (addr += PAGE_SIZE, addr != end); 2013 2014 head = try_get_compound_head(head, refs); 2015 if (!head) { 2016 *nr -= refs; 2017 return 0; 2018 } 2019 2020 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 2021 /* Could be optimized better */ 2022 *nr -= refs; 2023 while (refs--) 2024 put_page(head); 2025 return 0; 2026 } 2027 2028 SetPageReferenced(head); 2029 return 1; 2030 } 2031 2032 static int gup_huge_pd(hugepd_t hugepd, unsigned long addr, 2033 unsigned int pdshift, unsigned long end, int write, 2034 struct page **pages, int *nr) 2035 { 2036 pte_t *ptep; 2037 unsigned long sz = 1UL << hugepd_shift(hugepd); 2038 unsigned long next; 2039 2040 ptep = hugepte_offset(hugepd, addr, pdshift); 2041 do { 2042 next = hugepte_addr_end(addr, end, sz); 2043 if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) 2044 return 0; 2045 } while (ptep++, addr = next, addr != end); 2046 2047 return 1; 2048 } 2049 #else 2050 static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, 2051 unsigned pdshift, unsigned long end, int write, 2052 struct page **pages, int *nr) 2053 { 2054 return 0; 2055 } 2056 #endif /* CONFIG_ARCH_HAS_HUGEPD */ 2057 2058 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 2059 unsigned long end, unsigned int flags, struct page **pages, int *nr) 2060 { 2061 struct page *head, *page; 2062 int refs; 2063 2064 if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) 2065 return 0; 2066 2067 if (pmd_devmap(orig)) { 2068 if (unlikely(flags & FOLL_LONGTERM)) 2069 return 0; 2070 return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); 2071 } 2072 2073 refs = 0; 2074 page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 2075 do { 2076 pages[*nr] = page; 2077 (*nr)++; 2078 page++; 2079 refs++; 2080 } while (addr += PAGE_SIZE, addr != end); 2081 2082 head = try_get_compound_head(pmd_page(orig), refs); 2083 if (!head) { 2084 *nr -= refs; 2085 return 0; 2086 } 2087 2088 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 2089 *nr -= refs; 2090 while (refs--) 2091 put_page(head); 2092 return 0; 2093 } 2094 2095 SetPageReferenced(head); 2096 return 1; 2097 } 2098 2099 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 2100 unsigned long end, unsigned int flags, struct page **pages, int *nr) 2101 { 2102 struct page *head, *page; 2103 int refs; 2104 2105 if (!pud_access_permitted(orig, flags & FOLL_WRITE)) 2106 return 0; 2107 2108 if (pud_devmap(orig)) { 2109 if (unlikely(flags & FOLL_LONGTERM)) 2110 return 0; 2111 return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); 2112 } 2113 2114 refs = 0; 2115 page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 2116 do { 2117 pages[*nr] = page; 2118 (*nr)++; 2119 page++; 2120 refs++; 2121 } while (addr += PAGE_SIZE, addr != end); 2122 2123 head = try_get_compound_head(pud_page(orig), refs); 2124 if (!head) { 2125 *nr -= refs; 2126 return 0; 2127 } 2128 2129 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 2130 *nr -= refs; 2131 while (refs--) 2132 put_page(head); 2133 return 0; 2134 } 2135 2136 SetPageReferenced(head); 2137 return 1; 2138 } 2139 2140 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, 2141 unsigned long end, unsigned int flags, 2142 struct page **pages, int *nr) 2143 { 2144 int refs; 2145 struct page *head, *page; 2146 2147 if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) 2148 return 0; 2149 2150 BUILD_BUG_ON(pgd_devmap(orig)); 2151 refs = 0; 2152 page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 2153 do { 2154 pages[*nr] = page; 2155 (*nr)++; 2156 page++; 2157 refs++; 2158 } while (addr += PAGE_SIZE, addr != end); 2159 2160 head = try_get_compound_head(pgd_page(orig), refs); 2161 if (!head) { 2162 *nr -= refs; 2163 return 0; 2164 } 2165 2166 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { 2167 *nr -= refs; 2168 while (refs--) 2169 put_page(head); 2170 return 0; 2171 } 2172 2173 SetPageReferenced(head); 2174 return 1; 2175 } 2176 2177 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 2178 unsigned int flags, struct page **pages, int *nr) 2179 { 2180 unsigned long next; 2181 pmd_t *pmdp; 2182 2183 pmdp = pmd_offset(&pud, addr); 2184 do { 2185 pmd_t pmd = READ_ONCE(*pmdp); 2186 2187 next = pmd_addr_end(addr, end); 2188 if (!pmd_present(pmd)) 2189 return 0; 2190 2191 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || 2192 pmd_devmap(pmd))) { 2193 /* 2194 * NUMA hinting faults need to be handled in the GUP 2195 * slowpath for accounting purposes and so that they 2196 * can be serialised against THP migration. 2197 */ 2198 if (pmd_protnone(pmd)) 2199 return 0; 2200 2201 if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, 2202 pages, nr)) 2203 return 0; 2204 2205 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { 2206 /* 2207 * architecture have different format for hugetlbfs 2208 * pmd format and THP pmd format 2209 */ 2210 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, 2211 PMD_SHIFT, next, flags, pages, nr)) 2212 return 0; 2213 } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) 2214 return 0; 2215 } while (pmdp++, addr = next, addr != end); 2216 2217 return 1; 2218 } 2219 2220 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, 2221 unsigned int flags, struct page **pages, int *nr) 2222 { 2223 unsigned long next; 2224 pud_t *pudp; 2225 2226 pudp = pud_offset(&p4d, addr); 2227 do { 2228 pud_t pud = READ_ONCE(*pudp); 2229 2230 next = pud_addr_end(addr, end); 2231 if (pud_none(pud)) 2232 return 0; 2233 if (unlikely(pud_huge(pud))) { 2234 if (!gup_huge_pud(pud, pudp, addr, next, flags, 2235 pages, nr)) 2236 return 0; 2237 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 2238 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 2239 PUD_SHIFT, next, flags, pages, nr)) 2240 return 0; 2241 } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) 2242 return 0; 2243 } while (pudp++, addr = next, addr != end); 2244 2245 return 1; 2246 } 2247 2248 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, 2249 unsigned int flags, struct page **pages, int *nr) 2250 { 2251 unsigned long next; 2252 p4d_t *p4dp; 2253 2254 p4dp = p4d_offset(&pgd, addr); 2255 do { 2256 p4d_t p4d = READ_ONCE(*p4dp); 2257 2258 next = p4d_addr_end(addr, end); 2259 if (p4d_none(p4d)) 2260 return 0; 2261 BUILD_BUG_ON(p4d_huge(p4d)); 2262 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { 2263 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, 2264 P4D_SHIFT, next, flags, pages, nr)) 2265 return 0; 2266 } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) 2267 return 0; 2268 } while (p4dp++, addr = next, addr != end); 2269 2270 return 1; 2271 } 2272 2273 static void gup_pgd_range(unsigned long addr, unsigned long end, 2274 unsigned int flags, struct page **pages, int *nr) 2275 { 2276 unsigned long next; 2277 pgd_t *pgdp; 2278 2279 pgdp = pgd_offset(current->mm, addr); 2280 do { 2281 pgd_t pgd = READ_ONCE(*pgdp); 2282 2283 next = pgd_addr_end(addr, end); 2284 if (pgd_none(pgd)) 2285 return; 2286 if (unlikely(pgd_huge(pgd))) { 2287 if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, 2288 pages, nr)) 2289 return; 2290 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 2291 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 2292 PGDIR_SHIFT, next, flags, pages, nr)) 2293 return; 2294 } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) 2295 return; 2296 } while (pgdp++, addr = next, addr != end); 2297 } 2298 #else 2299 static inline void gup_pgd_range(unsigned long addr, unsigned long end, 2300 unsigned int flags, struct page **pages, int *nr) 2301 { 2302 } 2303 #endif /* CONFIG_HAVE_FAST_GUP */ 2304 2305 #ifndef gup_fast_permitted 2306 /* 2307 * Check if it's allowed to use __get_user_pages_fast() for the range, or 2308 * we need to fall back to the slow version: 2309 */ 2310 static bool gup_fast_permitted(unsigned long start, unsigned long end) 2311 { 2312 return true; 2313 } 2314 #endif 2315 2316 /* 2317 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to 2318 * the regular GUP. 2319 * Note a difference with get_user_pages_fast: this always returns the 2320 * number of pages pinned, 0 if no pages were pinned. 2321 * 2322 * If the architecture does not support this function, simply return with no 2323 * pages pinned. 2324 */ 2325 int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 2326 struct page **pages) 2327 { 2328 unsigned long len, end; 2329 unsigned long flags; 2330 int nr = 0; 2331 2332 start = untagged_addr(start) & PAGE_MASK; 2333 len = (unsigned long) nr_pages << PAGE_SHIFT; 2334 end = start + len; 2335 2336 if (end <= start) 2337 return 0; 2338 if (unlikely(!access_ok((void __user *)start, len))) 2339 return 0; 2340 2341 /* 2342 * Disable interrupts. We use the nested form as we can already have 2343 * interrupts disabled by get_futex_key. 2344 * 2345 * With interrupts disabled, we block page table pages from being 2346 * freed from under us. See struct mmu_table_batch comments in 2347 * include/asm-generic/tlb.h for more details. 2348 * 2349 * We do not adopt an rcu_read_lock(.) here as we also want to 2350 * block IPIs that come from THPs splitting. 2351 */ 2352 2353 if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && 2354 gup_fast_permitted(start, end)) { 2355 local_irq_save(flags); 2356 gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); 2357 local_irq_restore(flags); 2358 } 2359 2360 return nr; 2361 } 2362 EXPORT_SYMBOL_GPL(__get_user_pages_fast); 2363 2364 static int __gup_longterm_unlocked(unsigned long start, int nr_pages, 2365 unsigned int gup_flags, struct page **pages) 2366 { 2367 int ret; 2368 2369 /* 2370 * FIXME: FOLL_LONGTERM does not work with 2371 * get_user_pages_unlocked() (see comments in that function) 2372 */ 2373 if (gup_flags & FOLL_LONGTERM) { 2374 down_read(¤t->mm->mmap_sem); 2375 ret = __gup_longterm_locked(current, current->mm, 2376 start, nr_pages, 2377 pages, NULL, gup_flags); 2378 up_read(¤t->mm->mmap_sem); 2379 } else { 2380 ret = get_user_pages_unlocked(start, nr_pages, 2381 pages, gup_flags); 2382 } 2383 2384 return ret; 2385 } 2386 2387 /** 2388 * get_user_pages_fast() - pin user pages in memory 2389 * @start: starting user address 2390 * @nr_pages: number of pages from start to pin 2391 * @gup_flags: flags modifying pin behaviour 2392 * @pages: array that receives pointers to the pages pinned. 2393 * Should be at least nr_pages long. 2394 * 2395 * Attempt to pin user pages in memory without taking mm->mmap_sem. 2396 * If not successful, it will fall back to taking the lock and 2397 * calling get_user_pages(). 2398 * 2399 * Returns number of pages pinned. This may be fewer than the number 2400 * requested. If nr_pages is 0 or negative, returns 0. If no pages 2401 * were pinned, returns -errno. 2402 */ 2403 int get_user_pages_fast(unsigned long start, int nr_pages, 2404 unsigned int gup_flags, struct page **pages) 2405 { 2406 unsigned long addr, len, end; 2407 int nr = 0, ret = 0; 2408 2409 if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM))) 2410 return -EINVAL; 2411 2412 start = untagged_addr(start) & PAGE_MASK; 2413 addr = start; 2414 len = (unsigned long) nr_pages << PAGE_SHIFT; 2415 end = start + len; 2416 2417 if (end <= start) 2418 return 0; 2419 if (unlikely(!access_ok((void __user *)start, len))) 2420 return -EFAULT; 2421 2422 if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && 2423 gup_fast_permitted(start, end)) { 2424 local_irq_disable(); 2425 gup_pgd_range(addr, end, gup_flags, pages, &nr); 2426 local_irq_enable(); 2427 ret = nr; 2428 } 2429 2430 if (nr < nr_pages) { 2431 /* Try to get the remaining pages with get_user_pages */ 2432 start += nr << PAGE_SHIFT; 2433 pages += nr; 2434 2435 ret = __gup_longterm_unlocked(start, nr_pages - nr, 2436 gup_flags, pages); 2437 2438 /* Have to be a bit careful with return values */ 2439 if (nr > 0) { 2440 if (ret < 0) 2441 ret = nr; 2442 else 2443 ret += nr; 2444 } 2445 } 2446 2447 return ret; 2448 } 2449 EXPORT_SYMBOL_GPL(get_user_pages_fast); 2450