1 #include <linux/kernel.h> 2 #include <linux/errno.h> 3 #include <linux/err.h> 4 #include <linux/spinlock.h> 5 6 #include <linux/mm.h> 7 #include <linux/memremap.h> 8 #include <linux/pagemap.h> 9 #include <linux/rmap.h> 10 #include <linux/swap.h> 11 #include <linux/swapops.h> 12 13 #include <linux/sched/signal.h> 14 #include <linux/rwsem.h> 15 #include <linux/hugetlb.h> 16 #include <linux/migrate.h> 17 #include <linux/mm_inline.h> 18 #include <linux/sched/mm.h> 19 20 #include <asm/mmu_context.h> 21 #include <asm/pgtable.h> 22 #include <asm/tlbflush.h> 23 24 #include "internal.h" 25 26 struct follow_page_context { 27 struct dev_pagemap *pgmap; 28 unsigned int page_mask; 29 }; 30 31 typedef int (*set_dirty_func_t)(struct page *page); 32 33 static void __put_user_pages_dirty(struct page **pages, 34 unsigned long npages, 35 set_dirty_func_t sdf) 36 { 37 unsigned long index; 38 39 for (index = 0; index < npages; index++) { 40 struct page *page = compound_head(pages[index]); 41 42 /* 43 * Checking PageDirty at this point may race with 44 * clear_page_dirty_for_io(), but that's OK. Two key cases: 45 * 46 * 1) This code sees the page as already dirty, so it skips 47 * the call to sdf(). That could happen because 48 * clear_page_dirty_for_io() called page_mkclean(), 49 * followed by set_page_dirty(). However, now the page is 50 * going to get written back, which meets the original 51 * intention of setting it dirty, so all is well: 52 * clear_page_dirty_for_io() goes on to call 53 * TestClearPageDirty(), and write the page back. 54 * 55 * 2) This code sees the page as clean, so it calls sdf(). 56 * The page stays dirty, despite being written back, so it 57 * gets written back again in the next writeback cycle. 58 * This is harmless. 59 */ 60 if (!PageDirty(page)) 61 sdf(page); 62 63 put_user_page(page); 64 } 65 } 66 67 /** 68 * put_user_pages_dirty() - release and dirty an array of gup-pinned pages 69 * @pages: array of pages to be marked dirty and released. 70 * @npages: number of pages in the @pages array. 71 * 72 * "gup-pinned page" refers to a page that has had one of the get_user_pages() 73 * variants called on that page. 74 * 75 * For each page in the @pages array, make that page (or its head page, if a 76 * compound page) dirty, if it was previously listed as clean. Then, release 77 * the page using put_user_page(). 78 * 79 * Please see the put_user_page() documentation for details. 80 * 81 * set_page_dirty(), which does not lock the page, is used here. 82 * Therefore, it is the caller's responsibility to ensure that this is 83 * safe. If not, then put_user_pages_dirty_lock() should be called instead. 84 * 85 */ 86 void put_user_pages_dirty(struct page **pages, unsigned long npages) 87 { 88 __put_user_pages_dirty(pages, npages, set_page_dirty); 89 } 90 EXPORT_SYMBOL(put_user_pages_dirty); 91 92 /** 93 * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages 94 * @pages: array of pages to be marked dirty and released. 95 * @npages: number of pages in the @pages array. 96 * 97 * For each page in the @pages array, make that page (or its head page, if a 98 * compound page) dirty, if it was previously listed as clean. Then, release 99 * the page using put_user_page(). 100 * 101 * Please see the put_user_page() documentation for details. 102 * 103 * This is just like put_user_pages_dirty(), except that it invokes 104 * set_page_dirty_lock(), instead of set_page_dirty(). 105 * 106 */ 107 void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) 108 { 109 __put_user_pages_dirty(pages, npages, set_page_dirty_lock); 110 } 111 EXPORT_SYMBOL(put_user_pages_dirty_lock); 112 113 /** 114 * put_user_pages() - release an array of gup-pinned pages. 115 * @pages: array of pages to be marked dirty and released. 116 * @npages: number of pages in the @pages array. 117 * 118 * For each page in the @pages array, release the page using put_user_page(). 119 * 120 * Please see the put_user_page() documentation for details. 121 */ 122 void put_user_pages(struct page **pages, unsigned long npages) 123 { 124 unsigned long index; 125 126 /* 127 * TODO: this can be optimized for huge pages: if a series of pages is 128 * physically contiguous and part of the same compound page, then a 129 * single operation to the head page should suffice. 130 */ 131 for (index = 0; index < npages; index++) 132 put_user_page(pages[index]); 133 } 134 EXPORT_SYMBOL(put_user_pages); 135 136 static struct page *no_page_table(struct vm_area_struct *vma, 137 unsigned int flags) 138 { 139 /* 140 * When core dumping an enormous anonymous area that nobody 141 * has touched so far, we don't want to allocate unnecessary pages or 142 * page tables. Return error instead of NULL to skip handle_mm_fault, 143 * then get_dump_page() will return NULL to leave a hole in the dump. 144 * But we can only make this optimization where a hole would surely 145 * be zero-filled if handle_mm_fault() actually did handle it. 146 */ 147 if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) 148 return ERR_PTR(-EFAULT); 149 return NULL; 150 } 151 152 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, 153 pte_t *pte, unsigned int flags) 154 { 155 /* No page to get reference */ 156 if (flags & FOLL_GET) 157 return -EFAULT; 158 159 if (flags & FOLL_TOUCH) { 160 pte_t entry = *pte; 161 162 if (flags & FOLL_WRITE) 163 entry = pte_mkdirty(entry); 164 entry = pte_mkyoung(entry); 165 166 if (!pte_same(*pte, entry)) { 167 set_pte_at(vma->vm_mm, address, pte, entry); 168 update_mmu_cache(vma, address, pte); 169 } 170 } 171 172 /* Proper page table entry exists, but no corresponding struct page */ 173 return -EEXIST; 174 } 175 176 /* 177 * FOLL_FORCE can write to even unwritable pte's, but only 178 * after we've gone through a COW cycle and they are dirty. 179 */ 180 static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) 181 { 182 return pte_write(pte) || 183 ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); 184 } 185 186 static struct page *follow_page_pte(struct vm_area_struct *vma, 187 unsigned long address, pmd_t *pmd, unsigned int flags, 188 struct dev_pagemap **pgmap) 189 { 190 struct mm_struct *mm = vma->vm_mm; 191 struct page *page; 192 spinlock_t *ptl; 193 pte_t *ptep, pte; 194 195 retry: 196 if (unlikely(pmd_bad(*pmd))) 197 return no_page_table(vma, flags); 198 199 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 200 pte = *ptep; 201 if (!pte_present(pte)) { 202 swp_entry_t entry; 203 /* 204 * KSM's break_ksm() relies upon recognizing a ksm page 205 * even while it is being migrated, so for that case we 206 * need migration_entry_wait(). 207 */ 208 if (likely(!(flags & FOLL_MIGRATION))) 209 goto no_page; 210 if (pte_none(pte)) 211 goto no_page; 212 entry = pte_to_swp_entry(pte); 213 if (!is_migration_entry(entry)) 214 goto no_page; 215 pte_unmap_unlock(ptep, ptl); 216 migration_entry_wait(mm, pmd, address); 217 goto retry; 218 } 219 if ((flags & FOLL_NUMA) && pte_protnone(pte)) 220 goto no_page; 221 if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { 222 pte_unmap_unlock(ptep, ptl); 223 return NULL; 224 } 225 226 page = vm_normal_page(vma, address, pte); 227 if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { 228 /* 229 * Only return device mapping pages in the FOLL_GET case since 230 * they are only valid while holding the pgmap reference. 231 */ 232 *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap); 233 if (*pgmap) 234 page = pte_page(pte); 235 else 236 goto no_page; 237 } else if (unlikely(!page)) { 238 if (flags & FOLL_DUMP) { 239 /* Avoid special (like zero) pages in core dumps */ 240 page = ERR_PTR(-EFAULT); 241 goto out; 242 } 243 244 if (is_zero_pfn(pte_pfn(pte))) { 245 page = pte_page(pte); 246 } else { 247 int ret; 248 249 ret = follow_pfn_pte(vma, address, ptep, flags); 250 page = ERR_PTR(ret); 251 goto out; 252 } 253 } 254 255 if (flags & FOLL_SPLIT && PageTransCompound(page)) { 256 int ret; 257 get_page(page); 258 pte_unmap_unlock(ptep, ptl); 259 lock_page(page); 260 ret = split_huge_page(page); 261 unlock_page(page); 262 put_page(page); 263 if (ret) 264 return ERR_PTR(ret); 265 goto retry; 266 } 267 268 if (flags & FOLL_GET) { 269 if (unlikely(!try_get_page(page))) { 270 page = ERR_PTR(-ENOMEM); 271 goto out; 272 } 273 } 274 if (flags & FOLL_TOUCH) { 275 if ((flags & FOLL_WRITE) && 276 !pte_dirty(pte) && !PageDirty(page)) 277 set_page_dirty(page); 278 /* 279 * pte_mkyoung() would be more correct here, but atomic care 280 * is needed to avoid losing the dirty bit: it is easier to use 281 * mark_page_accessed(). 282 */ 283 mark_page_accessed(page); 284 } 285 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 286 /* Do not mlock pte-mapped THP */ 287 if (PageTransCompound(page)) 288 goto out; 289 290 /* 291 * The preliminary mapping check is mainly to avoid the 292 * pointless overhead of lock_page on the ZERO_PAGE 293 * which might bounce very badly if there is contention. 294 * 295 * If the page is already locked, we don't need to 296 * handle it now - vmscan will handle it later if and 297 * when it attempts to reclaim the page. 298 */ 299 if (page->mapping && trylock_page(page)) { 300 lru_add_drain(); /* push cached pages to LRU */ 301 /* 302 * Because we lock page here, and migration is 303 * blocked by the pte's page reference, and we 304 * know the page is still mapped, we don't even 305 * need to check for file-cache page truncation. 306 */ 307 mlock_vma_page(page); 308 unlock_page(page); 309 } 310 } 311 out: 312 pte_unmap_unlock(ptep, ptl); 313 return page; 314 no_page: 315 pte_unmap_unlock(ptep, ptl); 316 if (!pte_none(pte)) 317 return NULL; 318 return no_page_table(vma, flags); 319 } 320 321 static struct page *follow_pmd_mask(struct vm_area_struct *vma, 322 unsigned long address, pud_t *pudp, 323 unsigned int flags, 324 struct follow_page_context *ctx) 325 { 326 pmd_t *pmd, pmdval; 327 spinlock_t *ptl; 328 struct page *page; 329 struct mm_struct *mm = vma->vm_mm; 330 331 pmd = pmd_offset(pudp, address); 332 /* 333 * The READ_ONCE() will stabilize the pmdval in a register or 334 * on the stack so that it will stop changing under the code. 335 */ 336 pmdval = READ_ONCE(*pmd); 337 if (pmd_none(pmdval)) 338 return no_page_table(vma, flags); 339 if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { 340 page = follow_huge_pmd(mm, address, pmd, flags); 341 if (page) 342 return page; 343 return no_page_table(vma, flags); 344 } 345 if (is_hugepd(__hugepd(pmd_val(pmdval)))) { 346 page = follow_huge_pd(vma, address, 347 __hugepd(pmd_val(pmdval)), flags, 348 PMD_SHIFT); 349 if (page) 350 return page; 351 return no_page_table(vma, flags); 352 } 353 retry: 354 if (!pmd_present(pmdval)) { 355 if (likely(!(flags & FOLL_MIGRATION))) 356 return no_page_table(vma, flags); 357 VM_BUG_ON(thp_migration_supported() && 358 !is_pmd_migration_entry(pmdval)); 359 if (is_pmd_migration_entry(pmdval)) 360 pmd_migration_entry_wait(mm, pmd); 361 pmdval = READ_ONCE(*pmd); 362 /* 363 * MADV_DONTNEED may convert the pmd to null because 364 * mmap_sem is held in read mode 365 */ 366 if (pmd_none(pmdval)) 367 return no_page_table(vma, flags); 368 goto retry; 369 } 370 if (pmd_devmap(pmdval)) { 371 ptl = pmd_lock(mm, pmd); 372 page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); 373 spin_unlock(ptl); 374 if (page) 375 return page; 376 } 377 if (likely(!pmd_trans_huge(pmdval))) 378 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 379 380 if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) 381 return no_page_table(vma, flags); 382 383 retry_locked: 384 ptl = pmd_lock(mm, pmd); 385 if (unlikely(pmd_none(*pmd))) { 386 spin_unlock(ptl); 387 return no_page_table(vma, flags); 388 } 389 if (unlikely(!pmd_present(*pmd))) { 390 spin_unlock(ptl); 391 if (likely(!(flags & FOLL_MIGRATION))) 392 return no_page_table(vma, flags); 393 pmd_migration_entry_wait(mm, pmd); 394 goto retry_locked; 395 } 396 if (unlikely(!pmd_trans_huge(*pmd))) { 397 spin_unlock(ptl); 398 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 399 } 400 if (flags & FOLL_SPLIT) { 401 int ret; 402 page = pmd_page(*pmd); 403 if (is_huge_zero_page(page)) { 404 spin_unlock(ptl); 405 ret = 0; 406 split_huge_pmd(vma, pmd, address); 407 if (pmd_trans_unstable(pmd)) 408 ret = -EBUSY; 409 } else { 410 if (unlikely(!try_get_page(page))) { 411 spin_unlock(ptl); 412 return ERR_PTR(-ENOMEM); 413 } 414 spin_unlock(ptl); 415 lock_page(page); 416 ret = split_huge_page(page); 417 unlock_page(page); 418 put_page(page); 419 if (pmd_none(*pmd)) 420 return no_page_table(vma, flags); 421 } 422 423 return ret ? ERR_PTR(ret) : 424 follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 425 } 426 page = follow_trans_huge_pmd(vma, address, pmd, flags); 427 spin_unlock(ptl); 428 ctx->page_mask = HPAGE_PMD_NR - 1; 429 return page; 430 } 431 432 static struct page *follow_pud_mask(struct vm_area_struct *vma, 433 unsigned long address, p4d_t *p4dp, 434 unsigned int flags, 435 struct follow_page_context *ctx) 436 { 437 pud_t *pud; 438 spinlock_t *ptl; 439 struct page *page; 440 struct mm_struct *mm = vma->vm_mm; 441 442 pud = pud_offset(p4dp, address); 443 if (pud_none(*pud)) 444 return no_page_table(vma, flags); 445 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 446 page = follow_huge_pud(mm, address, pud, flags); 447 if (page) 448 return page; 449 return no_page_table(vma, flags); 450 } 451 if (is_hugepd(__hugepd(pud_val(*pud)))) { 452 page = follow_huge_pd(vma, address, 453 __hugepd(pud_val(*pud)), flags, 454 PUD_SHIFT); 455 if (page) 456 return page; 457 return no_page_table(vma, flags); 458 } 459 if (pud_devmap(*pud)) { 460 ptl = pud_lock(mm, pud); 461 page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); 462 spin_unlock(ptl); 463 if (page) 464 return page; 465 } 466 if (unlikely(pud_bad(*pud))) 467 return no_page_table(vma, flags); 468 469 return follow_pmd_mask(vma, address, pud, flags, ctx); 470 } 471 472 static struct page *follow_p4d_mask(struct vm_area_struct *vma, 473 unsigned long address, pgd_t *pgdp, 474 unsigned int flags, 475 struct follow_page_context *ctx) 476 { 477 p4d_t *p4d; 478 struct page *page; 479 480 p4d = p4d_offset(pgdp, address); 481 if (p4d_none(*p4d)) 482 return no_page_table(vma, flags); 483 BUILD_BUG_ON(p4d_huge(*p4d)); 484 if (unlikely(p4d_bad(*p4d))) 485 return no_page_table(vma, flags); 486 487 if (is_hugepd(__hugepd(p4d_val(*p4d)))) { 488 page = follow_huge_pd(vma, address, 489 __hugepd(p4d_val(*p4d)), flags, 490 P4D_SHIFT); 491 if (page) 492 return page; 493 return no_page_table(vma, flags); 494 } 495 return follow_pud_mask(vma, address, p4d, flags, ctx); 496 } 497 498 /** 499 * follow_page_mask - look up a page descriptor from a user-virtual address 500 * @vma: vm_area_struct mapping @address 501 * @address: virtual address to look up 502 * @flags: flags modifying lookup behaviour 503 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a 504 * pointer to output page_mask 505 * 506 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 507 * 508 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches 509 * the device's dev_pagemap metadata to avoid repeating expensive lookups. 510 * 511 * On output, the @ctx->page_mask is set according to the size of the page. 512 * 513 * Return: the mapped (struct page *), %NULL if no mapping exists, or 514 * an error pointer if there is a mapping to something not represented 515 * by a page descriptor (see also vm_normal_page()). 516 */ 517 struct page *follow_page_mask(struct vm_area_struct *vma, 518 unsigned long address, unsigned int flags, 519 struct follow_page_context *ctx) 520 { 521 pgd_t *pgd; 522 struct page *page; 523 struct mm_struct *mm = vma->vm_mm; 524 525 ctx->page_mask = 0; 526 527 /* make this handle hugepd */ 528 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 529 if (!IS_ERR(page)) { 530 BUG_ON(flags & FOLL_GET); 531 return page; 532 } 533 534 pgd = pgd_offset(mm, address); 535 536 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 537 return no_page_table(vma, flags); 538 539 if (pgd_huge(*pgd)) { 540 page = follow_huge_pgd(mm, address, pgd, flags); 541 if (page) 542 return page; 543 return no_page_table(vma, flags); 544 } 545 if (is_hugepd(__hugepd(pgd_val(*pgd)))) { 546 page = follow_huge_pd(vma, address, 547 __hugepd(pgd_val(*pgd)), flags, 548 PGDIR_SHIFT); 549 if (page) 550 return page; 551 return no_page_table(vma, flags); 552 } 553 554 return follow_p4d_mask(vma, address, pgd, flags, ctx); 555 } 556 557 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 558 unsigned int foll_flags) 559 { 560 struct follow_page_context ctx = { NULL }; 561 struct page *page; 562 563 page = follow_page_mask(vma, address, foll_flags, &ctx); 564 if (ctx.pgmap) 565 put_dev_pagemap(ctx.pgmap); 566 return page; 567 } 568 569 static int get_gate_page(struct mm_struct *mm, unsigned long address, 570 unsigned int gup_flags, struct vm_area_struct **vma, 571 struct page **page) 572 { 573 pgd_t *pgd; 574 p4d_t *p4d; 575 pud_t *pud; 576 pmd_t *pmd; 577 pte_t *pte; 578 int ret = -EFAULT; 579 580 /* user gate pages are read-only */ 581 if (gup_flags & FOLL_WRITE) 582 return -EFAULT; 583 if (address > TASK_SIZE) 584 pgd = pgd_offset_k(address); 585 else 586 pgd = pgd_offset_gate(mm, address); 587 BUG_ON(pgd_none(*pgd)); 588 p4d = p4d_offset(pgd, address); 589 BUG_ON(p4d_none(*p4d)); 590 pud = pud_offset(p4d, address); 591 BUG_ON(pud_none(*pud)); 592 pmd = pmd_offset(pud, address); 593 if (!pmd_present(*pmd)) 594 return -EFAULT; 595 VM_BUG_ON(pmd_trans_huge(*pmd)); 596 pte = pte_offset_map(pmd, address); 597 if (pte_none(*pte)) 598 goto unmap; 599 *vma = get_gate_vma(mm); 600 if (!page) 601 goto out; 602 *page = vm_normal_page(*vma, address, *pte); 603 if (!*page) { 604 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) 605 goto unmap; 606 *page = pte_page(*pte); 607 608 /* 609 * This should never happen (a device public page in the gate 610 * area). 611 */ 612 if (is_device_public_page(*page)) 613 goto unmap; 614 } 615 if (unlikely(!try_get_page(*page))) { 616 ret = -ENOMEM; 617 goto unmap; 618 } 619 out: 620 ret = 0; 621 unmap: 622 pte_unmap(pte); 623 return ret; 624 } 625 626 /* 627 * mmap_sem must be held on entry. If @nonblocking != NULL and 628 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. 629 * If it is, *@nonblocking will be set to 0 and -EBUSY returned. 630 */ 631 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, 632 unsigned long address, unsigned int *flags, int *nonblocking) 633 { 634 unsigned int fault_flags = 0; 635 vm_fault_t ret; 636 637 /* mlock all present pages, but do not fault in new pages */ 638 if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) 639 return -ENOENT; 640 if (*flags & FOLL_WRITE) 641 fault_flags |= FAULT_FLAG_WRITE; 642 if (*flags & FOLL_REMOTE) 643 fault_flags |= FAULT_FLAG_REMOTE; 644 if (nonblocking) 645 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 646 if (*flags & FOLL_NOWAIT) 647 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 648 if (*flags & FOLL_TRIED) { 649 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); 650 fault_flags |= FAULT_FLAG_TRIED; 651 } 652 653 ret = handle_mm_fault(vma, address, fault_flags); 654 if (ret & VM_FAULT_ERROR) { 655 int err = vm_fault_to_errno(ret, *flags); 656 657 if (err) 658 return err; 659 BUG(); 660 } 661 662 if (tsk) { 663 if (ret & VM_FAULT_MAJOR) 664 tsk->maj_flt++; 665 else 666 tsk->min_flt++; 667 } 668 669 if (ret & VM_FAULT_RETRY) { 670 if (nonblocking && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) 671 *nonblocking = 0; 672 return -EBUSY; 673 } 674 675 /* 676 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when 677 * necessary, even if maybe_mkwrite decided not to set pte_write. We 678 * can thus safely do subsequent page lookups as if they were reads. 679 * But only do so when looping for pte_write is futile: in some cases 680 * userspace may also be wanting to write to the gotten user page, 681 * which a read fault here might prevent (a readonly page might get 682 * reCOWed by userspace write). 683 */ 684 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) 685 *flags |= FOLL_COW; 686 return 0; 687 } 688 689 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 690 { 691 vm_flags_t vm_flags = vma->vm_flags; 692 int write = (gup_flags & FOLL_WRITE); 693 int foreign = (gup_flags & FOLL_REMOTE); 694 695 if (vm_flags & (VM_IO | VM_PFNMAP)) 696 return -EFAULT; 697 698 if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) 699 return -EFAULT; 700 701 if (write) { 702 if (!(vm_flags & VM_WRITE)) { 703 if (!(gup_flags & FOLL_FORCE)) 704 return -EFAULT; 705 /* 706 * We used to let the write,force case do COW in a 707 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could 708 * set a breakpoint in a read-only mapping of an 709 * executable, without corrupting the file (yet only 710 * when that file had been opened for writing!). 711 * Anon pages in shared mappings are surprising: now 712 * just reject it. 713 */ 714 if (!is_cow_mapping(vm_flags)) 715 return -EFAULT; 716 } 717 } else if (!(vm_flags & VM_READ)) { 718 if (!(gup_flags & FOLL_FORCE)) 719 return -EFAULT; 720 /* 721 * Is there actually any vma we can reach here which does not 722 * have VM_MAYREAD set? 723 */ 724 if (!(vm_flags & VM_MAYREAD)) 725 return -EFAULT; 726 } 727 /* 728 * gups are always data accesses, not instruction 729 * fetches, so execute=false here 730 */ 731 if (!arch_vma_access_permitted(vma, write, false, foreign)) 732 return -EFAULT; 733 return 0; 734 } 735 736 /** 737 * __get_user_pages() - pin user pages in memory 738 * @tsk: task_struct of target task 739 * @mm: mm_struct of target mm 740 * @start: starting user address 741 * @nr_pages: number of pages from start to pin 742 * @gup_flags: flags modifying pin behaviour 743 * @pages: array that receives pointers to the pages pinned. 744 * Should be at least nr_pages long. Or NULL, if caller 745 * only intends to ensure the pages are faulted in. 746 * @vmas: array of pointers to vmas corresponding to each page. 747 * Or NULL if the caller does not require them. 748 * @nonblocking: whether waiting for disk IO or mmap_sem contention 749 * 750 * Returns number of pages pinned. This may be fewer than the number 751 * requested. If nr_pages is 0 or negative, returns 0. If no pages 752 * were pinned, returns -errno. Each page returned must be released 753 * with a put_page() call when it is finished with. vmas will only 754 * remain valid while mmap_sem is held. 755 * 756 * Must be called with mmap_sem held. It may be released. See below. 757 * 758 * __get_user_pages walks a process's page tables and takes a reference to 759 * each struct page that each user address corresponds to at a given 760 * instant. That is, it takes the page that would be accessed if a user 761 * thread accesses the given user virtual address at that instant. 762 * 763 * This does not guarantee that the page exists in the user mappings when 764 * __get_user_pages returns, and there may even be a completely different 765 * page there in some cases (eg. if mmapped pagecache has been invalidated 766 * and subsequently re faulted). However it does guarantee that the page 767 * won't be freed completely. And mostly callers simply care that the page 768 * contains data that was valid *at some point in time*. Typically, an IO 769 * or similar operation cannot guarantee anything stronger anyway because 770 * locks can't be held over the syscall boundary. 771 * 772 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 773 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 774 * appropriate) must be called after the page is finished with, and 775 * before put_page is called. 776 * 777 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 778 * or mmap_sem contention, and if waiting is needed to pin all pages, 779 * *@nonblocking will be set to 0. Further, if @gup_flags does not 780 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in 781 * this case. 782 * 783 * A caller using such a combination of @nonblocking and @gup_flags 784 * must therefore hold the mmap_sem for reading only, and recognize 785 * when it's been released. Otherwise, it must be held for either 786 * reading or writing and will not be released. 787 * 788 * In most cases, get_user_pages or get_user_pages_fast should be used 789 * instead of __get_user_pages. __get_user_pages should be used only if 790 * you need some special @gup_flags. 791 */ 792 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 793 unsigned long start, unsigned long nr_pages, 794 unsigned int gup_flags, struct page **pages, 795 struct vm_area_struct **vmas, int *nonblocking) 796 { 797 long ret = 0, i = 0; 798 struct vm_area_struct *vma = NULL; 799 struct follow_page_context ctx = { NULL }; 800 801 if (!nr_pages) 802 return 0; 803 804 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 805 806 /* 807 * If FOLL_FORCE is set then do not force a full fault as the hinting 808 * fault information is unrelated to the reference behaviour of a task 809 * using the address space 810 */ 811 if (!(gup_flags & FOLL_FORCE)) 812 gup_flags |= FOLL_NUMA; 813 814 do { 815 struct page *page; 816 unsigned int foll_flags = gup_flags; 817 unsigned int page_increm; 818 819 /* first iteration or cross vma bound */ 820 if (!vma || start >= vma->vm_end) { 821 vma = find_extend_vma(mm, start); 822 if (!vma && in_gate_area(mm, start)) { 823 ret = get_gate_page(mm, start & PAGE_MASK, 824 gup_flags, &vma, 825 pages ? &pages[i] : NULL); 826 if (ret) 827 goto out; 828 ctx.page_mask = 0; 829 goto next_page; 830 } 831 832 if (!vma || check_vma_flags(vma, gup_flags)) { 833 ret = -EFAULT; 834 goto out; 835 } 836 if (is_vm_hugetlb_page(vma)) { 837 i = follow_hugetlb_page(mm, vma, pages, vmas, 838 &start, &nr_pages, i, 839 gup_flags, nonblocking); 840 continue; 841 } 842 } 843 retry: 844 /* 845 * If we have a pending SIGKILL, don't keep faulting pages and 846 * potentially allocating memory. 847 */ 848 if (fatal_signal_pending(current)) { 849 ret = -ERESTARTSYS; 850 goto out; 851 } 852 cond_resched(); 853 854 page = follow_page_mask(vma, start, foll_flags, &ctx); 855 if (!page) { 856 ret = faultin_page(tsk, vma, start, &foll_flags, 857 nonblocking); 858 switch (ret) { 859 case 0: 860 goto retry; 861 case -EBUSY: 862 ret = 0; 863 /* FALLTHRU */ 864 case -EFAULT: 865 case -ENOMEM: 866 case -EHWPOISON: 867 goto out; 868 case -ENOENT: 869 goto next_page; 870 } 871 BUG(); 872 } else if (PTR_ERR(page) == -EEXIST) { 873 /* 874 * Proper page table entry exists, but no corresponding 875 * struct page. 876 */ 877 goto next_page; 878 } else if (IS_ERR(page)) { 879 ret = PTR_ERR(page); 880 goto out; 881 } 882 if (pages) { 883 pages[i] = page; 884 flush_anon_page(vma, page, start); 885 flush_dcache_page(page); 886 ctx.page_mask = 0; 887 } 888 next_page: 889 if (vmas) { 890 vmas[i] = vma; 891 ctx.page_mask = 0; 892 } 893 page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); 894 if (page_increm > nr_pages) 895 page_increm = nr_pages; 896 i += page_increm; 897 start += page_increm * PAGE_SIZE; 898 nr_pages -= page_increm; 899 } while (nr_pages); 900 out: 901 if (ctx.pgmap) 902 put_dev_pagemap(ctx.pgmap); 903 return i ? i : ret; 904 } 905 906 static bool vma_permits_fault(struct vm_area_struct *vma, 907 unsigned int fault_flags) 908 { 909 bool write = !!(fault_flags & FAULT_FLAG_WRITE); 910 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); 911 vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; 912 913 if (!(vm_flags & vma->vm_flags)) 914 return false; 915 916 /* 917 * The architecture might have a hardware protection 918 * mechanism other than read/write that can deny access. 919 * 920 * gup always represents data access, not instruction 921 * fetches, so execute=false here: 922 */ 923 if (!arch_vma_access_permitted(vma, write, false, foreign)) 924 return false; 925 926 return true; 927 } 928 929 /* 930 * fixup_user_fault() - manually resolve a user page fault 931 * @tsk: the task_struct to use for page fault accounting, or 932 * NULL if faults are not to be recorded. 933 * @mm: mm_struct of target mm 934 * @address: user address 935 * @fault_flags:flags to pass down to handle_mm_fault() 936 * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller 937 * does not allow retry 938 * 939 * This is meant to be called in the specific scenario where for locking reasons 940 * we try to access user memory in atomic context (within a pagefault_disable() 941 * section), this returns -EFAULT, and we want to resolve the user fault before 942 * trying again. 943 * 944 * Typically this is meant to be used by the futex code. 945 * 946 * The main difference with get_user_pages() is that this function will 947 * unconditionally call handle_mm_fault() which will in turn perform all the 948 * necessary SW fixup of the dirty and young bits in the PTE, while 949 * get_user_pages() only guarantees to update these in the struct page. 950 * 951 * This is important for some architectures where those bits also gate the 952 * access permission to the page because they are maintained in software. On 953 * such architectures, gup() will not be enough to make a subsequent access 954 * succeed. 955 * 956 * This function will not return with an unlocked mmap_sem. So it has not the 957 * same semantics wrt the @mm->mmap_sem as does filemap_fault(). 958 */ 959 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 960 unsigned long address, unsigned int fault_flags, 961 bool *unlocked) 962 { 963 struct vm_area_struct *vma; 964 vm_fault_t ret, major = 0; 965 966 if (unlocked) 967 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 968 969 retry: 970 vma = find_extend_vma(mm, address); 971 if (!vma || address < vma->vm_start) 972 return -EFAULT; 973 974 if (!vma_permits_fault(vma, fault_flags)) 975 return -EFAULT; 976 977 ret = handle_mm_fault(vma, address, fault_flags); 978 major |= ret & VM_FAULT_MAJOR; 979 if (ret & VM_FAULT_ERROR) { 980 int err = vm_fault_to_errno(ret, 0); 981 982 if (err) 983 return err; 984 BUG(); 985 } 986 987 if (ret & VM_FAULT_RETRY) { 988 down_read(&mm->mmap_sem); 989 if (!(fault_flags & FAULT_FLAG_TRIED)) { 990 *unlocked = true; 991 fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; 992 fault_flags |= FAULT_FLAG_TRIED; 993 goto retry; 994 } 995 } 996 997 if (tsk) { 998 if (major) 999 tsk->maj_flt++; 1000 else 1001 tsk->min_flt++; 1002 } 1003 return 0; 1004 } 1005 EXPORT_SYMBOL_GPL(fixup_user_fault); 1006 1007 static __always_inline long __get_user_pages_locked(struct task_struct *tsk, 1008 struct mm_struct *mm, 1009 unsigned long start, 1010 unsigned long nr_pages, 1011 struct page **pages, 1012 struct vm_area_struct **vmas, 1013 int *locked, 1014 unsigned int flags) 1015 { 1016 long ret, pages_done; 1017 bool lock_dropped; 1018 1019 if (locked) { 1020 /* if VM_FAULT_RETRY can be returned, vmas become invalid */ 1021 BUG_ON(vmas); 1022 /* check caller initialized locked */ 1023 BUG_ON(*locked != 1); 1024 } 1025 1026 if (pages) 1027 flags |= FOLL_GET; 1028 1029 pages_done = 0; 1030 lock_dropped = false; 1031 for (;;) { 1032 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, 1033 vmas, locked); 1034 if (!locked) 1035 /* VM_FAULT_RETRY couldn't trigger, bypass */ 1036 return ret; 1037 1038 /* VM_FAULT_RETRY cannot return errors */ 1039 if (!*locked) { 1040 BUG_ON(ret < 0); 1041 BUG_ON(ret >= nr_pages); 1042 } 1043 1044 if (!pages) 1045 /* If it's a prefault don't insist harder */ 1046 return ret; 1047 1048 if (ret > 0) { 1049 nr_pages -= ret; 1050 pages_done += ret; 1051 if (!nr_pages) 1052 break; 1053 } 1054 if (*locked) { 1055 /* 1056 * VM_FAULT_RETRY didn't trigger or it was a 1057 * FOLL_NOWAIT. 1058 */ 1059 if (!pages_done) 1060 pages_done = ret; 1061 break; 1062 } 1063 /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ 1064 pages += ret; 1065 start += ret << PAGE_SHIFT; 1066 1067 /* 1068 * Repeat on the address that fired VM_FAULT_RETRY 1069 * without FAULT_FLAG_ALLOW_RETRY but with 1070 * FAULT_FLAG_TRIED. 1071 */ 1072 *locked = 1; 1073 lock_dropped = true; 1074 down_read(&mm->mmap_sem); 1075 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, 1076 pages, NULL, NULL); 1077 if (ret != 1) { 1078 BUG_ON(ret > 1); 1079 if (!pages_done) 1080 pages_done = ret; 1081 break; 1082 } 1083 nr_pages--; 1084 pages_done++; 1085 if (!nr_pages) 1086 break; 1087 pages++; 1088 start += PAGE_SIZE; 1089 } 1090 if (lock_dropped && *locked) { 1091 /* 1092 * We must let the caller know we temporarily dropped the lock 1093 * and so the critical section protected by it was lost. 1094 */ 1095 up_read(&mm->mmap_sem); 1096 *locked = 0; 1097 } 1098 return pages_done; 1099 } 1100 1101 /* 1102 * We can leverage the VM_FAULT_RETRY functionality in the page fault 1103 * paths better by using either get_user_pages_locked() or 1104 * get_user_pages_unlocked(). 1105 * 1106 * get_user_pages_locked() is suitable to replace the form: 1107 * 1108 * down_read(&mm->mmap_sem); 1109 * do_something() 1110 * get_user_pages(tsk, mm, ..., pages, NULL); 1111 * up_read(&mm->mmap_sem); 1112 * 1113 * to: 1114 * 1115 * int locked = 1; 1116 * down_read(&mm->mmap_sem); 1117 * do_something() 1118 * get_user_pages_locked(tsk, mm, ..., pages, &locked); 1119 * if (locked) 1120 * up_read(&mm->mmap_sem); 1121 */ 1122 long get_user_pages_locked(unsigned long start, unsigned long nr_pages, 1123 unsigned int gup_flags, struct page **pages, 1124 int *locked) 1125 { 1126 /* 1127 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1128 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1129 * vmas. As there are no users of this flag in this call we simply 1130 * disallow this option for now. 1131 */ 1132 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1133 return -EINVAL; 1134 1135 return __get_user_pages_locked(current, current->mm, start, nr_pages, 1136 pages, NULL, locked, 1137 gup_flags | FOLL_TOUCH); 1138 } 1139 EXPORT_SYMBOL(get_user_pages_locked); 1140 1141 /* 1142 * get_user_pages_unlocked() is suitable to replace the form: 1143 * 1144 * down_read(&mm->mmap_sem); 1145 * get_user_pages(tsk, mm, ..., pages, NULL); 1146 * up_read(&mm->mmap_sem); 1147 * 1148 * with: 1149 * 1150 * get_user_pages_unlocked(tsk, mm, ..., pages); 1151 * 1152 * It is functionally equivalent to get_user_pages_fast so 1153 * get_user_pages_fast should be used instead if specific gup_flags 1154 * (e.g. FOLL_FORCE) are not required. 1155 */ 1156 long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 1157 struct page **pages, unsigned int gup_flags) 1158 { 1159 struct mm_struct *mm = current->mm; 1160 int locked = 1; 1161 long ret; 1162 1163 /* 1164 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1165 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1166 * vmas. As there are no users of this flag in this call we simply 1167 * disallow this option for now. 1168 */ 1169 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1170 return -EINVAL; 1171 1172 down_read(&mm->mmap_sem); 1173 ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, 1174 &locked, gup_flags | FOLL_TOUCH); 1175 if (locked) 1176 up_read(&mm->mmap_sem); 1177 return ret; 1178 } 1179 EXPORT_SYMBOL(get_user_pages_unlocked); 1180 1181 /* 1182 * get_user_pages_remote() - pin user pages in memory 1183 * @tsk: the task_struct to use for page fault accounting, or 1184 * NULL if faults are not to be recorded. 1185 * @mm: mm_struct of target mm 1186 * @start: starting user address 1187 * @nr_pages: number of pages from start to pin 1188 * @gup_flags: flags modifying lookup behaviour 1189 * @pages: array that receives pointers to the pages pinned. 1190 * Should be at least nr_pages long. Or NULL, if caller 1191 * only intends to ensure the pages are faulted in. 1192 * @vmas: array of pointers to vmas corresponding to each page. 1193 * Or NULL if the caller does not require them. 1194 * @locked: pointer to lock flag indicating whether lock is held and 1195 * subsequently whether VM_FAULT_RETRY functionality can be 1196 * utilised. Lock must initially be held. 1197 * 1198 * Returns number of pages pinned. This may be fewer than the number 1199 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1200 * were pinned, returns -errno. Each page returned must be released 1201 * with a put_page() call when it is finished with. vmas will only 1202 * remain valid while mmap_sem is held. 1203 * 1204 * Must be called with mmap_sem held for read or write. 1205 * 1206 * get_user_pages walks a process's page tables and takes a reference to 1207 * each struct page that each user address corresponds to at a given 1208 * instant. That is, it takes the page that would be accessed if a user 1209 * thread accesses the given user virtual address at that instant. 1210 * 1211 * This does not guarantee that the page exists in the user mappings when 1212 * get_user_pages returns, and there may even be a completely different 1213 * page there in some cases (eg. if mmapped pagecache has been invalidated 1214 * and subsequently re faulted). However it does guarantee that the page 1215 * won't be freed completely. And mostly callers simply care that the page 1216 * contains data that was valid *at some point in time*. Typically, an IO 1217 * or similar operation cannot guarantee anything stronger anyway because 1218 * locks can't be held over the syscall boundary. 1219 * 1220 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page 1221 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must 1222 * be called after the page is finished with, and before put_page is called. 1223 * 1224 * get_user_pages is typically used for fewer-copy IO operations, to get a 1225 * handle on the memory by some means other than accesses via the user virtual 1226 * addresses. The pages may be submitted for DMA to devices or accessed via 1227 * their kernel linear mapping (via the kmap APIs). Care should be taken to 1228 * use the correct cache flushing APIs. 1229 * 1230 * See also get_user_pages_fast, for performance critical applications. 1231 * 1232 * get_user_pages should be phased out in favor of 1233 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing 1234 * should use get_user_pages because it cannot pass 1235 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. 1236 */ 1237 long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 1238 unsigned long start, unsigned long nr_pages, 1239 unsigned int gup_flags, struct page **pages, 1240 struct vm_area_struct **vmas, int *locked) 1241 { 1242 /* 1243 * FIXME: Current FOLL_LONGTERM behavior is incompatible with 1244 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on 1245 * vmas. As there are no users of this flag in this call we simply 1246 * disallow this option for now. 1247 */ 1248 if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) 1249 return -EINVAL; 1250 1251 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1252 locked, 1253 gup_flags | FOLL_TOUCH | FOLL_REMOTE); 1254 } 1255 EXPORT_SYMBOL(get_user_pages_remote); 1256 1257 #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) 1258 static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) 1259 { 1260 long i; 1261 struct vm_area_struct *vma_prev = NULL; 1262 1263 for (i = 0; i < nr_pages; i++) { 1264 struct vm_area_struct *vma = vmas[i]; 1265 1266 if (vma == vma_prev) 1267 continue; 1268 1269 vma_prev = vma; 1270 1271 if (vma_is_fsdax(vma)) 1272 return true; 1273 } 1274 return false; 1275 } 1276 1277 #ifdef CONFIG_CMA 1278 static struct page *new_non_cma_page(struct page *page, unsigned long private) 1279 { 1280 /* 1281 * We want to make sure we allocate the new page from the same node 1282 * as the source page. 1283 */ 1284 int nid = page_to_nid(page); 1285 /* 1286 * Trying to allocate a page for migration. Ignore allocation 1287 * failure warnings. We don't force __GFP_THISNODE here because 1288 * this node here is the node where we have CMA reservation and 1289 * in some case these nodes will have really less non movable 1290 * allocation memory. 1291 */ 1292 gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; 1293 1294 if (PageHighMem(page)) 1295 gfp_mask |= __GFP_HIGHMEM; 1296 1297 #ifdef CONFIG_HUGETLB_PAGE 1298 if (PageHuge(page)) { 1299 struct hstate *h = page_hstate(page); 1300 /* 1301 * We don't want to dequeue from the pool because pool pages will 1302 * mostly be from the CMA region. 1303 */ 1304 return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); 1305 } 1306 #endif 1307 if (PageTransHuge(page)) { 1308 struct page *thp; 1309 /* 1310 * ignore allocation failure warnings 1311 */ 1312 gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; 1313 1314 /* 1315 * Remove the movable mask so that we don't allocate from 1316 * CMA area again. 1317 */ 1318 thp_gfpmask &= ~__GFP_MOVABLE; 1319 thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); 1320 if (!thp) 1321 return NULL; 1322 prep_transhuge_page(thp); 1323 return thp; 1324 } 1325 1326 return __alloc_pages_node(nid, gfp_mask, 0); 1327 } 1328 1329 static long check_and_migrate_cma_pages(struct task_struct *tsk, 1330 struct mm_struct *mm, 1331 unsigned long start, 1332 unsigned long nr_pages, 1333 struct page **pages, 1334 struct vm_area_struct **vmas, 1335 unsigned int gup_flags) 1336 { 1337 long i; 1338 bool drain_allow = true; 1339 bool migrate_allow = true; 1340 LIST_HEAD(cma_page_list); 1341 1342 check_again: 1343 for (i = 0; i < nr_pages; i++) { 1344 /* 1345 * If we get a page from the CMA zone, since we are going to 1346 * be pinning these entries, we might as well move them out 1347 * of the CMA zone if possible. 1348 */ 1349 if (is_migrate_cma_page(pages[i])) { 1350 1351 struct page *head = compound_head(pages[i]); 1352 1353 if (PageHuge(head)) { 1354 isolate_huge_page(head, &cma_page_list); 1355 } else { 1356 if (!PageLRU(head) && drain_allow) { 1357 lru_add_drain_all(); 1358 drain_allow = false; 1359 } 1360 1361 if (!isolate_lru_page(head)) { 1362 list_add_tail(&head->lru, &cma_page_list); 1363 mod_node_page_state(page_pgdat(head), 1364 NR_ISOLATED_ANON + 1365 page_is_file_cache(head), 1366 hpage_nr_pages(head)); 1367 } 1368 } 1369 } 1370 } 1371 1372 if (!list_empty(&cma_page_list)) { 1373 /* 1374 * drop the above get_user_pages reference. 1375 */ 1376 for (i = 0; i < nr_pages; i++) 1377 put_page(pages[i]); 1378 1379 if (migrate_pages(&cma_page_list, new_non_cma_page, 1380 NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { 1381 /* 1382 * some of the pages failed migration. Do get_user_pages 1383 * without migration. 1384 */ 1385 migrate_allow = false; 1386 1387 if (!list_empty(&cma_page_list)) 1388 putback_movable_pages(&cma_page_list); 1389 } 1390 /* 1391 * We did migrate all the pages, Try to get the page references 1392 * again migrating any new CMA pages which we failed to isolate 1393 * earlier. 1394 */ 1395 nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, 1396 pages, vmas, NULL, 1397 gup_flags); 1398 1399 if ((nr_pages > 0) && migrate_allow) { 1400 drain_allow = true; 1401 goto check_again; 1402 } 1403 } 1404 1405 return nr_pages; 1406 } 1407 #else 1408 static long check_and_migrate_cma_pages(struct task_struct *tsk, 1409 struct mm_struct *mm, 1410 unsigned long start, 1411 unsigned long nr_pages, 1412 struct page **pages, 1413 struct vm_area_struct **vmas, 1414 unsigned int gup_flags) 1415 { 1416 return nr_pages; 1417 } 1418 #endif 1419 1420 /* 1421 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which 1422 * allows us to process the FOLL_LONGTERM flag. 1423 */ 1424 static long __gup_longterm_locked(struct task_struct *tsk, 1425 struct mm_struct *mm, 1426 unsigned long start, 1427 unsigned long nr_pages, 1428 struct page **pages, 1429 struct vm_area_struct **vmas, 1430 unsigned int gup_flags) 1431 { 1432 struct vm_area_struct **vmas_tmp = vmas; 1433 unsigned long flags = 0; 1434 long rc, i; 1435 1436 if (gup_flags & FOLL_LONGTERM) { 1437 if (!pages) 1438 return -EINVAL; 1439 1440 if (!vmas_tmp) { 1441 vmas_tmp = kcalloc(nr_pages, 1442 sizeof(struct vm_area_struct *), 1443 GFP_KERNEL); 1444 if (!vmas_tmp) 1445 return -ENOMEM; 1446 } 1447 flags = memalloc_nocma_save(); 1448 } 1449 1450 rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, 1451 vmas_tmp, NULL, gup_flags); 1452 1453 if (gup_flags & FOLL_LONGTERM) { 1454 memalloc_nocma_restore(flags); 1455 if (rc < 0) 1456 goto out; 1457 1458 if (check_dax_vmas(vmas_tmp, rc)) { 1459 for (i = 0; i < rc; i++) 1460 put_page(pages[i]); 1461 rc = -EOPNOTSUPP; 1462 goto out; 1463 } 1464 1465 rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, 1466 vmas_tmp, gup_flags); 1467 } 1468 1469 out: 1470 if (vmas_tmp != vmas) 1471 kfree(vmas_tmp); 1472 return rc; 1473 } 1474 #else /* !CONFIG_FS_DAX && !CONFIG_CMA */ 1475 static __always_inline long __gup_longterm_locked(struct task_struct *tsk, 1476 struct mm_struct *mm, 1477 unsigned long start, 1478 unsigned long nr_pages, 1479 struct page **pages, 1480 struct vm_area_struct **vmas, 1481 unsigned int flags) 1482 { 1483 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 1484 NULL, flags); 1485 } 1486 #endif /* CONFIG_FS_DAX || CONFIG_CMA */ 1487 1488 /* 1489 * This is the same as get_user_pages_remote(), just with a 1490 * less-flexible calling convention where we assume that the task 1491 * and mm being operated on are the current task's and don't allow 1492 * passing of a locked parameter. We also obviously don't pass 1493 * FOLL_REMOTE in here. 1494 */ 1495 long get_user_pages(unsigned long start, unsigned long nr_pages, 1496 unsigned int gup_flags, struct page **pages, 1497 struct vm_area_struct **vmas) 1498 { 1499 return __gup_longterm_locked(current, current->mm, start, nr_pages, 1500 pages, vmas, gup_flags | FOLL_TOUCH); 1501 } 1502 EXPORT_SYMBOL(get_user_pages); 1503 1504 /** 1505 * populate_vma_page_range() - populate a range of pages in the vma. 1506 * @vma: target vma 1507 * @start: start address 1508 * @end: end address 1509 * @nonblocking: 1510 * 1511 * This takes care of mlocking the pages too if VM_LOCKED is set. 1512 * 1513 * return 0 on success, negative error code on error. 1514 * 1515 * vma->vm_mm->mmap_sem must be held. 1516 * 1517 * If @nonblocking is NULL, it may be held for read or write and will 1518 * be unperturbed. 1519 * 1520 * If @nonblocking is non-NULL, it must held for read only and may be 1521 * released. If it's released, *@nonblocking will be set to 0. 1522 */ 1523 long populate_vma_page_range(struct vm_area_struct *vma, 1524 unsigned long start, unsigned long end, int *nonblocking) 1525 { 1526 struct mm_struct *mm = vma->vm_mm; 1527 unsigned long nr_pages = (end - start) / PAGE_SIZE; 1528 int gup_flags; 1529 1530 VM_BUG_ON(start & ~PAGE_MASK); 1531 VM_BUG_ON(end & ~PAGE_MASK); 1532 VM_BUG_ON_VMA(start < vma->vm_start, vma); 1533 VM_BUG_ON_VMA(end > vma->vm_end, vma); 1534 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); 1535 1536 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; 1537 if (vma->vm_flags & VM_LOCKONFAULT) 1538 gup_flags &= ~FOLL_POPULATE; 1539 /* 1540 * We want to touch writable mappings with a write fault in order 1541 * to break COW, except for shared mappings because these don't COW 1542 * and we would not want to dirty them for nothing. 1543 */ 1544 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) 1545 gup_flags |= FOLL_WRITE; 1546 1547 /* 1548 * We want mlock to succeed for regions that have any permissions 1549 * other than PROT_NONE. 1550 */ 1551 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 1552 gup_flags |= FOLL_FORCE; 1553 1554 /* 1555 * We made sure addr is within a VMA, so the following will 1556 * not result in a stack expansion that recurses back here. 1557 */ 1558 return __get_user_pages(current, mm, start, nr_pages, gup_flags, 1559 NULL, NULL, nonblocking); 1560 } 1561 1562 /* 1563 * __mm_populate - populate and/or mlock pages within a range of address space. 1564 * 1565 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap 1566 * flags. VMAs must be already marked with the desired vm_flags, and 1567 * mmap_sem must not be held. 1568 */ 1569 int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) 1570 { 1571 struct mm_struct *mm = current->mm; 1572 unsigned long end, nstart, nend; 1573 struct vm_area_struct *vma = NULL; 1574 int locked = 0; 1575 long ret = 0; 1576 1577 end = start + len; 1578 1579 for (nstart = start; nstart < end; nstart = nend) { 1580 /* 1581 * We want to fault in pages for [nstart; end) address range. 1582 * Find first corresponding VMA. 1583 */ 1584 if (!locked) { 1585 locked = 1; 1586 down_read(&mm->mmap_sem); 1587 vma = find_vma(mm, nstart); 1588 } else if (nstart >= vma->vm_end) 1589 vma = vma->vm_next; 1590 if (!vma || vma->vm_start >= end) 1591 break; 1592 /* 1593 * Set [nstart; nend) to intersection of desired address 1594 * range with the first VMA. Also, skip undesirable VMA types. 1595 */ 1596 nend = min(end, vma->vm_end); 1597 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1598 continue; 1599 if (nstart < vma->vm_start) 1600 nstart = vma->vm_start; 1601 /* 1602 * Now fault in a range of pages. populate_vma_page_range() 1603 * double checks the vma flags, so that it won't mlock pages 1604 * if the vma was already munlocked. 1605 */ 1606 ret = populate_vma_page_range(vma, nstart, nend, &locked); 1607 if (ret < 0) { 1608 if (ignore_errors) { 1609 ret = 0; 1610 continue; /* continue at next VMA */ 1611 } 1612 break; 1613 } 1614 nend = nstart + ret * PAGE_SIZE; 1615 ret = 0; 1616 } 1617 if (locked) 1618 up_read(&mm->mmap_sem); 1619 return ret; /* 0 or negative error code */ 1620 } 1621 1622 /** 1623 * get_dump_page() - pin user page in memory while writing it to core dump 1624 * @addr: user address 1625 * 1626 * Returns struct page pointer of user page pinned for dump, 1627 * to be freed afterwards by put_page(). 1628 * 1629 * Returns NULL on any kind of failure - a hole must then be inserted into 1630 * the corefile, to preserve alignment with its headers; and also returns 1631 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 1632 * allowing a hole to be left in the corefile to save diskspace. 1633 * 1634 * Called without mmap_sem, but after all other threads have been killed. 1635 */ 1636 #ifdef CONFIG_ELF_CORE 1637 struct page *get_dump_page(unsigned long addr) 1638 { 1639 struct vm_area_struct *vma; 1640 struct page *page; 1641 1642 if (__get_user_pages(current, current->mm, addr, 1, 1643 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, 1644 NULL) < 1) 1645 return NULL; 1646 flush_cache_page(vma, addr, page_to_pfn(page)); 1647 return page; 1648 } 1649 #endif /* CONFIG_ELF_CORE */ 1650 1651 /* 1652 * Generic Fast GUP 1653 * 1654 * get_user_pages_fast attempts to pin user pages by walking the page 1655 * tables directly and avoids taking locks. Thus the walker needs to be 1656 * protected from page table pages being freed from under it, and should 1657 * block any THP splits. 1658 * 1659 * One way to achieve this is to have the walker disable interrupts, and 1660 * rely on IPIs from the TLB flushing code blocking before the page table 1661 * pages are freed. This is unsuitable for architectures that do not need 1662 * to broadcast an IPI when invalidating TLBs. 1663 * 1664 * Another way to achieve this is to batch up page table containing pages 1665 * belonging to more than one mm_user, then rcu_sched a callback to free those 1666 * pages. Disabling interrupts will allow the fast_gup walker to both block 1667 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs 1668 * (which is a relatively rare event). The code below adopts this strategy. 1669 * 1670 * Before activating this code, please be aware that the following assumptions 1671 * are currently made: 1672 * 1673 * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to 1674 * free pages containing page tables or TLB flushing requires IPI broadcast. 1675 * 1676 * *) ptes can be read atomically by the architecture. 1677 * 1678 * *) access_ok is sufficient to validate userspace address ranges. 1679 * 1680 * The last two assumptions can be relaxed by the addition of helper functions. 1681 * 1682 * This code is based heavily on the PowerPC implementation by Nick Piggin. 1683 */ 1684 #ifdef CONFIG_HAVE_GENERIC_GUP 1685 1686 #ifndef gup_get_pte 1687 /* 1688 * We assume that the PTE can be read atomically. If this is not the case for 1689 * your architecture, please provide the helper. 1690 */ 1691 static inline pte_t gup_get_pte(pte_t *ptep) 1692 { 1693 return READ_ONCE(*ptep); 1694 } 1695 #endif 1696 1697 static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) 1698 { 1699 while ((*nr) - nr_start) { 1700 struct page *page = pages[--(*nr)]; 1701 1702 ClearPageReferenced(page); 1703 put_page(page); 1704 } 1705 } 1706 1707 /* 1708 * Return the compund head page with ref appropriately incremented, 1709 * or NULL if that failed. 1710 */ 1711 static inline struct page *try_get_compound_head(struct page *page, int refs) 1712 { 1713 struct page *head = compound_head(page); 1714 if (WARN_ON_ONCE(page_ref_count(head) < 0)) 1715 return NULL; 1716 if (unlikely(!page_cache_add_speculative(head, refs))) 1717 return NULL; 1718 return head; 1719 } 1720 1721 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL 1722 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1723 unsigned int flags, struct page **pages, int *nr) 1724 { 1725 struct dev_pagemap *pgmap = NULL; 1726 int nr_start = *nr, ret = 0; 1727 pte_t *ptep, *ptem; 1728 1729 ptem = ptep = pte_offset_map(&pmd, addr); 1730 do { 1731 pte_t pte = gup_get_pte(ptep); 1732 struct page *head, *page; 1733 1734 /* 1735 * Similar to the PMD case below, NUMA hinting must take slow 1736 * path using the pte_protnone check. 1737 */ 1738 if (pte_protnone(pte)) 1739 goto pte_unmap; 1740 1741 if (!pte_access_permitted(pte, flags & FOLL_WRITE)) 1742 goto pte_unmap; 1743 1744 if (pte_devmap(pte)) { 1745 if (unlikely(flags & FOLL_LONGTERM)) 1746 goto pte_unmap; 1747 1748 pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); 1749 if (unlikely(!pgmap)) { 1750 undo_dev_pagemap(nr, nr_start, pages); 1751 goto pte_unmap; 1752 } 1753 } else if (pte_special(pte)) 1754 goto pte_unmap; 1755 1756 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 1757 page = pte_page(pte); 1758 1759 head = try_get_compound_head(page, 1); 1760 if (!head) 1761 goto pte_unmap; 1762 1763 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 1764 put_page(head); 1765 goto pte_unmap; 1766 } 1767 1768 VM_BUG_ON_PAGE(compound_head(page) != head, page); 1769 1770 SetPageReferenced(page); 1771 pages[*nr] = page; 1772 (*nr)++; 1773 1774 } while (ptep++, addr += PAGE_SIZE, addr != end); 1775 1776 ret = 1; 1777 1778 pte_unmap: 1779 if (pgmap) 1780 put_dev_pagemap(pgmap); 1781 pte_unmap(ptem); 1782 return ret; 1783 } 1784 #else 1785 1786 /* 1787 * If we can't determine whether or not a pte is special, then fail immediately 1788 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not 1789 * to be special. 1790 * 1791 * For a futex to be placed on a THP tail page, get_futex_key requires a 1792 * __get_user_pages_fast implementation that can pin pages. Thus it's still 1793 * useful to have gup_huge_pmd even if we can't operate on ptes. 1794 */ 1795 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 1796 unsigned int flags, struct page **pages, int *nr) 1797 { 1798 return 0; 1799 } 1800 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ 1801 1802 #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1803 static int __gup_device_huge(unsigned long pfn, unsigned long addr, 1804 unsigned long end, struct page **pages, int *nr) 1805 { 1806 int nr_start = *nr; 1807 struct dev_pagemap *pgmap = NULL; 1808 1809 do { 1810 struct page *page = pfn_to_page(pfn); 1811 1812 pgmap = get_dev_pagemap(pfn, pgmap); 1813 if (unlikely(!pgmap)) { 1814 undo_dev_pagemap(nr, nr_start, pages); 1815 return 0; 1816 } 1817 SetPageReferenced(page); 1818 pages[*nr] = page; 1819 get_page(page); 1820 (*nr)++; 1821 pfn++; 1822 } while (addr += PAGE_SIZE, addr != end); 1823 1824 if (pgmap) 1825 put_dev_pagemap(pgmap); 1826 return 1; 1827 } 1828 1829 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1830 unsigned long end, struct page **pages, int *nr) 1831 { 1832 unsigned long fault_pfn; 1833 int nr_start = *nr; 1834 1835 fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1836 if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) 1837 return 0; 1838 1839 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 1840 undo_dev_pagemap(nr, nr_start, pages); 1841 return 0; 1842 } 1843 return 1; 1844 } 1845 1846 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 1847 unsigned long end, struct page **pages, int *nr) 1848 { 1849 unsigned long fault_pfn; 1850 int nr_start = *nr; 1851 1852 fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 1853 if (!__gup_device_huge(fault_pfn, addr, end, pages, nr)) 1854 return 0; 1855 1856 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 1857 undo_dev_pagemap(nr, nr_start, pages); 1858 return 0; 1859 } 1860 return 1; 1861 } 1862 #else 1863 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1864 unsigned long end, struct page **pages, int *nr) 1865 { 1866 BUILD_BUG(); 1867 return 0; 1868 } 1869 1870 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, 1871 unsigned long end, struct page **pages, int *nr) 1872 { 1873 BUILD_BUG(); 1874 return 0; 1875 } 1876 #endif 1877 1878 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 1879 unsigned long end, unsigned int flags, struct page **pages, int *nr) 1880 { 1881 struct page *head, *page; 1882 int refs; 1883 1884 if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) 1885 return 0; 1886 1887 if (pmd_devmap(orig)) { 1888 if (unlikely(flags & FOLL_LONGTERM)) 1889 return 0; 1890 return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); 1891 } 1892 1893 refs = 0; 1894 page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 1895 do { 1896 pages[*nr] = page; 1897 (*nr)++; 1898 page++; 1899 refs++; 1900 } while (addr += PAGE_SIZE, addr != end); 1901 1902 head = try_get_compound_head(pmd_page(orig), refs); 1903 if (!head) { 1904 *nr -= refs; 1905 return 0; 1906 } 1907 1908 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 1909 *nr -= refs; 1910 while (refs--) 1911 put_page(head); 1912 return 0; 1913 } 1914 1915 SetPageReferenced(head); 1916 return 1; 1917 } 1918 1919 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 1920 unsigned long end, unsigned int flags, struct page **pages, int *nr) 1921 { 1922 struct page *head, *page; 1923 int refs; 1924 1925 if (!pud_access_permitted(orig, flags & FOLL_WRITE)) 1926 return 0; 1927 1928 if (pud_devmap(orig)) { 1929 if (unlikely(flags & FOLL_LONGTERM)) 1930 return 0; 1931 return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); 1932 } 1933 1934 refs = 0; 1935 page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 1936 do { 1937 pages[*nr] = page; 1938 (*nr)++; 1939 page++; 1940 refs++; 1941 } while (addr += PAGE_SIZE, addr != end); 1942 1943 head = try_get_compound_head(pud_page(orig), refs); 1944 if (!head) { 1945 *nr -= refs; 1946 return 0; 1947 } 1948 1949 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 1950 *nr -= refs; 1951 while (refs--) 1952 put_page(head); 1953 return 0; 1954 } 1955 1956 SetPageReferenced(head); 1957 return 1; 1958 } 1959 1960 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, 1961 unsigned long end, unsigned int flags, 1962 struct page **pages, int *nr) 1963 { 1964 int refs; 1965 struct page *head, *page; 1966 1967 if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) 1968 return 0; 1969 1970 BUILD_BUG_ON(pgd_devmap(orig)); 1971 refs = 0; 1972 page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 1973 do { 1974 pages[*nr] = page; 1975 (*nr)++; 1976 page++; 1977 refs++; 1978 } while (addr += PAGE_SIZE, addr != end); 1979 1980 head = try_get_compound_head(pgd_page(orig), refs); 1981 if (!head) { 1982 *nr -= refs; 1983 return 0; 1984 } 1985 1986 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { 1987 *nr -= refs; 1988 while (refs--) 1989 put_page(head); 1990 return 0; 1991 } 1992 1993 SetPageReferenced(head); 1994 return 1; 1995 } 1996 1997 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 1998 unsigned int flags, struct page **pages, int *nr) 1999 { 2000 unsigned long next; 2001 pmd_t *pmdp; 2002 2003 pmdp = pmd_offset(&pud, addr); 2004 do { 2005 pmd_t pmd = READ_ONCE(*pmdp); 2006 2007 next = pmd_addr_end(addr, end); 2008 if (!pmd_present(pmd)) 2009 return 0; 2010 2011 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || 2012 pmd_devmap(pmd))) { 2013 /* 2014 * NUMA hinting faults need to be handled in the GUP 2015 * slowpath for accounting purposes and so that they 2016 * can be serialised against THP migration. 2017 */ 2018 if (pmd_protnone(pmd)) 2019 return 0; 2020 2021 if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, 2022 pages, nr)) 2023 return 0; 2024 2025 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { 2026 /* 2027 * architecture have different format for hugetlbfs 2028 * pmd format and THP pmd format 2029 */ 2030 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, 2031 PMD_SHIFT, next, flags, pages, nr)) 2032 return 0; 2033 } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) 2034 return 0; 2035 } while (pmdp++, addr = next, addr != end); 2036 2037 return 1; 2038 } 2039 2040 static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, 2041 unsigned int flags, struct page **pages, int *nr) 2042 { 2043 unsigned long next; 2044 pud_t *pudp; 2045 2046 pudp = pud_offset(&p4d, addr); 2047 do { 2048 pud_t pud = READ_ONCE(*pudp); 2049 2050 next = pud_addr_end(addr, end); 2051 if (pud_none(pud)) 2052 return 0; 2053 if (unlikely(pud_huge(pud))) { 2054 if (!gup_huge_pud(pud, pudp, addr, next, flags, 2055 pages, nr)) 2056 return 0; 2057 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 2058 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 2059 PUD_SHIFT, next, flags, pages, nr)) 2060 return 0; 2061 } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) 2062 return 0; 2063 } while (pudp++, addr = next, addr != end); 2064 2065 return 1; 2066 } 2067 2068 static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, 2069 unsigned int flags, struct page **pages, int *nr) 2070 { 2071 unsigned long next; 2072 p4d_t *p4dp; 2073 2074 p4dp = p4d_offset(&pgd, addr); 2075 do { 2076 p4d_t p4d = READ_ONCE(*p4dp); 2077 2078 next = p4d_addr_end(addr, end); 2079 if (p4d_none(p4d)) 2080 return 0; 2081 BUILD_BUG_ON(p4d_huge(p4d)); 2082 if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { 2083 if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, 2084 P4D_SHIFT, next, flags, pages, nr)) 2085 return 0; 2086 } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) 2087 return 0; 2088 } while (p4dp++, addr = next, addr != end); 2089 2090 return 1; 2091 } 2092 2093 static void gup_pgd_range(unsigned long addr, unsigned long end, 2094 unsigned int flags, struct page **pages, int *nr) 2095 { 2096 unsigned long next; 2097 pgd_t *pgdp; 2098 2099 pgdp = pgd_offset(current->mm, addr); 2100 do { 2101 pgd_t pgd = READ_ONCE(*pgdp); 2102 2103 next = pgd_addr_end(addr, end); 2104 if (pgd_none(pgd)) 2105 return; 2106 if (unlikely(pgd_huge(pgd))) { 2107 if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, 2108 pages, nr)) 2109 return; 2110 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 2111 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 2112 PGDIR_SHIFT, next, flags, pages, nr)) 2113 return; 2114 } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) 2115 return; 2116 } while (pgdp++, addr = next, addr != end); 2117 } 2118 2119 #ifndef gup_fast_permitted 2120 /* 2121 * Check if it's allowed to use __get_user_pages_fast() for the range, or 2122 * we need to fall back to the slow version: 2123 */ 2124 bool gup_fast_permitted(unsigned long start, int nr_pages) 2125 { 2126 unsigned long len, end; 2127 2128 len = (unsigned long) nr_pages << PAGE_SHIFT; 2129 end = start + len; 2130 return end >= start; 2131 } 2132 #endif 2133 2134 /* 2135 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to 2136 * the regular GUP. 2137 * Note a difference with get_user_pages_fast: this always returns the 2138 * number of pages pinned, 0 if no pages were pinned. 2139 */ 2140 int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 2141 struct page **pages) 2142 { 2143 unsigned long len, end; 2144 unsigned long flags; 2145 int nr = 0; 2146 2147 start &= PAGE_MASK; 2148 len = (unsigned long) nr_pages << PAGE_SHIFT; 2149 end = start + len; 2150 2151 if (unlikely(!access_ok((void __user *)start, len))) 2152 return 0; 2153 2154 /* 2155 * Disable interrupts. We use the nested form as we can already have 2156 * interrupts disabled by get_futex_key. 2157 * 2158 * With interrupts disabled, we block page table pages from being 2159 * freed from under us. See struct mmu_table_batch comments in 2160 * include/asm-generic/tlb.h for more details. 2161 * 2162 * We do not adopt an rcu_read_lock(.) here as we also want to 2163 * block IPIs that come from THPs splitting. 2164 */ 2165 2166 if (gup_fast_permitted(start, nr_pages)) { 2167 local_irq_save(flags); 2168 gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); 2169 local_irq_restore(flags); 2170 } 2171 2172 return nr; 2173 } 2174 2175 static int __gup_longterm_unlocked(unsigned long start, int nr_pages, 2176 unsigned int gup_flags, struct page **pages) 2177 { 2178 int ret; 2179 2180 /* 2181 * FIXME: FOLL_LONGTERM does not work with 2182 * get_user_pages_unlocked() (see comments in that function) 2183 */ 2184 if (gup_flags & FOLL_LONGTERM) { 2185 down_read(¤t->mm->mmap_sem); 2186 ret = __gup_longterm_locked(current, current->mm, 2187 start, nr_pages, 2188 pages, NULL, gup_flags); 2189 up_read(¤t->mm->mmap_sem); 2190 } else { 2191 ret = get_user_pages_unlocked(start, nr_pages, 2192 pages, gup_flags); 2193 } 2194 2195 return ret; 2196 } 2197 2198 /** 2199 * get_user_pages_fast() - pin user pages in memory 2200 * @start: starting user address 2201 * @nr_pages: number of pages from start to pin 2202 * @gup_flags: flags modifying pin behaviour 2203 * @pages: array that receives pointers to the pages pinned. 2204 * Should be at least nr_pages long. 2205 * 2206 * Attempt to pin user pages in memory without taking mm->mmap_sem. 2207 * If not successful, it will fall back to taking the lock and 2208 * calling get_user_pages(). 2209 * 2210 * Returns number of pages pinned. This may be fewer than the number 2211 * requested. If nr_pages is 0 or negative, returns 0. If no pages 2212 * were pinned, returns -errno. 2213 */ 2214 int get_user_pages_fast(unsigned long start, int nr_pages, 2215 unsigned int gup_flags, struct page **pages) 2216 { 2217 unsigned long addr, len, end; 2218 int nr = 0, ret = 0; 2219 2220 start &= PAGE_MASK; 2221 addr = start; 2222 len = (unsigned long) nr_pages << PAGE_SHIFT; 2223 end = start + len; 2224 2225 if (nr_pages <= 0) 2226 return 0; 2227 2228 if (unlikely(!access_ok((void __user *)start, len))) 2229 return -EFAULT; 2230 2231 if (gup_fast_permitted(start, nr_pages)) { 2232 local_irq_disable(); 2233 gup_pgd_range(addr, end, gup_flags, pages, &nr); 2234 local_irq_enable(); 2235 ret = nr; 2236 } 2237 2238 if (nr < nr_pages) { 2239 /* Try to get the remaining pages with get_user_pages */ 2240 start += nr << PAGE_SHIFT; 2241 pages += nr; 2242 2243 ret = __gup_longterm_unlocked(start, nr_pages - nr, 2244 gup_flags, pages); 2245 2246 /* Have to be a bit careful with return values */ 2247 if (nr > 0) { 2248 if (ret < 0) 2249 ret = nr; 2250 else 2251 ret += nr; 2252 } 2253 } 2254 2255 return ret; 2256 } 2257 2258 #endif /* CONFIG_HAVE_GENERIC_GUP */ 2259