1 #include <linux/kernel.h> 2 #include <linux/errno.h> 3 #include <linux/err.h> 4 #include <linux/spinlock.h> 5 6 #include <linux/mm.h> 7 #include <linux/pagemap.h> 8 #include <linux/rmap.h> 9 #include <linux/swap.h> 10 #include <linux/swapops.h> 11 12 #include <linux/sched.h> 13 #include <linux/rwsem.h> 14 #include <linux/hugetlb.h> 15 #include <asm/pgtable.h> 16 17 #include "internal.h" 18 19 static struct page *no_page_table(struct vm_area_struct *vma, 20 unsigned int flags) 21 { 22 /* 23 * When core dumping an enormous anonymous area that nobody 24 * has touched so far, we don't want to allocate unnecessary pages or 25 * page tables. Return error instead of NULL to skip handle_mm_fault, 26 * then get_dump_page() will return NULL to leave a hole in the dump. 27 * But we can only make this optimization where a hole would surely 28 * be zero-filled if handle_mm_fault() actually did handle it. 29 */ 30 if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) 31 return ERR_PTR(-EFAULT); 32 return NULL; 33 } 34 35 static struct page *follow_page_pte(struct vm_area_struct *vma, 36 unsigned long address, pmd_t *pmd, unsigned int flags) 37 { 38 struct mm_struct *mm = vma->vm_mm; 39 struct page *page; 40 spinlock_t *ptl; 41 pte_t *ptep, pte; 42 43 retry: 44 if (unlikely(pmd_bad(*pmd))) 45 return no_page_table(vma, flags); 46 47 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 48 pte = *ptep; 49 if (!pte_present(pte)) { 50 swp_entry_t entry; 51 /* 52 * KSM's break_ksm() relies upon recognizing a ksm page 53 * even while it is being migrated, so for that case we 54 * need migration_entry_wait(). 55 */ 56 if (likely(!(flags & FOLL_MIGRATION))) 57 goto no_page; 58 if (pte_none(pte) || pte_file(pte)) 59 goto no_page; 60 entry = pte_to_swp_entry(pte); 61 if (!is_migration_entry(entry)) 62 goto no_page; 63 pte_unmap_unlock(ptep, ptl); 64 migration_entry_wait(mm, pmd, address); 65 goto retry; 66 } 67 if ((flags & FOLL_NUMA) && pte_numa(pte)) 68 goto no_page; 69 if ((flags & FOLL_WRITE) && !pte_write(pte)) { 70 pte_unmap_unlock(ptep, ptl); 71 return NULL; 72 } 73 74 page = vm_normal_page(vma, address, pte); 75 if (unlikely(!page)) { 76 if ((flags & FOLL_DUMP) || 77 !is_zero_pfn(pte_pfn(pte))) 78 goto bad_page; 79 page = pte_page(pte); 80 } 81 82 if (flags & FOLL_GET) 83 get_page_foll(page); 84 if (flags & FOLL_TOUCH) { 85 if ((flags & FOLL_WRITE) && 86 !pte_dirty(pte) && !PageDirty(page)) 87 set_page_dirty(page); 88 /* 89 * pte_mkyoung() would be more correct here, but atomic care 90 * is needed to avoid losing the dirty bit: it is easier to use 91 * mark_page_accessed(). 92 */ 93 mark_page_accessed(page); 94 } 95 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 96 /* 97 * The preliminary mapping check is mainly to avoid the 98 * pointless overhead of lock_page on the ZERO_PAGE 99 * which might bounce very badly if there is contention. 100 * 101 * If the page is already locked, we don't need to 102 * handle it now - vmscan will handle it later if and 103 * when it attempts to reclaim the page. 104 */ 105 if (page->mapping && trylock_page(page)) { 106 lru_add_drain(); /* push cached pages to LRU */ 107 /* 108 * Because we lock page here, and migration is 109 * blocked by the pte's page reference, and we 110 * know the page is still mapped, we don't even 111 * need to check for file-cache page truncation. 112 */ 113 mlock_vma_page(page); 114 unlock_page(page); 115 } 116 } 117 pte_unmap_unlock(ptep, ptl); 118 return page; 119 bad_page: 120 pte_unmap_unlock(ptep, ptl); 121 return ERR_PTR(-EFAULT); 122 123 no_page: 124 pte_unmap_unlock(ptep, ptl); 125 if (!pte_none(pte)) 126 return NULL; 127 return no_page_table(vma, flags); 128 } 129 130 /** 131 * follow_page_mask - look up a page descriptor from a user-virtual address 132 * @vma: vm_area_struct mapping @address 133 * @address: virtual address to look up 134 * @flags: flags modifying lookup behaviour 135 * @page_mask: on output, *page_mask is set according to the size of the page 136 * 137 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 138 * 139 * Returns the mapped (struct page *), %NULL if no mapping exists, or 140 * an error pointer if there is a mapping to something not represented 141 * by a page descriptor (see also vm_normal_page()). 142 */ 143 struct page *follow_page_mask(struct vm_area_struct *vma, 144 unsigned long address, unsigned int flags, 145 unsigned int *page_mask) 146 { 147 pgd_t *pgd; 148 pud_t *pud; 149 pmd_t *pmd; 150 spinlock_t *ptl; 151 struct page *page; 152 struct mm_struct *mm = vma->vm_mm; 153 154 *page_mask = 0; 155 156 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 157 if (!IS_ERR(page)) { 158 BUG_ON(flags & FOLL_GET); 159 return page; 160 } 161 162 pgd = pgd_offset(mm, address); 163 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 164 return no_page_table(vma, flags); 165 166 pud = pud_offset(pgd, address); 167 if (pud_none(*pud)) 168 return no_page_table(vma, flags); 169 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 170 if (flags & FOLL_GET) 171 return NULL; 172 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 173 return page; 174 } 175 if (unlikely(pud_bad(*pud))) 176 return no_page_table(vma, flags); 177 178 pmd = pmd_offset(pud, address); 179 if (pmd_none(*pmd)) 180 return no_page_table(vma, flags); 181 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 182 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 183 if (flags & FOLL_GET) { 184 /* 185 * Refcount on tail pages are not well-defined and 186 * shouldn't be taken. The caller should handle a NULL 187 * return when trying to follow tail pages. 188 */ 189 if (PageHead(page)) 190 get_page(page); 191 else 192 page = NULL; 193 } 194 return page; 195 } 196 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 197 return no_page_table(vma, flags); 198 if (pmd_trans_huge(*pmd)) { 199 if (flags & FOLL_SPLIT) { 200 split_huge_page_pmd(vma, address, pmd); 201 return follow_page_pte(vma, address, pmd, flags); 202 } 203 ptl = pmd_lock(mm, pmd); 204 if (likely(pmd_trans_huge(*pmd))) { 205 if (unlikely(pmd_trans_splitting(*pmd))) { 206 spin_unlock(ptl); 207 wait_split_huge_page(vma->anon_vma, pmd); 208 } else { 209 page = follow_trans_huge_pmd(vma, address, 210 pmd, flags); 211 spin_unlock(ptl); 212 *page_mask = HPAGE_PMD_NR - 1; 213 return page; 214 } 215 } else 216 spin_unlock(ptl); 217 } 218 return follow_page_pte(vma, address, pmd, flags); 219 } 220 221 static int get_gate_page(struct mm_struct *mm, unsigned long address, 222 unsigned int gup_flags, struct vm_area_struct **vma, 223 struct page **page) 224 { 225 pgd_t *pgd; 226 pud_t *pud; 227 pmd_t *pmd; 228 pte_t *pte; 229 int ret = -EFAULT; 230 231 /* user gate pages are read-only */ 232 if (gup_flags & FOLL_WRITE) 233 return -EFAULT; 234 if (address > TASK_SIZE) 235 pgd = pgd_offset_k(address); 236 else 237 pgd = pgd_offset_gate(mm, address); 238 BUG_ON(pgd_none(*pgd)); 239 pud = pud_offset(pgd, address); 240 BUG_ON(pud_none(*pud)); 241 pmd = pmd_offset(pud, address); 242 if (pmd_none(*pmd)) 243 return -EFAULT; 244 VM_BUG_ON(pmd_trans_huge(*pmd)); 245 pte = pte_offset_map(pmd, address); 246 if (pte_none(*pte)) 247 goto unmap; 248 *vma = get_gate_vma(mm); 249 if (!page) 250 goto out; 251 *page = vm_normal_page(*vma, address, *pte); 252 if (!*page) { 253 if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) 254 goto unmap; 255 *page = pte_page(*pte); 256 } 257 get_page(*page); 258 out: 259 ret = 0; 260 unmap: 261 pte_unmap(pte); 262 return ret; 263 } 264 265 /* 266 * mmap_sem must be held on entry. If @nonblocking != NULL and 267 * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. 268 * If it is, *@nonblocking will be set to 0 and -EBUSY returned. 269 */ 270 static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, 271 unsigned long address, unsigned int *flags, int *nonblocking) 272 { 273 struct mm_struct *mm = vma->vm_mm; 274 unsigned int fault_flags = 0; 275 int ret; 276 277 /* For mlock, just skip the stack guard page. */ 278 if ((*flags & FOLL_MLOCK) && 279 (stack_guard_page_start(vma, address) || 280 stack_guard_page_end(vma, address + PAGE_SIZE))) 281 return -ENOENT; 282 if (*flags & FOLL_WRITE) 283 fault_flags |= FAULT_FLAG_WRITE; 284 if (nonblocking) 285 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 286 if (*flags & FOLL_NOWAIT) 287 fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; 288 if (*flags & FOLL_TRIED) { 289 VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY); 290 fault_flags |= FAULT_FLAG_TRIED; 291 } 292 293 ret = handle_mm_fault(mm, vma, address, fault_flags); 294 if (ret & VM_FAULT_ERROR) { 295 if (ret & VM_FAULT_OOM) 296 return -ENOMEM; 297 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) 298 return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; 299 if (ret & VM_FAULT_SIGBUS) 300 return -EFAULT; 301 BUG(); 302 } 303 304 if (tsk) { 305 if (ret & VM_FAULT_MAJOR) 306 tsk->maj_flt++; 307 else 308 tsk->min_flt++; 309 } 310 311 if (ret & VM_FAULT_RETRY) { 312 if (nonblocking) 313 *nonblocking = 0; 314 return -EBUSY; 315 } 316 317 /* 318 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when 319 * necessary, even if maybe_mkwrite decided not to set pte_write. We 320 * can thus safely do subsequent page lookups as if they were reads. 321 * But only do so when looping for pte_write is futile: in some cases 322 * userspace may also be wanting to write to the gotten user page, 323 * which a read fault here might prevent (a readonly page might get 324 * reCOWed by userspace write). 325 */ 326 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) 327 *flags &= ~FOLL_WRITE; 328 return 0; 329 } 330 331 static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) 332 { 333 vm_flags_t vm_flags = vma->vm_flags; 334 335 if (vm_flags & (VM_IO | VM_PFNMAP)) 336 return -EFAULT; 337 338 if (gup_flags & FOLL_WRITE) { 339 if (!(vm_flags & VM_WRITE)) { 340 if (!(gup_flags & FOLL_FORCE)) 341 return -EFAULT; 342 /* 343 * We used to let the write,force case do COW in a 344 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could 345 * set a breakpoint in a read-only mapping of an 346 * executable, without corrupting the file (yet only 347 * when that file had been opened for writing!). 348 * Anon pages in shared mappings are surprising: now 349 * just reject it. 350 */ 351 if (!is_cow_mapping(vm_flags)) { 352 WARN_ON_ONCE(vm_flags & VM_MAYWRITE); 353 return -EFAULT; 354 } 355 } 356 } else if (!(vm_flags & VM_READ)) { 357 if (!(gup_flags & FOLL_FORCE)) 358 return -EFAULT; 359 /* 360 * Is there actually any vma we can reach here which does not 361 * have VM_MAYREAD set? 362 */ 363 if (!(vm_flags & VM_MAYREAD)) 364 return -EFAULT; 365 } 366 return 0; 367 } 368 369 /** 370 * __get_user_pages() - pin user pages in memory 371 * @tsk: task_struct of target task 372 * @mm: mm_struct of target mm 373 * @start: starting user address 374 * @nr_pages: number of pages from start to pin 375 * @gup_flags: flags modifying pin behaviour 376 * @pages: array that receives pointers to the pages pinned. 377 * Should be at least nr_pages long. Or NULL, if caller 378 * only intends to ensure the pages are faulted in. 379 * @vmas: array of pointers to vmas corresponding to each page. 380 * Or NULL if the caller does not require them. 381 * @nonblocking: whether waiting for disk IO or mmap_sem contention 382 * 383 * Returns number of pages pinned. This may be fewer than the number 384 * requested. If nr_pages is 0 or negative, returns 0. If no pages 385 * were pinned, returns -errno. Each page returned must be released 386 * with a put_page() call when it is finished with. vmas will only 387 * remain valid while mmap_sem is held. 388 * 389 * Must be called with mmap_sem held. It may be released. See below. 390 * 391 * __get_user_pages walks a process's page tables and takes a reference to 392 * each struct page that each user address corresponds to at a given 393 * instant. That is, it takes the page that would be accessed if a user 394 * thread accesses the given user virtual address at that instant. 395 * 396 * This does not guarantee that the page exists in the user mappings when 397 * __get_user_pages returns, and there may even be a completely different 398 * page there in some cases (eg. if mmapped pagecache has been invalidated 399 * and subsequently re faulted). However it does guarantee that the page 400 * won't be freed completely. And mostly callers simply care that the page 401 * contains data that was valid *at some point in time*. Typically, an IO 402 * or similar operation cannot guarantee anything stronger anyway because 403 * locks can't be held over the syscall boundary. 404 * 405 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 406 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 407 * appropriate) must be called after the page is finished with, and 408 * before put_page is called. 409 * 410 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 411 * or mmap_sem contention, and if waiting is needed to pin all pages, 412 * *@nonblocking will be set to 0. Further, if @gup_flags does not 413 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in 414 * this case. 415 * 416 * A caller using such a combination of @nonblocking and @gup_flags 417 * must therefore hold the mmap_sem for reading only, and recognize 418 * when it's been released. Otherwise, it must be held for either 419 * reading or writing and will not be released. 420 * 421 * In most cases, get_user_pages or get_user_pages_fast should be used 422 * instead of __get_user_pages. __get_user_pages should be used only if 423 * you need some special @gup_flags. 424 */ 425 long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 426 unsigned long start, unsigned long nr_pages, 427 unsigned int gup_flags, struct page **pages, 428 struct vm_area_struct **vmas, int *nonblocking) 429 { 430 long i = 0; 431 unsigned int page_mask; 432 struct vm_area_struct *vma = NULL; 433 434 if (!nr_pages) 435 return 0; 436 437 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 438 439 /* 440 * If FOLL_FORCE is set then do not force a full fault as the hinting 441 * fault information is unrelated to the reference behaviour of a task 442 * using the address space 443 */ 444 if (!(gup_flags & FOLL_FORCE)) 445 gup_flags |= FOLL_NUMA; 446 447 do { 448 struct page *page; 449 unsigned int foll_flags = gup_flags; 450 unsigned int page_increm; 451 452 /* first iteration or cross vma bound */ 453 if (!vma || start >= vma->vm_end) { 454 vma = find_extend_vma(mm, start); 455 if (!vma && in_gate_area(mm, start)) { 456 int ret; 457 ret = get_gate_page(mm, start & PAGE_MASK, 458 gup_flags, &vma, 459 pages ? &pages[i] : NULL); 460 if (ret) 461 return i ? : ret; 462 page_mask = 0; 463 goto next_page; 464 } 465 466 if (!vma || check_vma_flags(vma, gup_flags)) 467 return i ? : -EFAULT; 468 if (is_vm_hugetlb_page(vma)) { 469 i = follow_hugetlb_page(mm, vma, pages, vmas, 470 &start, &nr_pages, i, 471 gup_flags); 472 continue; 473 } 474 } 475 retry: 476 /* 477 * If we have a pending SIGKILL, don't keep faulting pages and 478 * potentially allocating memory. 479 */ 480 if (unlikely(fatal_signal_pending(current))) 481 return i ? i : -ERESTARTSYS; 482 cond_resched(); 483 page = follow_page_mask(vma, start, foll_flags, &page_mask); 484 if (!page) { 485 int ret; 486 ret = faultin_page(tsk, vma, start, &foll_flags, 487 nonblocking); 488 switch (ret) { 489 case 0: 490 goto retry; 491 case -EFAULT: 492 case -ENOMEM: 493 case -EHWPOISON: 494 return i ? i : ret; 495 case -EBUSY: 496 return i; 497 case -ENOENT: 498 goto next_page; 499 } 500 BUG(); 501 } 502 if (IS_ERR(page)) 503 return i ? i : PTR_ERR(page); 504 if (pages) { 505 pages[i] = page; 506 flush_anon_page(vma, page, start); 507 flush_dcache_page(page); 508 page_mask = 0; 509 } 510 next_page: 511 if (vmas) { 512 vmas[i] = vma; 513 page_mask = 0; 514 } 515 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 516 if (page_increm > nr_pages) 517 page_increm = nr_pages; 518 i += page_increm; 519 start += page_increm * PAGE_SIZE; 520 nr_pages -= page_increm; 521 } while (nr_pages); 522 return i; 523 } 524 EXPORT_SYMBOL(__get_user_pages); 525 526 /* 527 * fixup_user_fault() - manually resolve a user page fault 528 * @tsk: the task_struct to use for page fault accounting, or 529 * NULL if faults are not to be recorded. 530 * @mm: mm_struct of target mm 531 * @address: user address 532 * @fault_flags:flags to pass down to handle_mm_fault() 533 * 534 * This is meant to be called in the specific scenario where for locking reasons 535 * we try to access user memory in atomic context (within a pagefault_disable() 536 * section), this returns -EFAULT, and we want to resolve the user fault before 537 * trying again. 538 * 539 * Typically this is meant to be used by the futex code. 540 * 541 * The main difference with get_user_pages() is that this function will 542 * unconditionally call handle_mm_fault() which will in turn perform all the 543 * necessary SW fixup of the dirty and young bits in the PTE, while 544 * handle_mm_fault() only guarantees to update these in the struct page. 545 * 546 * This is important for some architectures where those bits also gate the 547 * access permission to the page because they are maintained in software. On 548 * such architectures, gup() will not be enough to make a subsequent access 549 * succeed. 550 * 551 * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). 552 */ 553 int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 554 unsigned long address, unsigned int fault_flags) 555 { 556 struct vm_area_struct *vma; 557 vm_flags_t vm_flags; 558 int ret; 559 560 vma = find_extend_vma(mm, address); 561 if (!vma || address < vma->vm_start) 562 return -EFAULT; 563 564 vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; 565 if (!(vm_flags & vma->vm_flags)) 566 return -EFAULT; 567 568 ret = handle_mm_fault(mm, vma, address, fault_flags); 569 if (ret & VM_FAULT_ERROR) { 570 if (ret & VM_FAULT_OOM) 571 return -ENOMEM; 572 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) 573 return -EHWPOISON; 574 if (ret & VM_FAULT_SIGBUS) 575 return -EFAULT; 576 BUG(); 577 } 578 if (tsk) { 579 if (ret & VM_FAULT_MAJOR) 580 tsk->maj_flt++; 581 else 582 tsk->min_flt++; 583 } 584 return 0; 585 } 586 587 /* 588 * get_user_pages() - pin user pages in memory 589 * @tsk: the task_struct to use for page fault accounting, or 590 * NULL if faults are not to be recorded. 591 * @mm: mm_struct of target mm 592 * @start: starting user address 593 * @nr_pages: number of pages from start to pin 594 * @write: whether pages will be written to by the caller 595 * @force: whether to force access even when user mapping is currently 596 * protected (but never forces write access to shared mapping). 597 * @pages: array that receives pointers to the pages pinned. 598 * Should be at least nr_pages long. Or NULL, if caller 599 * only intends to ensure the pages are faulted in. 600 * @vmas: array of pointers to vmas corresponding to each page. 601 * Or NULL if the caller does not require them. 602 * 603 * Returns number of pages pinned. This may be fewer than the number 604 * requested. If nr_pages is 0 or negative, returns 0. If no pages 605 * were pinned, returns -errno. Each page returned must be released 606 * with a put_page() call when it is finished with. vmas will only 607 * remain valid while mmap_sem is held. 608 * 609 * Must be called with mmap_sem held for read or write. 610 * 611 * get_user_pages walks a process's page tables and takes a reference to 612 * each struct page that each user address corresponds to at a given 613 * instant. That is, it takes the page that would be accessed if a user 614 * thread accesses the given user virtual address at that instant. 615 * 616 * This does not guarantee that the page exists in the user mappings when 617 * get_user_pages returns, and there may even be a completely different 618 * page there in some cases (eg. if mmapped pagecache has been invalidated 619 * and subsequently re faulted). However it does guarantee that the page 620 * won't be freed completely. And mostly callers simply care that the page 621 * contains data that was valid *at some point in time*. Typically, an IO 622 * or similar operation cannot guarantee anything stronger anyway because 623 * locks can't be held over the syscall boundary. 624 * 625 * If write=0, the page must not be written to. If the page is written to, 626 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called 627 * after the page is finished with, and before put_page is called. 628 * 629 * get_user_pages is typically used for fewer-copy IO operations, to get a 630 * handle on the memory by some means other than accesses via the user virtual 631 * addresses. The pages may be submitted for DMA to devices or accessed via 632 * their kernel linear mapping (via the kmap APIs). Care should be taken to 633 * use the correct cache flushing APIs. 634 * 635 * See also get_user_pages_fast, for performance critical applications. 636 */ 637 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 638 unsigned long start, unsigned long nr_pages, int write, 639 int force, struct page **pages, struct vm_area_struct **vmas) 640 { 641 int flags = FOLL_TOUCH; 642 643 if (pages) 644 flags |= FOLL_GET; 645 if (write) 646 flags |= FOLL_WRITE; 647 if (force) 648 flags |= FOLL_FORCE; 649 650 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, 651 NULL); 652 } 653 EXPORT_SYMBOL(get_user_pages); 654 655 /** 656 * get_dump_page() - pin user page in memory while writing it to core dump 657 * @addr: user address 658 * 659 * Returns struct page pointer of user page pinned for dump, 660 * to be freed afterwards by page_cache_release() or put_page(). 661 * 662 * Returns NULL on any kind of failure - a hole must then be inserted into 663 * the corefile, to preserve alignment with its headers; and also returns 664 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 665 * allowing a hole to be left in the corefile to save diskspace. 666 * 667 * Called without mmap_sem, but after all other threads have been killed. 668 */ 669 #ifdef CONFIG_ELF_CORE 670 struct page *get_dump_page(unsigned long addr) 671 { 672 struct vm_area_struct *vma; 673 struct page *page; 674 675 if (__get_user_pages(current, current->mm, addr, 1, 676 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, 677 NULL) < 1) 678 return NULL; 679 flush_cache_page(vma, addr, page_to_pfn(page)); 680 return page; 681 } 682 #endif /* CONFIG_ELF_CORE */ 683 684 /* 685 * Generic RCU Fast GUP 686 * 687 * get_user_pages_fast attempts to pin user pages by walking the page 688 * tables directly and avoids taking locks. Thus the walker needs to be 689 * protected from page table pages being freed from under it, and should 690 * block any THP splits. 691 * 692 * One way to achieve this is to have the walker disable interrupts, and 693 * rely on IPIs from the TLB flushing code blocking before the page table 694 * pages are freed. This is unsuitable for architectures that do not need 695 * to broadcast an IPI when invalidating TLBs. 696 * 697 * Another way to achieve this is to batch up page table containing pages 698 * belonging to more than one mm_user, then rcu_sched a callback to free those 699 * pages. Disabling interrupts will allow the fast_gup walker to both block 700 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs 701 * (which is a relatively rare event). The code below adopts this strategy. 702 * 703 * Before activating this code, please be aware that the following assumptions 704 * are currently made: 705 * 706 * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free 707 * pages containing page tables. 708 * 709 * *) THP splits will broadcast an IPI, this can be achieved by overriding 710 * pmdp_splitting_flush. 711 * 712 * *) ptes can be read atomically by the architecture. 713 * 714 * *) access_ok is sufficient to validate userspace address ranges. 715 * 716 * The last two assumptions can be relaxed by the addition of helper functions. 717 * 718 * This code is based heavily on the PowerPC implementation by Nick Piggin. 719 */ 720 #ifdef CONFIG_HAVE_GENERIC_RCU_GUP 721 722 #ifdef __HAVE_ARCH_PTE_SPECIAL 723 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 724 int write, struct page **pages, int *nr) 725 { 726 pte_t *ptep, *ptem; 727 int ret = 0; 728 729 ptem = ptep = pte_offset_map(&pmd, addr); 730 do { 731 /* 732 * In the line below we are assuming that the pte can be read 733 * atomically. If this is not the case for your architecture, 734 * please wrap this in a helper function! 735 * 736 * for an example see gup_get_pte in arch/x86/mm/gup.c 737 */ 738 pte_t pte = ACCESS_ONCE(*ptep); 739 struct page *page; 740 741 /* 742 * Similar to the PMD case below, NUMA hinting must take slow 743 * path 744 */ 745 if (!pte_present(pte) || pte_special(pte) || 746 pte_numa(pte) || (write && !pte_write(pte))) 747 goto pte_unmap; 748 749 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 750 page = pte_page(pte); 751 752 if (!page_cache_get_speculative(page)) 753 goto pte_unmap; 754 755 if (unlikely(pte_val(pte) != pte_val(*ptep))) { 756 put_page(page); 757 goto pte_unmap; 758 } 759 760 pages[*nr] = page; 761 (*nr)++; 762 763 } while (ptep++, addr += PAGE_SIZE, addr != end); 764 765 ret = 1; 766 767 pte_unmap: 768 pte_unmap(ptem); 769 return ret; 770 } 771 #else 772 773 /* 774 * If we can't determine whether or not a pte is special, then fail immediately 775 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not 776 * to be special. 777 * 778 * For a futex to be placed on a THP tail page, get_futex_key requires a 779 * __get_user_pages_fast implementation that can pin pages. Thus it's still 780 * useful to have gup_huge_pmd even if we can't operate on ptes. 781 */ 782 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, 783 int write, struct page **pages, int *nr) 784 { 785 return 0; 786 } 787 #endif /* __HAVE_ARCH_PTE_SPECIAL */ 788 789 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, 790 unsigned long end, int write, struct page **pages, int *nr) 791 { 792 struct page *head, *page, *tail; 793 int refs; 794 795 if (write && !pmd_write(orig)) 796 return 0; 797 798 refs = 0; 799 head = pmd_page(orig); 800 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 801 tail = page; 802 do { 803 VM_BUG_ON_PAGE(compound_head(page) != head, page); 804 pages[*nr] = page; 805 (*nr)++; 806 page++; 807 refs++; 808 } while (addr += PAGE_SIZE, addr != end); 809 810 if (!page_cache_add_speculative(head, refs)) { 811 *nr -= refs; 812 return 0; 813 } 814 815 if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { 816 *nr -= refs; 817 while (refs--) 818 put_page(head); 819 return 0; 820 } 821 822 /* 823 * Any tail pages need their mapcount reference taken before we 824 * return. (This allows the THP code to bump their ref count when 825 * they are split into base pages). 826 */ 827 while (refs--) { 828 if (PageTail(tail)) 829 get_huge_page_tail(tail); 830 tail++; 831 } 832 833 return 1; 834 } 835 836 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, 837 unsigned long end, int write, struct page **pages, int *nr) 838 { 839 struct page *head, *page, *tail; 840 int refs; 841 842 if (write && !pud_write(orig)) 843 return 0; 844 845 refs = 0; 846 head = pud_page(orig); 847 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 848 tail = page; 849 do { 850 VM_BUG_ON_PAGE(compound_head(page) != head, page); 851 pages[*nr] = page; 852 (*nr)++; 853 page++; 854 refs++; 855 } while (addr += PAGE_SIZE, addr != end); 856 857 if (!page_cache_add_speculative(head, refs)) { 858 *nr -= refs; 859 return 0; 860 } 861 862 if (unlikely(pud_val(orig) != pud_val(*pudp))) { 863 *nr -= refs; 864 while (refs--) 865 put_page(head); 866 return 0; 867 } 868 869 while (refs--) { 870 if (PageTail(tail)) 871 get_huge_page_tail(tail); 872 tail++; 873 } 874 875 return 1; 876 } 877 878 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, 879 unsigned long end, int write, 880 struct page **pages, int *nr) 881 { 882 int refs; 883 struct page *head, *page, *tail; 884 885 if (write && !pgd_write(orig)) 886 return 0; 887 888 refs = 0; 889 head = pgd_page(orig); 890 page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); 891 tail = page; 892 do { 893 VM_BUG_ON_PAGE(compound_head(page) != head, page); 894 pages[*nr] = page; 895 (*nr)++; 896 page++; 897 refs++; 898 } while (addr += PAGE_SIZE, addr != end); 899 900 if (!page_cache_add_speculative(head, refs)) { 901 *nr -= refs; 902 return 0; 903 } 904 905 if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) { 906 *nr -= refs; 907 while (refs--) 908 put_page(head); 909 return 0; 910 } 911 912 while (refs--) { 913 if (PageTail(tail)) 914 get_huge_page_tail(tail); 915 tail++; 916 } 917 918 return 1; 919 } 920 921 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, 922 int write, struct page **pages, int *nr) 923 { 924 unsigned long next; 925 pmd_t *pmdp; 926 927 pmdp = pmd_offset(&pud, addr); 928 do { 929 pmd_t pmd = ACCESS_ONCE(*pmdp); 930 931 next = pmd_addr_end(addr, end); 932 if (pmd_none(pmd) || pmd_trans_splitting(pmd)) 933 return 0; 934 935 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { 936 /* 937 * NUMA hinting faults need to be handled in the GUP 938 * slowpath for accounting purposes and so that they 939 * can be serialised against THP migration. 940 */ 941 if (pmd_numa(pmd)) 942 return 0; 943 944 if (!gup_huge_pmd(pmd, pmdp, addr, next, write, 945 pages, nr)) 946 return 0; 947 948 } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) { 949 /* 950 * architecture have different format for hugetlbfs 951 * pmd format and THP pmd format 952 */ 953 if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, 954 PMD_SHIFT, next, write, pages, nr)) 955 return 0; 956 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 957 return 0; 958 } while (pmdp++, addr = next, addr != end); 959 960 return 1; 961 } 962 963 static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, 964 int write, struct page **pages, int *nr) 965 { 966 unsigned long next; 967 pud_t *pudp; 968 969 pudp = pud_offset(&pgd, addr); 970 do { 971 pud_t pud = READ_ONCE(*pudp); 972 973 next = pud_addr_end(addr, end); 974 if (pud_none(pud)) 975 return 0; 976 if (unlikely(pud_huge(pud))) { 977 if (!gup_huge_pud(pud, pudp, addr, next, write, 978 pages, nr)) 979 return 0; 980 } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { 981 if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, 982 PUD_SHIFT, next, write, pages, nr)) 983 return 0; 984 } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) 985 return 0; 986 } while (pudp++, addr = next, addr != end); 987 988 return 1; 989 } 990 991 /* 992 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to 993 * the regular GUP. It will only return non-negative values. 994 */ 995 int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 996 struct page **pages) 997 { 998 struct mm_struct *mm = current->mm; 999 unsigned long addr, len, end; 1000 unsigned long next, flags; 1001 pgd_t *pgdp; 1002 int nr = 0; 1003 1004 start &= PAGE_MASK; 1005 addr = start; 1006 len = (unsigned long) nr_pages << PAGE_SHIFT; 1007 end = start + len; 1008 1009 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, 1010 start, len))) 1011 return 0; 1012 1013 /* 1014 * Disable interrupts. We use the nested form as we can already have 1015 * interrupts disabled by get_futex_key. 1016 * 1017 * With interrupts disabled, we block page table pages from being 1018 * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h 1019 * for more details. 1020 * 1021 * We do not adopt an rcu_read_lock(.) here as we also want to 1022 * block IPIs that come from THPs splitting. 1023 */ 1024 1025 local_irq_save(flags); 1026 pgdp = pgd_offset(mm, addr); 1027 do { 1028 pgd_t pgd = ACCESS_ONCE(*pgdp); 1029 1030 next = pgd_addr_end(addr, end); 1031 if (pgd_none(pgd)) 1032 break; 1033 if (unlikely(pgd_huge(pgd))) { 1034 if (!gup_huge_pgd(pgd, pgdp, addr, next, write, 1035 pages, &nr)) 1036 break; 1037 } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { 1038 if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, 1039 PGDIR_SHIFT, next, write, pages, &nr)) 1040 break; 1041 } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) 1042 break; 1043 } while (pgdp++, addr = next, addr != end); 1044 local_irq_restore(flags); 1045 1046 return nr; 1047 } 1048 1049 /** 1050 * get_user_pages_fast() - pin user pages in memory 1051 * @start: starting user address 1052 * @nr_pages: number of pages from start to pin 1053 * @write: whether pages will be written to 1054 * @pages: array that receives pointers to the pages pinned. 1055 * Should be at least nr_pages long. 1056 * 1057 * Attempt to pin user pages in memory without taking mm->mmap_sem. 1058 * If not successful, it will fall back to taking the lock and 1059 * calling get_user_pages(). 1060 * 1061 * Returns number of pages pinned. This may be fewer than the number 1062 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1063 * were pinned, returns -errno. 1064 */ 1065 int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1066 struct page **pages) 1067 { 1068 struct mm_struct *mm = current->mm; 1069 int nr, ret; 1070 1071 start &= PAGE_MASK; 1072 nr = __get_user_pages_fast(start, nr_pages, write, pages); 1073 ret = nr; 1074 1075 if (nr < nr_pages) { 1076 /* Try to get the remaining pages with get_user_pages */ 1077 start += nr << PAGE_SHIFT; 1078 pages += nr; 1079 1080 down_read(&mm->mmap_sem); 1081 ret = get_user_pages(current, mm, start, 1082 nr_pages - nr, write, 0, pages, NULL); 1083 up_read(&mm->mmap_sem); 1084 1085 /* Have to be a bit careful with return values */ 1086 if (nr > 0) { 1087 if (ret < 0) 1088 ret = nr; 1089 else 1090 ret += nr; 1091 } 1092 } 1093 1094 return ret; 1095 } 1096 1097 #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ 1098