1 /* 2 * linux/mm/memory.c 3 * 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 5 */ 6 7 /* 8 * demand-loading started 01.12.91 - seems it is high on the list of 9 * things wanted, and it should be easy to implement. - Linus 10 */ 11 12 /* 13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared 14 * pages started 02.12.91, seems to work. - Linus. 15 * 16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it 17 * would have taken more than the 6M I have free, but it worked well as 18 * far as I could see. 19 * 20 * Also corrected some "invalidate()"s - I wasn't doing enough of them. 21 */ 22 23 /* 24 * Real VM (paging to/from disk) started 18.12.91. Much more work and 25 * thought has to go into this. Oh, well.. 26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. 27 * Found it. Everything seems to work now. 28 * 20.12.91 - Ok, making the swap-device changeable like the root. 29 */ 30 31 /* 32 * 05.04.94 - Multi-page memory management added for v1.1. 33 * Idea by Alex Bligh (alex@cconcepts.co.uk) 34 * 35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG 36 * (Gerhard.Wichert@pdb.siemens.de) 37 * 38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) 39 */ 40 41 #include <linux/kernel_stat.h> 42 #include <linux/mm.h> 43 #include <linux/hugetlb.h> 44 #include <linux/mman.h> 45 #include <linux/swap.h> 46 #include <linux/highmem.h> 47 #include <linux/pagemap.h> 48 #include <linux/ksm.h> 49 #include <linux/rmap.h> 50 #include <linux/module.h> 51 #include <linux/delayacct.h> 52 #include <linux/init.h> 53 #include <linux/writeback.h> 54 #include <linux/memcontrol.h> 55 #include <linux/mmu_notifier.h> 56 #include <linux/kallsyms.h> 57 #include <linux/swapops.h> 58 #include <linux/elf.h> 59 60 #include <asm/io.h> 61 #include <asm/pgalloc.h> 62 #include <asm/uaccess.h> 63 #include <asm/tlb.h> 64 #include <asm/tlbflush.h> 65 #include <asm/pgtable.h> 66 67 #include "internal.h" 68 69 #ifndef CONFIG_NEED_MULTIPLE_NODES 70 /* use the per-pgdat data instead for discontigmem - mbligh */ 71 unsigned long max_mapnr; 72 struct page *mem_map; 73 74 EXPORT_SYMBOL(max_mapnr); 75 EXPORT_SYMBOL(mem_map); 76 #endif 77 78 unsigned long num_physpages; 79 /* 80 * A number of key systems in x86 including ioremap() rely on the assumption 81 * that high_memory defines the upper bound on direct map memory, then end 82 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and 83 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL 84 * and ZONE_HIGHMEM. 85 */ 86 void * high_memory; 87 88 EXPORT_SYMBOL(num_physpages); 89 EXPORT_SYMBOL(high_memory); 90 91 /* 92 * Randomize the address space (stacks, mmaps, brk, etc.). 93 * 94 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, 95 * as ancient (libc5 based) binaries can segfault. ) 96 */ 97 int randomize_va_space __read_mostly = 98 #ifdef CONFIG_COMPAT_BRK 99 1; 100 #else 101 2; 102 #endif 103 104 static int __init disable_randmaps(char *s) 105 { 106 randomize_va_space = 0; 107 return 1; 108 } 109 __setup("norandmaps", disable_randmaps); 110 111 unsigned long zero_pfn __read_mostly; 112 unsigned long highest_memmap_pfn __read_mostly; 113 114 /* 115 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 116 */ 117 static int __init init_zero_pfn(void) 118 { 119 zero_pfn = page_to_pfn(ZERO_PAGE(0)); 120 return 0; 121 } 122 core_initcall(init_zero_pfn); 123 124 /* 125 * If a p?d_bad entry is found while walking page tables, report 126 * the error, before resetting entry to p?d_none. Usually (but 127 * very seldom) called out from the p?d_none_or_clear_bad macros. 128 */ 129 130 void pgd_clear_bad(pgd_t *pgd) 131 { 132 pgd_ERROR(*pgd); 133 pgd_clear(pgd); 134 } 135 136 void pud_clear_bad(pud_t *pud) 137 { 138 pud_ERROR(*pud); 139 pud_clear(pud); 140 } 141 142 void pmd_clear_bad(pmd_t *pmd) 143 { 144 pmd_ERROR(*pmd); 145 pmd_clear(pmd); 146 } 147 148 /* 149 * Note: this doesn't free the actual pages themselves. That 150 * has been handled earlier when unmapping all the memory regions. 151 */ 152 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 153 unsigned long addr) 154 { 155 pgtable_t token = pmd_pgtable(*pmd); 156 pmd_clear(pmd); 157 pte_free_tlb(tlb, token, addr); 158 tlb->mm->nr_ptes--; 159 } 160 161 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 162 unsigned long addr, unsigned long end, 163 unsigned long floor, unsigned long ceiling) 164 { 165 pmd_t *pmd; 166 unsigned long next; 167 unsigned long start; 168 169 start = addr; 170 pmd = pmd_offset(pud, addr); 171 do { 172 next = pmd_addr_end(addr, end); 173 if (pmd_none_or_clear_bad(pmd)) 174 continue; 175 free_pte_range(tlb, pmd, addr); 176 } while (pmd++, addr = next, addr != end); 177 178 start &= PUD_MASK; 179 if (start < floor) 180 return; 181 if (ceiling) { 182 ceiling &= PUD_MASK; 183 if (!ceiling) 184 return; 185 } 186 if (end - 1 > ceiling - 1) 187 return; 188 189 pmd = pmd_offset(pud, start); 190 pud_clear(pud); 191 pmd_free_tlb(tlb, pmd, start); 192 } 193 194 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 195 unsigned long addr, unsigned long end, 196 unsigned long floor, unsigned long ceiling) 197 { 198 pud_t *pud; 199 unsigned long next; 200 unsigned long start; 201 202 start = addr; 203 pud = pud_offset(pgd, addr); 204 do { 205 next = pud_addr_end(addr, end); 206 if (pud_none_or_clear_bad(pud)) 207 continue; 208 free_pmd_range(tlb, pud, addr, next, floor, ceiling); 209 } while (pud++, addr = next, addr != end); 210 211 start &= PGDIR_MASK; 212 if (start < floor) 213 return; 214 if (ceiling) { 215 ceiling &= PGDIR_MASK; 216 if (!ceiling) 217 return; 218 } 219 if (end - 1 > ceiling - 1) 220 return; 221 222 pud = pud_offset(pgd, start); 223 pgd_clear(pgd); 224 pud_free_tlb(tlb, pud, start); 225 } 226 227 /* 228 * This function frees user-level page tables of a process. 229 * 230 * Must be called with pagetable lock held. 231 */ 232 void free_pgd_range(struct mmu_gather *tlb, 233 unsigned long addr, unsigned long end, 234 unsigned long floor, unsigned long ceiling) 235 { 236 pgd_t *pgd; 237 unsigned long next; 238 unsigned long start; 239 240 /* 241 * The next few lines have given us lots of grief... 242 * 243 * Why are we testing PMD* at this top level? Because often 244 * there will be no work to do at all, and we'd prefer not to 245 * go all the way down to the bottom just to discover that. 246 * 247 * Why all these "- 1"s? Because 0 represents both the bottom 248 * of the address space and the top of it (using -1 for the 249 * top wouldn't help much: the masks would do the wrong thing). 250 * The rule is that addr 0 and floor 0 refer to the bottom of 251 * the address space, but end 0 and ceiling 0 refer to the top 252 * Comparisons need to use "end - 1" and "ceiling - 1" (though 253 * that end 0 case should be mythical). 254 * 255 * Wherever addr is brought up or ceiling brought down, we must 256 * be careful to reject "the opposite 0" before it confuses the 257 * subsequent tests. But what about where end is brought down 258 * by PMD_SIZE below? no, end can't go down to 0 there. 259 * 260 * Whereas we round start (addr) and ceiling down, by different 261 * masks at different levels, in order to test whether a table 262 * now has no other vmas using it, so can be freed, we don't 263 * bother to round floor or end up - the tests don't need that. 264 */ 265 266 addr &= PMD_MASK; 267 if (addr < floor) { 268 addr += PMD_SIZE; 269 if (!addr) 270 return; 271 } 272 if (ceiling) { 273 ceiling &= PMD_MASK; 274 if (!ceiling) 275 return; 276 } 277 if (end - 1 > ceiling - 1) 278 end -= PMD_SIZE; 279 if (addr > end - 1) 280 return; 281 282 start = addr; 283 pgd = pgd_offset(tlb->mm, addr); 284 do { 285 next = pgd_addr_end(addr, end); 286 if (pgd_none_or_clear_bad(pgd)) 287 continue; 288 free_pud_range(tlb, pgd, addr, next, floor, ceiling); 289 } while (pgd++, addr = next, addr != end); 290 } 291 292 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, 293 unsigned long floor, unsigned long ceiling) 294 { 295 while (vma) { 296 struct vm_area_struct *next = vma->vm_next; 297 unsigned long addr = vma->vm_start; 298 299 /* 300 * Hide vma from rmap and truncate_pagecache before freeing 301 * pgtables 302 */ 303 anon_vma_unlink(vma); 304 unlink_file_vma(vma); 305 306 if (is_vm_hugetlb_page(vma)) { 307 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 308 floor, next? next->vm_start: ceiling); 309 } else { 310 /* 311 * Optimization: gather nearby vmas into one call down 312 */ 313 while (next && next->vm_start <= vma->vm_end + PMD_SIZE 314 && !is_vm_hugetlb_page(next)) { 315 vma = next; 316 next = vma->vm_next; 317 anon_vma_unlink(vma); 318 unlink_file_vma(vma); 319 } 320 free_pgd_range(tlb, addr, vma->vm_end, 321 floor, next? next->vm_start: ceiling); 322 } 323 vma = next; 324 } 325 } 326 327 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 328 { 329 pgtable_t new = pte_alloc_one(mm, address); 330 if (!new) 331 return -ENOMEM; 332 333 /* 334 * Ensure all pte setup (eg. pte page lock and page clearing) are 335 * visible before the pte is made visible to other CPUs by being 336 * put into page tables. 337 * 338 * The other side of the story is the pointer chasing in the page 339 * table walking code (when walking the page table without locking; 340 * ie. most of the time). Fortunately, these data accesses consist 341 * of a chain of data-dependent loads, meaning most CPUs (alpha 342 * being the notable exception) will already guarantee loads are 343 * seen in-order. See the alpha page table accessors for the 344 * smp_read_barrier_depends() barriers in page table walking code. 345 */ 346 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 347 348 spin_lock(&mm->page_table_lock); 349 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 350 mm->nr_ptes++; 351 pmd_populate(mm, pmd, new); 352 new = NULL; 353 } 354 spin_unlock(&mm->page_table_lock); 355 if (new) 356 pte_free(mm, new); 357 return 0; 358 } 359 360 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) 361 { 362 pte_t *new = pte_alloc_one_kernel(&init_mm, address); 363 if (!new) 364 return -ENOMEM; 365 366 smp_wmb(); /* See comment in __pte_alloc */ 367 368 spin_lock(&init_mm.page_table_lock); 369 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 370 pmd_populate_kernel(&init_mm, pmd, new); 371 new = NULL; 372 } 373 spin_unlock(&init_mm.page_table_lock); 374 if (new) 375 pte_free_kernel(&init_mm, new); 376 return 0; 377 } 378 379 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) 380 { 381 if (file_rss) 382 add_mm_counter(mm, file_rss, file_rss); 383 if (anon_rss) 384 add_mm_counter(mm, anon_rss, anon_rss); 385 } 386 387 /* 388 * This function is called to print an error when a bad pte 389 * is found. For example, we might have a PFN-mapped pte in 390 * a region that doesn't allow it. 391 * 392 * The calling function must still handle the error. 393 */ 394 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, 395 pte_t pte, struct page *page) 396 { 397 pgd_t *pgd = pgd_offset(vma->vm_mm, addr); 398 pud_t *pud = pud_offset(pgd, addr); 399 pmd_t *pmd = pmd_offset(pud, addr); 400 struct address_space *mapping; 401 pgoff_t index; 402 static unsigned long resume; 403 static unsigned long nr_shown; 404 static unsigned long nr_unshown; 405 406 /* 407 * Allow a burst of 60 reports, then keep quiet for that minute; 408 * or allow a steady drip of one report per second. 409 */ 410 if (nr_shown == 60) { 411 if (time_before(jiffies, resume)) { 412 nr_unshown++; 413 return; 414 } 415 if (nr_unshown) { 416 printk(KERN_ALERT 417 "BUG: Bad page map: %lu messages suppressed\n", 418 nr_unshown); 419 nr_unshown = 0; 420 } 421 nr_shown = 0; 422 } 423 if (nr_shown++ == 0) 424 resume = jiffies + 60 * HZ; 425 426 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; 427 index = linear_page_index(vma, addr); 428 429 printk(KERN_ALERT 430 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 431 current->comm, 432 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 433 if (page) { 434 printk(KERN_ALERT 435 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", 436 page, (void *)page->flags, page_count(page), 437 page_mapcount(page), page->mapping, page->index); 438 } 439 printk(KERN_ALERT 440 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 441 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 442 /* 443 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y 444 */ 445 if (vma->vm_ops) 446 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", 447 (unsigned long)vma->vm_ops->fault); 448 if (vma->vm_file && vma->vm_file->f_op) 449 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", 450 (unsigned long)vma->vm_file->f_op->mmap); 451 dump_stack(); 452 add_taint(TAINT_BAD_PAGE); 453 } 454 455 static inline int is_cow_mapping(unsigned int flags) 456 { 457 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 458 } 459 460 #ifndef is_zero_pfn 461 static inline int is_zero_pfn(unsigned long pfn) 462 { 463 return pfn == zero_pfn; 464 } 465 #endif 466 467 #ifndef my_zero_pfn 468 static inline unsigned long my_zero_pfn(unsigned long addr) 469 { 470 return zero_pfn; 471 } 472 #endif 473 474 /* 475 * vm_normal_page -- This function gets the "struct page" associated with a pte. 476 * 477 * "Special" mappings do not wish to be associated with a "struct page" (either 478 * it doesn't exist, or it exists but they don't want to touch it). In this 479 * case, NULL is returned here. "Normal" mappings do have a struct page. 480 * 481 * There are 2 broad cases. Firstly, an architecture may define a pte_special() 482 * pte bit, in which case this function is trivial. Secondly, an architecture 483 * may not have a spare pte bit, which requires a more complicated scheme, 484 * described below. 485 * 486 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a 487 * special mapping (even if there are underlying and valid "struct pages"). 488 * COWed pages of a VM_PFNMAP are always normal. 489 * 490 * The way we recognize COWed pages within VM_PFNMAP mappings is through the 491 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit 492 * set, and the vm_pgoff will point to the first PFN mapped: thus every special 493 * mapping will always honor the rule 494 * 495 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 496 * 497 * And for normal mappings this is false. 498 * 499 * This restricts such mappings to be a linear translation from virtual address 500 * to pfn. To get around this restriction, we allow arbitrary mappings so long 501 * as the vma is not a COW mapping; in that case, we know that all ptes are 502 * special (because none can have been COWed). 503 * 504 * 505 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. 506 * 507 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct 508 * page" backing, however the difference is that _all_ pages with a struct 509 * page (that is, those where pfn_valid is true) are refcounted and considered 510 * normal pages by the VM. The disadvantage is that pages are refcounted 511 * (which can be slower and simply not an option for some PFNMAP users). The 512 * advantage is that we don't have to follow the strict linearity rule of 513 * PFNMAP mappings in order to support COWable mappings. 514 * 515 */ 516 #ifdef __HAVE_ARCH_PTE_SPECIAL 517 # define HAVE_PTE_SPECIAL 1 518 #else 519 # define HAVE_PTE_SPECIAL 0 520 #endif 521 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 522 pte_t pte) 523 { 524 unsigned long pfn = pte_pfn(pte); 525 526 if (HAVE_PTE_SPECIAL) { 527 if (likely(!pte_special(pte))) 528 goto check_pfn; 529 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 530 return NULL; 531 if (!is_zero_pfn(pfn)) 532 print_bad_pte(vma, addr, pte, NULL); 533 return NULL; 534 } 535 536 /* !HAVE_PTE_SPECIAL case follows: */ 537 538 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 539 if (vma->vm_flags & VM_MIXEDMAP) { 540 if (!pfn_valid(pfn)) 541 return NULL; 542 goto out; 543 } else { 544 unsigned long off; 545 off = (addr - vma->vm_start) >> PAGE_SHIFT; 546 if (pfn == vma->vm_pgoff + off) 547 return NULL; 548 if (!is_cow_mapping(vma->vm_flags)) 549 return NULL; 550 } 551 } 552 553 if (is_zero_pfn(pfn)) 554 return NULL; 555 check_pfn: 556 if (unlikely(pfn > highest_memmap_pfn)) { 557 print_bad_pte(vma, addr, pte, NULL); 558 return NULL; 559 } 560 561 /* 562 * NOTE! We still have PageReserved() pages in the page tables. 563 * eg. VDSO mappings can cause them to exist. 564 */ 565 out: 566 return pfn_to_page(pfn); 567 } 568 569 /* 570 * copy one vm_area from one task to the other. Assumes the page tables 571 * already present in the new task to be cleared in the whole range 572 * covered by this vma. 573 */ 574 575 static inline void 576 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 577 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 578 unsigned long addr, int *rss) 579 { 580 unsigned long vm_flags = vma->vm_flags; 581 pte_t pte = *src_pte; 582 struct page *page; 583 584 /* pte contains position in swap or file, so copy. */ 585 if (unlikely(!pte_present(pte))) { 586 if (!pte_file(pte)) { 587 swp_entry_t entry = pte_to_swp_entry(pte); 588 589 swap_duplicate(entry); 590 /* make sure dst_mm is on swapoff's mmlist. */ 591 if (unlikely(list_empty(&dst_mm->mmlist))) { 592 spin_lock(&mmlist_lock); 593 if (list_empty(&dst_mm->mmlist)) 594 list_add(&dst_mm->mmlist, 595 &src_mm->mmlist); 596 spin_unlock(&mmlist_lock); 597 } 598 if (is_write_migration_entry(entry) && 599 is_cow_mapping(vm_flags)) { 600 /* 601 * COW mappings require pages in both parent 602 * and child to be set to read. 603 */ 604 make_migration_entry_read(&entry); 605 pte = swp_entry_to_pte(entry); 606 set_pte_at(src_mm, addr, src_pte, pte); 607 } 608 } 609 goto out_set_pte; 610 } 611 612 /* 613 * If it's a COW mapping, write protect it both 614 * in the parent and the child 615 */ 616 if (is_cow_mapping(vm_flags)) { 617 ptep_set_wrprotect(src_mm, addr, src_pte); 618 pte = pte_wrprotect(pte); 619 } 620 621 /* 622 * If it's a shared mapping, mark it clean in 623 * the child 624 */ 625 if (vm_flags & VM_SHARED) 626 pte = pte_mkclean(pte); 627 pte = pte_mkold(pte); 628 629 page = vm_normal_page(vma, addr, pte); 630 if (page) { 631 get_page(page); 632 page_dup_rmap(page); 633 rss[PageAnon(page)]++; 634 } 635 636 out_set_pte: 637 set_pte_at(dst_mm, addr, dst_pte, pte); 638 } 639 640 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 641 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 642 unsigned long addr, unsigned long end) 643 { 644 pte_t *orig_src_pte, *orig_dst_pte; 645 pte_t *src_pte, *dst_pte; 646 spinlock_t *src_ptl, *dst_ptl; 647 int progress = 0; 648 int rss[2]; 649 650 again: 651 rss[1] = rss[0] = 0; 652 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 653 if (!dst_pte) 654 return -ENOMEM; 655 src_pte = pte_offset_map_nested(src_pmd, addr); 656 src_ptl = pte_lockptr(src_mm, src_pmd); 657 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 658 orig_src_pte = src_pte; 659 orig_dst_pte = dst_pte; 660 arch_enter_lazy_mmu_mode(); 661 662 do { 663 /* 664 * We are holding two locks at this point - either of them 665 * could generate latencies in another task on another CPU. 666 */ 667 if (progress >= 32) { 668 progress = 0; 669 if (need_resched() || 670 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) 671 break; 672 } 673 if (pte_none(*src_pte)) { 674 progress++; 675 continue; 676 } 677 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); 678 progress += 8; 679 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 680 681 arch_leave_lazy_mmu_mode(); 682 spin_unlock(src_ptl); 683 pte_unmap_nested(orig_src_pte); 684 add_mm_rss(dst_mm, rss[0], rss[1]); 685 pte_unmap_unlock(orig_dst_pte, dst_ptl); 686 cond_resched(); 687 if (addr != end) 688 goto again; 689 return 0; 690 } 691 692 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 693 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, 694 unsigned long addr, unsigned long end) 695 { 696 pmd_t *src_pmd, *dst_pmd; 697 unsigned long next; 698 699 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); 700 if (!dst_pmd) 701 return -ENOMEM; 702 src_pmd = pmd_offset(src_pud, addr); 703 do { 704 next = pmd_addr_end(addr, end); 705 if (pmd_none_or_clear_bad(src_pmd)) 706 continue; 707 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 708 vma, addr, next)) 709 return -ENOMEM; 710 } while (dst_pmd++, src_pmd++, addr = next, addr != end); 711 return 0; 712 } 713 714 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 715 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, 716 unsigned long addr, unsigned long end) 717 { 718 pud_t *src_pud, *dst_pud; 719 unsigned long next; 720 721 dst_pud = pud_alloc(dst_mm, dst_pgd, addr); 722 if (!dst_pud) 723 return -ENOMEM; 724 src_pud = pud_offset(src_pgd, addr); 725 do { 726 next = pud_addr_end(addr, end); 727 if (pud_none_or_clear_bad(src_pud)) 728 continue; 729 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, 730 vma, addr, next)) 731 return -ENOMEM; 732 } while (dst_pud++, src_pud++, addr = next, addr != end); 733 return 0; 734 } 735 736 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 737 struct vm_area_struct *vma) 738 { 739 pgd_t *src_pgd, *dst_pgd; 740 unsigned long next; 741 unsigned long addr = vma->vm_start; 742 unsigned long end = vma->vm_end; 743 int ret; 744 745 /* 746 * Don't copy ptes where a page fault will fill them correctly. 747 * Fork becomes much lighter when there are big shared or private 748 * readonly mappings. The tradeoff is that copy_page_range is more 749 * efficient than faulting. 750 */ 751 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { 752 if (!vma->anon_vma) 753 return 0; 754 } 755 756 if (is_vm_hugetlb_page(vma)) 757 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 758 759 if (unlikely(is_pfn_mapping(vma))) { 760 /* 761 * We do not free on error cases below as remove_vma 762 * gets called on error from higher level routine 763 */ 764 ret = track_pfn_vma_copy(vma); 765 if (ret) 766 return ret; 767 } 768 769 /* 770 * We need to invalidate the secondary MMU mappings only when 771 * there could be a permission downgrade on the ptes of the 772 * parent mm. And a permission downgrade will only happen if 773 * is_cow_mapping() returns true. 774 */ 775 if (is_cow_mapping(vma->vm_flags)) 776 mmu_notifier_invalidate_range_start(src_mm, addr, end); 777 778 ret = 0; 779 dst_pgd = pgd_offset(dst_mm, addr); 780 src_pgd = pgd_offset(src_mm, addr); 781 do { 782 next = pgd_addr_end(addr, end); 783 if (pgd_none_or_clear_bad(src_pgd)) 784 continue; 785 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 786 vma, addr, next))) { 787 ret = -ENOMEM; 788 break; 789 } 790 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 791 792 if (is_cow_mapping(vma->vm_flags)) 793 mmu_notifier_invalidate_range_end(src_mm, 794 vma->vm_start, end); 795 return ret; 796 } 797 798 static unsigned long zap_pte_range(struct mmu_gather *tlb, 799 struct vm_area_struct *vma, pmd_t *pmd, 800 unsigned long addr, unsigned long end, 801 long *zap_work, struct zap_details *details) 802 { 803 struct mm_struct *mm = tlb->mm; 804 pte_t *pte; 805 spinlock_t *ptl; 806 int file_rss = 0; 807 int anon_rss = 0; 808 809 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 810 arch_enter_lazy_mmu_mode(); 811 do { 812 pte_t ptent = *pte; 813 if (pte_none(ptent)) { 814 (*zap_work)--; 815 continue; 816 } 817 818 (*zap_work) -= PAGE_SIZE; 819 820 if (pte_present(ptent)) { 821 struct page *page; 822 823 page = vm_normal_page(vma, addr, ptent); 824 if (unlikely(details) && page) { 825 /* 826 * unmap_shared_mapping_pages() wants to 827 * invalidate cache without truncating: 828 * unmap shared but keep private pages. 829 */ 830 if (details->check_mapping && 831 details->check_mapping != page->mapping) 832 continue; 833 /* 834 * Each page->index must be checked when 835 * invalidating or truncating nonlinear. 836 */ 837 if (details->nonlinear_vma && 838 (page->index < details->first_index || 839 page->index > details->last_index)) 840 continue; 841 } 842 ptent = ptep_get_and_clear_full(mm, addr, pte, 843 tlb->fullmm); 844 tlb_remove_tlb_entry(tlb, pte, addr); 845 if (unlikely(!page)) 846 continue; 847 if (unlikely(details) && details->nonlinear_vma 848 && linear_page_index(details->nonlinear_vma, 849 addr) != page->index) 850 set_pte_at(mm, addr, pte, 851 pgoff_to_pte(page->index)); 852 if (PageAnon(page)) 853 anon_rss--; 854 else { 855 if (pte_dirty(ptent)) 856 set_page_dirty(page); 857 if (pte_young(ptent) && 858 likely(!VM_SequentialReadHint(vma))) 859 mark_page_accessed(page); 860 file_rss--; 861 } 862 page_remove_rmap(page); 863 if (unlikely(page_mapcount(page) < 0)) 864 print_bad_pte(vma, addr, ptent, page); 865 tlb_remove_page(tlb, page); 866 continue; 867 } 868 /* 869 * If details->check_mapping, we leave swap entries; 870 * if details->nonlinear_vma, we leave file entries. 871 */ 872 if (unlikely(details)) 873 continue; 874 if (pte_file(ptent)) { 875 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 876 print_bad_pte(vma, addr, ptent, NULL); 877 } else if 878 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) 879 print_bad_pte(vma, addr, ptent, NULL); 880 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 881 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 882 883 add_mm_rss(mm, file_rss, anon_rss); 884 arch_leave_lazy_mmu_mode(); 885 pte_unmap_unlock(pte - 1, ptl); 886 887 return addr; 888 } 889 890 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 891 struct vm_area_struct *vma, pud_t *pud, 892 unsigned long addr, unsigned long end, 893 long *zap_work, struct zap_details *details) 894 { 895 pmd_t *pmd; 896 unsigned long next; 897 898 pmd = pmd_offset(pud, addr); 899 do { 900 next = pmd_addr_end(addr, end); 901 if (pmd_none_or_clear_bad(pmd)) { 902 (*zap_work)--; 903 continue; 904 } 905 next = zap_pte_range(tlb, vma, pmd, addr, next, 906 zap_work, details); 907 } while (pmd++, addr = next, (addr != end && *zap_work > 0)); 908 909 return addr; 910 } 911 912 static inline unsigned long zap_pud_range(struct mmu_gather *tlb, 913 struct vm_area_struct *vma, pgd_t *pgd, 914 unsigned long addr, unsigned long end, 915 long *zap_work, struct zap_details *details) 916 { 917 pud_t *pud; 918 unsigned long next; 919 920 pud = pud_offset(pgd, addr); 921 do { 922 next = pud_addr_end(addr, end); 923 if (pud_none_or_clear_bad(pud)) { 924 (*zap_work)--; 925 continue; 926 } 927 next = zap_pmd_range(tlb, vma, pud, addr, next, 928 zap_work, details); 929 } while (pud++, addr = next, (addr != end && *zap_work > 0)); 930 931 return addr; 932 } 933 934 static unsigned long unmap_page_range(struct mmu_gather *tlb, 935 struct vm_area_struct *vma, 936 unsigned long addr, unsigned long end, 937 long *zap_work, struct zap_details *details) 938 { 939 pgd_t *pgd; 940 unsigned long next; 941 942 if (details && !details->check_mapping && !details->nonlinear_vma) 943 details = NULL; 944 945 BUG_ON(addr >= end); 946 tlb_start_vma(tlb, vma); 947 pgd = pgd_offset(vma->vm_mm, addr); 948 do { 949 next = pgd_addr_end(addr, end); 950 if (pgd_none_or_clear_bad(pgd)) { 951 (*zap_work)--; 952 continue; 953 } 954 next = zap_pud_range(tlb, vma, pgd, addr, next, 955 zap_work, details); 956 } while (pgd++, addr = next, (addr != end && *zap_work > 0)); 957 tlb_end_vma(tlb, vma); 958 959 return addr; 960 } 961 962 #ifdef CONFIG_PREEMPT 963 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) 964 #else 965 /* No preempt: go for improved straight-line efficiency */ 966 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) 967 #endif 968 969 /** 970 * unmap_vmas - unmap a range of memory covered by a list of vma's 971 * @tlbp: address of the caller's struct mmu_gather 972 * @vma: the starting vma 973 * @start_addr: virtual address at which to start unmapping 974 * @end_addr: virtual address at which to end unmapping 975 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here 976 * @details: details of nonlinear truncation or shared cache invalidation 977 * 978 * Returns the end address of the unmapping (restart addr if interrupted). 979 * 980 * Unmap all pages in the vma list. 981 * 982 * We aim to not hold locks for too long (for scheduling latency reasons). 983 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to 984 * return the ending mmu_gather to the caller. 985 * 986 * Only addresses between `start' and `end' will be unmapped. 987 * 988 * The VMA list must be sorted in ascending virtual address order. 989 * 990 * unmap_vmas() assumes that the caller will flush the whole unmapped address 991 * range after unmap_vmas() returns. So the only responsibility here is to 992 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 993 * drops the lock and schedules. 994 */ 995 unsigned long unmap_vmas(struct mmu_gather **tlbp, 996 struct vm_area_struct *vma, unsigned long start_addr, 997 unsigned long end_addr, unsigned long *nr_accounted, 998 struct zap_details *details) 999 { 1000 long zap_work = ZAP_BLOCK_SIZE; 1001 unsigned long tlb_start = 0; /* For tlb_finish_mmu */ 1002 int tlb_start_valid = 0; 1003 unsigned long start = start_addr; 1004 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 1005 int fullmm = (*tlbp)->fullmm; 1006 struct mm_struct *mm = vma->vm_mm; 1007 1008 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1009 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 1010 unsigned long end; 1011 1012 start = max(vma->vm_start, start_addr); 1013 if (start >= vma->vm_end) 1014 continue; 1015 end = min(vma->vm_end, end_addr); 1016 if (end <= vma->vm_start) 1017 continue; 1018 1019 if (vma->vm_flags & VM_ACCOUNT) 1020 *nr_accounted += (end - start) >> PAGE_SHIFT; 1021 1022 if (unlikely(is_pfn_mapping(vma))) 1023 untrack_pfn_vma(vma, 0, 0); 1024 1025 while (start != end) { 1026 if (!tlb_start_valid) { 1027 tlb_start = start; 1028 tlb_start_valid = 1; 1029 } 1030 1031 if (unlikely(is_vm_hugetlb_page(vma))) { 1032 /* 1033 * It is undesirable to test vma->vm_file as it 1034 * should be non-null for valid hugetlb area. 1035 * However, vm_file will be NULL in the error 1036 * cleanup path of do_mmap_pgoff. When 1037 * hugetlbfs ->mmap method fails, 1038 * do_mmap_pgoff() nullifies vma->vm_file 1039 * before calling this function to clean up. 1040 * Since no pte has actually been setup, it is 1041 * safe to do nothing in this case. 1042 */ 1043 if (vma->vm_file) { 1044 unmap_hugepage_range(vma, start, end, NULL); 1045 zap_work -= (end - start) / 1046 pages_per_huge_page(hstate_vma(vma)); 1047 } 1048 1049 start = end; 1050 } else 1051 start = unmap_page_range(*tlbp, vma, 1052 start, end, &zap_work, details); 1053 1054 if (zap_work > 0) { 1055 BUG_ON(start != end); 1056 break; 1057 } 1058 1059 tlb_finish_mmu(*tlbp, tlb_start, start); 1060 1061 if (need_resched() || 1062 (i_mmap_lock && spin_needbreak(i_mmap_lock))) { 1063 if (i_mmap_lock) { 1064 *tlbp = NULL; 1065 goto out; 1066 } 1067 cond_resched(); 1068 } 1069 1070 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); 1071 tlb_start_valid = 0; 1072 zap_work = ZAP_BLOCK_SIZE; 1073 } 1074 } 1075 out: 1076 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1077 return start; /* which is now the end (or restart) address */ 1078 } 1079 1080 /** 1081 * zap_page_range - remove user pages in a given range 1082 * @vma: vm_area_struct holding the applicable pages 1083 * @address: starting address of pages to zap 1084 * @size: number of bytes to zap 1085 * @details: details of nonlinear truncation or shared cache invalidation 1086 */ 1087 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 1088 unsigned long size, struct zap_details *details) 1089 { 1090 struct mm_struct *mm = vma->vm_mm; 1091 struct mmu_gather *tlb; 1092 unsigned long end = address + size; 1093 unsigned long nr_accounted = 0; 1094 1095 lru_add_drain(); 1096 tlb = tlb_gather_mmu(mm, 0); 1097 update_hiwater_rss(mm); 1098 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1099 if (tlb) 1100 tlb_finish_mmu(tlb, address, end); 1101 return end; 1102 } 1103 1104 /** 1105 * zap_vma_ptes - remove ptes mapping the vma 1106 * @vma: vm_area_struct holding ptes to be zapped 1107 * @address: starting address of pages to zap 1108 * @size: number of bytes to zap 1109 * 1110 * This function only unmaps ptes assigned to VM_PFNMAP vmas. 1111 * 1112 * The entire address range must be fully contained within the vma. 1113 * 1114 * Returns 0 if successful. 1115 */ 1116 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 1117 unsigned long size) 1118 { 1119 if (address < vma->vm_start || address + size > vma->vm_end || 1120 !(vma->vm_flags & VM_PFNMAP)) 1121 return -1; 1122 zap_page_range(vma, address, size, NULL); 1123 return 0; 1124 } 1125 EXPORT_SYMBOL_GPL(zap_vma_ptes); 1126 1127 /* 1128 * Do a quick page-table lookup for a single page. 1129 */ 1130 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1131 unsigned int flags) 1132 { 1133 pgd_t *pgd; 1134 pud_t *pud; 1135 pmd_t *pmd; 1136 pte_t *ptep, pte; 1137 spinlock_t *ptl; 1138 struct page *page; 1139 struct mm_struct *mm = vma->vm_mm; 1140 1141 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 1142 if (!IS_ERR(page)) { 1143 BUG_ON(flags & FOLL_GET); 1144 goto out; 1145 } 1146 1147 page = NULL; 1148 pgd = pgd_offset(mm, address); 1149 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 1150 goto no_page_table; 1151 1152 pud = pud_offset(pgd, address); 1153 if (pud_none(*pud)) 1154 goto no_page_table; 1155 if (pud_huge(*pud)) { 1156 BUG_ON(flags & FOLL_GET); 1157 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1158 goto out; 1159 } 1160 if (unlikely(pud_bad(*pud))) 1161 goto no_page_table; 1162 1163 pmd = pmd_offset(pud, address); 1164 if (pmd_none(*pmd)) 1165 goto no_page_table; 1166 if (pmd_huge(*pmd)) { 1167 BUG_ON(flags & FOLL_GET); 1168 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1169 goto out; 1170 } 1171 if (unlikely(pmd_bad(*pmd))) 1172 goto no_page_table; 1173 1174 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1175 1176 pte = *ptep; 1177 if (!pte_present(pte)) 1178 goto no_page; 1179 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1180 goto unlock; 1181 1182 page = vm_normal_page(vma, address, pte); 1183 if (unlikely(!page)) { 1184 if ((flags & FOLL_DUMP) || 1185 !is_zero_pfn(pte_pfn(pte))) 1186 goto bad_page; 1187 page = pte_page(pte); 1188 } 1189 1190 if (flags & FOLL_GET) 1191 get_page(page); 1192 if (flags & FOLL_TOUCH) { 1193 if ((flags & FOLL_WRITE) && 1194 !pte_dirty(pte) && !PageDirty(page)) 1195 set_page_dirty(page); 1196 /* 1197 * pte_mkyoung() would be more correct here, but atomic care 1198 * is needed to avoid losing the dirty bit: it is easier to use 1199 * mark_page_accessed(). 1200 */ 1201 mark_page_accessed(page); 1202 } 1203 unlock: 1204 pte_unmap_unlock(ptep, ptl); 1205 out: 1206 return page; 1207 1208 bad_page: 1209 pte_unmap_unlock(ptep, ptl); 1210 return ERR_PTR(-EFAULT); 1211 1212 no_page: 1213 pte_unmap_unlock(ptep, ptl); 1214 if (!pte_none(pte)) 1215 return page; 1216 1217 no_page_table: 1218 /* 1219 * When core dumping an enormous anonymous area that nobody 1220 * has touched so far, we don't want to allocate unnecessary pages or 1221 * page tables. Return error instead of NULL to skip handle_mm_fault, 1222 * then get_dump_page() will return NULL to leave a hole in the dump. 1223 * But we can only make this optimization where a hole would surely 1224 * be zero-filled if handle_mm_fault() actually did handle it. 1225 */ 1226 if ((flags & FOLL_DUMP) && 1227 (!vma->vm_ops || !vma->vm_ops->fault)) 1228 return ERR_PTR(-EFAULT); 1229 return page; 1230 } 1231 1232 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1233 unsigned long start, int nr_pages, unsigned int gup_flags, 1234 struct page **pages, struct vm_area_struct **vmas) 1235 { 1236 int i; 1237 unsigned long vm_flags; 1238 1239 if (nr_pages <= 0) 1240 return 0; 1241 1242 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 1243 1244 /* 1245 * Require read or write permissions. 1246 * If FOLL_FORCE is set, we only require the "MAY" flags. 1247 */ 1248 vm_flags = (gup_flags & FOLL_WRITE) ? 1249 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1250 vm_flags &= (gup_flags & FOLL_FORCE) ? 1251 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1252 i = 0; 1253 1254 do { 1255 struct vm_area_struct *vma; 1256 1257 vma = find_extend_vma(mm, start); 1258 if (!vma && in_gate_area(tsk, start)) { 1259 unsigned long pg = start & PAGE_MASK; 1260 struct vm_area_struct *gate_vma = get_gate_vma(tsk); 1261 pgd_t *pgd; 1262 pud_t *pud; 1263 pmd_t *pmd; 1264 pte_t *pte; 1265 1266 /* user gate pages are read-only */ 1267 if (gup_flags & FOLL_WRITE) 1268 return i ? : -EFAULT; 1269 if (pg > TASK_SIZE) 1270 pgd = pgd_offset_k(pg); 1271 else 1272 pgd = pgd_offset_gate(mm, pg); 1273 BUG_ON(pgd_none(*pgd)); 1274 pud = pud_offset(pgd, pg); 1275 BUG_ON(pud_none(*pud)); 1276 pmd = pmd_offset(pud, pg); 1277 if (pmd_none(*pmd)) 1278 return i ? : -EFAULT; 1279 pte = pte_offset_map(pmd, pg); 1280 if (pte_none(*pte)) { 1281 pte_unmap(pte); 1282 return i ? : -EFAULT; 1283 } 1284 if (pages) { 1285 struct page *page = vm_normal_page(gate_vma, start, *pte); 1286 pages[i] = page; 1287 if (page) 1288 get_page(page); 1289 } 1290 pte_unmap(pte); 1291 if (vmas) 1292 vmas[i] = gate_vma; 1293 i++; 1294 start += PAGE_SIZE; 1295 nr_pages--; 1296 continue; 1297 } 1298 1299 if (!vma || 1300 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1301 !(vm_flags & vma->vm_flags)) 1302 return i ? : -EFAULT; 1303 1304 if (is_vm_hugetlb_page(vma)) { 1305 i = follow_hugetlb_page(mm, vma, pages, vmas, 1306 &start, &nr_pages, i, gup_flags); 1307 continue; 1308 } 1309 1310 do { 1311 struct page *page; 1312 unsigned int foll_flags = gup_flags; 1313 1314 /* 1315 * If we have a pending SIGKILL, don't keep faulting 1316 * pages and potentially allocating memory. 1317 */ 1318 if (unlikely(fatal_signal_pending(current))) 1319 return i ? i : -ERESTARTSYS; 1320 1321 cond_resched(); 1322 while (!(page = follow_page(vma, start, foll_flags))) { 1323 int ret; 1324 1325 ret = handle_mm_fault(mm, vma, start, 1326 (foll_flags & FOLL_WRITE) ? 1327 FAULT_FLAG_WRITE : 0); 1328 1329 if (ret & VM_FAULT_ERROR) { 1330 if (ret & VM_FAULT_OOM) 1331 return i ? i : -ENOMEM; 1332 if (ret & 1333 (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) 1334 return i ? i : -EFAULT; 1335 BUG(); 1336 } 1337 if (ret & VM_FAULT_MAJOR) 1338 tsk->maj_flt++; 1339 else 1340 tsk->min_flt++; 1341 1342 /* 1343 * The VM_FAULT_WRITE bit tells us that 1344 * do_wp_page has broken COW when necessary, 1345 * even if maybe_mkwrite decided not to set 1346 * pte_write. We can thus safely do subsequent 1347 * page lookups as if they were reads. But only 1348 * do so when looping for pte_write is futile: 1349 * in some cases userspace may also be wanting 1350 * to write to the gotten user page, which a 1351 * read fault here might prevent (a readonly 1352 * page might get reCOWed by userspace write). 1353 */ 1354 if ((ret & VM_FAULT_WRITE) && 1355 !(vma->vm_flags & VM_WRITE)) 1356 foll_flags &= ~FOLL_WRITE; 1357 1358 cond_resched(); 1359 } 1360 if (IS_ERR(page)) 1361 return i ? i : PTR_ERR(page); 1362 if (pages) { 1363 pages[i] = page; 1364 1365 flush_anon_page(vma, page, start); 1366 flush_dcache_page(page); 1367 } 1368 if (vmas) 1369 vmas[i] = vma; 1370 i++; 1371 start += PAGE_SIZE; 1372 nr_pages--; 1373 } while (nr_pages && start < vma->vm_end); 1374 } while (nr_pages); 1375 return i; 1376 } 1377 1378 /** 1379 * get_user_pages() - pin user pages in memory 1380 * @tsk: task_struct of target task 1381 * @mm: mm_struct of target mm 1382 * @start: starting user address 1383 * @nr_pages: number of pages from start to pin 1384 * @write: whether pages will be written to by the caller 1385 * @force: whether to force write access even if user mapping is 1386 * readonly. This will result in the page being COWed even 1387 * in MAP_SHARED mappings. You do not want this. 1388 * @pages: array that receives pointers to the pages pinned. 1389 * Should be at least nr_pages long. Or NULL, if caller 1390 * only intends to ensure the pages are faulted in. 1391 * @vmas: array of pointers to vmas corresponding to each page. 1392 * Or NULL if the caller does not require them. 1393 * 1394 * Returns number of pages pinned. This may be fewer than the number 1395 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1396 * were pinned, returns -errno. Each page returned must be released 1397 * with a put_page() call when it is finished with. vmas will only 1398 * remain valid while mmap_sem is held. 1399 * 1400 * Must be called with mmap_sem held for read or write. 1401 * 1402 * get_user_pages walks a process's page tables and takes a reference to 1403 * each struct page that each user address corresponds to at a given 1404 * instant. That is, it takes the page that would be accessed if a user 1405 * thread accesses the given user virtual address at that instant. 1406 * 1407 * This does not guarantee that the page exists in the user mappings when 1408 * get_user_pages returns, and there may even be a completely different 1409 * page there in some cases (eg. if mmapped pagecache has been invalidated 1410 * and subsequently re faulted). However it does guarantee that the page 1411 * won't be freed completely. And mostly callers simply care that the page 1412 * contains data that was valid *at some point in time*. Typically, an IO 1413 * or similar operation cannot guarantee anything stronger anyway because 1414 * locks can't be held over the syscall boundary. 1415 * 1416 * If write=0, the page must not be written to. If the page is written to, 1417 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called 1418 * after the page is finished with, and before put_page is called. 1419 * 1420 * get_user_pages is typically used for fewer-copy IO operations, to get a 1421 * handle on the memory by some means other than accesses via the user virtual 1422 * addresses. The pages may be submitted for DMA to devices or accessed via 1423 * their kernel linear mapping (via the kmap APIs). Care should be taken to 1424 * use the correct cache flushing APIs. 1425 * 1426 * See also get_user_pages_fast, for performance critical applications. 1427 */ 1428 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1429 unsigned long start, int nr_pages, int write, int force, 1430 struct page **pages, struct vm_area_struct **vmas) 1431 { 1432 int flags = FOLL_TOUCH; 1433 1434 if (pages) 1435 flags |= FOLL_GET; 1436 if (write) 1437 flags |= FOLL_WRITE; 1438 if (force) 1439 flags |= FOLL_FORCE; 1440 1441 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1442 } 1443 EXPORT_SYMBOL(get_user_pages); 1444 1445 /** 1446 * get_dump_page() - pin user page in memory while writing it to core dump 1447 * @addr: user address 1448 * 1449 * Returns struct page pointer of user page pinned for dump, 1450 * to be freed afterwards by page_cache_release() or put_page(). 1451 * 1452 * Returns NULL on any kind of failure - a hole must then be inserted into 1453 * the corefile, to preserve alignment with its headers; and also returns 1454 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 1455 * allowing a hole to be left in the corefile to save diskspace. 1456 * 1457 * Called without mmap_sem, but after all other threads have been killed. 1458 */ 1459 #ifdef CONFIG_ELF_CORE 1460 struct page *get_dump_page(unsigned long addr) 1461 { 1462 struct vm_area_struct *vma; 1463 struct page *page; 1464 1465 if (__get_user_pages(current, current->mm, addr, 1, 1466 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) 1467 return NULL; 1468 flush_cache_page(vma, addr, page_to_pfn(page)); 1469 return page; 1470 } 1471 #endif /* CONFIG_ELF_CORE */ 1472 1473 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1474 spinlock_t **ptl) 1475 { 1476 pgd_t * pgd = pgd_offset(mm, addr); 1477 pud_t * pud = pud_alloc(mm, pgd, addr); 1478 if (pud) { 1479 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1480 if (pmd) 1481 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1482 } 1483 return NULL; 1484 } 1485 1486 /* 1487 * This is the old fallback for page remapping. 1488 * 1489 * For historical reasons, it only allows reserved pages. Only 1490 * old drivers should use this, and they needed to mark their 1491 * pages reserved for the old functions anyway. 1492 */ 1493 static int insert_page(struct vm_area_struct *vma, unsigned long addr, 1494 struct page *page, pgprot_t prot) 1495 { 1496 struct mm_struct *mm = vma->vm_mm; 1497 int retval; 1498 pte_t *pte; 1499 spinlock_t *ptl; 1500 1501 retval = -EINVAL; 1502 if (PageAnon(page)) 1503 goto out; 1504 retval = -ENOMEM; 1505 flush_dcache_page(page); 1506 pte = get_locked_pte(mm, addr, &ptl); 1507 if (!pte) 1508 goto out; 1509 retval = -EBUSY; 1510 if (!pte_none(*pte)) 1511 goto out_unlock; 1512 1513 /* Ok, finally just insert the thing.. */ 1514 get_page(page); 1515 inc_mm_counter(mm, file_rss); 1516 page_add_file_rmap(page); 1517 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1518 1519 retval = 0; 1520 pte_unmap_unlock(pte, ptl); 1521 return retval; 1522 out_unlock: 1523 pte_unmap_unlock(pte, ptl); 1524 out: 1525 return retval; 1526 } 1527 1528 /** 1529 * vm_insert_page - insert single page into user vma 1530 * @vma: user vma to map to 1531 * @addr: target user address of this page 1532 * @page: source kernel page 1533 * 1534 * This allows drivers to insert individual pages they've allocated 1535 * into a user vma. 1536 * 1537 * The page has to be a nice clean _individual_ kernel allocation. 1538 * If you allocate a compound page, you need to have marked it as 1539 * such (__GFP_COMP), or manually just split the page up yourself 1540 * (see split_page()). 1541 * 1542 * NOTE! Traditionally this was done with "remap_pfn_range()" which 1543 * took an arbitrary page protection parameter. This doesn't allow 1544 * that. Your vma protection will have to be set up correctly, which 1545 * means that if you want a shared writable mapping, you'd better 1546 * ask for a shared writable mapping! 1547 * 1548 * The page does not need to be reserved. 1549 */ 1550 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 1551 struct page *page) 1552 { 1553 if (addr < vma->vm_start || addr >= vma->vm_end) 1554 return -EFAULT; 1555 if (!page_count(page)) 1556 return -EINVAL; 1557 vma->vm_flags |= VM_INSERTPAGE; 1558 return insert_page(vma, addr, page, vma->vm_page_prot); 1559 } 1560 EXPORT_SYMBOL(vm_insert_page); 1561 1562 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1563 unsigned long pfn, pgprot_t prot) 1564 { 1565 struct mm_struct *mm = vma->vm_mm; 1566 int retval; 1567 pte_t *pte, entry; 1568 spinlock_t *ptl; 1569 1570 retval = -ENOMEM; 1571 pte = get_locked_pte(mm, addr, &ptl); 1572 if (!pte) 1573 goto out; 1574 retval = -EBUSY; 1575 if (!pte_none(*pte)) 1576 goto out_unlock; 1577 1578 /* Ok, finally just insert the thing.. */ 1579 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1580 set_pte_at(mm, addr, pte, entry); 1581 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ 1582 1583 retval = 0; 1584 out_unlock: 1585 pte_unmap_unlock(pte, ptl); 1586 out: 1587 return retval; 1588 } 1589 1590 /** 1591 * vm_insert_pfn - insert single pfn into user vma 1592 * @vma: user vma to map to 1593 * @addr: target user address of this page 1594 * @pfn: source kernel pfn 1595 * 1596 * Similar to vm_inert_page, this allows drivers to insert individual pages 1597 * they've allocated into a user vma. Same comments apply. 1598 * 1599 * This function should only be called from a vm_ops->fault handler, and 1600 * in that case the handler should return NULL. 1601 * 1602 * vma cannot be a COW mapping. 1603 * 1604 * As this is called only for pages that do not currently exist, we 1605 * do not need to flush old virtual caches or the TLB. 1606 */ 1607 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1608 unsigned long pfn) 1609 { 1610 int ret; 1611 pgprot_t pgprot = vma->vm_page_prot; 1612 /* 1613 * Technically, architectures with pte_special can avoid all these 1614 * restrictions (same for remap_pfn_range). However we would like 1615 * consistency in testing and feature parity among all, so we should 1616 * try to keep these invariants in place for everybody. 1617 */ 1618 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1619 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1620 (VM_PFNMAP|VM_MIXEDMAP)); 1621 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1622 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 1623 1624 if (addr < vma->vm_start || addr >= vma->vm_end) 1625 return -EFAULT; 1626 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) 1627 return -EINVAL; 1628 1629 ret = insert_pfn(vma, addr, pfn, pgprot); 1630 1631 if (ret) 1632 untrack_pfn_vma(vma, pfn, PAGE_SIZE); 1633 1634 return ret; 1635 } 1636 EXPORT_SYMBOL(vm_insert_pfn); 1637 1638 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1639 unsigned long pfn) 1640 { 1641 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); 1642 1643 if (addr < vma->vm_start || addr >= vma->vm_end) 1644 return -EFAULT; 1645 1646 /* 1647 * If we don't have pte special, then we have to use the pfn_valid() 1648 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1649 * refcount the page if pfn_valid is true (hence insert_page rather 1650 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 1651 * without pte special, it would there be refcounted as a normal page. 1652 */ 1653 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1654 struct page *page; 1655 1656 page = pfn_to_page(pfn); 1657 return insert_page(vma, addr, page, vma->vm_page_prot); 1658 } 1659 return insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1660 } 1661 EXPORT_SYMBOL(vm_insert_mixed); 1662 1663 /* 1664 * maps a range of physical memory into the requested pages. the old 1665 * mappings are removed. any references to nonexistent pages results 1666 * in null mappings (currently treated as "copy-on-access") 1667 */ 1668 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, 1669 unsigned long addr, unsigned long end, 1670 unsigned long pfn, pgprot_t prot) 1671 { 1672 pte_t *pte; 1673 spinlock_t *ptl; 1674 1675 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1676 if (!pte) 1677 return -ENOMEM; 1678 arch_enter_lazy_mmu_mode(); 1679 do { 1680 BUG_ON(!pte_none(*pte)); 1681 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); 1682 pfn++; 1683 } while (pte++, addr += PAGE_SIZE, addr != end); 1684 arch_leave_lazy_mmu_mode(); 1685 pte_unmap_unlock(pte - 1, ptl); 1686 return 0; 1687 } 1688 1689 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, 1690 unsigned long addr, unsigned long end, 1691 unsigned long pfn, pgprot_t prot) 1692 { 1693 pmd_t *pmd; 1694 unsigned long next; 1695 1696 pfn -= addr >> PAGE_SHIFT; 1697 pmd = pmd_alloc(mm, pud, addr); 1698 if (!pmd) 1699 return -ENOMEM; 1700 do { 1701 next = pmd_addr_end(addr, end); 1702 if (remap_pte_range(mm, pmd, addr, next, 1703 pfn + (addr >> PAGE_SHIFT), prot)) 1704 return -ENOMEM; 1705 } while (pmd++, addr = next, addr != end); 1706 return 0; 1707 } 1708 1709 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, 1710 unsigned long addr, unsigned long end, 1711 unsigned long pfn, pgprot_t prot) 1712 { 1713 pud_t *pud; 1714 unsigned long next; 1715 1716 pfn -= addr >> PAGE_SHIFT; 1717 pud = pud_alloc(mm, pgd, addr); 1718 if (!pud) 1719 return -ENOMEM; 1720 do { 1721 next = pud_addr_end(addr, end); 1722 if (remap_pmd_range(mm, pud, addr, next, 1723 pfn + (addr >> PAGE_SHIFT), prot)) 1724 return -ENOMEM; 1725 } while (pud++, addr = next, addr != end); 1726 return 0; 1727 } 1728 1729 /** 1730 * remap_pfn_range - remap kernel memory to userspace 1731 * @vma: user vma to map to 1732 * @addr: target user address to start at 1733 * @pfn: physical address of kernel memory 1734 * @size: size of map area 1735 * @prot: page protection flags for this mapping 1736 * 1737 * Note: this is only safe if the mm semaphore is held when called. 1738 */ 1739 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 1740 unsigned long pfn, unsigned long size, pgprot_t prot) 1741 { 1742 pgd_t *pgd; 1743 unsigned long next; 1744 unsigned long end = addr + PAGE_ALIGN(size); 1745 struct mm_struct *mm = vma->vm_mm; 1746 int err; 1747 1748 /* 1749 * Physically remapped pages are special. Tell the 1750 * rest of the world about it: 1751 * VM_IO tells people not to look at these pages 1752 * (accesses can have side effects). 1753 * VM_RESERVED is specified all over the place, because 1754 * in 2.4 it kept swapout's vma scan off this vma; but 1755 * in 2.6 the LRU scan won't even find its pages, so this 1756 * flag means no more than count its pages in reserved_vm, 1757 * and omit it from core dump, even when VM_IO turned off. 1758 * VM_PFNMAP tells the core MM that the base pages are just 1759 * raw PFN mappings, and do not have a "struct page" associated 1760 * with them. 1761 * 1762 * There's a horrible special case to handle copy-on-write 1763 * behaviour that some programs depend on. We mark the "original" 1764 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 1765 */ 1766 if (addr == vma->vm_start && end == vma->vm_end) { 1767 vma->vm_pgoff = pfn; 1768 vma->vm_flags |= VM_PFN_AT_MMAP; 1769 } else if (is_cow_mapping(vma->vm_flags)) 1770 return -EINVAL; 1771 1772 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1773 1774 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); 1775 if (err) { 1776 /* 1777 * To indicate that track_pfn related cleanup is not 1778 * needed from higher level routine calling unmap_vmas 1779 */ 1780 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); 1781 vma->vm_flags &= ~VM_PFN_AT_MMAP; 1782 return -EINVAL; 1783 } 1784 1785 BUG_ON(addr >= end); 1786 pfn -= addr >> PAGE_SHIFT; 1787 pgd = pgd_offset(mm, addr); 1788 flush_cache_range(vma, addr, end); 1789 do { 1790 next = pgd_addr_end(addr, end); 1791 err = remap_pud_range(mm, pgd, addr, next, 1792 pfn + (addr >> PAGE_SHIFT), prot); 1793 if (err) 1794 break; 1795 } while (pgd++, addr = next, addr != end); 1796 1797 if (err) 1798 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); 1799 1800 return err; 1801 } 1802 EXPORT_SYMBOL(remap_pfn_range); 1803 1804 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, 1805 unsigned long addr, unsigned long end, 1806 pte_fn_t fn, void *data) 1807 { 1808 pte_t *pte; 1809 int err; 1810 pgtable_t token; 1811 spinlock_t *uninitialized_var(ptl); 1812 1813 pte = (mm == &init_mm) ? 1814 pte_alloc_kernel(pmd, addr) : 1815 pte_alloc_map_lock(mm, pmd, addr, &ptl); 1816 if (!pte) 1817 return -ENOMEM; 1818 1819 BUG_ON(pmd_huge(*pmd)); 1820 1821 arch_enter_lazy_mmu_mode(); 1822 1823 token = pmd_pgtable(*pmd); 1824 1825 do { 1826 err = fn(pte++, token, addr, data); 1827 if (err) 1828 break; 1829 } while (addr += PAGE_SIZE, addr != end); 1830 1831 arch_leave_lazy_mmu_mode(); 1832 1833 if (mm != &init_mm) 1834 pte_unmap_unlock(pte-1, ptl); 1835 return err; 1836 } 1837 1838 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, 1839 unsigned long addr, unsigned long end, 1840 pte_fn_t fn, void *data) 1841 { 1842 pmd_t *pmd; 1843 unsigned long next; 1844 int err; 1845 1846 BUG_ON(pud_huge(*pud)); 1847 1848 pmd = pmd_alloc(mm, pud, addr); 1849 if (!pmd) 1850 return -ENOMEM; 1851 do { 1852 next = pmd_addr_end(addr, end); 1853 err = apply_to_pte_range(mm, pmd, addr, next, fn, data); 1854 if (err) 1855 break; 1856 } while (pmd++, addr = next, addr != end); 1857 return err; 1858 } 1859 1860 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, 1861 unsigned long addr, unsigned long end, 1862 pte_fn_t fn, void *data) 1863 { 1864 pud_t *pud; 1865 unsigned long next; 1866 int err; 1867 1868 pud = pud_alloc(mm, pgd, addr); 1869 if (!pud) 1870 return -ENOMEM; 1871 do { 1872 next = pud_addr_end(addr, end); 1873 err = apply_to_pmd_range(mm, pud, addr, next, fn, data); 1874 if (err) 1875 break; 1876 } while (pud++, addr = next, addr != end); 1877 return err; 1878 } 1879 1880 /* 1881 * Scan a region of virtual memory, filling in page tables as necessary 1882 * and calling a provided function on each leaf page table. 1883 */ 1884 int apply_to_page_range(struct mm_struct *mm, unsigned long addr, 1885 unsigned long size, pte_fn_t fn, void *data) 1886 { 1887 pgd_t *pgd; 1888 unsigned long next; 1889 unsigned long start = addr, end = addr + size; 1890 int err; 1891 1892 BUG_ON(addr >= end); 1893 mmu_notifier_invalidate_range_start(mm, start, end); 1894 pgd = pgd_offset(mm, addr); 1895 do { 1896 next = pgd_addr_end(addr, end); 1897 err = apply_to_pud_range(mm, pgd, addr, next, fn, data); 1898 if (err) 1899 break; 1900 } while (pgd++, addr = next, addr != end); 1901 mmu_notifier_invalidate_range_end(mm, start, end); 1902 return err; 1903 } 1904 EXPORT_SYMBOL_GPL(apply_to_page_range); 1905 1906 /* 1907 * handle_pte_fault chooses page fault handler according to an entry 1908 * which was read non-atomically. Before making any commitment, on 1909 * those architectures or configurations (e.g. i386 with PAE) which 1910 * might give a mix of unmatched parts, do_swap_page and do_file_page 1911 * must check under lock before unmapping the pte and proceeding 1912 * (but do_wp_page is only called after already making such a check; 1913 * and do_anonymous_page and do_no_page can safely check later on). 1914 */ 1915 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 1916 pte_t *page_table, pte_t orig_pte) 1917 { 1918 int same = 1; 1919 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 1920 if (sizeof(pte_t) > sizeof(unsigned long)) { 1921 spinlock_t *ptl = pte_lockptr(mm, pmd); 1922 spin_lock(ptl); 1923 same = pte_same(*page_table, orig_pte); 1924 spin_unlock(ptl); 1925 } 1926 #endif 1927 pte_unmap(page_table); 1928 return same; 1929 } 1930 1931 /* 1932 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 1933 * servicing faults for write access. In the normal case, do always want 1934 * pte_mkwrite. But get_user_pages can cause write faults for mappings 1935 * that do not have writing enabled, when used by access_process_vm. 1936 */ 1937 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) 1938 { 1939 if (likely(vma->vm_flags & VM_WRITE)) 1940 pte = pte_mkwrite(pte); 1941 return pte; 1942 } 1943 1944 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 1945 { 1946 /* 1947 * If the source page was a PFN mapping, we don't have 1948 * a "struct page" for it. We do a best-effort copy by 1949 * just copying from the original user address. If that 1950 * fails, we just zero-fill it. Live with it. 1951 */ 1952 if (unlikely(!src)) { 1953 void *kaddr = kmap_atomic(dst, KM_USER0); 1954 void __user *uaddr = (void __user *)(va & PAGE_MASK); 1955 1956 /* 1957 * This really shouldn't fail, because the page is there 1958 * in the page tables. But it might just be unreadable, 1959 * in which case we just give up and fill the result with 1960 * zeroes. 1961 */ 1962 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 1963 memset(kaddr, 0, PAGE_SIZE); 1964 kunmap_atomic(kaddr, KM_USER0); 1965 flush_dcache_page(dst); 1966 } else 1967 copy_user_highpage(dst, src, va, vma); 1968 } 1969 1970 /* 1971 * This routine handles present pages, when users try to write 1972 * to a shared page. It is done by copying the page to a new address 1973 * and decrementing the shared-page counter for the old page. 1974 * 1975 * Note that this routine assumes that the protection checks have been 1976 * done by the caller (the low-level page fault routine in most cases). 1977 * Thus we can safely just mark it writable once we've done any necessary 1978 * COW. 1979 * 1980 * We also mark the page dirty at this point even though the page will 1981 * change only once the write actually happens. This avoids a few races, 1982 * and potentially makes it more efficient. 1983 * 1984 * We enter with non-exclusive mmap_sem (to exclude vma changes, 1985 * but allow concurrent faults), with pte both mapped and locked. 1986 * We return with mmap_sem still held, but pte unmapped and unlocked. 1987 */ 1988 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1989 unsigned long address, pte_t *page_table, pmd_t *pmd, 1990 spinlock_t *ptl, pte_t orig_pte) 1991 { 1992 struct page *old_page, *new_page; 1993 pte_t entry; 1994 int reuse = 0, ret = 0; 1995 int page_mkwrite = 0; 1996 struct page *dirty_page = NULL; 1997 1998 old_page = vm_normal_page(vma, address, orig_pte); 1999 if (!old_page) { 2000 /* 2001 * VM_MIXEDMAP !pfn_valid() case 2002 * 2003 * We should not cow pages in a shared writeable mapping. 2004 * Just mark the pages writable as we can't do any dirty 2005 * accounting on raw pfn maps. 2006 */ 2007 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2008 (VM_WRITE|VM_SHARED)) 2009 goto reuse; 2010 goto gotten; 2011 } 2012 2013 /* 2014 * Take out anonymous pages first, anonymous shared vmas are 2015 * not dirty accountable. 2016 */ 2017 if (PageAnon(old_page) && !PageKsm(old_page)) { 2018 if (!trylock_page(old_page)) { 2019 page_cache_get(old_page); 2020 pte_unmap_unlock(page_table, ptl); 2021 lock_page(old_page); 2022 page_table = pte_offset_map_lock(mm, pmd, address, 2023 &ptl); 2024 if (!pte_same(*page_table, orig_pte)) { 2025 unlock_page(old_page); 2026 page_cache_release(old_page); 2027 goto unlock; 2028 } 2029 page_cache_release(old_page); 2030 } 2031 reuse = reuse_swap_page(old_page); 2032 unlock_page(old_page); 2033 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2034 (VM_WRITE|VM_SHARED))) { 2035 /* 2036 * Only catch write-faults on shared writable pages, 2037 * read-only shared pages can get COWed by 2038 * get_user_pages(.write=1, .force=1). 2039 */ 2040 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2041 struct vm_fault vmf; 2042 int tmp; 2043 2044 vmf.virtual_address = (void __user *)(address & 2045 PAGE_MASK); 2046 vmf.pgoff = old_page->index; 2047 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 2048 vmf.page = old_page; 2049 2050 /* 2051 * Notify the address space that the page is about to 2052 * become writable so that it can prohibit this or wait 2053 * for the page to get into an appropriate state. 2054 * 2055 * We do this without the lock held, so that it can 2056 * sleep if it needs to. 2057 */ 2058 page_cache_get(old_page); 2059 pte_unmap_unlock(page_table, ptl); 2060 2061 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2062 if (unlikely(tmp & 2063 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2064 ret = tmp; 2065 goto unwritable_page; 2066 } 2067 if (unlikely(!(tmp & VM_FAULT_LOCKED))) { 2068 lock_page(old_page); 2069 if (!old_page->mapping) { 2070 ret = 0; /* retry the fault */ 2071 unlock_page(old_page); 2072 goto unwritable_page; 2073 } 2074 } else 2075 VM_BUG_ON(!PageLocked(old_page)); 2076 2077 /* 2078 * Since we dropped the lock we need to revalidate 2079 * the PTE as someone else may have changed it. If 2080 * they did, we just return, as we can count on the 2081 * MMU to tell us if they didn't also make it writable. 2082 */ 2083 page_table = pte_offset_map_lock(mm, pmd, address, 2084 &ptl); 2085 if (!pte_same(*page_table, orig_pte)) { 2086 unlock_page(old_page); 2087 page_cache_release(old_page); 2088 goto unlock; 2089 } 2090 2091 page_mkwrite = 1; 2092 } 2093 dirty_page = old_page; 2094 get_page(dirty_page); 2095 reuse = 1; 2096 } 2097 2098 if (reuse) { 2099 reuse: 2100 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2101 entry = pte_mkyoung(orig_pte); 2102 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2103 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2104 update_mmu_cache(vma, address, entry); 2105 ret |= VM_FAULT_WRITE; 2106 goto unlock; 2107 } 2108 2109 /* 2110 * Ok, we need to copy. Oh, well.. 2111 */ 2112 page_cache_get(old_page); 2113 gotten: 2114 pte_unmap_unlock(page_table, ptl); 2115 2116 if (unlikely(anon_vma_prepare(vma))) 2117 goto oom; 2118 2119 if (is_zero_pfn(pte_pfn(orig_pte))) { 2120 new_page = alloc_zeroed_user_highpage_movable(vma, address); 2121 if (!new_page) 2122 goto oom; 2123 } else { 2124 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2125 if (!new_page) 2126 goto oom; 2127 cow_user_page(new_page, old_page, address, vma); 2128 } 2129 __SetPageUptodate(new_page); 2130 2131 /* 2132 * Don't let another task, with possibly unlocked vma, 2133 * keep the mlocked page. 2134 */ 2135 if ((vma->vm_flags & VM_LOCKED) && old_page) { 2136 lock_page(old_page); /* for LRU manipulation */ 2137 clear_page_mlock(old_page); 2138 unlock_page(old_page); 2139 } 2140 2141 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2142 goto oom_free_new; 2143 2144 /* 2145 * Re-check the pte - we dropped the lock 2146 */ 2147 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2148 if (likely(pte_same(*page_table, orig_pte))) { 2149 if (old_page) { 2150 if (!PageAnon(old_page)) { 2151 dec_mm_counter(mm, file_rss); 2152 inc_mm_counter(mm, anon_rss); 2153 } 2154 } else 2155 inc_mm_counter(mm, anon_rss); 2156 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2157 entry = mk_pte(new_page, vma->vm_page_prot); 2158 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2159 /* 2160 * Clear the pte entry and flush it first, before updating the 2161 * pte with the new entry. This will avoid a race condition 2162 * seen in the presence of one thread doing SMC and another 2163 * thread doing COW. 2164 */ 2165 ptep_clear_flush(vma, address, page_table); 2166 page_add_new_anon_rmap(new_page, vma, address); 2167 /* 2168 * We call the notify macro here because, when using secondary 2169 * mmu page tables (such as kvm shadow page tables), we want the 2170 * new page to be mapped directly into the secondary page table. 2171 */ 2172 set_pte_at_notify(mm, address, page_table, entry); 2173 update_mmu_cache(vma, address, entry); 2174 if (old_page) { 2175 /* 2176 * Only after switching the pte to the new page may 2177 * we remove the mapcount here. Otherwise another 2178 * process may come and find the rmap count decremented 2179 * before the pte is switched to the new page, and 2180 * "reuse" the old page writing into it while our pte 2181 * here still points into it and can be read by other 2182 * threads. 2183 * 2184 * The critical issue is to order this 2185 * page_remove_rmap with the ptp_clear_flush above. 2186 * Those stores are ordered by (if nothing else,) 2187 * the barrier present in the atomic_add_negative 2188 * in page_remove_rmap. 2189 * 2190 * Then the TLB flush in ptep_clear_flush ensures that 2191 * no process can access the old page before the 2192 * decremented mapcount is visible. And the old page 2193 * cannot be reused until after the decremented 2194 * mapcount is visible. So transitively, TLBs to 2195 * old page will be flushed before it can be reused. 2196 */ 2197 page_remove_rmap(old_page); 2198 } 2199 2200 /* Free the old page.. */ 2201 new_page = old_page; 2202 ret |= VM_FAULT_WRITE; 2203 } else 2204 mem_cgroup_uncharge_page(new_page); 2205 2206 if (new_page) 2207 page_cache_release(new_page); 2208 if (old_page) 2209 page_cache_release(old_page); 2210 unlock: 2211 pte_unmap_unlock(page_table, ptl); 2212 if (dirty_page) { 2213 /* 2214 * Yes, Virginia, this is actually required to prevent a race 2215 * with clear_page_dirty_for_io() from clearing the page dirty 2216 * bit after it clear all dirty ptes, but before a racing 2217 * do_wp_page installs a dirty pte. 2218 * 2219 * do_no_page is protected similarly. 2220 */ 2221 if (!page_mkwrite) { 2222 wait_on_page_locked(dirty_page); 2223 set_page_dirty_balance(dirty_page, page_mkwrite); 2224 } 2225 put_page(dirty_page); 2226 if (page_mkwrite) { 2227 struct address_space *mapping = dirty_page->mapping; 2228 2229 set_page_dirty(dirty_page); 2230 unlock_page(dirty_page); 2231 page_cache_release(dirty_page); 2232 if (mapping) { 2233 /* 2234 * Some device drivers do not set page.mapping 2235 * but still dirty their pages 2236 */ 2237 balance_dirty_pages_ratelimited(mapping); 2238 } 2239 } 2240 2241 /* file_update_time outside page_lock */ 2242 if (vma->vm_file) 2243 file_update_time(vma->vm_file); 2244 } 2245 return ret; 2246 oom_free_new: 2247 page_cache_release(new_page); 2248 oom: 2249 if (old_page) { 2250 if (page_mkwrite) { 2251 unlock_page(old_page); 2252 page_cache_release(old_page); 2253 } 2254 page_cache_release(old_page); 2255 } 2256 return VM_FAULT_OOM; 2257 2258 unwritable_page: 2259 page_cache_release(old_page); 2260 return ret; 2261 } 2262 2263 /* 2264 * Helper functions for unmap_mapping_range(). 2265 * 2266 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ 2267 * 2268 * We have to restart searching the prio_tree whenever we drop the lock, 2269 * since the iterator is only valid while the lock is held, and anyway 2270 * a later vma might be split and reinserted earlier while lock dropped. 2271 * 2272 * The list of nonlinear vmas could be handled more efficiently, using 2273 * a placeholder, but handle it in the same way until a need is shown. 2274 * It is important to search the prio_tree before nonlinear list: a vma 2275 * may become nonlinear and be shifted from prio_tree to nonlinear list 2276 * while the lock is dropped; but never shifted from list to prio_tree. 2277 * 2278 * In order to make forward progress despite restarting the search, 2279 * vm_truncate_count is used to mark a vma as now dealt with, so we can 2280 * quickly skip it next time around. Since the prio_tree search only 2281 * shows us those vmas affected by unmapping the range in question, we 2282 * can't efficiently keep all vmas in step with mapping->truncate_count: 2283 * so instead reset them all whenever it wraps back to 0 (then go to 1). 2284 * mapping->truncate_count and vma->vm_truncate_count are protected by 2285 * i_mmap_lock. 2286 * 2287 * In order to make forward progress despite repeatedly restarting some 2288 * large vma, note the restart_addr from unmap_vmas when it breaks out: 2289 * and restart from that address when we reach that vma again. It might 2290 * have been split or merged, shrunk or extended, but never shifted: so 2291 * restart_addr remains valid so long as it remains in the vma's range. 2292 * unmap_mapping_range forces truncate_count to leap over page-aligned 2293 * values so we can save vma's restart_addr in its truncate_count field. 2294 */ 2295 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) 2296 2297 static void reset_vma_truncate_counts(struct address_space *mapping) 2298 { 2299 struct vm_area_struct *vma; 2300 struct prio_tree_iter iter; 2301 2302 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) 2303 vma->vm_truncate_count = 0; 2304 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 2305 vma->vm_truncate_count = 0; 2306 } 2307 2308 static int unmap_mapping_range_vma(struct vm_area_struct *vma, 2309 unsigned long start_addr, unsigned long end_addr, 2310 struct zap_details *details) 2311 { 2312 unsigned long restart_addr; 2313 int need_break; 2314 2315 /* 2316 * files that support invalidating or truncating portions of the 2317 * file from under mmaped areas must have their ->fault function 2318 * return a locked page (and set VM_FAULT_LOCKED in the return). 2319 * This provides synchronisation against concurrent unmapping here. 2320 */ 2321 2322 again: 2323 restart_addr = vma->vm_truncate_count; 2324 if (is_restart_addr(restart_addr) && start_addr < restart_addr) { 2325 start_addr = restart_addr; 2326 if (start_addr >= end_addr) { 2327 /* Top of vma has been split off since last time */ 2328 vma->vm_truncate_count = details->truncate_count; 2329 return 0; 2330 } 2331 } 2332 2333 restart_addr = zap_page_range(vma, start_addr, 2334 end_addr - start_addr, details); 2335 need_break = need_resched() || spin_needbreak(details->i_mmap_lock); 2336 2337 if (restart_addr >= end_addr) { 2338 /* We have now completed this vma: mark it so */ 2339 vma->vm_truncate_count = details->truncate_count; 2340 if (!need_break) 2341 return 0; 2342 } else { 2343 /* Note restart_addr in vma's truncate_count field */ 2344 vma->vm_truncate_count = restart_addr; 2345 if (!need_break) 2346 goto again; 2347 } 2348 2349 spin_unlock(details->i_mmap_lock); 2350 cond_resched(); 2351 spin_lock(details->i_mmap_lock); 2352 return -EINTR; 2353 } 2354 2355 static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2356 struct zap_details *details) 2357 { 2358 struct vm_area_struct *vma; 2359 struct prio_tree_iter iter; 2360 pgoff_t vba, vea, zba, zea; 2361 2362 restart: 2363 vma_prio_tree_foreach(vma, &iter, root, 2364 details->first_index, details->last_index) { 2365 /* Skip quickly over those we have already dealt with */ 2366 if (vma->vm_truncate_count == details->truncate_count) 2367 continue; 2368 2369 vba = vma->vm_pgoff; 2370 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2371 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ 2372 zba = details->first_index; 2373 if (zba < vba) 2374 zba = vba; 2375 zea = details->last_index; 2376 if (zea > vea) 2377 zea = vea; 2378 2379 if (unmap_mapping_range_vma(vma, 2380 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 2381 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 2382 details) < 0) 2383 goto restart; 2384 } 2385 } 2386 2387 static inline void unmap_mapping_range_list(struct list_head *head, 2388 struct zap_details *details) 2389 { 2390 struct vm_area_struct *vma; 2391 2392 /* 2393 * In nonlinear VMAs there is no correspondence between virtual address 2394 * offset and file offset. So we must perform an exhaustive search 2395 * across *all* the pages in each nonlinear VMA, not just the pages 2396 * whose virtual address lies outside the file truncation point. 2397 */ 2398 restart: 2399 list_for_each_entry(vma, head, shared.vm_set.list) { 2400 /* Skip quickly over those we have already dealt with */ 2401 if (vma->vm_truncate_count == details->truncate_count) 2402 continue; 2403 details->nonlinear_vma = vma; 2404 if (unmap_mapping_range_vma(vma, vma->vm_start, 2405 vma->vm_end, details) < 0) 2406 goto restart; 2407 } 2408 } 2409 2410 /** 2411 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. 2412 * @mapping: the address space containing mmaps to be unmapped. 2413 * @holebegin: byte in first page to unmap, relative to the start of 2414 * the underlying file. This will be rounded down to a PAGE_SIZE 2415 * boundary. Note that this is different from truncate_pagecache(), which 2416 * must keep the partial page. In contrast, we must get rid of 2417 * partial pages. 2418 * @holelen: size of prospective hole in bytes. This will be rounded 2419 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the 2420 * end of the file. 2421 * @even_cows: 1 when truncating a file, unmap even private COWed pages; 2422 * but 0 when invalidating pagecache, don't throw away private data. 2423 */ 2424 void unmap_mapping_range(struct address_space *mapping, 2425 loff_t const holebegin, loff_t const holelen, int even_cows) 2426 { 2427 struct zap_details details; 2428 pgoff_t hba = holebegin >> PAGE_SHIFT; 2429 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2430 2431 /* Check for overflow. */ 2432 if (sizeof(holelen) > sizeof(hlen)) { 2433 long long holeend = 2434 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2435 if (holeend & ~(long long)ULONG_MAX) 2436 hlen = ULONG_MAX - hba + 1; 2437 } 2438 2439 details.check_mapping = even_cows? NULL: mapping; 2440 details.nonlinear_vma = NULL; 2441 details.first_index = hba; 2442 details.last_index = hba + hlen - 1; 2443 if (details.last_index < details.first_index) 2444 details.last_index = ULONG_MAX; 2445 details.i_mmap_lock = &mapping->i_mmap_lock; 2446 2447 spin_lock(&mapping->i_mmap_lock); 2448 2449 /* Protect against endless unmapping loops */ 2450 mapping->truncate_count++; 2451 if (unlikely(is_restart_addr(mapping->truncate_count))) { 2452 if (mapping->truncate_count == 0) 2453 reset_vma_truncate_counts(mapping); 2454 mapping->truncate_count++; 2455 } 2456 details.truncate_count = mapping->truncate_count; 2457 2458 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2459 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2460 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2461 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2462 spin_unlock(&mapping->i_mmap_lock); 2463 } 2464 EXPORT_SYMBOL(unmap_mapping_range); 2465 2466 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 2467 { 2468 struct address_space *mapping = inode->i_mapping; 2469 2470 /* 2471 * If the underlying filesystem is not going to provide 2472 * a way to truncate a range of blocks (punch a hole) - 2473 * we should return failure right now. 2474 */ 2475 if (!inode->i_op->truncate_range) 2476 return -ENOSYS; 2477 2478 mutex_lock(&inode->i_mutex); 2479 down_write(&inode->i_alloc_sem); 2480 unmap_mapping_range(mapping, offset, (end - offset), 1); 2481 truncate_inode_pages_range(mapping, offset, end); 2482 unmap_mapping_range(mapping, offset, (end - offset), 1); 2483 inode->i_op->truncate_range(inode, offset, end); 2484 up_write(&inode->i_alloc_sem); 2485 mutex_unlock(&inode->i_mutex); 2486 2487 return 0; 2488 } 2489 2490 /* 2491 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2492 * but allow concurrent faults), and pte mapped but not yet locked. 2493 * We return with mmap_sem still held, but pte unmapped and unlocked. 2494 */ 2495 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2496 unsigned long address, pte_t *page_table, pmd_t *pmd, 2497 unsigned int flags, pte_t orig_pte) 2498 { 2499 spinlock_t *ptl; 2500 struct page *page; 2501 swp_entry_t entry; 2502 pte_t pte; 2503 struct mem_cgroup *ptr = NULL; 2504 int ret = 0; 2505 2506 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2507 goto out; 2508 2509 entry = pte_to_swp_entry(orig_pte); 2510 if (unlikely(non_swap_entry(entry))) { 2511 if (is_migration_entry(entry)) { 2512 migration_entry_wait(mm, pmd, address); 2513 } else if (is_hwpoison_entry(entry)) { 2514 ret = VM_FAULT_HWPOISON; 2515 } else { 2516 print_bad_pte(vma, address, orig_pte, NULL); 2517 ret = VM_FAULT_OOM; 2518 } 2519 goto out; 2520 } 2521 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2522 page = lookup_swap_cache(entry); 2523 if (!page) { 2524 grab_swap_token(mm); /* Contend for token _before_ read-in */ 2525 page = swapin_readahead(entry, 2526 GFP_HIGHUSER_MOVABLE, vma, address); 2527 if (!page) { 2528 /* 2529 * Back out if somebody else faulted in this pte 2530 * while we released the pte lock. 2531 */ 2532 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2533 if (likely(pte_same(*page_table, orig_pte))) 2534 ret = VM_FAULT_OOM; 2535 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2536 goto unlock; 2537 } 2538 2539 /* Had to read the page from swap area: Major fault */ 2540 ret = VM_FAULT_MAJOR; 2541 count_vm_event(PGMAJFAULT); 2542 } else if (PageHWPoison(page)) { 2543 ret = VM_FAULT_HWPOISON; 2544 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2545 goto out_release; 2546 } 2547 2548 lock_page(page); 2549 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2550 2551 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2552 ret = VM_FAULT_OOM; 2553 goto out_page; 2554 } 2555 2556 /* 2557 * Back out if somebody else already faulted in this pte. 2558 */ 2559 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2560 if (unlikely(!pte_same(*page_table, orig_pte))) 2561 goto out_nomap; 2562 2563 if (unlikely(!PageUptodate(page))) { 2564 ret = VM_FAULT_SIGBUS; 2565 goto out_nomap; 2566 } 2567 2568 /* 2569 * The page isn't present yet, go ahead with the fault. 2570 * 2571 * Be careful about the sequence of operations here. 2572 * To get its accounting right, reuse_swap_page() must be called 2573 * while the page is counted on swap but not yet in mapcount i.e. 2574 * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 2575 * must be called after the swap_free(), or it will never succeed. 2576 * Because delete_from_swap_page() may be called by reuse_swap_page(), 2577 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry 2578 * in page->private. In this case, a record in swap_cgroup is silently 2579 * discarded at swap_free(). 2580 */ 2581 2582 inc_mm_counter(mm, anon_rss); 2583 pte = mk_pte(page, vma->vm_page_prot); 2584 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2585 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2586 flags &= ~FAULT_FLAG_WRITE; 2587 } 2588 flush_icache_page(vma, page); 2589 set_pte_at(mm, address, page_table, pte); 2590 page_add_anon_rmap(page, vma, address); 2591 /* It's better to call commit-charge after rmap is established */ 2592 mem_cgroup_commit_charge_swapin(page, ptr); 2593 2594 swap_free(entry); 2595 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2596 try_to_free_swap(page); 2597 unlock_page(page); 2598 2599 if (flags & FAULT_FLAG_WRITE) { 2600 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2601 if (ret & VM_FAULT_ERROR) 2602 ret &= VM_FAULT_ERROR; 2603 goto out; 2604 } 2605 2606 /* No need to invalidate - it was non-present before */ 2607 update_mmu_cache(vma, address, pte); 2608 unlock: 2609 pte_unmap_unlock(page_table, ptl); 2610 out: 2611 return ret; 2612 out_nomap: 2613 mem_cgroup_cancel_charge_swapin(ptr); 2614 pte_unmap_unlock(page_table, ptl); 2615 out_page: 2616 unlock_page(page); 2617 out_release: 2618 page_cache_release(page); 2619 return ret; 2620 } 2621 2622 /* 2623 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2624 * but allow concurrent faults), and pte mapped but not yet locked. 2625 * We return with mmap_sem still held, but pte unmapped and unlocked. 2626 */ 2627 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 2628 unsigned long address, pte_t *page_table, pmd_t *pmd, 2629 unsigned int flags) 2630 { 2631 struct page *page; 2632 spinlock_t *ptl; 2633 pte_t entry; 2634 2635 if (!(flags & FAULT_FLAG_WRITE)) { 2636 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2637 vma->vm_page_prot)); 2638 ptl = pte_lockptr(mm, pmd); 2639 spin_lock(ptl); 2640 if (!pte_none(*page_table)) 2641 goto unlock; 2642 goto setpte; 2643 } 2644 2645 /* Allocate our own private page. */ 2646 pte_unmap(page_table); 2647 2648 if (unlikely(anon_vma_prepare(vma))) 2649 goto oom; 2650 page = alloc_zeroed_user_highpage_movable(vma, address); 2651 if (!page) 2652 goto oom; 2653 __SetPageUptodate(page); 2654 2655 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) 2656 goto oom_free_page; 2657 2658 entry = mk_pte(page, vma->vm_page_prot); 2659 if (vma->vm_flags & VM_WRITE) 2660 entry = pte_mkwrite(pte_mkdirty(entry)); 2661 2662 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2663 if (!pte_none(*page_table)) 2664 goto release; 2665 2666 inc_mm_counter(mm, anon_rss); 2667 page_add_new_anon_rmap(page, vma, address); 2668 setpte: 2669 set_pte_at(mm, address, page_table, entry); 2670 2671 /* No need to invalidate - it was non-present before */ 2672 update_mmu_cache(vma, address, entry); 2673 unlock: 2674 pte_unmap_unlock(page_table, ptl); 2675 return 0; 2676 release: 2677 mem_cgroup_uncharge_page(page); 2678 page_cache_release(page); 2679 goto unlock; 2680 oom_free_page: 2681 page_cache_release(page); 2682 oom: 2683 return VM_FAULT_OOM; 2684 } 2685 2686 /* 2687 * __do_fault() tries to create a new page mapping. It aggressively 2688 * tries to share with existing pages, but makes a separate copy if 2689 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid 2690 * the next page fault. 2691 * 2692 * As this is called only for pages that do not currently exist, we 2693 * do not need to flush old virtual caches or the TLB. 2694 * 2695 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2696 * but allow concurrent faults), and pte neither mapped nor locked. 2697 * We return with mmap_sem still held, but pte unmapped and unlocked. 2698 */ 2699 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2700 unsigned long address, pmd_t *pmd, 2701 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 2702 { 2703 pte_t *page_table; 2704 spinlock_t *ptl; 2705 struct page *page; 2706 pte_t entry; 2707 int anon = 0; 2708 int charged = 0; 2709 struct page *dirty_page = NULL; 2710 struct vm_fault vmf; 2711 int ret; 2712 int page_mkwrite = 0; 2713 2714 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 2715 vmf.pgoff = pgoff; 2716 vmf.flags = flags; 2717 vmf.page = NULL; 2718 2719 ret = vma->vm_ops->fault(vma, &vmf); 2720 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2721 return ret; 2722 2723 if (unlikely(PageHWPoison(vmf.page))) { 2724 if (ret & VM_FAULT_LOCKED) 2725 unlock_page(vmf.page); 2726 return VM_FAULT_HWPOISON; 2727 } 2728 2729 /* 2730 * For consistency in subsequent calls, make the faulted page always 2731 * locked. 2732 */ 2733 if (unlikely(!(ret & VM_FAULT_LOCKED))) 2734 lock_page(vmf.page); 2735 else 2736 VM_BUG_ON(!PageLocked(vmf.page)); 2737 2738 /* 2739 * Should we do an early C-O-W break? 2740 */ 2741 page = vmf.page; 2742 if (flags & FAULT_FLAG_WRITE) { 2743 if (!(vma->vm_flags & VM_SHARED)) { 2744 anon = 1; 2745 if (unlikely(anon_vma_prepare(vma))) { 2746 ret = VM_FAULT_OOM; 2747 goto out; 2748 } 2749 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 2750 vma, address); 2751 if (!page) { 2752 ret = VM_FAULT_OOM; 2753 goto out; 2754 } 2755 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { 2756 ret = VM_FAULT_OOM; 2757 page_cache_release(page); 2758 goto out; 2759 } 2760 charged = 1; 2761 /* 2762 * Don't let another task, with possibly unlocked vma, 2763 * keep the mlocked page. 2764 */ 2765 if (vma->vm_flags & VM_LOCKED) 2766 clear_page_mlock(vmf.page); 2767 copy_user_highpage(page, vmf.page, address, vma); 2768 __SetPageUptodate(page); 2769 } else { 2770 /* 2771 * If the page will be shareable, see if the backing 2772 * address space wants to know that the page is about 2773 * to become writable 2774 */ 2775 if (vma->vm_ops->page_mkwrite) { 2776 int tmp; 2777 2778 unlock_page(page); 2779 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 2780 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2781 if (unlikely(tmp & 2782 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2783 ret = tmp; 2784 goto unwritable_page; 2785 } 2786 if (unlikely(!(tmp & VM_FAULT_LOCKED))) { 2787 lock_page(page); 2788 if (!page->mapping) { 2789 ret = 0; /* retry the fault */ 2790 unlock_page(page); 2791 goto unwritable_page; 2792 } 2793 } else 2794 VM_BUG_ON(!PageLocked(page)); 2795 page_mkwrite = 1; 2796 } 2797 } 2798 2799 } 2800 2801 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2802 2803 /* 2804 * This silly early PAGE_DIRTY setting removes a race 2805 * due to the bad i386 page protection. But it's valid 2806 * for other architectures too. 2807 * 2808 * Note that if FAULT_FLAG_WRITE is set, we either now have 2809 * an exclusive copy of the page, or this is a shared mapping, 2810 * so we can make it writable and dirty to avoid having to 2811 * handle that later. 2812 */ 2813 /* Only go through if we didn't race with anybody else... */ 2814 if (likely(pte_same(*page_table, orig_pte))) { 2815 flush_icache_page(vma, page); 2816 entry = mk_pte(page, vma->vm_page_prot); 2817 if (flags & FAULT_FLAG_WRITE) 2818 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2819 if (anon) { 2820 inc_mm_counter(mm, anon_rss); 2821 page_add_new_anon_rmap(page, vma, address); 2822 } else { 2823 inc_mm_counter(mm, file_rss); 2824 page_add_file_rmap(page); 2825 if (flags & FAULT_FLAG_WRITE) { 2826 dirty_page = page; 2827 get_page(dirty_page); 2828 } 2829 } 2830 set_pte_at(mm, address, page_table, entry); 2831 2832 /* no need to invalidate: a not-present page won't be cached */ 2833 update_mmu_cache(vma, address, entry); 2834 } else { 2835 if (charged) 2836 mem_cgroup_uncharge_page(page); 2837 if (anon) 2838 page_cache_release(page); 2839 else 2840 anon = 1; /* no anon but release faulted_page */ 2841 } 2842 2843 pte_unmap_unlock(page_table, ptl); 2844 2845 out: 2846 if (dirty_page) { 2847 struct address_space *mapping = page->mapping; 2848 2849 if (set_page_dirty(dirty_page)) 2850 page_mkwrite = 1; 2851 unlock_page(dirty_page); 2852 put_page(dirty_page); 2853 if (page_mkwrite && mapping) { 2854 /* 2855 * Some device drivers do not set page.mapping but still 2856 * dirty their pages 2857 */ 2858 balance_dirty_pages_ratelimited(mapping); 2859 } 2860 2861 /* file_update_time outside page_lock */ 2862 if (vma->vm_file) 2863 file_update_time(vma->vm_file); 2864 } else { 2865 unlock_page(vmf.page); 2866 if (anon) 2867 page_cache_release(vmf.page); 2868 } 2869 2870 return ret; 2871 2872 unwritable_page: 2873 page_cache_release(page); 2874 return ret; 2875 } 2876 2877 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2878 unsigned long address, pte_t *page_table, pmd_t *pmd, 2879 unsigned int flags, pte_t orig_pte) 2880 { 2881 pgoff_t pgoff = (((address & PAGE_MASK) 2882 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 2883 2884 pte_unmap(page_table); 2885 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2886 } 2887 2888 /* 2889 * Fault of a previously existing named mapping. Repopulate the pte 2890 * from the encoded file_pte if possible. This enables swappable 2891 * nonlinear vmas. 2892 * 2893 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2894 * but allow concurrent faults), and pte mapped but not yet locked. 2895 * We return with mmap_sem still held, but pte unmapped and unlocked. 2896 */ 2897 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2898 unsigned long address, pte_t *page_table, pmd_t *pmd, 2899 unsigned int flags, pte_t orig_pte) 2900 { 2901 pgoff_t pgoff; 2902 2903 flags |= FAULT_FLAG_NONLINEAR; 2904 2905 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2906 return 0; 2907 2908 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { 2909 /* 2910 * Page table corrupted: show pte and kill process. 2911 */ 2912 print_bad_pte(vma, address, orig_pte, NULL); 2913 return VM_FAULT_OOM; 2914 } 2915 2916 pgoff = pte_to_pgoff(orig_pte); 2917 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2918 } 2919 2920 /* 2921 * These routines also need to handle stuff like marking pages dirty 2922 * and/or accessed for architectures that don't do it in hardware (most 2923 * RISC architectures). The early dirtying is also good on the i386. 2924 * 2925 * There is also a hook called "update_mmu_cache()" that architectures 2926 * with external mmu caches can use to update those (ie the Sparc or 2927 * PowerPC hashed page tables that act as extended TLBs). 2928 * 2929 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2930 * but allow concurrent faults), and pte mapped but not yet locked. 2931 * We return with mmap_sem still held, but pte unmapped and unlocked. 2932 */ 2933 static inline int handle_pte_fault(struct mm_struct *mm, 2934 struct vm_area_struct *vma, unsigned long address, 2935 pte_t *pte, pmd_t *pmd, unsigned int flags) 2936 { 2937 pte_t entry; 2938 spinlock_t *ptl; 2939 2940 entry = *pte; 2941 if (!pte_present(entry)) { 2942 if (pte_none(entry)) { 2943 if (vma->vm_ops) { 2944 if (likely(vma->vm_ops->fault)) 2945 return do_linear_fault(mm, vma, address, 2946 pte, pmd, flags, entry); 2947 } 2948 return do_anonymous_page(mm, vma, address, 2949 pte, pmd, flags); 2950 } 2951 if (pte_file(entry)) 2952 return do_nonlinear_fault(mm, vma, address, 2953 pte, pmd, flags, entry); 2954 return do_swap_page(mm, vma, address, 2955 pte, pmd, flags, entry); 2956 } 2957 2958 ptl = pte_lockptr(mm, pmd); 2959 spin_lock(ptl); 2960 if (unlikely(!pte_same(*pte, entry))) 2961 goto unlock; 2962 if (flags & FAULT_FLAG_WRITE) { 2963 if (!pte_write(entry)) 2964 return do_wp_page(mm, vma, address, 2965 pte, pmd, ptl, entry); 2966 entry = pte_mkdirty(entry); 2967 } 2968 entry = pte_mkyoung(entry); 2969 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 2970 update_mmu_cache(vma, address, entry); 2971 } else { 2972 /* 2973 * This is needed only for protection faults but the arch code 2974 * is not yet telling us if this is a protection fault or not. 2975 * This still avoids useless tlb flushes for .text page faults 2976 * with threads. 2977 */ 2978 if (flags & FAULT_FLAG_WRITE) 2979 flush_tlb_page(vma, address); 2980 } 2981 unlock: 2982 pte_unmap_unlock(pte, ptl); 2983 return 0; 2984 } 2985 2986 /* 2987 * By the time we get here, we already hold the mm semaphore 2988 */ 2989 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2990 unsigned long address, unsigned int flags) 2991 { 2992 pgd_t *pgd; 2993 pud_t *pud; 2994 pmd_t *pmd; 2995 pte_t *pte; 2996 2997 __set_current_state(TASK_RUNNING); 2998 2999 count_vm_event(PGFAULT); 3000 3001 if (unlikely(is_vm_hugetlb_page(vma))) 3002 return hugetlb_fault(mm, vma, address, flags); 3003 3004 pgd = pgd_offset(mm, address); 3005 pud = pud_alloc(mm, pgd, address); 3006 if (!pud) 3007 return VM_FAULT_OOM; 3008 pmd = pmd_alloc(mm, pud, address); 3009 if (!pmd) 3010 return VM_FAULT_OOM; 3011 pte = pte_alloc_map(mm, pmd, address); 3012 if (!pte) 3013 return VM_FAULT_OOM; 3014 3015 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3016 } 3017 3018 #ifndef __PAGETABLE_PUD_FOLDED 3019 /* 3020 * Allocate page upper directory. 3021 * We've already handled the fast-path in-line. 3022 */ 3023 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 3024 { 3025 pud_t *new = pud_alloc_one(mm, address); 3026 if (!new) 3027 return -ENOMEM; 3028 3029 smp_wmb(); /* See comment in __pte_alloc */ 3030 3031 spin_lock(&mm->page_table_lock); 3032 if (pgd_present(*pgd)) /* Another has populated it */ 3033 pud_free(mm, new); 3034 else 3035 pgd_populate(mm, pgd, new); 3036 spin_unlock(&mm->page_table_lock); 3037 return 0; 3038 } 3039 #endif /* __PAGETABLE_PUD_FOLDED */ 3040 3041 #ifndef __PAGETABLE_PMD_FOLDED 3042 /* 3043 * Allocate page middle directory. 3044 * We've already handled the fast-path in-line. 3045 */ 3046 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3047 { 3048 pmd_t *new = pmd_alloc_one(mm, address); 3049 if (!new) 3050 return -ENOMEM; 3051 3052 smp_wmb(); /* See comment in __pte_alloc */ 3053 3054 spin_lock(&mm->page_table_lock); 3055 #ifndef __ARCH_HAS_4LEVEL_HACK 3056 if (pud_present(*pud)) /* Another has populated it */ 3057 pmd_free(mm, new); 3058 else 3059 pud_populate(mm, pud, new); 3060 #else 3061 if (pgd_present(*pud)) /* Another has populated it */ 3062 pmd_free(mm, new); 3063 else 3064 pgd_populate(mm, pud, new); 3065 #endif /* __ARCH_HAS_4LEVEL_HACK */ 3066 spin_unlock(&mm->page_table_lock); 3067 return 0; 3068 } 3069 #endif /* __PAGETABLE_PMD_FOLDED */ 3070 3071 int make_pages_present(unsigned long addr, unsigned long end) 3072 { 3073 int ret, len, write; 3074 struct vm_area_struct * vma; 3075 3076 vma = find_vma(current->mm, addr); 3077 if (!vma) 3078 return -ENOMEM; 3079 write = (vma->vm_flags & VM_WRITE) != 0; 3080 BUG_ON(addr >= end); 3081 BUG_ON(end > vma->vm_end); 3082 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 3083 ret = get_user_pages(current, current->mm, addr, 3084 len, write, 0, NULL, NULL); 3085 if (ret < 0) 3086 return ret; 3087 return ret == len ? 0 : -EFAULT; 3088 } 3089 3090 #if !defined(__HAVE_ARCH_GATE_AREA) 3091 3092 #if defined(AT_SYSINFO_EHDR) 3093 static struct vm_area_struct gate_vma; 3094 3095 static int __init gate_vma_init(void) 3096 { 3097 gate_vma.vm_mm = NULL; 3098 gate_vma.vm_start = FIXADDR_USER_START; 3099 gate_vma.vm_end = FIXADDR_USER_END; 3100 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; 3101 gate_vma.vm_page_prot = __P101; 3102 /* 3103 * Make sure the vDSO gets into every core dump. 3104 * Dumping its contents makes post-mortem fully interpretable later 3105 * without matching up the same kernel and hardware config to see 3106 * what PC values meant. 3107 */ 3108 gate_vma.vm_flags |= VM_ALWAYSDUMP; 3109 return 0; 3110 } 3111 __initcall(gate_vma_init); 3112 #endif 3113 3114 struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 3115 { 3116 #ifdef AT_SYSINFO_EHDR 3117 return &gate_vma; 3118 #else 3119 return NULL; 3120 #endif 3121 } 3122 3123 int in_gate_area_no_task(unsigned long addr) 3124 { 3125 #ifdef AT_SYSINFO_EHDR 3126 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) 3127 return 1; 3128 #endif 3129 return 0; 3130 } 3131 3132 #endif /* __HAVE_ARCH_GATE_AREA */ 3133 3134 static int follow_pte(struct mm_struct *mm, unsigned long address, 3135 pte_t **ptepp, spinlock_t **ptlp) 3136 { 3137 pgd_t *pgd; 3138 pud_t *pud; 3139 pmd_t *pmd; 3140 pte_t *ptep; 3141 3142 pgd = pgd_offset(mm, address); 3143 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3144 goto out; 3145 3146 pud = pud_offset(pgd, address); 3147 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 3148 goto out; 3149 3150 pmd = pmd_offset(pud, address); 3151 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3152 goto out; 3153 3154 /* We cannot handle huge page PFN maps. Luckily they don't exist. */ 3155 if (pmd_huge(*pmd)) 3156 goto out; 3157 3158 ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 3159 if (!ptep) 3160 goto out; 3161 if (!pte_present(*ptep)) 3162 goto unlock; 3163 *ptepp = ptep; 3164 return 0; 3165 unlock: 3166 pte_unmap_unlock(ptep, *ptlp); 3167 out: 3168 return -EINVAL; 3169 } 3170 3171 /** 3172 * follow_pfn - look up PFN at a user virtual address 3173 * @vma: memory mapping 3174 * @address: user virtual address 3175 * @pfn: location to store found PFN 3176 * 3177 * Only IO mappings and raw PFN mappings are allowed. 3178 * 3179 * Returns zero and the pfn at @pfn on success, -ve otherwise. 3180 */ 3181 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 3182 unsigned long *pfn) 3183 { 3184 int ret = -EINVAL; 3185 spinlock_t *ptl; 3186 pte_t *ptep; 3187 3188 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 3189 return ret; 3190 3191 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); 3192 if (ret) 3193 return ret; 3194 *pfn = pte_pfn(*ptep); 3195 pte_unmap_unlock(ptep, ptl); 3196 return 0; 3197 } 3198 EXPORT_SYMBOL(follow_pfn); 3199 3200 #ifdef CONFIG_HAVE_IOREMAP_PROT 3201 int follow_phys(struct vm_area_struct *vma, 3202 unsigned long address, unsigned int flags, 3203 unsigned long *prot, resource_size_t *phys) 3204 { 3205 int ret = -EINVAL; 3206 pte_t *ptep, pte; 3207 spinlock_t *ptl; 3208 3209 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 3210 goto out; 3211 3212 if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) 3213 goto out; 3214 pte = *ptep; 3215 3216 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3217 goto unlock; 3218 3219 *prot = pgprot_val(pte_pgprot(pte)); 3220 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; 3221 3222 ret = 0; 3223 unlock: 3224 pte_unmap_unlock(ptep, ptl); 3225 out: 3226 return ret; 3227 } 3228 3229 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 3230 void *buf, int len, int write) 3231 { 3232 resource_size_t phys_addr; 3233 unsigned long prot = 0; 3234 void __iomem *maddr; 3235 int offset = addr & (PAGE_SIZE-1); 3236 3237 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3238 return -EINVAL; 3239 3240 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); 3241 if (write) 3242 memcpy_toio(maddr + offset, buf, len); 3243 else 3244 memcpy_fromio(buf, maddr + offset, len); 3245 iounmap(maddr); 3246 3247 return len; 3248 } 3249 #endif 3250 3251 /* 3252 * Access another process' address space. 3253 * Source/target buffer must be kernel space, 3254 * Do not walk the page table directly, use get_user_pages 3255 */ 3256 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) 3257 { 3258 struct mm_struct *mm; 3259 struct vm_area_struct *vma; 3260 void *old_buf = buf; 3261 3262 mm = get_task_mm(tsk); 3263 if (!mm) 3264 return 0; 3265 3266 down_read(&mm->mmap_sem); 3267 /* ignore errors, just check how much was successfully transferred */ 3268 while (len) { 3269 int bytes, ret, offset; 3270 void *maddr; 3271 struct page *page = NULL; 3272 3273 ret = get_user_pages(tsk, mm, addr, 1, 3274 write, 1, &page, &vma); 3275 if (ret <= 0) { 3276 /* 3277 * Check if this is a VM_IO | VM_PFNMAP VMA, which 3278 * we can access using slightly different code. 3279 */ 3280 #ifdef CONFIG_HAVE_IOREMAP_PROT 3281 vma = find_vma(mm, addr); 3282 if (!vma) 3283 break; 3284 if (vma->vm_ops && vma->vm_ops->access) 3285 ret = vma->vm_ops->access(vma, addr, buf, 3286 len, write); 3287 if (ret <= 0) 3288 #endif 3289 break; 3290 bytes = ret; 3291 } else { 3292 bytes = len; 3293 offset = addr & (PAGE_SIZE-1); 3294 if (bytes > PAGE_SIZE-offset) 3295 bytes = PAGE_SIZE-offset; 3296 3297 maddr = kmap(page); 3298 if (write) { 3299 copy_to_user_page(vma, page, addr, 3300 maddr + offset, buf, bytes); 3301 set_page_dirty_lock(page); 3302 } else { 3303 copy_from_user_page(vma, page, addr, 3304 buf, maddr + offset, bytes); 3305 } 3306 kunmap(page); 3307 page_cache_release(page); 3308 } 3309 len -= bytes; 3310 buf += bytes; 3311 addr += bytes; 3312 } 3313 up_read(&mm->mmap_sem); 3314 mmput(mm); 3315 3316 return buf - old_buf; 3317 } 3318 3319 /* 3320 * Print the name of a VMA. 3321 */ 3322 void print_vma_addr(char *prefix, unsigned long ip) 3323 { 3324 struct mm_struct *mm = current->mm; 3325 struct vm_area_struct *vma; 3326 3327 /* 3328 * Do not print if we are in atomic 3329 * contexts (in exception stacks, etc.): 3330 */ 3331 if (preempt_count()) 3332 return; 3333 3334 down_read(&mm->mmap_sem); 3335 vma = find_vma(mm, ip); 3336 if (vma && vma->vm_file) { 3337 struct file *f = vma->vm_file; 3338 char *buf = (char *)__get_free_page(GFP_KERNEL); 3339 if (buf) { 3340 char *p, *s; 3341 3342 p = d_path(&f->f_path, buf, PAGE_SIZE); 3343 if (IS_ERR(p)) 3344 p = "?"; 3345 s = strrchr(p, '/'); 3346 if (s) 3347 p = s+1; 3348 printk("%s%s[%lx+%lx]", prefix, p, 3349 vma->vm_start, 3350 vma->vm_end - vma->vm_start); 3351 free_page((unsigned long)buf); 3352 } 3353 } 3354 up_read(¤t->mm->mmap_sem); 3355 } 3356 3357 #ifdef CONFIG_PROVE_LOCKING 3358 void might_fault(void) 3359 { 3360 /* 3361 * Some code (nfs/sunrpc) uses socket ops on kernel memory while 3362 * holding the mmap_sem, this is safe because kernel memory doesn't 3363 * get paged out, therefore we'll never actually fault, and the 3364 * below annotations will generate false positives. 3365 */ 3366 if (segment_eq(get_fs(), KERNEL_DS)) 3367 return; 3368 3369 might_sleep(); 3370 /* 3371 * it would be nicer only to annotate paths which are not under 3372 * pagefault_disable, however that requires a larger audit and 3373 * providing helpers like get_user_atomic. 3374 */ 3375 if (!in_atomic() && current->mm) 3376 might_lock_read(¤t->mm->mmap_sem); 3377 } 3378 EXPORT_SYMBOL(might_fault); 3379 #endif 3380