1 /* 2 * mm/mmap.c 3 * 4 * Written by obz. 5 * 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 7 */ 8 9 #include <linux/kernel.h> 10 #include <linux/slab.h> 11 #include <linux/backing-dev.h> 12 #include <linux/mm.h> 13 #include <linux/shm.h> 14 #include <linux/mman.h> 15 #include <linux/pagemap.h> 16 #include <linux/swap.h> 17 #include <linux/syscalls.h> 18 #include <linux/capability.h> 19 #include <linux/init.h> 20 #include <linux/file.h> 21 #include <linux/fs.h> 22 #include <linux/personality.h> 23 #include <linux/security.h> 24 #include <linux/hugetlb.h> 25 #include <linux/profile.h> 26 #include <linux/export.h> 27 #include <linux/mount.h> 28 #include <linux/mempolicy.h> 29 #include <linux/rmap.h> 30 #include <linux/mmu_notifier.h> 31 #include <linux/perf_event.h> 32 #include <linux/audit.h> 33 #include <linux/khugepaged.h> 34 #include <linux/uprobes.h> 35 #include <linux/rbtree_augmented.h> 36 #include <linux/sched/sysctl.h> 37 #include <linux/notifier.h> 38 #include <linux/memory.h> 39 40 #include <asm/uaccess.h> 41 #include <asm/cacheflush.h> 42 #include <asm/tlb.h> 43 #include <asm/mmu_context.h> 44 45 #include "internal.h" 46 47 #ifndef arch_mmap_check 48 #define arch_mmap_check(addr, len, flags) (0) 49 #endif 50 51 #ifndef arch_rebalance_pgtables 52 #define arch_rebalance_pgtables(addr, len) (addr) 53 #endif 54 55 static void unmap_region(struct mm_struct *mm, 56 struct vm_area_struct *vma, struct vm_area_struct *prev, 57 unsigned long start, unsigned long end); 58 59 /* description of effects of mapping type and prot in current implementation. 60 * this is due to the limited x86 page protection hardware. The expected 61 * behavior is in parens: 62 * 63 * map_type prot 64 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC 65 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 66 * w: (no) no w: (no) no w: (yes) yes w: (no) no 67 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 68 * 69 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 70 * w: (no) no w: (no) no w: (copy) copy w: (no) no 71 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 72 * 73 */ 74 pgprot_t protection_map[16] = { 75 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, 76 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 77 }; 78 79 pgprot_t vm_get_page_prot(unsigned long vm_flags) 80 { 81 return __pgprot(pgprot_val(protection_map[vm_flags & 82 (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) | 83 pgprot_val(arch_vm_get_page_prot(vm_flags))); 84 } 85 EXPORT_SYMBOL(vm_get_page_prot); 86 87 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ 88 int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ 89 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; 90 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ 91 unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ 92 /* 93 * Make sure vm_committed_as in one cacheline and not cacheline shared with 94 * other variables. It can be updated by several CPUs frequently. 95 */ 96 struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; 97 98 /* 99 * The global memory commitment made in the system can be a metric 100 * that can be used to drive ballooning decisions when Linux is hosted 101 * as a guest. On Hyper-V, the host implements a policy engine for dynamically 102 * balancing memory across competing virtual machines that are hosted. 103 * Several metrics drive this policy engine including the guest reported 104 * memory commitment. 105 */ 106 unsigned long vm_memory_committed(void) 107 { 108 return percpu_counter_read_positive(&vm_committed_as); 109 } 110 EXPORT_SYMBOL_GPL(vm_memory_committed); 111 112 /* 113 * Check that a process has enough memory to allocate a new virtual 114 * mapping. 0 means there is enough memory for the allocation to 115 * succeed and -ENOMEM implies there is not. 116 * 117 * We currently support three overcommit policies, which are set via the 118 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting 119 * 120 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 121 * Additional code 2002 Jul 20 by Robert Love. 122 * 123 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 124 * 125 * Note this is a helper function intended to be used by LSMs which 126 * wish to use this logic. 127 */ 128 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 129 { 130 unsigned long free, allowed, reserve; 131 132 vm_acct_memory(pages); 133 134 /* 135 * Sometimes we want to use more memory than we have 136 */ 137 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 138 return 0; 139 140 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 141 free = global_page_state(NR_FREE_PAGES); 142 free += global_page_state(NR_FILE_PAGES); 143 144 /* 145 * shmem pages shouldn't be counted as free in this 146 * case, they can't be purged, only swapped out, and 147 * that won't affect the overall amount of available 148 * memory in the system. 149 */ 150 free -= global_page_state(NR_SHMEM); 151 152 free += get_nr_swap_pages(); 153 154 /* 155 * Any slabs which are created with the 156 * SLAB_RECLAIM_ACCOUNT flag claim to have contents 157 * which are reclaimable, under pressure. The dentry 158 * cache and most inode caches should fall into this 159 */ 160 free += global_page_state(NR_SLAB_RECLAIMABLE); 161 162 /* 163 * Leave reserved pages. The pages are not for anonymous pages. 164 */ 165 if (free <= totalreserve_pages) 166 goto error; 167 else 168 free -= totalreserve_pages; 169 170 /* 171 * Reserve some for root 172 */ 173 if (!cap_sys_admin) 174 free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 175 176 if (free > pages) 177 return 0; 178 179 goto error; 180 } 181 182 allowed = vm_commit_limit(); 183 /* 184 * Reserve some for root 185 */ 186 if (!cap_sys_admin) 187 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 188 189 /* 190 * Don't let a single process grow so big a user can't recover 191 */ 192 if (mm) { 193 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 194 allowed -= min(mm->total_vm / 32, reserve); 195 } 196 197 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 198 return 0; 199 error: 200 vm_unacct_memory(pages); 201 202 return -ENOMEM; 203 } 204 205 /* 206 * Requires inode->i_mapping->i_mmap_mutex 207 */ 208 static void __remove_shared_vm_struct(struct vm_area_struct *vma, 209 struct file *file, struct address_space *mapping) 210 { 211 if (vma->vm_flags & VM_DENYWRITE) 212 atomic_inc(&file_inode(file)->i_writecount); 213 if (vma->vm_flags & VM_SHARED) 214 mapping->i_mmap_writable--; 215 216 flush_dcache_mmap_lock(mapping); 217 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 218 list_del_init(&vma->shared.nonlinear); 219 else 220 vma_interval_tree_remove(vma, &mapping->i_mmap); 221 flush_dcache_mmap_unlock(mapping); 222 } 223 224 /* 225 * Unlink a file-based vm structure from its interval tree, to hide 226 * vma from rmap and vmtruncate before freeing its page tables. 227 */ 228 void unlink_file_vma(struct vm_area_struct *vma) 229 { 230 struct file *file = vma->vm_file; 231 232 if (file) { 233 struct address_space *mapping = file->f_mapping; 234 mutex_lock(&mapping->i_mmap_mutex); 235 __remove_shared_vm_struct(vma, file, mapping); 236 mutex_unlock(&mapping->i_mmap_mutex); 237 } 238 } 239 240 /* 241 * Close a vm structure and free it, returning the next. 242 */ 243 static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) 244 { 245 struct vm_area_struct *next = vma->vm_next; 246 247 might_sleep(); 248 if (vma->vm_ops && vma->vm_ops->close) 249 vma->vm_ops->close(vma); 250 if (vma->vm_file) 251 fput(vma->vm_file); 252 mpol_put(vma_policy(vma)); 253 kmem_cache_free(vm_area_cachep, vma); 254 return next; 255 } 256 257 static unsigned long do_brk(unsigned long addr, unsigned long len); 258 259 SYSCALL_DEFINE1(brk, unsigned long, brk) 260 { 261 unsigned long rlim, retval; 262 unsigned long newbrk, oldbrk; 263 struct mm_struct *mm = current->mm; 264 unsigned long min_brk; 265 bool populate; 266 267 down_write(&mm->mmap_sem); 268 269 #ifdef CONFIG_COMPAT_BRK 270 /* 271 * CONFIG_COMPAT_BRK can still be overridden by setting 272 * randomize_va_space to 2, which will still cause mm->start_brk 273 * to be arbitrarily shifted 274 */ 275 if (current->brk_randomized) 276 min_brk = mm->start_brk; 277 else 278 min_brk = mm->end_data; 279 #else 280 min_brk = mm->start_brk; 281 #endif 282 if (brk < min_brk) 283 goto out; 284 285 /* 286 * Check against rlimit here. If this check is done later after the test 287 * of oldbrk with newbrk then it can escape the test and let the data 288 * segment grow beyond its set limit the in case where the limit is 289 * not page aligned -Ram Gupta 290 */ 291 rlim = rlimit(RLIMIT_DATA); 292 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + 293 (mm->end_data - mm->start_data) > rlim) 294 goto out; 295 296 newbrk = PAGE_ALIGN(brk); 297 oldbrk = PAGE_ALIGN(mm->brk); 298 if (oldbrk == newbrk) 299 goto set_brk; 300 301 /* Always allow shrinking brk. */ 302 if (brk <= mm->brk) { 303 if (!do_munmap(mm, newbrk, oldbrk-newbrk)) 304 goto set_brk; 305 goto out; 306 } 307 308 /* Check against existing mmap mappings. */ 309 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) 310 goto out; 311 312 /* Ok, looks good - let it rip. */ 313 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 314 goto out; 315 316 set_brk: 317 mm->brk = brk; 318 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; 319 up_write(&mm->mmap_sem); 320 if (populate) 321 mm_populate(oldbrk, newbrk - oldbrk); 322 return brk; 323 324 out: 325 retval = mm->brk; 326 up_write(&mm->mmap_sem); 327 return retval; 328 } 329 330 static long vma_compute_subtree_gap(struct vm_area_struct *vma) 331 { 332 unsigned long max, subtree_gap; 333 max = vma->vm_start; 334 if (vma->vm_prev) 335 max -= vma->vm_prev->vm_end; 336 if (vma->vm_rb.rb_left) { 337 subtree_gap = rb_entry(vma->vm_rb.rb_left, 338 struct vm_area_struct, vm_rb)->rb_subtree_gap; 339 if (subtree_gap > max) 340 max = subtree_gap; 341 } 342 if (vma->vm_rb.rb_right) { 343 subtree_gap = rb_entry(vma->vm_rb.rb_right, 344 struct vm_area_struct, vm_rb)->rb_subtree_gap; 345 if (subtree_gap > max) 346 max = subtree_gap; 347 } 348 return max; 349 } 350 351 #ifdef CONFIG_DEBUG_VM_RB 352 static int browse_rb(struct rb_root *root) 353 { 354 int i = 0, j, bug = 0; 355 struct rb_node *nd, *pn = NULL; 356 unsigned long prev = 0, pend = 0; 357 358 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 359 struct vm_area_struct *vma; 360 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 361 if (vma->vm_start < prev) { 362 printk("vm_start %lx prev %lx\n", vma->vm_start, prev); 363 bug = 1; 364 } 365 if (vma->vm_start < pend) { 366 printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 367 bug = 1; 368 } 369 if (vma->vm_start > vma->vm_end) { 370 printk("vm_end %lx < vm_start %lx\n", 371 vma->vm_end, vma->vm_start); 372 bug = 1; 373 } 374 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { 375 printk("free gap %lx, correct %lx\n", 376 vma->rb_subtree_gap, 377 vma_compute_subtree_gap(vma)); 378 bug = 1; 379 } 380 i++; 381 pn = nd; 382 prev = vma->vm_start; 383 pend = vma->vm_end; 384 } 385 j = 0; 386 for (nd = pn; nd; nd = rb_prev(nd)) 387 j++; 388 if (i != j) { 389 printk("backwards %d, forwards %d\n", j, i); 390 bug = 1; 391 } 392 return bug ? -1 : i; 393 } 394 395 static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) 396 { 397 struct rb_node *nd; 398 399 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 400 struct vm_area_struct *vma; 401 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 402 BUG_ON(vma != ignore && 403 vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); 404 } 405 } 406 407 void validate_mm(struct mm_struct *mm) 408 { 409 int bug = 0; 410 int i = 0; 411 unsigned long highest_address = 0; 412 struct vm_area_struct *vma = mm->mmap; 413 while (vma) { 414 struct anon_vma_chain *avc; 415 vma_lock_anon_vma(vma); 416 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 417 anon_vma_interval_tree_verify(avc); 418 vma_unlock_anon_vma(vma); 419 highest_address = vma->vm_end; 420 vma = vma->vm_next; 421 i++; 422 } 423 if (i != mm->map_count) { 424 printk("map_count %d vm_next %d\n", mm->map_count, i); 425 bug = 1; 426 } 427 if (highest_address != mm->highest_vm_end) { 428 printk("mm->highest_vm_end %lx, found %lx\n", 429 mm->highest_vm_end, highest_address); 430 bug = 1; 431 } 432 i = browse_rb(&mm->mm_rb); 433 if (i != mm->map_count) { 434 printk("map_count %d rb %d\n", mm->map_count, i); 435 bug = 1; 436 } 437 BUG_ON(bug); 438 } 439 #else 440 #define validate_mm_rb(root, ignore) do { } while (0) 441 #define validate_mm(mm) do { } while (0) 442 #endif 443 444 RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, 445 unsigned long, rb_subtree_gap, vma_compute_subtree_gap) 446 447 /* 448 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or 449 * vma->vm_prev->vm_end values changed, without modifying the vma's position 450 * in the rbtree. 451 */ 452 static void vma_gap_update(struct vm_area_struct *vma) 453 { 454 /* 455 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback 456 * function that does exacltly what we want. 457 */ 458 vma_gap_callbacks_propagate(&vma->vm_rb, NULL); 459 } 460 461 static inline void vma_rb_insert(struct vm_area_struct *vma, 462 struct rb_root *root) 463 { 464 /* All rb_subtree_gap values must be consistent prior to insertion */ 465 validate_mm_rb(root, NULL); 466 467 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 468 } 469 470 static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) 471 { 472 /* 473 * All rb_subtree_gap values must be consistent prior to erase, 474 * with the possible exception of the vma being erased. 475 */ 476 validate_mm_rb(root, vma); 477 478 /* 479 * Note rb_erase_augmented is a fairly large inline function, 480 * so make sure we instantiate it only once with our desired 481 * augmented rbtree callbacks. 482 */ 483 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); 484 } 485 486 /* 487 * vma has some anon_vma assigned, and is already inserted on that 488 * anon_vma's interval trees. 489 * 490 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the 491 * vma must be removed from the anon_vma's interval trees using 492 * anon_vma_interval_tree_pre_update_vma(). 493 * 494 * After the update, the vma will be reinserted using 495 * anon_vma_interval_tree_post_update_vma(). 496 * 497 * The entire update must be protected by exclusive mmap_sem and by 498 * the root anon_vma's mutex. 499 */ 500 static inline void 501 anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) 502 { 503 struct anon_vma_chain *avc; 504 505 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 506 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); 507 } 508 509 static inline void 510 anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) 511 { 512 struct anon_vma_chain *avc; 513 514 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 515 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); 516 } 517 518 static int find_vma_links(struct mm_struct *mm, unsigned long addr, 519 unsigned long end, struct vm_area_struct **pprev, 520 struct rb_node ***rb_link, struct rb_node **rb_parent) 521 { 522 struct rb_node **__rb_link, *__rb_parent, *rb_prev; 523 524 __rb_link = &mm->mm_rb.rb_node; 525 rb_prev = __rb_parent = NULL; 526 527 while (*__rb_link) { 528 struct vm_area_struct *vma_tmp; 529 530 __rb_parent = *__rb_link; 531 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 532 533 if (vma_tmp->vm_end > addr) { 534 /* Fail if an existing vma overlaps the area */ 535 if (vma_tmp->vm_start < end) 536 return -ENOMEM; 537 __rb_link = &__rb_parent->rb_left; 538 } else { 539 rb_prev = __rb_parent; 540 __rb_link = &__rb_parent->rb_right; 541 } 542 } 543 544 *pprev = NULL; 545 if (rb_prev) 546 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 547 *rb_link = __rb_link; 548 *rb_parent = __rb_parent; 549 return 0; 550 } 551 552 static unsigned long count_vma_pages_range(struct mm_struct *mm, 553 unsigned long addr, unsigned long end) 554 { 555 unsigned long nr_pages = 0; 556 struct vm_area_struct *vma; 557 558 /* Find first overlaping mapping */ 559 vma = find_vma_intersection(mm, addr, end); 560 if (!vma) 561 return 0; 562 563 nr_pages = (min(end, vma->vm_end) - 564 max(addr, vma->vm_start)) >> PAGE_SHIFT; 565 566 /* Iterate over the rest of the overlaps */ 567 for (vma = vma->vm_next; vma; vma = vma->vm_next) { 568 unsigned long overlap_len; 569 570 if (vma->vm_start > end) 571 break; 572 573 overlap_len = min(end, vma->vm_end) - vma->vm_start; 574 nr_pages += overlap_len >> PAGE_SHIFT; 575 } 576 577 return nr_pages; 578 } 579 580 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 581 struct rb_node **rb_link, struct rb_node *rb_parent) 582 { 583 /* Update tracking information for the gap following the new vma. */ 584 if (vma->vm_next) 585 vma_gap_update(vma->vm_next); 586 else 587 mm->highest_vm_end = vma->vm_end; 588 589 /* 590 * vma->vm_prev wasn't known when we followed the rbtree to find the 591 * correct insertion point for that vma. As a result, we could not 592 * update the vma vm_rb parents rb_subtree_gap values on the way down. 593 * So, we first insert the vma with a zero rb_subtree_gap value 594 * (to be consistent with what we did on the way down), and then 595 * immediately update the gap to the correct value. Finally we 596 * rebalance the rbtree after all augmented values have been set. 597 */ 598 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 599 vma->rb_subtree_gap = 0; 600 vma_gap_update(vma); 601 vma_rb_insert(vma, &mm->mm_rb); 602 } 603 604 static void __vma_link_file(struct vm_area_struct *vma) 605 { 606 struct file *file; 607 608 file = vma->vm_file; 609 if (file) { 610 struct address_space *mapping = file->f_mapping; 611 612 if (vma->vm_flags & VM_DENYWRITE) 613 atomic_dec(&file_inode(file)->i_writecount); 614 if (vma->vm_flags & VM_SHARED) 615 mapping->i_mmap_writable++; 616 617 flush_dcache_mmap_lock(mapping); 618 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 619 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 620 else 621 vma_interval_tree_insert(vma, &mapping->i_mmap); 622 flush_dcache_mmap_unlock(mapping); 623 } 624 } 625 626 static void 627 __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 628 struct vm_area_struct *prev, struct rb_node **rb_link, 629 struct rb_node *rb_parent) 630 { 631 __vma_link_list(mm, vma, prev, rb_parent); 632 __vma_link_rb(mm, vma, rb_link, rb_parent); 633 } 634 635 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, 636 struct vm_area_struct *prev, struct rb_node **rb_link, 637 struct rb_node *rb_parent) 638 { 639 struct address_space *mapping = NULL; 640 641 if (vma->vm_file) 642 mapping = vma->vm_file->f_mapping; 643 644 if (mapping) 645 mutex_lock(&mapping->i_mmap_mutex); 646 647 __vma_link(mm, vma, prev, rb_link, rb_parent); 648 __vma_link_file(vma); 649 650 if (mapping) 651 mutex_unlock(&mapping->i_mmap_mutex); 652 653 mm->map_count++; 654 validate_mm(mm); 655 } 656 657 /* 658 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the 659 * mm's list and rbtree. It has already been inserted into the interval tree. 660 */ 661 static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 662 { 663 struct vm_area_struct *prev; 664 struct rb_node **rb_link, *rb_parent; 665 666 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 667 &prev, &rb_link, &rb_parent)) 668 BUG(); 669 __vma_link(mm, vma, prev, rb_link, rb_parent); 670 mm->map_count++; 671 } 672 673 static inline void 674 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 675 struct vm_area_struct *prev) 676 { 677 struct vm_area_struct *next; 678 679 vma_rb_erase(vma, &mm->mm_rb); 680 prev->vm_next = next = vma->vm_next; 681 if (next) 682 next->vm_prev = prev; 683 if (mm->mmap_cache == vma) 684 mm->mmap_cache = prev; 685 } 686 687 /* 688 * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that 689 * is already present in an i_mmap tree without adjusting the tree. 690 * The following helper function should be used when such adjustments 691 * are necessary. The "insert" vma (if any) is to be inserted 692 * before we drop the necessary locks. 693 */ 694 int vma_adjust(struct vm_area_struct *vma, unsigned long start, 695 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) 696 { 697 struct mm_struct *mm = vma->vm_mm; 698 struct vm_area_struct *next = vma->vm_next; 699 struct vm_area_struct *importer = NULL; 700 struct address_space *mapping = NULL; 701 struct rb_root *root = NULL; 702 struct anon_vma *anon_vma = NULL; 703 struct file *file = vma->vm_file; 704 bool start_changed = false, end_changed = false; 705 long adjust_next = 0; 706 int remove_next = 0; 707 708 if (next && !insert) { 709 struct vm_area_struct *exporter = NULL; 710 711 if (end >= next->vm_end) { 712 /* 713 * vma expands, overlapping all the next, and 714 * perhaps the one after too (mprotect case 6). 715 */ 716 again: remove_next = 1 + (end > next->vm_end); 717 end = next->vm_end; 718 exporter = next; 719 importer = vma; 720 } else if (end > next->vm_start) { 721 /* 722 * vma expands, overlapping part of the next: 723 * mprotect case 5 shifting the boundary up. 724 */ 725 adjust_next = (end - next->vm_start) >> PAGE_SHIFT; 726 exporter = next; 727 importer = vma; 728 } else if (end < vma->vm_end) { 729 /* 730 * vma shrinks, and !insert tells it's not 731 * split_vma inserting another: so it must be 732 * mprotect case 4 shifting the boundary down. 733 */ 734 adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); 735 exporter = vma; 736 importer = next; 737 } 738 739 /* 740 * Easily overlooked: when mprotect shifts the boundary, 741 * make sure the expanding vma has anon_vma set if the 742 * shrinking vma had, to cover any anon pages imported. 743 */ 744 if (exporter && exporter->anon_vma && !importer->anon_vma) { 745 if (anon_vma_clone(importer, exporter)) 746 return -ENOMEM; 747 importer->anon_vma = exporter->anon_vma; 748 } 749 } 750 751 if (file) { 752 mapping = file->f_mapping; 753 if (!(vma->vm_flags & VM_NONLINEAR)) { 754 root = &mapping->i_mmap; 755 uprobe_munmap(vma, vma->vm_start, vma->vm_end); 756 757 if (adjust_next) 758 uprobe_munmap(next, next->vm_start, 759 next->vm_end); 760 } 761 762 mutex_lock(&mapping->i_mmap_mutex); 763 if (insert) { 764 /* 765 * Put into interval tree now, so instantiated pages 766 * are visible to arm/parisc __flush_dcache_page 767 * throughout; but we cannot insert into address 768 * space until vma start or end is updated. 769 */ 770 __vma_link_file(insert); 771 } 772 } 773 774 vma_adjust_trans_huge(vma, start, end, adjust_next); 775 776 anon_vma = vma->anon_vma; 777 if (!anon_vma && adjust_next) 778 anon_vma = next->anon_vma; 779 if (anon_vma) { 780 VM_BUG_ON(adjust_next && next->anon_vma && 781 anon_vma != next->anon_vma); 782 anon_vma_lock_write(anon_vma); 783 anon_vma_interval_tree_pre_update_vma(vma); 784 if (adjust_next) 785 anon_vma_interval_tree_pre_update_vma(next); 786 } 787 788 if (root) { 789 flush_dcache_mmap_lock(mapping); 790 vma_interval_tree_remove(vma, root); 791 if (adjust_next) 792 vma_interval_tree_remove(next, root); 793 } 794 795 if (start != vma->vm_start) { 796 vma->vm_start = start; 797 start_changed = true; 798 } 799 if (end != vma->vm_end) { 800 vma->vm_end = end; 801 end_changed = true; 802 } 803 vma->vm_pgoff = pgoff; 804 if (adjust_next) { 805 next->vm_start += adjust_next << PAGE_SHIFT; 806 next->vm_pgoff += adjust_next; 807 } 808 809 if (root) { 810 if (adjust_next) 811 vma_interval_tree_insert(next, root); 812 vma_interval_tree_insert(vma, root); 813 flush_dcache_mmap_unlock(mapping); 814 } 815 816 if (remove_next) { 817 /* 818 * vma_merge has merged next into vma, and needs 819 * us to remove next before dropping the locks. 820 */ 821 __vma_unlink(mm, next, vma); 822 if (file) 823 __remove_shared_vm_struct(next, file, mapping); 824 } else if (insert) { 825 /* 826 * split_vma has split insert from vma, and needs 827 * us to insert it before dropping the locks 828 * (it may either follow vma or precede it). 829 */ 830 __insert_vm_struct(mm, insert); 831 } else { 832 if (start_changed) 833 vma_gap_update(vma); 834 if (end_changed) { 835 if (!next) 836 mm->highest_vm_end = end; 837 else if (!adjust_next) 838 vma_gap_update(next); 839 } 840 } 841 842 if (anon_vma) { 843 anon_vma_interval_tree_post_update_vma(vma); 844 if (adjust_next) 845 anon_vma_interval_tree_post_update_vma(next); 846 anon_vma_unlock_write(anon_vma); 847 } 848 if (mapping) 849 mutex_unlock(&mapping->i_mmap_mutex); 850 851 if (root) { 852 uprobe_mmap(vma); 853 854 if (adjust_next) 855 uprobe_mmap(next); 856 } 857 858 if (remove_next) { 859 if (file) { 860 uprobe_munmap(next, next->vm_start, next->vm_end); 861 fput(file); 862 } 863 if (next->anon_vma) 864 anon_vma_merge(vma, next); 865 mm->map_count--; 866 mpol_put(vma_policy(next)); 867 kmem_cache_free(vm_area_cachep, next); 868 /* 869 * In mprotect's case 6 (see comments on vma_merge), 870 * we must remove another next too. It would clutter 871 * up the code too much to do both in one go. 872 */ 873 next = vma->vm_next; 874 if (remove_next == 2) 875 goto again; 876 else if (next) 877 vma_gap_update(next); 878 else 879 mm->highest_vm_end = end; 880 } 881 if (insert && file) 882 uprobe_mmap(insert); 883 884 validate_mm(mm); 885 886 return 0; 887 } 888 889 /* 890 * If the vma has a ->close operation then the driver probably needs to release 891 * per-vma resources, so we don't attempt to merge those. 892 */ 893 static inline int is_mergeable_vma(struct vm_area_struct *vma, 894 struct file *file, unsigned long vm_flags) 895 { 896 if (vma->vm_flags ^ vm_flags) 897 return 0; 898 if (vma->vm_file != file) 899 return 0; 900 if (vma->vm_ops && vma->vm_ops->close) 901 return 0; 902 return 1; 903 } 904 905 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, 906 struct anon_vma *anon_vma2, 907 struct vm_area_struct *vma) 908 { 909 /* 910 * The list_is_singular() test is to avoid merging VMA cloned from 911 * parents. This can improve scalability caused by anon_vma lock. 912 */ 913 if ((!anon_vma1 || !anon_vma2) && (!vma || 914 list_is_singular(&vma->anon_vma_chain))) 915 return 1; 916 return anon_vma1 == anon_vma2; 917 } 918 919 /* 920 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 921 * in front of (at a lower virtual address and file offset than) the vma. 922 * 923 * We cannot merge two vmas if they have differently assigned (non-NULL) 924 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 925 * 926 * We don't check here for the merged mmap wrapping around the end of pagecache 927 * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which 928 * wrap, nor mmaps which cover the final page at index -1UL. 929 */ 930 static int 931 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, 932 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 933 { 934 if (is_mergeable_vma(vma, file, vm_flags) && 935 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 936 if (vma->vm_pgoff == vm_pgoff) 937 return 1; 938 } 939 return 0; 940 } 941 942 /* 943 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) 944 * beyond (at a higher virtual address and file offset than) the vma. 945 * 946 * We cannot merge two vmas if they have differently assigned (non-NULL) 947 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. 948 */ 949 static int 950 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, 951 struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) 952 { 953 if (is_mergeable_vma(vma, file, vm_flags) && 954 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 955 pgoff_t vm_pglen; 956 vm_pglen = vma_pages(vma); 957 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 958 return 1; 959 } 960 return 0; 961 } 962 963 /* 964 * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out 965 * whether that can be merged with its predecessor or its successor. 966 * Or both (it neatly fills a hole). 967 * 968 * In most cases - when called for mmap, brk or mremap - [addr,end) is 969 * certain not to be mapped by the time vma_merge is called; but when 970 * called for mprotect, it is certain to be already mapped (either at 971 * an offset within prev, or at the start of next), and the flags of 972 * this area are about to be changed to vm_flags - and the no-change 973 * case has already been eliminated. 974 * 975 * The following mprotect cases have to be considered, where AAAA is 976 * the area passed down from mprotect_fixup, never extending beyond one 977 * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: 978 * 979 * AAAA AAAA AAAA AAAA 980 * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX 981 * cannot merge might become might become might become 982 * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or 983 * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or 984 * mremap move: PPPPNNNNNNNN 8 985 * AAAA 986 * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN 987 * might become case 1 below case 2 below case 3 below 988 * 989 * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: 990 * mprotect_fixup updates vm_flags & vm_page_prot on successful return. 991 */ 992 struct vm_area_struct *vma_merge(struct mm_struct *mm, 993 struct vm_area_struct *prev, unsigned long addr, 994 unsigned long end, unsigned long vm_flags, 995 struct anon_vma *anon_vma, struct file *file, 996 pgoff_t pgoff, struct mempolicy *policy) 997 { 998 pgoff_t pglen = (end - addr) >> PAGE_SHIFT; 999 struct vm_area_struct *area, *next; 1000 int err; 1001 1002 /* 1003 * We later require that vma->vm_flags == vm_flags, 1004 * so this tests vma->vm_flags & VM_SPECIAL, too. 1005 */ 1006 if (vm_flags & VM_SPECIAL) 1007 return NULL; 1008 1009 if (prev) 1010 next = prev->vm_next; 1011 else 1012 next = mm->mmap; 1013 area = next; 1014 if (next && next->vm_end == end) /* cases 6, 7, 8 */ 1015 next = next->vm_next; 1016 1017 /* 1018 * Can it merge with the predecessor? 1019 */ 1020 if (prev && prev->vm_end == addr && 1021 mpol_equal(vma_policy(prev), policy) && 1022 can_vma_merge_after(prev, vm_flags, 1023 anon_vma, file, pgoff)) { 1024 /* 1025 * OK, it can. Can we now merge in the successor as well? 1026 */ 1027 if (next && end == next->vm_start && 1028 mpol_equal(policy, vma_policy(next)) && 1029 can_vma_merge_before(next, vm_flags, 1030 anon_vma, file, pgoff+pglen) && 1031 is_mergeable_anon_vma(prev->anon_vma, 1032 next->anon_vma, NULL)) { 1033 /* cases 1, 6 */ 1034 err = vma_adjust(prev, prev->vm_start, 1035 next->vm_end, prev->vm_pgoff, NULL); 1036 } else /* cases 2, 5, 7 */ 1037 err = vma_adjust(prev, prev->vm_start, 1038 end, prev->vm_pgoff, NULL); 1039 if (err) 1040 return NULL; 1041 khugepaged_enter_vma_merge(prev); 1042 return prev; 1043 } 1044 1045 /* 1046 * Can this new request be merged in front of next? 1047 */ 1048 if (next && end == next->vm_start && 1049 mpol_equal(policy, vma_policy(next)) && 1050 can_vma_merge_before(next, vm_flags, 1051 anon_vma, file, pgoff+pglen)) { 1052 if (prev && addr < prev->vm_end) /* case 4 */ 1053 err = vma_adjust(prev, prev->vm_start, 1054 addr, prev->vm_pgoff, NULL); 1055 else /* cases 3, 8 */ 1056 err = vma_adjust(area, addr, next->vm_end, 1057 next->vm_pgoff - pglen, NULL); 1058 if (err) 1059 return NULL; 1060 khugepaged_enter_vma_merge(area); 1061 return area; 1062 } 1063 1064 return NULL; 1065 } 1066 1067 /* 1068 * Rough compatbility check to quickly see if it's even worth looking 1069 * at sharing an anon_vma. 1070 * 1071 * They need to have the same vm_file, and the flags can only differ 1072 * in things that mprotect may change. 1073 * 1074 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that 1075 * we can merge the two vma's. For example, we refuse to merge a vma if 1076 * there is a vm_ops->close() function, because that indicates that the 1077 * driver is doing some kind of reference counting. But that doesn't 1078 * really matter for the anon_vma sharing case. 1079 */ 1080 static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) 1081 { 1082 return a->vm_end == b->vm_start && 1083 mpol_equal(vma_policy(a), vma_policy(b)) && 1084 a->vm_file == b->vm_file && 1085 !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && 1086 b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); 1087 } 1088 1089 /* 1090 * Do some basic sanity checking to see if we can re-use the anon_vma 1091 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be 1092 * the same as 'old', the other will be the new one that is trying 1093 * to share the anon_vma. 1094 * 1095 * NOTE! This runs with mm_sem held for reading, so it is possible that 1096 * the anon_vma of 'old' is concurrently in the process of being set up 1097 * by another page fault trying to merge _that_. But that's ok: if it 1098 * is being set up, that automatically means that it will be a singleton 1099 * acceptable for merging, so we can do all of this optimistically. But 1100 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. 1101 * 1102 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1103 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1104 * is to return an anon_vma that is "complex" due to having gone through 1105 * a fork). 1106 * 1107 * We also make sure that the two vma's are compatible (adjacent, 1108 * and with the same memory policies). That's all stable, even with just 1109 * a read lock on the mm_sem. 1110 */ 1111 static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1112 { 1113 if (anon_vma_compatible(a, b)) { 1114 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); 1115 1116 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1117 return anon_vma; 1118 } 1119 return NULL; 1120 } 1121 1122 /* 1123 * find_mergeable_anon_vma is used by anon_vma_prepare, to check 1124 * neighbouring vmas for a suitable anon_vma, before it goes off 1125 * to allocate a new anon_vma. It checks because a repetitive 1126 * sequence of mprotects and faults may otherwise lead to distinct 1127 * anon_vmas being allocated, preventing vma merge in subsequent 1128 * mprotect. 1129 */ 1130 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) 1131 { 1132 struct anon_vma *anon_vma; 1133 struct vm_area_struct *near; 1134 1135 near = vma->vm_next; 1136 if (!near) 1137 goto try_prev; 1138 1139 anon_vma = reusable_anon_vma(near, vma, near); 1140 if (anon_vma) 1141 return anon_vma; 1142 try_prev: 1143 near = vma->vm_prev; 1144 if (!near) 1145 goto none; 1146 1147 anon_vma = reusable_anon_vma(near, near, vma); 1148 if (anon_vma) 1149 return anon_vma; 1150 none: 1151 /* 1152 * There's no absolute need to look only at touching neighbours: 1153 * we could search further afield for "compatible" anon_vmas. 1154 * But it would probably just be a waste of time searching, 1155 * or lead to too many vmas hanging off the same anon_vma. 1156 * We're trying to allow mprotect remerging later on, 1157 * not trying to minimize memory used for anon_vmas. 1158 */ 1159 return NULL; 1160 } 1161 1162 #ifdef CONFIG_PROC_FS 1163 void vm_stat_account(struct mm_struct *mm, unsigned long flags, 1164 struct file *file, long pages) 1165 { 1166 const unsigned long stack_flags 1167 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 1168 1169 mm->total_vm += pages; 1170 1171 if (file) { 1172 mm->shared_vm += pages; 1173 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 1174 mm->exec_vm += pages; 1175 } else if (flags & stack_flags) 1176 mm->stack_vm += pages; 1177 } 1178 #endif /* CONFIG_PROC_FS */ 1179 1180 /* 1181 * If a hint addr is less than mmap_min_addr change hint to be as 1182 * low as possible but still greater than mmap_min_addr 1183 */ 1184 static inline unsigned long round_hint_to_min(unsigned long hint) 1185 { 1186 hint &= PAGE_MASK; 1187 if (((void *)hint != NULL) && 1188 (hint < mmap_min_addr)) 1189 return PAGE_ALIGN(mmap_min_addr); 1190 return hint; 1191 } 1192 1193 /* 1194 * The caller must hold down_write(¤t->mm->mmap_sem). 1195 */ 1196 1197 unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1198 unsigned long len, unsigned long prot, 1199 unsigned long flags, unsigned long pgoff, 1200 unsigned long *populate) 1201 { 1202 struct mm_struct * mm = current->mm; 1203 vm_flags_t vm_flags; 1204 1205 *populate = 0; 1206 1207 /* 1208 * Does the application expect PROT_READ to imply PROT_EXEC? 1209 * 1210 * (the exception is when the underlying filesystem is noexec 1211 * mounted, in which case we dont add PROT_EXEC.) 1212 */ 1213 if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) 1214 if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) 1215 prot |= PROT_EXEC; 1216 1217 if (!len) 1218 return -EINVAL; 1219 1220 if (!(flags & MAP_FIXED)) 1221 addr = round_hint_to_min(addr); 1222 1223 /* Careful about overflows.. */ 1224 len = PAGE_ALIGN(len); 1225 if (!len) 1226 return -ENOMEM; 1227 1228 /* offset overflow? */ 1229 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 1230 return -EOVERFLOW; 1231 1232 /* Too many mappings? */ 1233 if (mm->map_count > sysctl_max_map_count) 1234 return -ENOMEM; 1235 1236 /* Obtain the address to map to. we verify (or select) it and ensure 1237 * that it represents a valid section of the address space. 1238 */ 1239 addr = get_unmapped_area(file, addr, len, pgoff, flags); 1240 if (addr & ~PAGE_MASK) 1241 return addr; 1242 1243 /* Do simple checking here so the lower-level routines won't have 1244 * to. we assume access permissions have been handled by the open 1245 * of the memory object, so we don't do any here. 1246 */ 1247 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | 1248 mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1249 1250 if (flags & MAP_LOCKED) 1251 if (!can_do_mlock()) 1252 return -EPERM; 1253 1254 /* mlock MCL_FUTURE? */ 1255 if (vm_flags & VM_LOCKED) { 1256 unsigned long locked, lock_limit; 1257 locked = len >> PAGE_SHIFT; 1258 locked += mm->locked_vm; 1259 lock_limit = rlimit(RLIMIT_MEMLOCK); 1260 lock_limit >>= PAGE_SHIFT; 1261 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1262 return -EAGAIN; 1263 } 1264 1265 if (file) { 1266 struct inode *inode = file_inode(file); 1267 1268 switch (flags & MAP_TYPE) { 1269 case MAP_SHARED: 1270 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 1271 return -EACCES; 1272 1273 /* 1274 * Make sure we don't allow writing to an append-only 1275 * file.. 1276 */ 1277 if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) 1278 return -EACCES; 1279 1280 /* 1281 * Make sure there are no mandatory locks on the file. 1282 */ 1283 if (locks_verify_locked(inode)) 1284 return -EAGAIN; 1285 1286 vm_flags |= VM_SHARED | VM_MAYSHARE; 1287 if (!(file->f_mode & FMODE_WRITE)) 1288 vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 1289 1290 /* fall through */ 1291 case MAP_PRIVATE: 1292 if (!(file->f_mode & FMODE_READ)) 1293 return -EACCES; 1294 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { 1295 if (vm_flags & VM_EXEC) 1296 return -EPERM; 1297 vm_flags &= ~VM_MAYEXEC; 1298 } 1299 1300 if (!file->f_op->mmap) 1301 return -ENODEV; 1302 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1303 return -EINVAL; 1304 break; 1305 1306 default: 1307 return -EINVAL; 1308 } 1309 } else { 1310 switch (flags & MAP_TYPE) { 1311 case MAP_SHARED: 1312 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1313 return -EINVAL; 1314 /* 1315 * Ignore pgoff. 1316 */ 1317 pgoff = 0; 1318 vm_flags |= VM_SHARED | VM_MAYSHARE; 1319 break; 1320 case MAP_PRIVATE: 1321 /* 1322 * Set pgoff according to addr for anon_vma. 1323 */ 1324 pgoff = addr >> PAGE_SHIFT; 1325 break; 1326 default: 1327 return -EINVAL; 1328 } 1329 } 1330 1331 /* 1332 * Set 'VM_NORESERVE' if we should not account for the 1333 * memory use of this mapping. 1334 */ 1335 if (flags & MAP_NORESERVE) { 1336 /* We honor MAP_NORESERVE if allowed to overcommit */ 1337 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) 1338 vm_flags |= VM_NORESERVE; 1339 1340 /* hugetlb applies strict overcommit unless MAP_NORESERVE */ 1341 if (file && is_file_hugepages(file)) 1342 vm_flags |= VM_NORESERVE; 1343 } 1344 1345 addr = mmap_region(file, addr, len, vm_flags, pgoff); 1346 if (!IS_ERR_VALUE(addr) && 1347 ((vm_flags & VM_LOCKED) || 1348 (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) 1349 *populate = len; 1350 return addr; 1351 } 1352 1353 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1354 unsigned long, prot, unsigned long, flags, 1355 unsigned long, fd, unsigned long, pgoff) 1356 { 1357 struct file *file = NULL; 1358 unsigned long retval = -EBADF; 1359 1360 if (!(flags & MAP_ANONYMOUS)) { 1361 audit_mmap_fd(fd, flags); 1362 file = fget(fd); 1363 if (!file) 1364 goto out; 1365 if (is_file_hugepages(file)) 1366 len = ALIGN(len, huge_page_size(hstate_file(file))); 1367 retval = -EINVAL; 1368 if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) 1369 goto out_fput; 1370 } else if (flags & MAP_HUGETLB) { 1371 struct user_struct *user = NULL; 1372 struct hstate *hs; 1373 1374 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); 1375 if (!hs) 1376 return -EINVAL; 1377 1378 len = ALIGN(len, huge_page_size(hs)); 1379 /* 1380 * VM_NORESERVE is used because the reservations will be 1381 * taken when vm_ops->mmap() is called 1382 * A dummy user value is used because we are not locking 1383 * memory so no accounting is necessary 1384 */ 1385 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, 1386 VM_NORESERVE, 1387 &user, HUGETLB_ANONHUGE_INODE, 1388 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); 1389 if (IS_ERR(file)) 1390 return PTR_ERR(file); 1391 } 1392 1393 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1394 1395 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1396 out_fput: 1397 if (file) 1398 fput(file); 1399 out: 1400 return retval; 1401 } 1402 1403 #ifdef __ARCH_WANT_SYS_OLD_MMAP 1404 struct mmap_arg_struct { 1405 unsigned long addr; 1406 unsigned long len; 1407 unsigned long prot; 1408 unsigned long flags; 1409 unsigned long fd; 1410 unsigned long offset; 1411 }; 1412 1413 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 1414 { 1415 struct mmap_arg_struct a; 1416 1417 if (copy_from_user(&a, arg, sizeof(a))) 1418 return -EFAULT; 1419 if (a.offset & ~PAGE_MASK) 1420 return -EINVAL; 1421 1422 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1423 a.offset >> PAGE_SHIFT); 1424 } 1425 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1426 1427 /* 1428 * Some shared mappigns will want the pages marked read-only 1429 * to track write events. If so, we'll downgrade vm_page_prot 1430 * to the private version (using protection_map[] without the 1431 * VM_SHARED bit). 1432 */ 1433 int vma_wants_writenotify(struct vm_area_struct *vma) 1434 { 1435 vm_flags_t vm_flags = vma->vm_flags; 1436 1437 /* If it was private or non-writable, the write bit is already clear */ 1438 if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) 1439 return 0; 1440 1441 /* The backer wishes to know when pages are first written to? */ 1442 if (vma->vm_ops && vma->vm_ops->page_mkwrite) 1443 return 1; 1444 1445 /* The open routine did something to the protections already? */ 1446 if (pgprot_val(vma->vm_page_prot) != 1447 pgprot_val(vm_get_page_prot(vm_flags))) 1448 return 0; 1449 1450 /* Specialty mapping? */ 1451 if (vm_flags & VM_PFNMAP) 1452 return 0; 1453 1454 /* Can the mapping track the dirty pages? */ 1455 return vma->vm_file && vma->vm_file->f_mapping && 1456 mapping_cap_account_dirty(vma->vm_file->f_mapping); 1457 } 1458 1459 /* 1460 * We account for memory if it's a private writeable mapping, 1461 * not hugepages and VM_NORESERVE wasn't set. 1462 */ 1463 static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) 1464 { 1465 /* 1466 * hugetlb has its own accounting separate from the core VM 1467 * VM_HUGETLB may not be set yet so we cannot check for that flag. 1468 */ 1469 if (file && is_file_hugepages(file)) 1470 return 0; 1471 1472 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; 1473 } 1474 1475 unsigned long mmap_region(struct file *file, unsigned long addr, 1476 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) 1477 { 1478 struct mm_struct *mm = current->mm; 1479 struct vm_area_struct *vma, *prev; 1480 int error; 1481 struct rb_node **rb_link, *rb_parent; 1482 unsigned long charged = 0; 1483 1484 /* Check against address space limit. */ 1485 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { 1486 unsigned long nr_pages; 1487 1488 /* 1489 * MAP_FIXED may remove pages of mappings that intersects with 1490 * requested mapping. Account for the pages it would unmap. 1491 */ 1492 if (!(vm_flags & MAP_FIXED)) 1493 return -ENOMEM; 1494 1495 nr_pages = count_vma_pages_range(mm, addr, addr + len); 1496 1497 if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) 1498 return -ENOMEM; 1499 } 1500 1501 /* Clear old maps */ 1502 error = -ENOMEM; 1503 munmap_back: 1504 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 1505 if (do_munmap(mm, addr, len)) 1506 return -ENOMEM; 1507 goto munmap_back; 1508 } 1509 1510 /* 1511 * Private writable mapping: check memory availability 1512 */ 1513 if (accountable_mapping(file, vm_flags)) { 1514 charged = len >> PAGE_SHIFT; 1515 if (security_vm_enough_memory_mm(mm, charged)) 1516 return -ENOMEM; 1517 vm_flags |= VM_ACCOUNT; 1518 } 1519 1520 /* 1521 * Can we just expand an old mapping? 1522 */ 1523 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); 1524 if (vma) 1525 goto out; 1526 1527 /* 1528 * Determine the object being mapped and call the appropriate 1529 * specific mapper. the address has already been validated, but 1530 * not unmapped, but the maps are removed from the list. 1531 */ 1532 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 1533 if (!vma) { 1534 error = -ENOMEM; 1535 goto unacct_error; 1536 } 1537 1538 vma->vm_mm = mm; 1539 vma->vm_start = addr; 1540 vma->vm_end = addr + len; 1541 vma->vm_flags = vm_flags; 1542 vma->vm_page_prot = vm_get_page_prot(vm_flags); 1543 vma->vm_pgoff = pgoff; 1544 INIT_LIST_HEAD(&vma->anon_vma_chain); 1545 1546 if (file) { 1547 if (vm_flags & VM_DENYWRITE) { 1548 error = deny_write_access(file); 1549 if (error) 1550 goto free_vma; 1551 } 1552 vma->vm_file = get_file(file); 1553 error = file->f_op->mmap(file, vma); 1554 if (error) 1555 goto unmap_and_free_vma; 1556 1557 /* Can addr have changed?? 1558 * 1559 * Answer: Yes, several device drivers can do it in their 1560 * f_op->mmap method. -DaveM 1561 * Bug: If addr is changed, prev, rb_link, rb_parent should 1562 * be updated for vma_link() 1563 */ 1564 WARN_ON_ONCE(addr != vma->vm_start); 1565 1566 addr = vma->vm_start; 1567 vm_flags = vma->vm_flags; 1568 } else if (vm_flags & VM_SHARED) { 1569 error = shmem_zero_setup(vma); 1570 if (error) 1571 goto free_vma; 1572 } 1573 1574 if (vma_wants_writenotify(vma)) { 1575 pgprot_t pprot = vma->vm_page_prot; 1576 1577 /* Can vma->vm_page_prot have changed?? 1578 * 1579 * Answer: Yes, drivers may have changed it in their 1580 * f_op->mmap method. 1581 * 1582 * Ensures that vmas marked as uncached stay that way. 1583 */ 1584 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1585 if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) 1586 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 1587 } 1588 1589 vma_link(mm, vma, prev, rb_link, rb_parent); 1590 /* Once vma denies write, undo our temporary denial count */ 1591 if (vm_flags & VM_DENYWRITE) 1592 allow_write_access(file); 1593 file = vma->vm_file; 1594 out: 1595 perf_event_mmap(vma); 1596 1597 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1598 if (vm_flags & VM_LOCKED) { 1599 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || 1600 vma == get_gate_vma(current->mm))) 1601 mm->locked_vm += (len >> PAGE_SHIFT); 1602 else 1603 vma->vm_flags &= ~VM_LOCKED; 1604 } 1605 1606 if (file) 1607 uprobe_mmap(vma); 1608 1609 /* 1610 * New (or expanded) vma always get soft dirty status. 1611 * Otherwise user-space soft-dirty page tracker won't 1612 * be able to distinguish situation when vma area unmapped, 1613 * then new mapped in-place (which must be aimed as 1614 * a completely new data area). 1615 */ 1616 vma->vm_flags |= VM_SOFTDIRTY; 1617 1618 return addr; 1619 1620 unmap_and_free_vma: 1621 if (vm_flags & VM_DENYWRITE) 1622 allow_write_access(file); 1623 vma->vm_file = NULL; 1624 fput(file); 1625 1626 /* Undo any partial mapping done by a device driver. */ 1627 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); 1628 charged = 0; 1629 free_vma: 1630 kmem_cache_free(vm_area_cachep, vma); 1631 unacct_error: 1632 if (charged) 1633 vm_unacct_memory(charged); 1634 return error; 1635 } 1636 1637 unsigned long unmapped_area(struct vm_unmapped_area_info *info) 1638 { 1639 /* 1640 * We implement the search by looking for an rbtree node that 1641 * immediately follows a suitable gap. That is, 1642 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; 1643 * - gap_end = vma->vm_start >= info->low_limit + length; 1644 * - gap_end - gap_start >= length 1645 */ 1646 1647 struct mm_struct *mm = current->mm; 1648 struct vm_area_struct *vma; 1649 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1650 1651 /* Adjust search length to account for worst case alignment overhead */ 1652 length = info->length + info->align_mask; 1653 if (length < info->length) 1654 return -ENOMEM; 1655 1656 /* Adjust search limits by the desired length */ 1657 if (info->high_limit < length) 1658 return -ENOMEM; 1659 high_limit = info->high_limit - length; 1660 1661 if (info->low_limit > high_limit) 1662 return -ENOMEM; 1663 low_limit = info->low_limit + length; 1664 1665 /* Check if rbtree root looks promising */ 1666 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1667 goto check_highest; 1668 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1669 if (vma->rb_subtree_gap < length) 1670 goto check_highest; 1671 1672 while (true) { 1673 /* Visit left subtree if it looks promising */ 1674 gap_end = vma->vm_start; 1675 if (gap_end >= low_limit && vma->vm_rb.rb_left) { 1676 struct vm_area_struct *left = 1677 rb_entry(vma->vm_rb.rb_left, 1678 struct vm_area_struct, vm_rb); 1679 if (left->rb_subtree_gap >= length) { 1680 vma = left; 1681 continue; 1682 } 1683 } 1684 1685 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1686 check_current: 1687 /* Check if current node has a suitable gap */ 1688 if (gap_start > high_limit) 1689 return -ENOMEM; 1690 if (gap_end >= low_limit && gap_end - gap_start >= length) 1691 goto found; 1692 1693 /* Visit right subtree if it looks promising */ 1694 if (vma->vm_rb.rb_right) { 1695 struct vm_area_struct *right = 1696 rb_entry(vma->vm_rb.rb_right, 1697 struct vm_area_struct, vm_rb); 1698 if (right->rb_subtree_gap >= length) { 1699 vma = right; 1700 continue; 1701 } 1702 } 1703 1704 /* Go back up the rbtree to find next candidate node */ 1705 while (true) { 1706 struct rb_node *prev = &vma->vm_rb; 1707 if (!rb_parent(prev)) 1708 goto check_highest; 1709 vma = rb_entry(rb_parent(prev), 1710 struct vm_area_struct, vm_rb); 1711 if (prev == vma->vm_rb.rb_left) { 1712 gap_start = vma->vm_prev->vm_end; 1713 gap_end = vma->vm_start; 1714 goto check_current; 1715 } 1716 } 1717 } 1718 1719 check_highest: 1720 /* Check highest gap, which does not precede any rbtree node */ 1721 gap_start = mm->highest_vm_end; 1722 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ 1723 if (gap_start > high_limit) 1724 return -ENOMEM; 1725 1726 found: 1727 /* We found a suitable gap. Clip it with the original low_limit. */ 1728 if (gap_start < info->low_limit) 1729 gap_start = info->low_limit; 1730 1731 /* Adjust gap address to the desired alignment */ 1732 gap_start += (info->align_offset - gap_start) & info->align_mask; 1733 1734 VM_BUG_ON(gap_start + info->length > info->high_limit); 1735 VM_BUG_ON(gap_start + info->length > gap_end); 1736 return gap_start; 1737 } 1738 1739 unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) 1740 { 1741 struct mm_struct *mm = current->mm; 1742 struct vm_area_struct *vma; 1743 unsigned long length, low_limit, high_limit, gap_start, gap_end; 1744 1745 /* Adjust search length to account for worst case alignment overhead */ 1746 length = info->length + info->align_mask; 1747 if (length < info->length) 1748 return -ENOMEM; 1749 1750 /* 1751 * Adjust search limits by the desired length. 1752 * See implementation comment at top of unmapped_area(). 1753 */ 1754 gap_end = info->high_limit; 1755 if (gap_end < length) 1756 return -ENOMEM; 1757 high_limit = gap_end - length; 1758 1759 if (info->low_limit > high_limit) 1760 return -ENOMEM; 1761 low_limit = info->low_limit + length; 1762 1763 /* Check highest gap, which does not precede any rbtree node */ 1764 gap_start = mm->highest_vm_end; 1765 if (gap_start <= high_limit) 1766 goto found_highest; 1767 1768 /* Check if rbtree root looks promising */ 1769 if (RB_EMPTY_ROOT(&mm->mm_rb)) 1770 return -ENOMEM; 1771 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 1772 if (vma->rb_subtree_gap < length) 1773 return -ENOMEM; 1774 1775 while (true) { 1776 /* Visit right subtree if it looks promising */ 1777 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 1778 if (gap_start <= high_limit && vma->vm_rb.rb_right) { 1779 struct vm_area_struct *right = 1780 rb_entry(vma->vm_rb.rb_right, 1781 struct vm_area_struct, vm_rb); 1782 if (right->rb_subtree_gap >= length) { 1783 vma = right; 1784 continue; 1785 } 1786 } 1787 1788 check_current: 1789 /* Check if current node has a suitable gap */ 1790 gap_end = vma->vm_start; 1791 if (gap_end < low_limit) 1792 return -ENOMEM; 1793 if (gap_start <= high_limit && gap_end - gap_start >= length) 1794 goto found; 1795 1796 /* Visit left subtree if it looks promising */ 1797 if (vma->vm_rb.rb_left) { 1798 struct vm_area_struct *left = 1799 rb_entry(vma->vm_rb.rb_left, 1800 struct vm_area_struct, vm_rb); 1801 if (left->rb_subtree_gap >= length) { 1802 vma = left; 1803 continue; 1804 } 1805 } 1806 1807 /* Go back up the rbtree to find next candidate node */ 1808 while (true) { 1809 struct rb_node *prev = &vma->vm_rb; 1810 if (!rb_parent(prev)) 1811 return -ENOMEM; 1812 vma = rb_entry(rb_parent(prev), 1813 struct vm_area_struct, vm_rb); 1814 if (prev == vma->vm_rb.rb_right) { 1815 gap_start = vma->vm_prev ? 1816 vma->vm_prev->vm_end : 0; 1817 goto check_current; 1818 } 1819 } 1820 } 1821 1822 found: 1823 /* We found a suitable gap. Clip it with the original high_limit. */ 1824 if (gap_end > info->high_limit) 1825 gap_end = info->high_limit; 1826 1827 found_highest: 1828 /* Compute highest gap address at the desired alignment */ 1829 gap_end -= info->length; 1830 gap_end -= (gap_end - info->align_offset) & info->align_mask; 1831 1832 VM_BUG_ON(gap_end < info->low_limit); 1833 VM_BUG_ON(gap_end < gap_start); 1834 return gap_end; 1835 } 1836 1837 /* Get an address range which is currently unmapped. 1838 * For shmat() with addr=0. 1839 * 1840 * Ugly calling convention alert: 1841 * Return value with the low bits set means error value, 1842 * ie 1843 * if (ret & ~PAGE_MASK) 1844 * error = ret; 1845 * 1846 * This function "knows" that -ENOMEM has the bits set. 1847 */ 1848 #ifndef HAVE_ARCH_UNMAPPED_AREA 1849 unsigned long 1850 arch_get_unmapped_area(struct file *filp, unsigned long addr, 1851 unsigned long len, unsigned long pgoff, unsigned long flags) 1852 { 1853 struct mm_struct *mm = current->mm; 1854 struct vm_area_struct *vma; 1855 struct vm_unmapped_area_info info; 1856 1857 if (len > TASK_SIZE - mmap_min_addr) 1858 return -ENOMEM; 1859 1860 if (flags & MAP_FIXED) 1861 return addr; 1862 1863 if (addr) { 1864 addr = PAGE_ALIGN(addr); 1865 vma = find_vma(mm, addr); 1866 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 1867 (!vma || addr + len <= vma->vm_start)) 1868 return addr; 1869 } 1870 1871 info.flags = 0; 1872 info.length = len; 1873 info.low_limit = mm->mmap_base; 1874 info.high_limit = TASK_SIZE; 1875 info.align_mask = 0; 1876 return vm_unmapped_area(&info); 1877 } 1878 #endif 1879 1880 /* 1881 * This mmap-allocator allocates new areas top-down from below the 1882 * stack's low limit (the base): 1883 */ 1884 #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 1885 unsigned long 1886 arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 1887 const unsigned long len, const unsigned long pgoff, 1888 const unsigned long flags) 1889 { 1890 struct vm_area_struct *vma; 1891 struct mm_struct *mm = current->mm; 1892 unsigned long addr = addr0; 1893 struct vm_unmapped_area_info info; 1894 1895 /* requested length too big for entire address space */ 1896 if (len > TASK_SIZE - mmap_min_addr) 1897 return -ENOMEM; 1898 1899 if (flags & MAP_FIXED) 1900 return addr; 1901 1902 /* requesting a specific address */ 1903 if (addr) { 1904 addr = PAGE_ALIGN(addr); 1905 vma = find_vma(mm, addr); 1906 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 1907 (!vma || addr + len <= vma->vm_start)) 1908 return addr; 1909 } 1910 1911 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 1912 info.length = len; 1913 info.low_limit = max(PAGE_SIZE, mmap_min_addr); 1914 info.high_limit = mm->mmap_base; 1915 info.align_mask = 0; 1916 addr = vm_unmapped_area(&info); 1917 1918 /* 1919 * A failed mmap() very likely causes application failure, 1920 * so fall back to the bottom-up function here. This scenario 1921 * can happen with large stack limits and large mmap() 1922 * allocations. 1923 */ 1924 if (addr & ~PAGE_MASK) { 1925 VM_BUG_ON(addr != -ENOMEM); 1926 info.flags = 0; 1927 info.low_limit = TASK_UNMAPPED_BASE; 1928 info.high_limit = TASK_SIZE; 1929 addr = vm_unmapped_area(&info); 1930 } 1931 1932 return addr; 1933 } 1934 #endif 1935 1936 unsigned long 1937 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 1938 unsigned long pgoff, unsigned long flags) 1939 { 1940 unsigned long (*get_area)(struct file *, unsigned long, 1941 unsigned long, unsigned long, unsigned long); 1942 1943 unsigned long error = arch_mmap_check(addr, len, flags); 1944 if (error) 1945 return error; 1946 1947 /* Careful about overflows.. */ 1948 if (len > TASK_SIZE) 1949 return -ENOMEM; 1950 1951 get_area = current->mm->get_unmapped_area; 1952 if (file && file->f_op->get_unmapped_area) 1953 get_area = file->f_op->get_unmapped_area; 1954 addr = get_area(file, addr, len, pgoff, flags); 1955 if (IS_ERR_VALUE(addr)) 1956 return addr; 1957 1958 if (addr > TASK_SIZE - len) 1959 return -ENOMEM; 1960 if (addr & ~PAGE_MASK) 1961 return -EINVAL; 1962 1963 addr = arch_rebalance_pgtables(addr, len); 1964 error = security_mmap_addr(addr); 1965 return error ? error : addr; 1966 } 1967 1968 EXPORT_SYMBOL(get_unmapped_area); 1969 1970 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1971 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 1972 { 1973 struct vm_area_struct *vma = NULL; 1974 1975 /* Check the cache first. */ 1976 /* (Cache hit rate is typically around 35%.) */ 1977 vma = ACCESS_ONCE(mm->mmap_cache); 1978 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 1979 struct rb_node *rb_node; 1980 1981 rb_node = mm->mm_rb.rb_node; 1982 vma = NULL; 1983 1984 while (rb_node) { 1985 struct vm_area_struct *vma_tmp; 1986 1987 vma_tmp = rb_entry(rb_node, 1988 struct vm_area_struct, vm_rb); 1989 1990 if (vma_tmp->vm_end > addr) { 1991 vma = vma_tmp; 1992 if (vma_tmp->vm_start <= addr) 1993 break; 1994 rb_node = rb_node->rb_left; 1995 } else 1996 rb_node = rb_node->rb_right; 1997 } 1998 if (vma) 1999 mm->mmap_cache = vma; 2000 } 2001 return vma; 2002 } 2003 2004 EXPORT_SYMBOL(find_vma); 2005 2006 /* 2007 * Same as find_vma, but also return a pointer to the previous VMA in *pprev. 2008 */ 2009 struct vm_area_struct * 2010 find_vma_prev(struct mm_struct *mm, unsigned long addr, 2011 struct vm_area_struct **pprev) 2012 { 2013 struct vm_area_struct *vma; 2014 2015 vma = find_vma(mm, addr); 2016 if (vma) { 2017 *pprev = vma->vm_prev; 2018 } else { 2019 struct rb_node *rb_node = mm->mm_rb.rb_node; 2020 *pprev = NULL; 2021 while (rb_node) { 2022 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); 2023 rb_node = rb_node->rb_right; 2024 } 2025 } 2026 return vma; 2027 } 2028 2029 /* 2030 * Verify that the stack growth is acceptable and 2031 * update accounting. This is shared with both the 2032 * grow-up and grow-down cases. 2033 */ 2034 static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) 2035 { 2036 struct mm_struct *mm = vma->vm_mm; 2037 struct rlimit *rlim = current->signal->rlim; 2038 unsigned long new_start; 2039 2040 /* address space limit tests */ 2041 if (!may_expand_vm(mm, grow)) 2042 return -ENOMEM; 2043 2044 /* Stack limit test */ 2045 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) 2046 return -ENOMEM; 2047 2048 /* mlock limit tests */ 2049 if (vma->vm_flags & VM_LOCKED) { 2050 unsigned long locked; 2051 unsigned long limit; 2052 locked = mm->locked_vm + grow; 2053 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); 2054 limit >>= PAGE_SHIFT; 2055 if (locked > limit && !capable(CAP_IPC_LOCK)) 2056 return -ENOMEM; 2057 } 2058 2059 /* Check to ensure the stack will not grow into a hugetlb-only region */ 2060 new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : 2061 vma->vm_end - size; 2062 if (is_hugepage_only_range(vma->vm_mm, new_start, size)) 2063 return -EFAULT; 2064 2065 /* 2066 * Overcommit.. This must be the final test, as it will 2067 * update security statistics. 2068 */ 2069 if (security_vm_enough_memory_mm(mm, grow)) 2070 return -ENOMEM; 2071 2072 /* Ok, everything looks good - let it rip */ 2073 if (vma->vm_flags & VM_LOCKED) 2074 mm->locked_vm += grow; 2075 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 2076 return 0; 2077 } 2078 2079 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) 2080 /* 2081 * PA-RISC uses this for its stack; IA64 for its Register Backing Store. 2082 * vma is the last one with address > vma->vm_end. Have to extend vma. 2083 */ 2084 int expand_upwards(struct vm_area_struct *vma, unsigned long address) 2085 { 2086 int error; 2087 2088 if (!(vma->vm_flags & VM_GROWSUP)) 2089 return -EFAULT; 2090 2091 /* 2092 * We must make sure the anon_vma is allocated 2093 * so that the anon_vma locking is not a noop. 2094 */ 2095 if (unlikely(anon_vma_prepare(vma))) 2096 return -ENOMEM; 2097 vma_lock_anon_vma(vma); 2098 2099 /* 2100 * vma->vm_start/vm_end cannot change under us because the caller 2101 * is required to hold the mmap_sem in read mode. We need the 2102 * anon_vma lock to serialize against concurrent expand_stacks. 2103 * Also guard against wrapping around to address 0. 2104 */ 2105 if (address < PAGE_ALIGN(address+4)) 2106 address = PAGE_ALIGN(address+4); 2107 else { 2108 vma_unlock_anon_vma(vma); 2109 return -ENOMEM; 2110 } 2111 error = 0; 2112 2113 /* Somebody else might have raced and expanded it already */ 2114 if (address > vma->vm_end) { 2115 unsigned long size, grow; 2116 2117 size = address - vma->vm_start; 2118 grow = (address - vma->vm_end) >> PAGE_SHIFT; 2119 2120 error = -ENOMEM; 2121 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 2122 error = acct_stack_growth(vma, size, grow); 2123 if (!error) { 2124 /* 2125 * vma_gap_update() doesn't support concurrent 2126 * updates, but we only hold a shared mmap_sem 2127 * lock here, so we need to protect against 2128 * concurrent vma expansions. 2129 * vma_lock_anon_vma() doesn't help here, as 2130 * we don't guarantee that all growable vmas 2131 * in a mm share the same root anon vma. 2132 * So, we reuse mm->page_table_lock to guard 2133 * against concurrent vma expansions. 2134 */ 2135 spin_lock(&vma->vm_mm->page_table_lock); 2136 anon_vma_interval_tree_pre_update_vma(vma); 2137 vma->vm_end = address; 2138 anon_vma_interval_tree_post_update_vma(vma); 2139 if (vma->vm_next) 2140 vma_gap_update(vma->vm_next); 2141 else 2142 vma->vm_mm->highest_vm_end = address; 2143 spin_unlock(&vma->vm_mm->page_table_lock); 2144 2145 perf_event_mmap(vma); 2146 } 2147 } 2148 } 2149 vma_unlock_anon_vma(vma); 2150 khugepaged_enter_vma_merge(vma); 2151 validate_mm(vma->vm_mm); 2152 return error; 2153 } 2154 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 2155 2156 /* 2157 * vma is the first one with address < vma->vm_start. Have to extend vma. 2158 */ 2159 int expand_downwards(struct vm_area_struct *vma, 2160 unsigned long address) 2161 { 2162 int error; 2163 2164 /* 2165 * We must make sure the anon_vma is allocated 2166 * so that the anon_vma locking is not a noop. 2167 */ 2168 if (unlikely(anon_vma_prepare(vma))) 2169 return -ENOMEM; 2170 2171 address &= PAGE_MASK; 2172 error = security_mmap_addr(address); 2173 if (error) 2174 return error; 2175 2176 vma_lock_anon_vma(vma); 2177 2178 /* 2179 * vma->vm_start/vm_end cannot change under us because the caller 2180 * is required to hold the mmap_sem in read mode. We need the 2181 * anon_vma lock to serialize against concurrent expand_stacks. 2182 */ 2183 2184 /* Somebody else might have raced and expanded it already */ 2185 if (address < vma->vm_start) { 2186 unsigned long size, grow; 2187 2188 size = vma->vm_end - address; 2189 grow = (vma->vm_start - address) >> PAGE_SHIFT; 2190 2191 error = -ENOMEM; 2192 if (grow <= vma->vm_pgoff) { 2193 error = acct_stack_growth(vma, size, grow); 2194 if (!error) { 2195 /* 2196 * vma_gap_update() doesn't support concurrent 2197 * updates, but we only hold a shared mmap_sem 2198 * lock here, so we need to protect against 2199 * concurrent vma expansions. 2200 * vma_lock_anon_vma() doesn't help here, as 2201 * we don't guarantee that all growable vmas 2202 * in a mm share the same root anon vma. 2203 * So, we reuse mm->page_table_lock to guard 2204 * against concurrent vma expansions. 2205 */ 2206 spin_lock(&vma->vm_mm->page_table_lock); 2207 anon_vma_interval_tree_pre_update_vma(vma); 2208 vma->vm_start = address; 2209 vma->vm_pgoff -= grow; 2210 anon_vma_interval_tree_post_update_vma(vma); 2211 vma_gap_update(vma); 2212 spin_unlock(&vma->vm_mm->page_table_lock); 2213 2214 perf_event_mmap(vma); 2215 } 2216 } 2217 } 2218 vma_unlock_anon_vma(vma); 2219 khugepaged_enter_vma_merge(vma); 2220 validate_mm(vma->vm_mm); 2221 return error; 2222 } 2223 2224 /* 2225 * Note how expand_stack() refuses to expand the stack all the way to 2226 * abut the next virtual mapping, *unless* that mapping itself is also 2227 * a stack mapping. We want to leave room for a guard page, after all 2228 * (the guard page itself is not added here, that is done by the 2229 * actual page faulting logic) 2230 * 2231 * This matches the behavior of the guard page logic (see mm/memory.c: 2232 * check_stack_guard_page()), which only allows the guard page to be 2233 * removed under these circumstances. 2234 */ 2235 #ifdef CONFIG_STACK_GROWSUP 2236 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2237 { 2238 struct vm_area_struct *next; 2239 2240 address &= PAGE_MASK; 2241 next = vma->vm_next; 2242 if (next && next->vm_start == address + PAGE_SIZE) { 2243 if (!(next->vm_flags & VM_GROWSUP)) 2244 return -ENOMEM; 2245 } 2246 return expand_upwards(vma, address); 2247 } 2248 2249 struct vm_area_struct * 2250 find_extend_vma(struct mm_struct *mm, unsigned long addr) 2251 { 2252 struct vm_area_struct *vma, *prev; 2253 2254 addr &= PAGE_MASK; 2255 vma = find_vma_prev(mm, addr, &prev); 2256 if (vma && (vma->vm_start <= addr)) 2257 return vma; 2258 if (!prev || expand_stack(prev, addr)) 2259 return NULL; 2260 if (prev->vm_flags & VM_LOCKED) 2261 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); 2262 return prev; 2263 } 2264 #else 2265 int expand_stack(struct vm_area_struct *vma, unsigned long address) 2266 { 2267 struct vm_area_struct *prev; 2268 2269 address &= PAGE_MASK; 2270 prev = vma->vm_prev; 2271 if (prev && prev->vm_end == address) { 2272 if (!(prev->vm_flags & VM_GROWSDOWN)) 2273 return -ENOMEM; 2274 } 2275 return expand_downwards(vma, address); 2276 } 2277 2278 struct vm_area_struct * 2279 find_extend_vma(struct mm_struct * mm, unsigned long addr) 2280 { 2281 struct vm_area_struct * vma; 2282 unsigned long start; 2283 2284 addr &= PAGE_MASK; 2285 vma = find_vma(mm,addr); 2286 if (!vma) 2287 return NULL; 2288 if (vma->vm_start <= addr) 2289 return vma; 2290 if (!(vma->vm_flags & VM_GROWSDOWN)) 2291 return NULL; 2292 start = vma->vm_start; 2293 if (expand_stack(vma, addr)) 2294 return NULL; 2295 if (vma->vm_flags & VM_LOCKED) 2296 __mlock_vma_pages_range(vma, addr, start, NULL); 2297 return vma; 2298 } 2299 #endif 2300 2301 /* 2302 * Ok - we have the memory areas we should free on the vma list, 2303 * so release them, and do the vma updates. 2304 * 2305 * Called with the mm semaphore held. 2306 */ 2307 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) 2308 { 2309 unsigned long nr_accounted = 0; 2310 2311 /* Update high watermark before we lower total_vm */ 2312 update_hiwater_vm(mm); 2313 do { 2314 long nrpages = vma_pages(vma); 2315 2316 if (vma->vm_flags & VM_ACCOUNT) 2317 nr_accounted += nrpages; 2318 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 2319 vma = remove_vma(vma); 2320 } while (vma); 2321 vm_unacct_memory(nr_accounted); 2322 validate_mm(mm); 2323 } 2324 2325 /* 2326 * Get rid of page table information in the indicated region. 2327 * 2328 * Called with the mm semaphore held. 2329 */ 2330 static void unmap_region(struct mm_struct *mm, 2331 struct vm_area_struct *vma, struct vm_area_struct *prev, 2332 unsigned long start, unsigned long end) 2333 { 2334 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; 2335 struct mmu_gather tlb; 2336 2337 lru_add_drain(); 2338 tlb_gather_mmu(&tlb, mm, start, end); 2339 update_hiwater_rss(mm); 2340 unmap_vmas(&tlb, vma, start, end); 2341 free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, 2342 next ? next->vm_start : USER_PGTABLES_CEILING); 2343 tlb_finish_mmu(&tlb, start, end); 2344 } 2345 2346 /* 2347 * Create a list of vma's touched by the unmap, removing them from the mm's 2348 * vma list as we go.. 2349 */ 2350 static void 2351 detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, 2352 struct vm_area_struct *prev, unsigned long end) 2353 { 2354 struct vm_area_struct **insertion_point; 2355 struct vm_area_struct *tail_vma = NULL; 2356 2357 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2358 vma->vm_prev = NULL; 2359 do { 2360 vma_rb_erase(vma, &mm->mm_rb); 2361 mm->map_count--; 2362 tail_vma = vma; 2363 vma = vma->vm_next; 2364 } while (vma && vma->vm_start < end); 2365 *insertion_point = vma; 2366 if (vma) { 2367 vma->vm_prev = prev; 2368 vma_gap_update(vma); 2369 } else 2370 mm->highest_vm_end = prev ? prev->vm_end : 0; 2371 tail_vma->vm_next = NULL; 2372 mm->mmap_cache = NULL; /* Kill the cache. */ 2373 } 2374 2375 /* 2376 * __split_vma() bypasses sysctl_max_map_count checking. We use this on the 2377 * munmap path where it doesn't make sense to fail. 2378 */ 2379 static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 2380 unsigned long addr, int new_below) 2381 { 2382 struct vm_area_struct *new; 2383 int err = -ENOMEM; 2384 2385 if (is_vm_hugetlb_page(vma) && (addr & 2386 ~(huge_page_mask(hstate_vma(vma))))) 2387 return -EINVAL; 2388 2389 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2390 if (!new) 2391 goto out_err; 2392 2393 /* most fields are the same, copy all, and then fixup */ 2394 *new = *vma; 2395 2396 INIT_LIST_HEAD(&new->anon_vma_chain); 2397 2398 if (new_below) 2399 new->vm_end = addr; 2400 else { 2401 new->vm_start = addr; 2402 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 2403 } 2404 2405 err = vma_dup_policy(vma, new); 2406 if (err) 2407 goto out_free_vma; 2408 2409 if (anon_vma_clone(new, vma)) 2410 goto out_free_mpol; 2411 2412 if (new->vm_file) 2413 get_file(new->vm_file); 2414 2415 if (new->vm_ops && new->vm_ops->open) 2416 new->vm_ops->open(new); 2417 2418 if (new_below) 2419 err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + 2420 ((addr - new->vm_start) >> PAGE_SHIFT), new); 2421 else 2422 err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); 2423 2424 /* Success. */ 2425 if (!err) 2426 return 0; 2427 2428 /* Clean everything up if vma_adjust failed. */ 2429 if (new->vm_ops && new->vm_ops->close) 2430 new->vm_ops->close(new); 2431 if (new->vm_file) 2432 fput(new->vm_file); 2433 unlink_anon_vmas(new); 2434 out_free_mpol: 2435 mpol_put(vma_policy(new)); 2436 out_free_vma: 2437 kmem_cache_free(vm_area_cachep, new); 2438 out_err: 2439 return err; 2440 } 2441 2442 /* 2443 * Split a vma into two pieces at address 'addr', a new vma is allocated 2444 * either for the first part or the tail. 2445 */ 2446 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 2447 unsigned long addr, int new_below) 2448 { 2449 if (mm->map_count >= sysctl_max_map_count) 2450 return -ENOMEM; 2451 2452 return __split_vma(mm, vma, addr, new_below); 2453 } 2454 2455 /* Munmap is split into 2 main parts -- this part which finds 2456 * what needs doing, and the areas themselves, which do the 2457 * work. This now handles partial unmappings. 2458 * Jeremy Fitzhardinge <jeremy@goop.org> 2459 */ 2460 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 2461 { 2462 unsigned long end; 2463 struct vm_area_struct *vma, *prev, *last; 2464 2465 if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) 2466 return -EINVAL; 2467 2468 if ((len = PAGE_ALIGN(len)) == 0) 2469 return -EINVAL; 2470 2471 /* Find the first overlapping VMA */ 2472 vma = find_vma(mm, start); 2473 if (!vma) 2474 return 0; 2475 prev = vma->vm_prev; 2476 /* we have start < vma->vm_end */ 2477 2478 /* if it doesn't overlap, we have nothing.. */ 2479 end = start + len; 2480 if (vma->vm_start >= end) 2481 return 0; 2482 2483 /* 2484 * If we need to split any vma, do it now to save pain later. 2485 * 2486 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially 2487 * unmapped vm_area_struct will remain in use: so lower split_vma 2488 * places tmp vma above, and higher split_vma places tmp vma below. 2489 */ 2490 if (start > vma->vm_start) { 2491 int error; 2492 2493 /* 2494 * Make sure that map_count on return from munmap() will 2495 * not exceed its limit; but let map_count go just above 2496 * its limit temporarily, to help free resources as expected. 2497 */ 2498 if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) 2499 return -ENOMEM; 2500 2501 error = __split_vma(mm, vma, start, 0); 2502 if (error) 2503 return error; 2504 prev = vma; 2505 } 2506 2507 /* Does it split the last one? */ 2508 last = find_vma(mm, end); 2509 if (last && end > last->vm_start) { 2510 int error = __split_vma(mm, last, end, 1); 2511 if (error) 2512 return error; 2513 } 2514 vma = prev? prev->vm_next: mm->mmap; 2515 2516 /* 2517 * unlock any mlock()ed ranges before detaching vmas 2518 */ 2519 if (mm->locked_vm) { 2520 struct vm_area_struct *tmp = vma; 2521 while (tmp && tmp->vm_start < end) { 2522 if (tmp->vm_flags & VM_LOCKED) { 2523 mm->locked_vm -= vma_pages(tmp); 2524 munlock_vma_pages_all(tmp); 2525 } 2526 tmp = tmp->vm_next; 2527 } 2528 } 2529 2530 /* 2531 * Remove the vma's, and unmap the actual pages 2532 */ 2533 detach_vmas_to_be_unmapped(mm, vma, prev, end); 2534 unmap_region(mm, vma, prev, start, end); 2535 2536 /* Fix up all other VM information */ 2537 remove_vma_list(mm, vma); 2538 2539 return 0; 2540 } 2541 2542 int vm_munmap(unsigned long start, size_t len) 2543 { 2544 int ret; 2545 struct mm_struct *mm = current->mm; 2546 2547 down_write(&mm->mmap_sem); 2548 ret = do_munmap(mm, start, len); 2549 up_write(&mm->mmap_sem); 2550 return ret; 2551 } 2552 EXPORT_SYMBOL(vm_munmap); 2553 2554 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 2555 { 2556 profile_munmap(addr); 2557 return vm_munmap(addr, len); 2558 } 2559 2560 static inline void verify_mm_writelocked(struct mm_struct *mm) 2561 { 2562 #ifdef CONFIG_DEBUG_VM 2563 if (unlikely(down_read_trylock(&mm->mmap_sem))) { 2564 WARN_ON(1); 2565 up_read(&mm->mmap_sem); 2566 } 2567 #endif 2568 } 2569 2570 /* 2571 * this is really a simplified "do_mmap". it only handles 2572 * anonymous maps. eventually we may be able to do some 2573 * brk-specific accounting here. 2574 */ 2575 static unsigned long do_brk(unsigned long addr, unsigned long len) 2576 { 2577 struct mm_struct * mm = current->mm; 2578 struct vm_area_struct * vma, * prev; 2579 unsigned long flags; 2580 struct rb_node ** rb_link, * rb_parent; 2581 pgoff_t pgoff = addr >> PAGE_SHIFT; 2582 int error; 2583 2584 len = PAGE_ALIGN(len); 2585 if (!len) 2586 return addr; 2587 2588 flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; 2589 2590 error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); 2591 if (error & ~PAGE_MASK) 2592 return error; 2593 2594 /* 2595 * mlock MCL_FUTURE? 2596 */ 2597 if (mm->def_flags & VM_LOCKED) { 2598 unsigned long locked, lock_limit; 2599 locked = len >> PAGE_SHIFT; 2600 locked += mm->locked_vm; 2601 lock_limit = rlimit(RLIMIT_MEMLOCK); 2602 lock_limit >>= PAGE_SHIFT; 2603 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 2604 return -EAGAIN; 2605 } 2606 2607 /* 2608 * mm->mmap_sem is required to protect against another thread 2609 * changing the mappings in case we sleep. 2610 */ 2611 verify_mm_writelocked(mm); 2612 2613 /* 2614 * Clear old maps. this also does some error checking for us 2615 */ 2616 munmap_back: 2617 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 2618 if (do_munmap(mm, addr, len)) 2619 return -ENOMEM; 2620 goto munmap_back; 2621 } 2622 2623 /* Check against address space limits *after* clearing old maps... */ 2624 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 2625 return -ENOMEM; 2626 2627 if (mm->map_count > sysctl_max_map_count) 2628 return -ENOMEM; 2629 2630 if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) 2631 return -ENOMEM; 2632 2633 /* Can we just expand an old private anonymous mapping? */ 2634 vma = vma_merge(mm, prev, addr, addr + len, flags, 2635 NULL, NULL, pgoff, NULL); 2636 if (vma) 2637 goto out; 2638 2639 /* 2640 * create a vma struct for an anonymous mapping 2641 */ 2642 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2643 if (!vma) { 2644 vm_unacct_memory(len >> PAGE_SHIFT); 2645 return -ENOMEM; 2646 } 2647 2648 INIT_LIST_HEAD(&vma->anon_vma_chain); 2649 vma->vm_mm = mm; 2650 vma->vm_start = addr; 2651 vma->vm_end = addr + len; 2652 vma->vm_pgoff = pgoff; 2653 vma->vm_flags = flags; 2654 vma->vm_page_prot = vm_get_page_prot(flags); 2655 vma_link(mm, vma, prev, rb_link, rb_parent); 2656 out: 2657 perf_event_mmap(vma); 2658 mm->total_vm += len >> PAGE_SHIFT; 2659 if (flags & VM_LOCKED) 2660 mm->locked_vm += (len >> PAGE_SHIFT); 2661 vma->vm_flags |= VM_SOFTDIRTY; 2662 return addr; 2663 } 2664 2665 unsigned long vm_brk(unsigned long addr, unsigned long len) 2666 { 2667 struct mm_struct *mm = current->mm; 2668 unsigned long ret; 2669 bool populate; 2670 2671 down_write(&mm->mmap_sem); 2672 ret = do_brk(addr, len); 2673 populate = ((mm->def_flags & VM_LOCKED) != 0); 2674 up_write(&mm->mmap_sem); 2675 if (populate) 2676 mm_populate(addr, len); 2677 return ret; 2678 } 2679 EXPORT_SYMBOL(vm_brk); 2680 2681 /* Release all mmaps. */ 2682 void exit_mmap(struct mm_struct *mm) 2683 { 2684 struct mmu_gather tlb; 2685 struct vm_area_struct *vma; 2686 unsigned long nr_accounted = 0; 2687 2688 /* mm's last user has gone, and its about to be pulled down */ 2689 mmu_notifier_release(mm); 2690 2691 if (mm->locked_vm) { 2692 vma = mm->mmap; 2693 while (vma) { 2694 if (vma->vm_flags & VM_LOCKED) 2695 munlock_vma_pages_all(vma); 2696 vma = vma->vm_next; 2697 } 2698 } 2699 2700 arch_exit_mmap(mm); 2701 2702 vma = mm->mmap; 2703 if (!vma) /* Can happen if dup_mmap() received an OOM */ 2704 return; 2705 2706 lru_add_drain(); 2707 flush_cache_mm(mm); 2708 tlb_gather_mmu(&tlb, mm, 0, -1); 2709 /* update_hiwater_rss(mm) here? but nobody should be looking */ 2710 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2711 unmap_vmas(&tlb, vma, 0, -1); 2712 2713 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); 2714 tlb_finish_mmu(&tlb, 0, -1); 2715 2716 /* 2717 * Walk the list again, actually closing and freeing it, 2718 * with preemption enabled, without holding any MM locks. 2719 */ 2720 while (vma) { 2721 if (vma->vm_flags & VM_ACCOUNT) 2722 nr_accounted += vma_pages(vma); 2723 vma = remove_vma(vma); 2724 } 2725 vm_unacct_memory(nr_accounted); 2726 2727 WARN_ON(atomic_long_read(&mm->nr_ptes) > 2728 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2729 } 2730 2731 /* Insert vm structure into process list sorted by address 2732 * and into the inode's i_mmap tree. If vm_file is non-NULL 2733 * then i_mmap_mutex is taken here. 2734 */ 2735 int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 2736 { 2737 struct vm_area_struct *prev; 2738 struct rb_node **rb_link, *rb_parent; 2739 2740 /* 2741 * The vm_pgoff of a purely anonymous vma should be irrelevant 2742 * until its first write fault, when page's anon_vma and index 2743 * are set. But now set the vm_pgoff it will almost certainly 2744 * end up with (unless mremap moves it elsewhere before that 2745 * first wfault), so /proc/pid/maps tells a consistent story. 2746 * 2747 * By setting it to reflect the virtual start address of the 2748 * vma, merges and splits can happen in a seamless way, just 2749 * using the existing file pgoff checks and manipulations. 2750 * Similarly in do_mmap_pgoff and in do_brk. 2751 */ 2752 if (!vma->vm_file) { 2753 BUG_ON(vma->anon_vma); 2754 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2755 } 2756 if (find_vma_links(mm, vma->vm_start, vma->vm_end, 2757 &prev, &rb_link, &rb_parent)) 2758 return -ENOMEM; 2759 if ((vma->vm_flags & VM_ACCOUNT) && 2760 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2761 return -ENOMEM; 2762 2763 vma_link(mm, vma, prev, rb_link, rb_parent); 2764 return 0; 2765 } 2766 2767 /* 2768 * Copy the vma structure to a new location in the same mm, 2769 * prior to moving page table entries, to effect an mremap move. 2770 */ 2771 struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2772 unsigned long addr, unsigned long len, pgoff_t pgoff, 2773 bool *need_rmap_locks) 2774 { 2775 struct vm_area_struct *vma = *vmap; 2776 unsigned long vma_start = vma->vm_start; 2777 struct mm_struct *mm = vma->vm_mm; 2778 struct vm_area_struct *new_vma, *prev; 2779 struct rb_node **rb_link, *rb_parent; 2780 bool faulted_in_anon_vma = true; 2781 2782 /* 2783 * If anonymous vma has not yet been faulted, update new pgoff 2784 * to match new location, to increase its chance of merging. 2785 */ 2786 if (unlikely(!vma->vm_file && !vma->anon_vma)) { 2787 pgoff = addr >> PAGE_SHIFT; 2788 faulted_in_anon_vma = false; 2789 } 2790 2791 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) 2792 return NULL; /* should never get here */ 2793 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2794 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2795 if (new_vma) { 2796 /* 2797 * Source vma may have been merged into new_vma 2798 */ 2799 if (unlikely(vma_start >= new_vma->vm_start && 2800 vma_start < new_vma->vm_end)) { 2801 /* 2802 * The only way we can get a vma_merge with 2803 * self during an mremap is if the vma hasn't 2804 * been faulted in yet and we were allowed to 2805 * reset the dst vma->vm_pgoff to the 2806 * destination address of the mremap to allow 2807 * the merge to happen. mremap must change the 2808 * vm_pgoff linearity between src and dst vmas 2809 * (in turn preventing a vma_merge) to be 2810 * safe. It is only safe to keep the vm_pgoff 2811 * linear if there are no pages mapped yet. 2812 */ 2813 VM_BUG_ON(faulted_in_anon_vma); 2814 *vmap = vma = new_vma; 2815 } 2816 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); 2817 } else { 2818 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2819 if (new_vma) { 2820 *new_vma = *vma; 2821 new_vma->vm_start = addr; 2822 new_vma->vm_end = addr + len; 2823 new_vma->vm_pgoff = pgoff; 2824 if (vma_dup_policy(vma, new_vma)) 2825 goto out_free_vma; 2826 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2827 if (anon_vma_clone(new_vma, vma)) 2828 goto out_free_mempol; 2829 if (new_vma->vm_file) 2830 get_file(new_vma->vm_file); 2831 if (new_vma->vm_ops && new_vma->vm_ops->open) 2832 new_vma->vm_ops->open(new_vma); 2833 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2834 *need_rmap_locks = false; 2835 } 2836 } 2837 return new_vma; 2838 2839 out_free_mempol: 2840 mpol_put(vma_policy(new_vma)); 2841 out_free_vma: 2842 kmem_cache_free(vm_area_cachep, new_vma); 2843 return NULL; 2844 } 2845 2846 /* 2847 * Return true if the calling process may expand its vm space by the passed 2848 * number of pages 2849 */ 2850 int may_expand_vm(struct mm_struct *mm, unsigned long npages) 2851 { 2852 unsigned long cur = mm->total_vm; /* pages */ 2853 unsigned long lim; 2854 2855 lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; 2856 2857 if (cur + npages > lim) 2858 return 0; 2859 return 1; 2860 } 2861 2862 2863 static int special_mapping_fault(struct vm_area_struct *vma, 2864 struct vm_fault *vmf) 2865 { 2866 pgoff_t pgoff; 2867 struct page **pages; 2868 2869 /* 2870 * special mappings have no vm_file, and in that case, the mm 2871 * uses vm_pgoff internally. So we have to subtract it from here. 2872 * We are allowed to do this because we are the mm; do not copy 2873 * this code into drivers! 2874 */ 2875 pgoff = vmf->pgoff - vma->vm_pgoff; 2876 2877 for (pages = vma->vm_private_data; pgoff && *pages; ++pages) 2878 pgoff--; 2879 2880 if (*pages) { 2881 struct page *page = *pages; 2882 get_page(page); 2883 vmf->page = page; 2884 return 0; 2885 } 2886 2887 return VM_FAULT_SIGBUS; 2888 } 2889 2890 /* 2891 * Having a close hook prevents vma merging regardless of flags. 2892 */ 2893 static void special_mapping_close(struct vm_area_struct *vma) 2894 { 2895 } 2896 2897 static const struct vm_operations_struct special_mapping_vmops = { 2898 .close = special_mapping_close, 2899 .fault = special_mapping_fault, 2900 }; 2901 2902 /* 2903 * Called with mm->mmap_sem held for writing. 2904 * Insert a new vma covering the given region, with the given flags. 2905 * Its pages are supplied by the given array of struct page *. 2906 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. 2907 * The region past the last page supplied will always produce SIGBUS. 2908 * The array pointer and the pages it points to are assumed to stay alive 2909 * for as long as this mapping might exist. 2910 */ 2911 int install_special_mapping(struct mm_struct *mm, 2912 unsigned long addr, unsigned long len, 2913 unsigned long vm_flags, struct page **pages) 2914 { 2915 int ret; 2916 struct vm_area_struct *vma; 2917 2918 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2919 if (unlikely(vma == NULL)) 2920 return -ENOMEM; 2921 2922 INIT_LIST_HEAD(&vma->anon_vma_chain); 2923 vma->vm_mm = mm; 2924 vma->vm_start = addr; 2925 vma->vm_end = addr + len; 2926 2927 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; 2928 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2929 2930 vma->vm_ops = &special_mapping_vmops; 2931 vma->vm_private_data = pages; 2932 2933 ret = insert_vm_struct(mm, vma); 2934 if (ret) 2935 goto out; 2936 2937 mm->total_vm += len >> PAGE_SHIFT; 2938 2939 perf_event_mmap(vma); 2940 2941 return 0; 2942 2943 out: 2944 kmem_cache_free(vm_area_cachep, vma); 2945 return ret; 2946 } 2947 2948 static DEFINE_MUTEX(mm_all_locks_mutex); 2949 2950 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2951 { 2952 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 2953 /* 2954 * The LSB of head.next can't change from under us 2955 * because we hold the mm_all_locks_mutex. 2956 */ 2957 down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem); 2958 /* 2959 * We can safely modify head.next after taking the 2960 * anon_vma->root->rwsem. If some other vma in this mm shares 2961 * the same anon_vma we won't take it again. 2962 * 2963 * No need of atomic instructions here, head.next 2964 * can't change from under us thanks to the 2965 * anon_vma->root->rwsem. 2966 */ 2967 if (__test_and_set_bit(0, (unsigned long *) 2968 &anon_vma->root->rb_root.rb_node)) 2969 BUG(); 2970 } 2971 } 2972 2973 static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) 2974 { 2975 if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 2976 /* 2977 * AS_MM_ALL_LOCKS can't change from under us because 2978 * we hold the mm_all_locks_mutex. 2979 * 2980 * Operations on ->flags have to be atomic because 2981 * even if AS_MM_ALL_LOCKS is stable thanks to the 2982 * mm_all_locks_mutex, there may be other cpus 2983 * changing other bitflags in parallel to us. 2984 */ 2985 if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) 2986 BUG(); 2987 mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem); 2988 } 2989 } 2990 2991 /* 2992 * This operation locks against the VM for all pte/vma/mm related 2993 * operations that could ever happen on a certain mm. This includes 2994 * vmtruncate, try_to_unmap, and all page faults. 2995 * 2996 * The caller must take the mmap_sem in write mode before calling 2997 * mm_take_all_locks(). The caller isn't allowed to release the 2998 * mmap_sem until mm_drop_all_locks() returns. 2999 * 3000 * mmap_sem in write mode is required in order to block all operations 3001 * that could modify pagetables and free pages without need of 3002 * altering the vma layout (for example populate_range() with 3003 * nonlinear vmas). It's also needed in write mode to avoid new 3004 * anon_vmas to be associated with existing vmas. 3005 * 3006 * A single task can't take more than one mm_take_all_locks() in a row 3007 * or it would deadlock. 3008 * 3009 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in 3010 * mapping->flags avoid to take the same lock twice, if more than one 3011 * vma in this mm is backed by the same anon_vma or address_space. 3012 * 3013 * We can take all the locks in random order because the VM code 3014 * taking i_mmap_mutex or anon_vma->rwsem outside the mmap_sem never 3015 * takes more than one of them in a row. Secondly we're protected 3016 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. 3017 * 3018 * mm_take_all_locks() and mm_drop_all_locks are expensive operations 3019 * that may have to take thousand of locks. 3020 * 3021 * mm_take_all_locks() can fail if it's interrupted by signals. 3022 */ 3023 int mm_take_all_locks(struct mm_struct *mm) 3024 { 3025 struct vm_area_struct *vma; 3026 struct anon_vma_chain *avc; 3027 3028 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3029 3030 mutex_lock(&mm_all_locks_mutex); 3031 3032 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3033 if (signal_pending(current)) 3034 goto out_unlock; 3035 if (vma->vm_file && vma->vm_file->f_mapping) 3036 vm_lock_mapping(mm, vma->vm_file->f_mapping); 3037 } 3038 3039 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3040 if (signal_pending(current)) 3041 goto out_unlock; 3042 if (vma->anon_vma) 3043 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3044 vm_lock_anon_vma(mm, avc->anon_vma); 3045 } 3046 3047 return 0; 3048 3049 out_unlock: 3050 mm_drop_all_locks(mm); 3051 return -EINTR; 3052 } 3053 3054 static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 3055 { 3056 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { 3057 /* 3058 * The LSB of head.next can't change to 0 from under 3059 * us because we hold the mm_all_locks_mutex. 3060 * 3061 * We must however clear the bitflag before unlocking 3062 * the vma so the users using the anon_vma->rb_root will 3063 * never see our bitflag. 3064 * 3065 * No need of atomic instructions here, head.next 3066 * can't change from under us until we release the 3067 * anon_vma->root->rwsem. 3068 */ 3069 if (!__test_and_clear_bit(0, (unsigned long *) 3070 &anon_vma->root->rb_root.rb_node)) 3071 BUG(); 3072 anon_vma_unlock_write(anon_vma); 3073 } 3074 } 3075 3076 static void vm_unlock_mapping(struct address_space *mapping) 3077 { 3078 if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { 3079 /* 3080 * AS_MM_ALL_LOCKS can't change to 0 from under us 3081 * because we hold the mm_all_locks_mutex. 3082 */ 3083 mutex_unlock(&mapping->i_mmap_mutex); 3084 if (!test_and_clear_bit(AS_MM_ALL_LOCKS, 3085 &mapping->flags)) 3086 BUG(); 3087 } 3088 } 3089 3090 /* 3091 * The mmap_sem cannot be released by the caller until 3092 * mm_drop_all_locks() returns. 3093 */ 3094 void mm_drop_all_locks(struct mm_struct *mm) 3095 { 3096 struct vm_area_struct *vma; 3097 struct anon_vma_chain *avc; 3098 3099 BUG_ON(down_read_trylock(&mm->mmap_sem)); 3100 BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); 3101 3102 for (vma = mm->mmap; vma; vma = vma->vm_next) { 3103 if (vma->anon_vma) 3104 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 3105 vm_unlock_anon_vma(avc->anon_vma); 3106 if (vma->vm_file && vma->vm_file->f_mapping) 3107 vm_unlock_mapping(vma->vm_file->f_mapping); 3108 } 3109 3110 mutex_unlock(&mm_all_locks_mutex); 3111 } 3112 3113 /* 3114 * initialise the VMA slab 3115 */ 3116 void __init mmap_init(void) 3117 { 3118 int ret; 3119 3120 ret = percpu_counter_init(&vm_committed_as, 0); 3121 VM_BUG_ON(ret); 3122 } 3123 3124 /* 3125 * Initialise sysctl_user_reserve_kbytes. 3126 * 3127 * This is intended to prevent a user from starting a single memory hogging 3128 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER 3129 * mode. 3130 * 3131 * The default value is min(3% of free memory, 128MB) 3132 * 128MB is enough to recover with sshd/login, bash, and top/kill. 3133 */ 3134 static int init_user_reserve(void) 3135 { 3136 unsigned long free_kbytes; 3137 3138 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3139 3140 sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); 3141 return 0; 3142 } 3143 module_init(init_user_reserve) 3144 3145 /* 3146 * Initialise sysctl_admin_reserve_kbytes. 3147 * 3148 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin 3149 * to log in and kill a memory hogging process. 3150 * 3151 * Systems with more than 256MB will reserve 8MB, enough to recover 3152 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will 3153 * only reserve 3% of free pages by default. 3154 */ 3155 static int init_admin_reserve(void) 3156 { 3157 unsigned long free_kbytes; 3158 3159 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3160 3161 sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); 3162 return 0; 3163 } 3164 module_init(init_admin_reserve) 3165 3166 /* 3167 * Reinititalise user and admin reserves if memory is added or removed. 3168 * 3169 * The default user reserve max is 128MB, and the default max for the 3170 * admin reserve is 8MB. These are usually, but not always, enough to 3171 * enable recovery from a memory hogging process using login/sshd, a shell, 3172 * and tools like top. It may make sense to increase or even disable the 3173 * reserve depending on the existence of swap or variations in the recovery 3174 * tools. So, the admin may have changed them. 3175 * 3176 * If memory is added and the reserves have been eliminated or increased above 3177 * the default max, then we'll trust the admin. 3178 * 3179 * If memory is removed and there isn't enough free memory, then we 3180 * need to reset the reserves. 3181 * 3182 * Otherwise keep the reserve set by the admin. 3183 */ 3184 static int reserve_mem_notifier(struct notifier_block *nb, 3185 unsigned long action, void *data) 3186 { 3187 unsigned long tmp, free_kbytes; 3188 3189 switch (action) { 3190 case MEM_ONLINE: 3191 /* Default max is 128MB. Leave alone if modified by operator. */ 3192 tmp = sysctl_user_reserve_kbytes; 3193 if (0 < tmp && tmp < (1UL << 17)) 3194 init_user_reserve(); 3195 3196 /* Default max is 8MB. Leave alone if modified by operator. */ 3197 tmp = sysctl_admin_reserve_kbytes; 3198 if (0 < tmp && tmp < (1UL << 13)) 3199 init_admin_reserve(); 3200 3201 break; 3202 case MEM_OFFLINE: 3203 free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); 3204 3205 if (sysctl_user_reserve_kbytes > free_kbytes) { 3206 init_user_reserve(); 3207 pr_info("vm.user_reserve_kbytes reset to %lu\n", 3208 sysctl_user_reserve_kbytes); 3209 } 3210 3211 if (sysctl_admin_reserve_kbytes > free_kbytes) { 3212 init_admin_reserve(); 3213 pr_info("vm.admin_reserve_kbytes reset to %lu\n", 3214 sysctl_admin_reserve_kbytes); 3215 } 3216 break; 3217 default: 3218 break; 3219 } 3220 return NOTIFY_OK; 3221 } 3222 3223 static struct notifier_block reserve_mem_nb = { 3224 .notifier_call = reserve_mem_notifier, 3225 }; 3226 3227 static int __meminit init_reserve_notifier(void) 3228 { 3229 if (register_hotmemory_notifier(&reserve_mem_nb)) 3230 printk("Failed registering memory add/remove notifier for admin reserve"); 3231 3232 return 0; 3233 } 3234 module_init(init_reserve_notifier) 3235