1 /* 2 * mm/mremap.c 3 * 4 * (C) Copyright 1996 Linus Torvalds 5 * 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 7 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 8 */ 9 10 #include <linux/mm.h> 11 #include <linux/hugetlb.h> 12 #include <linux/shm.h> 13 #include <linux/ksm.h> 14 #include <linux/mman.h> 15 #include <linux/swap.h> 16 #include <linux/capability.h> 17 #include <linux/fs.h> 18 #include <linux/highmem.h> 19 #include <linux/security.h> 20 #include <linux/syscalls.h> 21 #include <linux/mmu_notifier.h> 22 23 #include <asm/uaccess.h> 24 #include <asm/cacheflush.h> 25 #include <asm/tlbflush.h> 26 27 #include "internal.h" 28 29 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) 30 { 31 pgd_t *pgd; 32 pud_t *pud; 33 pmd_t *pmd; 34 35 pgd = pgd_offset(mm, addr); 36 if (pgd_none_or_clear_bad(pgd)) 37 return NULL; 38 39 pud = pud_offset(pgd, addr); 40 if (pud_none_or_clear_bad(pud)) 41 return NULL; 42 43 pmd = pmd_offset(pud, addr); 44 if (pmd_none(*pmd)) 45 return NULL; 46 47 return pmd; 48 } 49 50 static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 51 unsigned long addr) 52 { 53 pgd_t *pgd; 54 pud_t *pud; 55 pmd_t *pmd; 56 57 pgd = pgd_offset(mm, addr); 58 pud = pud_alloc(mm, pgd, addr); 59 if (!pud) 60 return NULL; 61 62 pmd = pmd_alloc(mm, pud, addr); 63 if (!pmd) 64 return NULL; 65 66 VM_BUG_ON(pmd_trans_huge(*pmd)); 67 68 return pmd; 69 } 70 71 static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, 72 unsigned long old_addr, unsigned long old_end, 73 struct vm_area_struct *new_vma, pmd_t *new_pmd, 74 unsigned long new_addr, bool need_rmap_locks) 75 { 76 struct address_space *mapping = NULL; 77 struct anon_vma *anon_vma = NULL; 78 struct mm_struct *mm = vma->vm_mm; 79 pte_t *old_pte, *new_pte, pte; 80 spinlock_t *old_ptl, *new_ptl; 81 82 /* 83 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma 84 * locks to ensure that rmap will always observe either the old or the 85 * new ptes. This is the easiest way to avoid races with 86 * truncate_pagecache(), page migration, etc... 87 * 88 * When need_rmap_locks is false, we use other ways to avoid 89 * such races: 90 * 91 * - During exec() shift_arg_pages(), we use a specially tagged vma 92 * which rmap call sites look for using is_vma_temporary_stack(). 93 * 94 * - During mremap(), new_vma is often known to be placed after vma 95 * in rmap traversal order. This ensures rmap will always observe 96 * either the old pte, or the new pte, or both (the page table locks 97 * serialize access to individual ptes, but only rmap traversal 98 * order guarantees that we won't miss both the old and new ptes). 99 */ 100 if (need_rmap_locks) { 101 if (vma->vm_file) { 102 mapping = vma->vm_file->f_mapping; 103 mutex_lock(&mapping->i_mmap_mutex); 104 } 105 if (vma->anon_vma) { 106 anon_vma = vma->anon_vma; 107 anon_vma_lock_write(anon_vma); 108 } 109 } 110 111 /* 112 * We don't have to worry about the ordering of src and dst 113 * pte locks because exclusive mmap_sem prevents deadlock. 114 */ 115 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); 116 new_pte = pte_offset_map(new_pmd, new_addr); 117 new_ptl = pte_lockptr(mm, new_pmd); 118 if (new_ptl != old_ptl) 119 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); 120 arch_enter_lazy_mmu_mode(); 121 122 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 123 new_pte++, new_addr += PAGE_SIZE) { 124 if (pte_none(*old_pte)) 125 continue; 126 pte = ptep_get_and_clear(mm, old_addr, old_pte); 127 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 128 set_pte_at(mm, new_addr, new_pte, pte); 129 } 130 131 arch_leave_lazy_mmu_mode(); 132 if (new_ptl != old_ptl) 133 spin_unlock(new_ptl); 134 pte_unmap(new_pte - 1); 135 pte_unmap_unlock(old_pte - 1, old_ptl); 136 if (anon_vma) 137 anon_vma_unlock(anon_vma); 138 if (mapping) 139 mutex_unlock(&mapping->i_mmap_mutex); 140 } 141 142 #define LATENCY_LIMIT (64 * PAGE_SIZE) 143 144 unsigned long move_page_tables(struct vm_area_struct *vma, 145 unsigned long old_addr, struct vm_area_struct *new_vma, 146 unsigned long new_addr, unsigned long len, 147 bool need_rmap_locks) 148 { 149 unsigned long extent, next, old_end; 150 pmd_t *old_pmd, *new_pmd; 151 bool need_flush = false; 152 unsigned long mmun_start; /* For mmu_notifiers */ 153 unsigned long mmun_end; /* For mmu_notifiers */ 154 155 old_end = old_addr + len; 156 flush_cache_range(vma, old_addr, old_end); 157 158 mmun_start = old_addr; 159 mmun_end = old_end; 160 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); 161 162 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 163 cond_resched(); 164 next = (old_addr + PMD_SIZE) & PMD_MASK; 165 /* even if next overflowed, extent below will be ok */ 166 extent = next - old_addr; 167 if (extent > old_end - old_addr) 168 extent = old_end - old_addr; 169 old_pmd = get_old_pmd(vma->vm_mm, old_addr); 170 if (!old_pmd) 171 continue; 172 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); 173 if (!new_pmd) 174 break; 175 if (pmd_trans_huge(*old_pmd)) { 176 int err = 0; 177 if (extent == HPAGE_PMD_SIZE) 178 err = move_huge_pmd(vma, new_vma, old_addr, 179 new_addr, old_end, 180 old_pmd, new_pmd); 181 if (err > 0) { 182 need_flush = true; 183 continue; 184 } else if (!err) { 185 split_huge_page_pmd(vma, old_addr, old_pmd); 186 } 187 VM_BUG_ON(pmd_trans_huge(*old_pmd)); 188 } 189 if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, 190 new_pmd, new_addr)) 191 break; 192 next = (new_addr + PMD_SIZE) & PMD_MASK; 193 if (extent > next - new_addr) 194 extent = next - new_addr; 195 if (extent > LATENCY_LIMIT) 196 extent = LATENCY_LIMIT; 197 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 198 new_vma, new_pmd, new_addr, need_rmap_locks); 199 need_flush = true; 200 } 201 if (likely(need_flush)) 202 flush_tlb_range(vma, old_end-len, old_addr); 203 204 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 205 206 return len + old_addr - old_end; /* how much done */ 207 } 208 209 static unsigned long move_vma(struct vm_area_struct *vma, 210 unsigned long old_addr, unsigned long old_len, 211 unsigned long new_len, unsigned long new_addr) 212 { 213 struct mm_struct *mm = vma->vm_mm; 214 struct vm_area_struct *new_vma; 215 unsigned long vm_flags = vma->vm_flags; 216 unsigned long new_pgoff; 217 unsigned long moved_len; 218 unsigned long excess = 0; 219 unsigned long hiwater_vm; 220 int split = 0; 221 int err; 222 bool need_rmap_locks; 223 224 /* 225 * We'd prefer to avoid failure later on in do_munmap: 226 * which may split one vma into three before unmapping. 227 */ 228 if (mm->map_count >= sysctl_max_map_count - 3) 229 return -ENOMEM; 230 231 /* 232 * Advise KSM to break any KSM pages in the area to be moved: 233 * it would be confusing if they were to turn up at the new 234 * location, where they happen to coincide with different KSM 235 * pages recently unmapped. But leave vma->vm_flags as it was, 236 * so KSM can come around to merge on vma and new_vma afterwards. 237 */ 238 err = ksm_madvise(vma, old_addr, old_addr + old_len, 239 MADV_UNMERGEABLE, &vm_flags); 240 if (err) 241 return err; 242 243 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 244 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, 245 &need_rmap_locks); 246 if (!new_vma) 247 return -ENOMEM; 248 249 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, 250 need_rmap_locks); 251 if (moved_len < old_len) { 252 /* 253 * On error, move entries back from new area to old, 254 * which will succeed since page tables still there, 255 * and then proceed to unmap new area instead of old. 256 */ 257 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, 258 true); 259 vma = new_vma; 260 old_len = new_len; 261 old_addr = new_addr; 262 new_addr = -ENOMEM; 263 } 264 265 /* Conceal VM_ACCOUNT so old reservation is not undone */ 266 if (vm_flags & VM_ACCOUNT) { 267 vma->vm_flags &= ~VM_ACCOUNT; 268 excess = vma->vm_end - vma->vm_start - old_len; 269 if (old_addr > vma->vm_start && 270 old_addr + old_len < vma->vm_end) 271 split = 1; 272 } 273 274 /* 275 * If we failed to move page tables we still do total_vm increment 276 * since do_munmap() will decrement it by old_len == new_len. 277 * 278 * Since total_vm is about to be raised artificially high for a 279 * moment, we need to restore high watermark afterwards: if stats 280 * are taken meanwhile, total_vm and hiwater_vm appear too high. 281 * If this were a serious issue, we'd add a flag to do_munmap(). 282 */ 283 hiwater_vm = mm->hiwater_vm; 284 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 285 286 if (do_munmap(mm, old_addr, old_len) < 0) { 287 /* OOM: unable to split vma, just get accounts right */ 288 vm_unacct_memory(excess >> PAGE_SHIFT); 289 excess = 0; 290 } 291 mm->hiwater_vm = hiwater_vm; 292 293 /* Restore VM_ACCOUNT if one or two pieces of vma left */ 294 if (excess) { 295 vma->vm_flags |= VM_ACCOUNT; 296 if (split) 297 vma->vm_next->vm_flags |= VM_ACCOUNT; 298 } 299 300 if (vm_flags & VM_LOCKED) { 301 mm->locked_vm += new_len >> PAGE_SHIFT; 302 if (new_len > old_len) 303 mlock_vma_pages_range(new_vma, new_addr + old_len, 304 new_addr + new_len); 305 } 306 307 return new_addr; 308 } 309 310 static struct vm_area_struct *vma_to_resize(unsigned long addr, 311 unsigned long old_len, unsigned long new_len, unsigned long *p) 312 { 313 struct mm_struct *mm = current->mm; 314 struct vm_area_struct *vma = find_vma(mm, addr); 315 316 if (!vma || vma->vm_start > addr) 317 goto Efault; 318 319 if (is_vm_hugetlb_page(vma)) 320 goto Einval; 321 322 /* We can't remap across vm area boundaries */ 323 if (old_len > vma->vm_end - addr) 324 goto Efault; 325 326 /* Need to be careful about a growing mapping */ 327 if (new_len > old_len) { 328 unsigned long pgoff; 329 330 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) 331 goto Efault; 332 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; 333 pgoff += vma->vm_pgoff; 334 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) 335 goto Einval; 336 } 337 338 if (vma->vm_flags & VM_LOCKED) { 339 unsigned long locked, lock_limit; 340 locked = mm->locked_vm << PAGE_SHIFT; 341 lock_limit = rlimit(RLIMIT_MEMLOCK); 342 locked += new_len - old_len; 343 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 344 goto Eagain; 345 } 346 347 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) 348 goto Enomem; 349 350 if (vma->vm_flags & VM_ACCOUNT) { 351 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; 352 if (security_vm_enough_memory_mm(mm, charged)) 353 goto Efault; 354 *p = charged; 355 } 356 357 return vma; 358 359 Efault: /* very odd choice for most of the cases, but... */ 360 return ERR_PTR(-EFAULT); 361 Einval: 362 return ERR_PTR(-EINVAL); 363 Enomem: 364 return ERR_PTR(-ENOMEM); 365 Eagain: 366 return ERR_PTR(-EAGAIN); 367 } 368 369 static unsigned long mremap_to(unsigned long addr, 370 unsigned long old_len, unsigned long new_addr, 371 unsigned long new_len) 372 { 373 struct mm_struct *mm = current->mm; 374 struct vm_area_struct *vma; 375 unsigned long ret = -EINVAL; 376 unsigned long charged = 0; 377 unsigned long map_flags; 378 379 if (new_addr & ~PAGE_MASK) 380 goto out; 381 382 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) 383 goto out; 384 385 /* Check if the location we're moving into overlaps the 386 * old location at all, and fail if it does. 387 */ 388 if ((new_addr <= addr) && (new_addr+new_len) > addr) 389 goto out; 390 391 if ((addr <= new_addr) && (addr+old_len) > new_addr) 392 goto out; 393 394 ret = do_munmap(mm, new_addr, new_len); 395 if (ret) 396 goto out; 397 398 if (old_len >= new_len) { 399 ret = do_munmap(mm, addr+new_len, old_len - new_len); 400 if (ret && old_len != new_len) 401 goto out; 402 old_len = new_len; 403 } 404 405 vma = vma_to_resize(addr, old_len, new_len, &charged); 406 if (IS_ERR(vma)) { 407 ret = PTR_ERR(vma); 408 goto out; 409 } 410 411 map_flags = MAP_FIXED; 412 if (vma->vm_flags & VM_MAYSHARE) 413 map_flags |= MAP_SHARED; 414 415 ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + 416 ((addr - vma->vm_start) >> PAGE_SHIFT), 417 map_flags); 418 if (ret & ~PAGE_MASK) 419 goto out1; 420 421 ret = move_vma(vma, addr, old_len, new_len, new_addr); 422 if (!(ret & ~PAGE_MASK)) 423 goto out; 424 out1: 425 vm_unacct_memory(charged); 426 427 out: 428 return ret; 429 } 430 431 static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) 432 { 433 unsigned long end = vma->vm_end + delta; 434 if (end < vma->vm_end) /* overflow */ 435 return 0; 436 if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ 437 return 0; 438 if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 439 0, MAP_FIXED) & ~PAGE_MASK) 440 return 0; 441 return 1; 442 } 443 444 /* 445 * Expand (or shrink) an existing mapping, potentially moving it at the 446 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) 447 * 448 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise 449 * This option implies MREMAP_MAYMOVE. 450 */ 451 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, 452 unsigned long, new_len, unsigned long, flags, 453 unsigned long, new_addr) 454 { 455 struct mm_struct *mm = current->mm; 456 struct vm_area_struct *vma; 457 unsigned long ret = -EINVAL; 458 unsigned long charged = 0; 459 460 down_write(¤t->mm->mmap_sem); 461 462 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 463 goto out; 464 465 if (addr & ~PAGE_MASK) 466 goto out; 467 468 old_len = PAGE_ALIGN(old_len); 469 new_len = PAGE_ALIGN(new_len); 470 471 /* 472 * We allow a zero old-len as a special case 473 * for DOS-emu "duplicate shm area" thing. But 474 * a zero new-len is nonsensical. 475 */ 476 if (!new_len) 477 goto out; 478 479 if (flags & MREMAP_FIXED) { 480 if (flags & MREMAP_MAYMOVE) 481 ret = mremap_to(addr, old_len, new_addr, new_len); 482 goto out; 483 } 484 485 /* 486 * Always allow a shrinking remap: that just unmaps 487 * the unnecessary pages.. 488 * do_munmap does all the needed commit accounting 489 */ 490 if (old_len >= new_len) { 491 ret = do_munmap(mm, addr+new_len, old_len - new_len); 492 if (ret && old_len != new_len) 493 goto out; 494 ret = addr; 495 goto out; 496 } 497 498 /* 499 * Ok, we need to grow.. 500 */ 501 vma = vma_to_resize(addr, old_len, new_len, &charged); 502 if (IS_ERR(vma)) { 503 ret = PTR_ERR(vma); 504 goto out; 505 } 506 507 /* old_len exactly to the end of the area.. 508 */ 509 if (old_len == vma->vm_end - addr) { 510 /* can we just expand the current mapping? */ 511 if (vma_expandable(vma, new_len - old_len)) { 512 int pages = (new_len - old_len) >> PAGE_SHIFT; 513 514 if (vma_adjust(vma, vma->vm_start, addr + new_len, 515 vma->vm_pgoff, NULL)) { 516 ret = -ENOMEM; 517 goto out; 518 } 519 520 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 521 if (vma->vm_flags & VM_LOCKED) { 522 mm->locked_vm += pages; 523 mlock_vma_pages_range(vma, addr + old_len, 524 addr + new_len); 525 } 526 ret = addr; 527 goto out; 528 } 529 } 530 531 /* 532 * We weren't able to just expand or shrink the area, 533 * we need to create a new one and move it.. 534 */ 535 ret = -ENOMEM; 536 if (flags & MREMAP_MAYMOVE) { 537 unsigned long map_flags = 0; 538 if (vma->vm_flags & VM_MAYSHARE) 539 map_flags |= MAP_SHARED; 540 541 new_addr = get_unmapped_area(vma->vm_file, 0, new_len, 542 vma->vm_pgoff + 543 ((addr - vma->vm_start) >> PAGE_SHIFT), 544 map_flags); 545 if (new_addr & ~PAGE_MASK) { 546 ret = new_addr; 547 goto out; 548 } 549 550 ret = move_vma(vma, addr, old_len, new_len, new_addr); 551 } 552 out: 553 if (ret & ~PAGE_MASK) 554 vm_unacct_memory(charged); 555 up_write(¤t->mm->mmap_sem); 556 return ret; 557 } 558