1 /* 2 * mm/mprotect.c 3 * 4 * (C) Copyright 1994 Linus Torvalds 5 * (C) Copyright 2002 Christoph Hellwig 6 * 7 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 9 */ 10 11 #include <linux/mm.h> 12 #include <linux/hugetlb.h> 13 #include <linux/shm.h> 14 #include <linux/mman.h> 15 #include <linux/fs.h> 16 #include <linux/highmem.h> 17 #include <linux/security.h> 18 #include <linux/mempolicy.h> 19 #include <linux/personality.h> 20 #include <linux/syscalls.h> 21 #include <linux/swap.h> 22 #include <linux/swapops.h> 23 #include <linux/mmu_notifier.h> 24 #include <linux/migrate.h> 25 #include <linux/perf_event.h> 26 #include <linux/pkeys.h> 27 #include <linux/ksm.h> 28 #include <linux/uaccess.h> 29 #include <asm/pgtable.h> 30 #include <asm/cacheflush.h> 31 #include <asm/mmu_context.h> 32 #include <asm/tlbflush.h> 33 34 #include "internal.h" 35 36 static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 37 unsigned long addr, unsigned long end, pgprot_t newprot, 38 int dirty_accountable, int prot_numa) 39 { 40 struct mm_struct *mm = vma->vm_mm; 41 pte_t *pte, oldpte; 42 spinlock_t *ptl; 43 unsigned long pages = 0; 44 int target_node = NUMA_NO_NODE; 45 46 /* 47 * Can be called with only the mmap_sem for reading by 48 * prot_numa so we must check the pmd isn't constantly 49 * changing from under us from pmd_none to pmd_trans_huge 50 * and/or the other way around. 51 */ 52 if (pmd_trans_unstable(pmd)) 53 return 0; 54 55 /* 56 * The pmd points to a regular pte so the pmd can't change 57 * from under us even if the mmap_sem is only hold for 58 * reading. 59 */ 60 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 61 62 /* Get target node for single threaded private VMAs */ 63 if (prot_numa && !(vma->vm_flags & VM_SHARED) && 64 atomic_read(&vma->vm_mm->mm_users) == 1) 65 target_node = numa_node_id(); 66 67 arch_enter_lazy_mmu_mode(); 68 do { 69 oldpte = *pte; 70 if (pte_present(oldpte)) { 71 pte_t ptent; 72 bool preserve_write = prot_numa && pte_write(oldpte); 73 74 /* 75 * Avoid trapping faults against the zero or KSM 76 * pages. See similar comment in change_huge_pmd. 77 */ 78 if (prot_numa) { 79 struct page *page; 80 81 page = vm_normal_page(vma, addr, oldpte); 82 if (!page || PageKsm(page)) 83 continue; 84 85 /* Avoid TLB flush if possible */ 86 if (pte_protnone(oldpte)) 87 continue; 88 89 /* 90 * Don't mess with PTEs if page is already on the node 91 * a single-threaded process is running on. 92 */ 93 if (target_node == page_to_nid(page)) 94 continue; 95 } 96 97 ptent = ptep_modify_prot_start(mm, addr, pte); 98 ptent = pte_modify(ptent, newprot); 99 if (preserve_write) 100 ptent = pte_mk_savedwrite(ptent); 101 102 /* Avoid taking write faults for known dirty pages */ 103 if (dirty_accountable && pte_dirty(ptent) && 104 (pte_soft_dirty(ptent) || 105 !(vma->vm_flags & VM_SOFTDIRTY))) { 106 ptent = pte_mkwrite(ptent); 107 } 108 ptep_modify_prot_commit(mm, addr, pte, ptent); 109 pages++; 110 } else if (IS_ENABLED(CONFIG_MIGRATION)) { 111 swp_entry_t entry = pte_to_swp_entry(oldpte); 112 113 if (is_write_migration_entry(entry)) { 114 pte_t newpte; 115 /* 116 * A protection check is difficult so 117 * just be safe and disable write 118 */ 119 make_migration_entry_read(&entry); 120 newpte = swp_entry_to_pte(entry); 121 if (pte_swp_soft_dirty(oldpte)) 122 newpte = pte_swp_mksoft_dirty(newpte); 123 set_pte_at(mm, addr, pte, newpte); 124 125 pages++; 126 } 127 } 128 } while (pte++, addr += PAGE_SIZE, addr != end); 129 arch_leave_lazy_mmu_mode(); 130 pte_unmap_unlock(pte - 1, ptl); 131 132 return pages; 133 } 134 135 static inline unsigned long change_pmd_range(struct vm_area_struct *vma, 136 pud_t *pud, unsigned long addr, unsigned long end, 137 pgprot_t newprot, int dirty_accountable, int prot_numa) 138 { 139 pmd_t *pmd; 140 struct mm_struct *mm = vma->vm_mm; 141 unsigned long next; 142 unsigned long pages = 0; 143 unsigned long nr_huge_updates = 0; 144 unsigned long mni_start = 0; 145 146 pmd = pmd_offset(pud, addr); 147 do { 148 unsigned long this_pages; 149 150 next = pmd_addr_end(addr, end); 151 if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) 152 && pmd_none_or_clear_bad(pmd)) 153 continue; 154 155 /* invoke the mmu notifier if the pmd is populated */ 156 if (!mni_start) { 157 mni_start = addr; 158 mmu_notifier_invalidate_range_start(mm, mni_start, end); 159 } 160 161 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 162 if (next - addr != HPAGE_PMD_SIZE) { 163 __split_huge_pmd(vma, pmd, addr, false, NULL); 164 } else { 165 int nr_ptes = change_huge_pmd(vma, pmd, addr, 166 newprot, prot_numa); 167 168 if (nr_ptes) { 169 if (nr_ptes == HPAGE_PMD_NR) { 170 pages += HPAGE_PMD_NR; 171 nr_huge_updates++; 172 } 173 174 /* huge pmd was handled */ 175 continue; 176 } 177 } 178 /* fall through, the trans huge pmd just split */ 179 } 180 this_pages = change_pte_range(vma, pmd, addr, next, newprot, 181 dirty_accountable, prot_numa); 182 pages += this_pages; 183 } while (pmd++, addr = next, addr != end); 184 185 if (mni_start) 186 mmu_notifier_invalidate_range_end(mm, mni_start, end); 187 188 if (nr_huge_updates) 189 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); 190 return pages; 191 } 192 193 static inline unsigned long change_pud_range(struct vm_area_struct *vma, 194 p4d_t *p4d, unsigned long addr, unsigned long end, 195 pgprot_t newprot, int dirty_accountable, int prot_numa) 196 { 197 pud_t *pud; 198 unsigned long next; 199 unsigned long pages = 0; 200 201 pud = pud_offset(p4d, addr); 202 do { 203 next = pud_addr_end(addr, end); 204 if (pud_none_or_clear_bad(pud)) 205 continue; 206 pages += change_pmd_range(vma, pud, addr, next, newprot, 207 dirty_accountable, prot_numa); 208 } while (pud++, addr = next, addr != end); 209 210 return pages; 211 } 212 213 static inline unsigned long change_p4d_range(struct vm_area_struct *vma, 214 pgd_t *pgd, unsigned long addr, unsigned long end, 215 pgprot_t newprot, int dirty_accountable, int prot_numa) 216 { 217 p4d_t *p4d; 218 unsigned long next; 219 unsigned long pages = 0; 220 221 p4d = p4d_offset(pgd, addr); 222 do { 223 next = p4d_addr_end(addr, end); 224 if (p4d_none_or_clear_bad(p4d)) 225 continue; 226 pages += change_pud_range(vma, p4d, addr, next, newprot, 227 dirty_accountable, prot_numa); 228 } while (p4d++, addr = next, addr != end); 229 230 return pages; 231 } 232 233 static unsigned long change_protection_range(struct vm_area_struct *vma, 234 unsigned long addr, unsigned long end, pgprot_t newprot, 235 int dirty_accountable, int prot_numa) 236 { 237 struct mm_struct *mm = vma->vm_mm; 238 pgd_t *pgd; 239 unsigned long next; 240 unsigned long start = addr; 241 unsigned long pages = 0; 242 243 BUG_ON(addr >= end); 244 pgd = pgd_offset(mm, addr); 245 flush_cache_range(vma, addr, end); 246 set_tlb_flush_pending(mm); 247 do { 248 next = pgd_addr_end(addr, end); 249 if (pgd_none_or_clear_bad(pgd)) 250 continue; 251 pages += change_p4d_range(vma, pgd, addr, next, newprot, 252 dirty_accountable, prot_numa); 253 } while (pgd++, addr = next, addr != end); 254 255 /* Only flush the TLB if we actually modified any entries: */ 256 if (pages) 257 flush_tlb_range(vma, start, end); 258 clear_tlb_flush_pending(mm); 259 260 return pages; 261 } 262 263 unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, 264 unsigned long end, pgprot_t newprot, 265 int dirty_accountable, int prot_numa) 266 { 267 unsigned long pages; 268 269 if (is_vm_hugetlb_page(vma)) 270 pages = hugetlb_change_protection(vma, start, end, newprot); 271 else 272 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); 273 274 return pages; 275 } 276 277 int 278 mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, 279 unsigned long start, unsigned long end, unsigned long newflags) 280 { 281 struct mm_struct *mm = vma->vm_mm; 282 unsigned long oldflags = vma->vm_flags; 283 long nrpages = (end - start) >> PAGE_SHIFT; 284 unsigned long charged = 0; 285 pgoff_t pgoff; 286 int error; 287 int dirty_accountable = 0; 288 289 if (newflags == oldflags) { 290 *pprev = vma; 291 return 0; 292 } 293 294 /* 295 * If we make a private mapping writable we increase our commit; 296 * but (without finer accounting) cannot reduce our commit if we 297 * make it unwritable again. hugetlb mapping were accounted for 298 * even if read-only so there is no need to account for them here 299 */ 300 if (newflags & VM_WRITE) { 301 /* Check space limits when area turns into data. */ 302 if (!may_expand_vm(mm, newflags, nrpages) && 303 may_expand_vm(mm, oldflags, nrpages)) 304 return -ENOMEM; 305 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| 306 VM_SHARED|VM_NORESERVE))) { 307 charged = nrpages; 308 if (security_vm_enough_memory_mm(mm, charged)) 309 return -ENOMEM; 310 newflags |= VM_ACCOUNT; 311 } 312 } 313 314 /* 315 * First try to merge with previous and/or next vma. 316 */ 317 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 318 *pprev = vma_merge(mm, *pprev, start, end, newflags, 319 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), 320 vma->vm_userfaultfd_ctx); 321 if (*pprev) { 322 vma = *pprev; 323 VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); 324 goto success; 325 } 326 327 *pprev = vma; 328 329 if (start != vma->vm_start) { 330 error = split_vma(mm, vma, start, 1); 331 if (error) 332 goto fail; 333 } 334 335 if (end != vma->vm_end) { 336 error = split_vma(mm, vma, end, 0); 337 if (error) 338 goto fail; 339 } 340 341 success: 342 /* 343 * vm_flags and vm_page_prot are protected by the mmap_sem 344 * held in write mode. 345 */ 346 vma->vm_flags = newflags; 347 dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); 348 vma_set_page_prot(vma); 349 350 change_protection(vma, start, end, vma->vm_page_prot, 351 dirty_accountable, 0); 352 353 /* 354 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major 355 * fault on access. 356 */ 357 if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && 358 (newflags & VM_WRITE)) { 359 populate_vma_page_range(vma, start, end, NULL); 360 } 361 362 vm_stat_account(mm, oldflags, -nrpages); 363 vm_stat_account(mm, newflags, nrpages); 364 perf_event_mmap(vma); 365 return 0; 366 367 fail: 368 vm_unacct_memory(charged); 369 return error; 370 } 371 372 /* 373 * pkey==-1 when doing a legacy mprotect() 374 */ 375 static int do_mprotect_pkey(unsigned long start, size_t len, 376 unsigned long prot, int pkey) 377 { 378 unsigned long nstart, end, tmp, reqprot; 379 struct vm_area_struct *vma, *prev; 380 int error = -EINVAL; 381 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); 382 const bool rier = (current->personality & READ_IMPLIES_EXEC) && 383 (prot & PROT_READ); 384 385 prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); 386 if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ 387 return -EINVAL; 388 389 if (start & ~PAGE_MASK) 390 return -EINVAL; 391 if (!len) 392 return 0; 393 len = PAGE_ALIGN(len); 394 end = start + len; 395 if (end <= start) 396 return -ENOMEM; 397 if (!arch_validate_prot(prot)) 398 return -EINVAL; 399 400 reqprot = prot; 401 402 if (down_write_killable(¤t->mm->mmap_sem)) 403 return -EINTR; 404 405 /* 406 * If userspace did not allocate the pkey, do not let 407 * them use it here. 408 */ 409 error = -EINVAL; 410 if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) 411 goto out; 412 413 vma = find_vma(current->mm, start); 414 error = -ENOMEM; 415 if (!vma) 416 goto out; 417 prev = vma->vm_prev; 418 if (unlikely(grows & PROT_GROWSDOWN)) { 419 if (vma->vm_start >= end) 420 goto out; 421 start = vma->vm_start; 422 error = -EINVAL; 423 if (!(vma->vm_flags & VM_GROWSDOWN)) 424 goto out; 425 } else { 426 if (vma->vm_start > start) 427 goto out; 428 if (unlikely(grows & PROT_GROWSUP)) { 429 end = vma->vm_end; 430 error = -EINVAL; 431 if (!(vma->vm_flags & VM_GROWSUP)) 432 goto out; 433 } 434 } 435 if (start > vma->vm_start) 436 prev = vma; 437 438 for (nstart = start ; ; ) { 439 unsigned long mask_off_old_flags; 440 unsigned long newflags; 441 int new_vma_pkey; 442 443 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 444 445 /* Does the application expect PROT_READ to imply PROT_EXEC */ 446 if (rier && (vma->vm_flags & VM_MAYEXEC)) 447 prot |= PROT_EXEC; 448 449 /* 450 * Each mprotect() call explicitly passes r/w/x permissions. 451 * If a permission is not passed to mprotect(), it must be 452 * cleared from the VMA. 453 */ 454 mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | 455 ARCH_VM_PKEY_FLAGS; 456 457 new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); 458 newflags = calc_vm_prot_bits(prot, new_vma_pkey); 459 newflags |= (vma->vm_flags & ~mask_off_old_flags); 460 461 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 462 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { 463 error = -EACCES; 464 goto out; 465 } 466 467 error = security_file_mprotect(vma, reqprot, prot); 468 if (error) 469 goto out; 470 471 tmp = vma->vm_end; 472 if (tmp > end) 473 tmp = end; 474 error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); 475 if (error) 476 goto out; 477 nstart = tmp; 478 479 if (nstart < prev->vm_end) 480 nstart = prev->vm_end; 481 if (nstart >= end) 482 goto out; 483 484 vma = prev->vm_next; 485 if (!vma || vma->vm_start != nstart) { 486 error = -ENOMEM; 487 goto out; 488 } 489 prot = reqprot; 490 } 491 out: 492 up_write(¤t->mm->mmap_sem); 493 return error; 494 } 495 496 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, 497 unsigned long, prot) 498 { 499 return do_mprotect_pkey(start, len, prot, -1); 500 } 501 502 #ifdef CONFIG_ARCH_HAS_PKEYS 503 504 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, 505 unsigned long, prot, int, pkey) 506 { 507 return do_mprotect_pkey(start, len, prot, pkey); 508 } 509 510 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) 511 { 512 int pkey; 513 int ret; 514 515 /* No flags supported yet. */ 516 if (flags) 517 return -EINVAL; 518 /* check for unsupported init values */ 519 if (init_val & ~PKEY_ACCESS_MASK) 520 return -EINVAL; 521 522 down_write(¤t->mm->mmap_sem); 523 pkey = mm_pkey_alloc(current->mm); 524 525 ret = -ENOSPC; 526 if (pkey == -1) 527 goto out; 528 529 ret = arch_set_user_pkey_access(current, pkey, init_val); 530 if (ret) { 531 mm_pkey_free(current->mm, pkey); 532 goto out; 533 } 534 ret = pkey; 535 out: 536 up_write(¤t->mm->mmap_sem); 537 return ret; 538 } 539 540 SYSCALL_DEFINE1(pkey_free, int, pkey) 541 { 542 int ret; 543 544 down_write(¤t->mm->mmap_sem); 545 ret = mm_pkey_free(current->mm, pkey); 546 up_write(¤t->mm->mmap_sem); 547 548 /* 549 * We could provie warnings or errors if any VMA still 550 * has the pkey set here. 551 */ 552 return ret; 553 } 554 555 #endif /* CONFIG_ARCH_HAS_PKEYS */ 556