1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * mm/mprotect.c 4 * 5 * (C) Copyright 1994 Linus Torvalds 6 * (C) Copyright 2002 Christoph Hellwig 7 * 8 * Address space accounting code <alan@lxorguk.ukuu.org.uk> 9 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 10 */ 11 12 #include <linux/pagewalk.h> 13 #include <linux/hugetlb.h> 14 #include <linux/shm.h> 15 #include <linux/mman.h> 16 #include <linux/fs.h> 17 #include <linux/highmem.h> 18 #include <linux/security.h> 19 #include <linux/mempolicy.h> 20 #include <linux/personality.h> 21 #include <linux/syscalls.h> 22 #include <linux/swap.h> 23 #include <linux/swapops.h> 24 #include <linux/mmu_notifier.h> 25 #include <linux/migrate.h> 26 #include <linux/perf_event.h> 27 #include <linux/pkeys.h> 28 #include <linux/ksm.h> 29 #include <linux/uaccess.h> 30 #include <linux/mm_inline.h> 31 #include <linux/pgtable.h> 32 #include <linux/sched/sysctl.h> 33 #include <linux/userfaultfd_k.h> 34 #include <asm/cacheflush.h> 35 #include <asm/mmu_context.h> 36 #include <asm/tlbflush.h> 37 #include <asm/tlb.h> 38 39 #include "internal.h" 40 41 static unsigned long change_pte_range(struct mmu_gather *tlb, 42 struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, 43 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 44 { 45 pte_t *pte, oldpte; 46 spinlock_t *ptl; 47 unsigned long pages = 0; 48 int target_node = NUMA_NO_NODE; 49 bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; 50 bool prot_numa = cp_flags & MM_CP_PROT_NUMA; 51 bool uffd_wp = cp_flags & MM_CP_UFFD_WP; 52 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; 53 54 tlb_change_page_size(tlb, PAGE_SIZE); 55 56 /* 57 * Can be called with only the mmap_lock for reading by 58 * prot_numa so we must check the pmd isn't constantly 59 * changing from under us from pmd_none to pmd_trans_huge 60 * and/or the other way around. 61 */ 62 if (pmd_trans_unstable(pmd)) 63 return 0; 64 65 /* 66 * The pmd points to a regular pte so the pmd can't change 67 * from under us even if the mmap_lock is only hold for 68 * reading. 69 */ 70 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 71 72 /* Get target node for single threaded private VMAs */ 73 if (prot_numa && !(vma->vm_flags & VM_SHARED) && 74 atomic_read(&vma->vm_mm->mm_users) == 1) 75 target_node = numa_node_id(); 76 77 flush_tlb_batched_pending(vma->vm_mm); 78 arch_enter_lazy_mmu_mode(); 79 do { 80 oldpte = *pte; 81 if (pte_present(oldpte)) { 82 pte_t ptent; 83 bool preserve_write = prot_numa && pte_write(oldpte); 84 85 /* 86 * Avoid trapping faults against the zero or KSM 87 * pages. See similar comment in change_huge_pmd. 88 */ 89 if (prot_numa) { 90 struct page *page; 91 int nid; 92 93 /* Avoid TLB flush if possible */ 94 if (pte_protnone(oldpte)) 95 continue; 96 97 page = vm_normal_page(vma, addr, oldpte); 98 if (!page || PageKsm(page)) 99 continue; 100 101 /* Also skip shared copy-on-write pages */ 102 if (is_cow_mapping(vma->vm_flags) && 103 page_count(page) != 1) 104 continue; 105 106 /* 107 * While migration can move some dirty pages, 108 * it cannot move them all from MIGRATE_ASYNC 109 * context. 110 */ 111 if (page_is_file_lru(page) && PageDirty(page)) 112 continue; 113 114 /* 115 * Don't mess with PTEs if page is already on the node 116 * a single-threaded process is running on. 117 */ 118 nid = page_to_nid(page); 119 if (target_node == nid) 120 continue; 121 122 /* 123 * Skip scanning top tier node if normal numa 124 * balancing is disabled 125 */ 126 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) && 127 node_is_toptier(nid)) 128 continue; 129 } 130 131 oldpte = ptep_modify_prot_start(vma, addr, pte); 132 ptent = pte_modify(oldpte, newprot); 133 if (preserve_write) 134 ptent = pte_mk_savedwrite(ptent); 135 136 if (uffd_wp) { 137 ptent = pte_wrprotect(ptent); 138 ptent = pte_mkuffd_wp(ptent); 139 } else if (uffd_wp_resolve) { 140 /* 141 * Leave the write bit to be handled 142 * by PF interrupt handler, then 143 * things like COW could be properly 144 * handled. 145 */ 146 ptent = pte_clear_uffd_wp(ptent); 147 } 148 149 /* Avoid taking write faults for known dirty pages */ 150 if (dirty_accountable && pte_dirty(ptent) && 151 (pte_soft_dirty(ptent) || 152 !(vma->vm_flags & VM_SOFTDIRTY))) { 153 ptent = pte_mkwrite(ptent); 154 } 155 ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); 156 if (pte_needs_flush(oldpte, ptent)) 157 tlb_flush_pte_range(tlb, addr, PAGE_SIZE); 158 pages++; 159 } else if (is_swap_pte(oldpte)) { 160 swp_entry_t entry = pte_to_swp_entry(oldpte); 161 struct page *page = pfn_swap_entry_to_page(entry); 162 pte_t newpte; 163 164 if (is_writable_migration_entry(entry)) { 165 /* 166 * A protection check is difficult so 167 * just be safe and disable write 168 */ 169 if (PageAnon(page)) 170 entry = make_readable_exclusive_migration_entry( 171 swp_offset(entry)); 172 else 173 entry = make_readable_migration_entry(swp_offset(entry)); 174 newpte = swp_entry_to_pte(entry); 175 if (pte_swp_soft_dirty(oldpte)) 176 newpte = pte_swp_mksoft_dirty(newpte); 177 if (pte_swp_uffd_wp(oldpte)) 178 newpte = pte_swp_mkuffd_wp(newpte); 179 } else if (is_writable_device_private_entry(entry)) { 180 /* 181 * We do not preserve soft-dirtiness. See 182 * copy_one_pte() for explanation. 183 */ 184 entry = make_readable_device_private_entry( 185 swp_offset(entry)); 186 newpte = swp_entry_to_pte(entry); 187 if (pte_swp_uffd_wp(oldpte)) 188 newpte = pte_swp_mkuffd_wp(newpte); 189 } else if (is_writable_device_exclusive_entry(entry)) { 190 entry = make_readable_device_exclusive_entry( 191 swp_offset(entry)); 192 newpte = swp_entry_to_pte(entry); 193 if (pte_swp_soft_dirty(oldpte)) 194 newpte = pte_swp_mksoft_dirty(newpte); 195 if (pte_swp_uffd_wp(oldpte)) 196 newpte = pte_swp_mkuffd_wp(newpte); 197 } else if (pte_marker_entry_uffd_wp(entry)) { 198 /* 199 * If this is uffd-wp pte marker and we'd like 200 * to unprotect it, drop it; the next page 201 * fault will trigger without uffd trapping. 202 */ 203 if (uffd_wp_resolve) { 204 pte_clear(vma->vm_mm, addr, pte); 205 pages++; 206 } 207 continue; 208 } else { 209 newpte = oldpte; 210 } 211 212 if (uffd_wp) 213 newpte = pte_swp_mkuffd_wp(newpte); 214 else if (uffd_wp_resolve) 215 newpte = pte_swp_clear_uffd_wp(newpte); 216 217 if (!pte_same(oldpte, newpte)) { 218 set_pte_at(vma->vm_mm, addr, pte, newpte); 219 pages++; 220 } 221 } else { 222 /* It must be an none page, or what else?.. */ 223 WARN_ON_ONCE(!pte_none(oldpte)); 224 if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { 225 /* 226 * For file-backed mem, we need to be able to 227 * wr-protect a none pte, because even if the 228 * pte is none, the page/swap cache could 229 * exist. Doing that by install a marker. 230 */ 231 set_pte_at(vma->vm_mm, addr, pte, 232 make_pte_marker(PTE_MARKER_UFFD_WP)); 233 pages++; 234 } 235 } 236 } while (pte++, addr += PAGE_SIZE, addr != end); 237 arch_leave_lazy_mmu_mode(); 238 pte_unmap_unlock(pte - 1, ptl); 239 240 return pages; 241 } 242 243 /* 244 * Used when setting automatic NUMA hinting protection where it is 245 * critical that a numa hinting PMD is not confused with a bad PMD. 246 */ 247 static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) 248 { 249 pmd_t pmdval = pmd_read_atomic(pmd); 250 251 /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ 252 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 253 barrier(); 254 #endif 255 256 if (pmd_none(pmdval)) 257 return 1; 258 if (pmd_trans_huge(pmdval)) 259 return 0; 260 if (unlikely(pmd_bad(pmdval))) { 261 pmd_clear_bad(pmd); 262 return 1; 263 } 264 265 return 0; 266 } 267 268 /* Return true if we're uffd wr-protecting file-backed memory, or false */ 269 static inline bool 270 uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags) 271 { 272 return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma); 273 } 274 275 /* 276 * If wr-protecting the range for file-backed, populate pgtable for the case 277 * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc() 278 * failed it means no memory, we don't have a better option but stop. 279 */ 280 #define change_pmd_prepare(vma, pmd, cp_flags) \ 281 do { \ 282 if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ 283 if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \ 284 break; \ 285 } \ 286 } while (0) 287 /* 288 * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to 289 * have separate change_pmd_prepare() because pte_alloc() returns 0 on success, 290 * while {pmd|pud|p4d}_alloc() returns the valid pointer on success. 291 */ 292 #define change_prepare(vma, high, low, addr, cp_flags) \ 293 do { \ 294 if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \ 295 low##_t *p = low##_alloc(vma->vm_mm, high, addr); \ 296 if (WARN_ON_ONCE(p == NULL)) \ 297 break; \ 298 } \ 299 } while (0) 300 301 static inline unsigned long change_pmd_range(struct mmu_gather *tlb, 302 struct vm_area_struct *vma, pud_t *pud, unsigned long addr, 303 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 304 { 305 pmd_t *pmd; 306 unsigned long next; 307 unsigned long pages = 0; 308 unsigned long nr_huge_updates = 0; 309 struct mmu_notifier_range range; 310 311 range.start = 0; 312 313 pmd = pmd_offset(pud, addr); 314 do { 315 unsigned long this_pages; 316 317 next = pmd_addr_end(addr, end); 318 319 change_pmd_prepare(vma, pmd, cp_flags); 320 /* 321 * Automatic NUMA balancing walks the tables with mmap_lock 322 * held for read. It's possible a parallel update to occur 323 * between pmd_trans_huge() and a pmd_none_or_clear_bad() 324 * check leading to a false positive and clearing. 325 * Hence, it's necessary to atomically read the PMD value 326 * for all the checks. 327 */ 328 if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) && 329 pmd_none_or_clear_bad_unless_trans_huge(pmd)) 330 goto next; 331 332 /* invoke the mmu notifier if the pmd is populated */ 333 if (!range.start) { 334 mmu_notifier_range_init(&range, 335 MMU_NOTIFY_PROTECTION_VMA, 0, 336 vma, vma->vm_mm, addr, end); 337 mmu_notifier_invalidate_range_start(&range); 338 } 339 340 if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 341 if ((next - addr != HPAGE_PMD_SIZE) || 342 uffd_wp_protect_file(vma, cp_flags)) { 343 __split_huge_pmd(vma, pmd, addr, false, NULL); 344 /* 345 * For file-backed, the pmd could have been 346 * cleared; make sure pmd populated if 347 * necessary, then fall-through to pte level. 348 */ 349 change_pmd_prepare(vma, pmd, cp_flags); 350 } else { 351 /* 352 * change_huge_pmd() does not defer TLB flushes, 353 * so no need to propagate the tlb argument. 354 */ 355 int nr_ptes = change_huge_pmd(tlb, vma, pmd, 356 addr, newprot, cp_flags); 357 358 if (nr_ptes) { 359 if (nr_ptes == HPAGE_PMD_NR) { 360 pages += HPAGE_PMD_NR; 361 nr_huge_updates++; 362 } 363 364 /* huge pmd was handled */ 365 goto next; 366 } 367 } 368 /* fall through, the trans huge pmd just split */ 369 } 370 this_pages = change_pte_range(tlb, vma, pmd, addr, next, 371 newprot, cp_flags); 372 pages += this_pages; 373 next: 374 cond_resched(); 375 } while (pmd++, addr = next, addr != end); 376 377 if (range.start) 378 mmu_notifier_invalidate_range_end(&range); 379 380 if (nr_huge_updates) 381 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); 382 return pages; 383 } 384 385 static inline unsigned long change_pud_range(struct mmu_gather *tlb, 386 struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, 387 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 388 { 389 pud_t *pud; 390 unsigned long next; 391 unsigned long pages = 0; 392 393 pud = pud_offset(p4d, addr); 394 do { 395 next = pud_addr_end(addr, end); 396 change_prepare(vma, pud, pmd, addr, cp_flags); 397 if (pud_none_or_clear_bad(pud)) 398 continue; 399 pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, 400 cp_flags); 401 } while (pud++, addr = next, addr != end); 402 403 return pages; 404 } 405 406 static inline unsigned long change_p4d_range(struct mmu_gather *tlb, 407 struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, 408 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 409 { 410 p4d_t *p4d; 411 unsigned long next; 412 unsigned long pages = 0; 413 414 p4d = p4d_offset(pgd, addr); 415 do { 416 next = p4d_addr_end(addr, end); 417 change_prepare(vma, p4d, pud, addr, cp_flags); 418 if (p4d_none_or_clear_bad(p4d)) 419 continue; 420 pages += change_pud_range(tlb, vma, p4d, addr, next, newprot, 421 cp_flags); 422 } while (p4d++, addr = next, addr != end); 423 424 return pages; 425 } 426 427 static unsigned long change_protection_range(struct mmu_gather *tlb, 428 struct vm_area_struct *vma, unsigned long addr, 429 unsigned long end, pgprot_t newprot, unsigned long cp_flags) 430 { 431 struct mm_struct *mm = vma->vm_mm; 432 pgd_t *pgd; 433 unsigned long next; 434 unsigned long pages = 0; 435 436 BUG_ON(addr >= end); 437 pgd = pgd_offset(mm, addr); 438 tlb_start_vma(tlb, vma); 439 do { 440 next = pgd_addr_end(addr, end); 441 change_prepare(vma, pgd, p4d, addr, cp_flags); 442 if (pgd_none_or_clear_bad(pgd)) 443 continue; 444 pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot, 445 cp_flags); 446 } while (pgd++, addr = next, addr != end); 447 448 tlb_end_vma(tlb, vma); 449 450 return pages; 451 } 452 453 unsigned long change_protection(struct mmu_gather *tlb, 454 struct vm_area_struct *vma, unsigned long start, 455 unsigned long end, pgprot_t newprot, 456 unsigned long cp_flags) 457 { 458 unsigned long pages; 459 460 BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); 461 462 if (is_vm_hugetlb_page(vma)) 463 pages = hugetlb_change_protection(vma, start, end, newprot, 464 cp_flags); 465 else 466 pages = change_protection_range(tlb, vma, start, end, newprot, 467 cp_flags); 468 469 return pages; 470 } 471 472 static int prot_none_pte_entry(pte_t *pte, unsigned long addr, 473 unsigned long next, struct mm_walk *walk) 474 { 475 return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? 476 0 : -EACCES; 477 } 478 479 static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask, 480 unsigned long addr, unsigned long next, 481 struct mm_walk *walk) 482 { 483 return pfn_modify_allowed(pte_pfn(*pte), *(pgprot_t *)(walk->private)) ? 484 0 : -EACCES; 485 } 486 487 static int prot_none_test(unsigned long addr, unsigned long next, 488 struct mm_walk *walk) 489 { 490 return 0; 491 } 492 493 static const struct mm_walk_ops prot_none_walk_ops = { 494 .pte_entry = prot_none_pte_entry, 495 .hugetlb_entry = prot_none_hugetlb_entry, 496 .test_walk = prot_none_test, 497 }; 498 499 int 500 mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, 501 struct vm_area_struct **pprev, unsigned long start, 502 unsigned long end, unsigned long newflags) 503 { 504 struct mm_struct *mm = vma->vm_mm; 505 unsigned long oldflags = vma->vm_flags; 506 long nrpages = (end - start) >> PAGE_SHIFT; 507 unsigned long charged = 0; 508 pgoff_t pgoff; 509 int error; 510 int dirty_accountable = 0; 511 512 if (newflags == oldflags) { 513 *pprev = vma; 514 return 0; 515 } 516 517 /* 518 * Do PROT_NONE PFN permission checks here when we can still 519 * bail out without undoing a lot of state. This is a rather 520 * uncommon case, so doesn't need to be very optimized. 521 */ 522 if (arch_has_pfn_modify_check() && 523 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 524 (newflags & VM_ACCESS_FLAGS) == 0) { 525 pgprot_t new_pgprot = vm_get_page_prot(newflags); 526 527 error = walk_page_range(current->mm, start, end, 528 &prot_none_walk_ops, &new_pgprot); 529 if (error) 530 return error; 531 } 532 533 /* 534 * If we make a private mapping writable we increase our commit; 535 * but (without finer accounting) cannot reduce our commit if we 536 * make it unwritable again. hugetlb mapping were accounted for 537 * even if read-only so there is no need to account for them here 538 */ 539 if (newflags & VM_WRITE) { 540 /* Check space limits when area turns into data. */ 541 if (!may_expand_vm(mm, newflags, nrpages) && 542 may_expand_vm(mm, oldflags, nrpages)) 543 return -ENOMEM; 544 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| 545 VM_SHARED|VM_NORESERVE))) { 546 charged = nrpages; 547 if (security_vm_enough_memory_mm(mm, charged)) 548 return -ENOMEM; 549 newflags |= VM_ACCOUNT; 550 } 551 } 552 553 /* 554 * First try to merge with previous and/or next vma. 555 */ 556 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 557 *pprev = vma_merge(mm, *pprev, start, end, newflags, 558 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), 559 vma->vm_userfaultfd_ctx, anon_vma_name(vma)); 560 if (*pprev) { 561 vma = *pprev; 562 VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); 563 goto success; 564 } 565 566 *pprev = vma; 567 568 if (start != vma->vm_start) { 569 error = split_vma(mm, vma, start, 1); 570 if (error) 571 goto fail; 572 } 573 574 if (end != vma->vm_end) { 575 error = split_vma(mm, vma, end, 0); 576 if (error) 577 goto fail; 578 } 579 580 success: 581 /* 582 * vm_flags and vm_page_prot are protected by the mmap_lock 583 * held in write mode. 584 */ 585 vma->vm_flags = newflags; 586 dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); 587 vma_set_page_prot(vma); 588 589 change_protection(tlb, vma, start, end, vma->vm_page_prot, 590 dirty_accountable ? MM_CP_DIRTY_ACCT : 0); 591 592 /* 593 * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major 594 * fault on access. 595 */ 596 if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && 597 (newflags & VM_WRITE)) { 598 populate_vma_page_range(vma, start, end, NULL); 599 } 600 601 vm_stat_account(mm, oldflags, -nrpages); 602 vm_stat_account(mm, newflags, nrpages); 603 perf_event_mmap(vma); 604 return 0; 605 606 fail: 607 vm_unacct_memory(charged); 608 return error; 609 } 610 611 /* 612 * pkey==-1 when doing a legacy mprotect() 613 */ 614 static int do_mprotect_pkey(unsigned long start, size_t len, 615 unsigned long prot, int pkey) 616 { 617 unsigned long nstart, end, tmp, reqprot; 618 struct vm_area_struct *vma, *prev; 619 int error = -EINVAL; 620 const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); 621 const bool rier = (current->personality & READ_IMPLIES_EXEC) && 622 (prot & PROT_READ); 623 struct mmu_gather tlb; 624 625 start = untagged_addr(start); 626 627 prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); 628 if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ 629 return -EINVAL; 630 631 if (start & ~PAGE_MASK) 632 return -EINVAL; 633 if (!len) 634 return 0; 635 len = PAGE_ALIGN(len); 636 end = start + len; 637 if (end <= start) 638 return -ENOMEM; 639 if (!arch_validate_prot(prot, start)) 640 return -EINVAL; 641 642 reqprot = prot; 643 644 if (mmap_write_lock_killable(current->mm)) 645 return -EINTR; 646 647 /* 648 * If userspace did not allocate the pkey, do not let 649 * them use it here. 650 */ 651 error = -EINVAL; 652 if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) 653 goto out; 654 655 vma = find_vma(current->mm, start); 656 error = -ENOMEM; 657 if (!vma) 658 goto out; 659 660 if (unlikely(grows & PROT_GROWSDOWN)) { 661 if (vma->vm_start >= end) 662 goto out; 663 start = vma->vm_start; 664 error = -EINVAL; 665 if (!(vma->vm_flags & VM_GROWSDOWN)) 666 goto out; 667 } else { 668 if (vma->vm_start > start) 669 goto out; 670 if (unlikely(grows & PROT_GROWSUP)) { 671 end = vma->vm_end; 672 error = -EINVAL; 673 if (!(vma->vm_flags & VM_GROWSUP)) 674 goto out; 675 } 676 } 677 678 if (start > vma->vm_start) 679 prev = vma; 680 else 681 prev = vma->vm_prev; 682 683 tlb_gather_mmu(&tlb, current->mm); 684 for (nstart = start ; ; ) { 685 unsigned long mask_off_old_flags; 686 unsigned long newflags; 687 int new_vma_pkey; 688 689 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 690 691 /* Does the application expect PROT_READ to imply PROT_EXEC */ 692 if (rier && (vma->vm_flags & VM_MAYEXEC)) 693 prot |= PROT_EXEC; 694 695 /* 696 * Each mprotect() call explicitly passes r/w/x permissions. 697 * If a permission is not passed to mprotect(), it must be 698 * cleared from the VMA. 699 */ 700 mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | 701 VM_FLAGS_CLEAR; 702 703 new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); 704 newflags = calc_vm_prot_bits(prot, new_vma_pkey); 705 newflags |= (vma->vm_flags & ~mask_off_old_flags); 706 707 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 708 if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { 709 error = -EACCES; 710 break; 711 } 712 713 /* Allow architectures to sanity-check the new flags */ 714 if (!arch_validate_flags(newflags)) { 715 error = -EINVAL; 716 break; 717 } 718 719 error = security_file_mprotect(vma, reqprot, prot); 720 if (error) 721 break; 722 723 tmp = vma->vm_end; 724 if (tmp > end) 725 tmp = end; 726 727 if (vma->vm_ops && vma->vm_ops->mprotect) { 728 error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); 729 if (error) 730 break; 731 } 732 733 error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags); 734 if (error) 735 break; 736 737 nstart = tmp; 738 739 if (nstart < prev->vm_end) 740 nstart = prev->vm_end; 741 if (nstart >= end) 742 break; 743 744 vma = prev->vm_next; 745 if (!vma || vma->vm_start != nstart) { 746 error = -ENOMEM; 747 break; 748 } 749 prot = reqprot; 750 } 751 tlb_finish_mmu(&tlb); 752 out: 753 mmap_write_unlock(current->mm); 754 return error; 755 } 756 757 SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, 758 unsigned long, prot) 759 { 760 return do_mprotect_pkey(start, len, prot, -1); 761 } 762 763 #ifdef CONFIG_ARCH_HAS_PKEYS 764 765 SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, 766 unsigned long, prot, int, pkey) 767 { 768 return do_mprotect_pkey(start, len, prot, pkey); 769 } 770 771 SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val) 772 { 773 int pkey; 774 int ret; 775 776 /* No flags supported yet. */ 777 if (flags) 778 return -EINVAL; 779 /* check for unsupported init values */ 780 if (init_val & ~PKEY_ACCESS_MASK) 781 return -EINVAL; 782 783 mmap_write_lock(current->mm); 784 pkey = mm_pkey_alloc(current->mm); 785 786 ret = -ENOSPC; 787 if (pkey == -1) 788 goto out; 789 790 ret = arch_set_user_pkey_access(current, pkey, init_val); 791 if (ret) { 792 mm_pkey_free(current->mm, pkey); 793 goto out; 794 } 795 ret = pkey; 796 out: 797 mmap_write_unlock(current->mm); 798 return ret; 799 } 800 801 SYSCALL_DEFINE1(pkey_free, int, pkey) 802 { 803 int ret; 804 805 mmap_write_lock(current->mm); 806 ret = mm_pkey_free(current->mm, pkey); 807 mmap_write_unlock(current->mm); 808 809 /* 810 * We could provide warnings or errors if any VMA still 811 * has the pkey set here. 812 */ 813 return ret; 814 } 815 816 #endif /* CONFIG_ARCH_HAS_PKEYS */ 817