1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * mm/userfaultfd.c 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/sched/signal.h> 10 #include <linux/pagemap.h> 11 #include <linux/rmap.h> 12 #include <linux/swap.h> 13 #include <linux/swapops.h> 14 #include <linux/userfaultfd_k.h> 15 #include <linux/mmu_notifier.h> 16 #include <linux/hugetlb.h> 17 #include <linux/shmem_fs.h> 18 #include <asm/tlbflush.h> 19 #include "internal.h" 20 21 static int mcopy_atomic_pte(struct mm_struct *dst_mm, 22 pmd_t *dst_pmd, 23 struct vm_area_struct *dst_vma, 24 unsigned long dst_addr, 25 unsigned long src_addr, 26 struct page **pagep) 27 { 28 struct mem_cgroup *memcg; 29 pte_t _dst_pte, *dst_pte; 30 spinlock_t *ptl; 31 void *page_kaddr; 32 int ret; 33 struct page *page; 34 pgoff_t offset, max_off; 35 struct inode *inode; 36 37 if (!*pagep) { 38 ret = -ENOMEM; 39 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); 40 if (!page) 41 goto out; 42 43 page_kaddr = kmap_atomic(page); 44 ret = copy_from_user(page_kaddr, 45 (const void __user *) src_addr, 46 PAGE_SIZE); 47 kunmap_atomic(page_kaddr); 48 49 /* fallback to copy_from_user outside mmap_sem */ 50 if (unlikely(ret)) { 51 ret = -ENOENT; 52 *pagep = page; 53 /* don't free the page */ 54 goto out; 55 } 56 } else { 57 page = *pagep; 58 *pagep = NULL; 59 } 60 61 /* 62 * The memory barrier inside __SetPageUptodate makes sure that 63 * preceeding stores to the page contents become visible before 64 * the set_pte_at() write. 65 */ 66 __SetPageUptodate(page); 67 68 ret = -ENOMEM; 69 if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) 70 goto out_release; 71 72 _dst_pte = mk_pte(page, dst_vma->vm_page_prot); 73 if (dst_vma->vm_flags & VM_WRITE) 74 _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); 75 76 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 77 if (dst_vma->vm_file) { 78 /* the shmem MAP_PRIVATE case requires checking the i_size */ 79 inode = dst_vma->vm_file->f_inode; 80 offset = linear_page_index(dst_vma, dst_addr); 81 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 82 ret = -EFAULT; 83 if (unlikely(offset >= max_off)) 84 goto out_release_uncharge_unlock; 85 } 86 ret = -EEXIST; 87 if (!pte_none(*dst_pte)) 88 goto out_release_uncharge_unlock; 89 90 inc_mm_counter(dst_mm, MM_ANONPAGES); 91 page_add_new_anon_rmap(page, dst_vma, dst_addr, false); 92 mem_cgroup_commit_charge(page, memcg, false, false); 93 lru_cache_add_active_or_unevictable(page, dst_vma); 94 95 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 96 97 /* No need to invalidate - it was non-present before */ 98 update_mmu_cache(dst_vma, dst_addr, dst_pte); 99 100 pte_unmap_unlock(dst_pte, ptl); 101 ret = 0; 102 out: 103 return ret; 104 out_release_uncharge_unlock: 105 pte_unmap_unlock(dst_pte, ptl); 106 mem_cgroup_cancel_charge(page, memcg, false); 107 out_release: 108 put_page(page); 109 goto out; 110 } 111 112 static int mfill_zeropage_pte(struct mm_struct *dst_mm, 113 pmd_t *dst_pmd, 114 struct vm_area_struct *dst_vma, 115 unsigned long dst_addr) 116 { 117 pte_t _dst_pte, *dst_pte; 118 spinlock_t *ptl; 119 int ret; 120 pgoff_t offset, max_off; 121 struct inode *inode; 122 123 _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), 124 dst_vma->vm_page_prot)); 125 dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); 126 if (dst_vma->vm_file) { 127 /* the shmem MAP_PRIVATE case requires checking the i_size */ 128 inode = dst_vma->vm_file->f_inode; 129 offset = linear_page_index(dst_vma, dst_addr); 130 max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 131 ret = -EFAULT; 132 if (unlikely(offset >= max_off)) 133 goto out_unlock; 134 } 135 ret = -EEXIST; 136 if (!pte_none(*dst_pte)) 137 goto out_unlock; 138 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); 139 /* No need to invalidate - it was non-present before */ 140 update_mmu_cache(dst_vma, dst_addr, dst_pte); 141 ret = 0; 142 out_unlock: 143 pte_unmap_unlock(dst_pte, ptl); 144 return ret; 145 } 146 147 static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) 148 { 149 pgd_t *pgd; 150 p4d_t *p4d; 151 pud_t *pud; 152 153 pgd = pgd_offset(mm, address); 154 p4d = p4d_alloc(mm, pgd, address); 155 if (!p4d) 156 return NULL; 157 pud = pud_alloc(mm, p4d, address); 158 if (!pud) 159 return NULL; 160 /* 161 * Note that we didn't run this because the pmd was 162 * missing, the *pmd may be already established and in 163 * turn it may also be a trans_huge_pmd. 164 */ 165 return pmd_alloc(mm, pud, address); 166 } 167 168 #ifdef CONFIG_HUGETLB_PAGE 169 /* 170 * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is 171 * called with mmap_sem held, it will release mmap_sem before returning. 172 */ 173 static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 174 struct vm_area_struct *dst_vma, 175 unsigned long dst_start, 176 unsigned long src_start, 177 unsigned long len, 178 bool zeropage) 179 { 180 int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED; 181 int vm_shared = dst_vma->vm_flags & VM_SHARED; 182 ssize_t err; 183 pte_t *dst_pte; 184 unsigned long src_addr, dst_addr; 185 long copied; 186 struct page *page; 187 struct hstate *h; 188 unsigned long vma_hpagesize; 189 pgoff_t idx; 190 u32 hash; 191 struct address_space *mapping; 192 193 /* 194 * There is no default zero huge page for all huge page sizes as 195 * supported by hugetlb. A PMD_SIZE huge pages may exist as used 196 * by THP. Since we can not reliably insert a zero page, this 197 * feature is not supported. 198 */ 199 if (zeropage) { 200 up_read(&dst_mm->mmap_sem); 201 return -EINVAL; 202 } 203 204 src_addr = src_start; 205 dst_addr = dst_start; 206 copied = 0; 207 page = NULL; 208 vma_hpagesize = vma_kernel_pagesize(dst_vma); 209 210 /* 211 * Validate alignment based on huge page size 212 */ 213 err = -EINVAL; 214 if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) 215 goto out_unlock; 216 217 retry: 218 /* 219 * On routine entry dst_vma is set. If we had to drop mmap_sem and 220 * retry, dst_vma will be set to NULL and we must lookup again. 221 */ 222 if (!dst_vma) { 223 err = -ENOENT; 224 dst_vma = find_vma(dst_mm, dst_start); 225 if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) 226 goto out_unlock; 227 /* 228 * Check the vma is registered in uffd, this is 229 * required to enforce the VM_MAYWRITE check done at 230 * uffd registration time. 231 */ 232 if (!dst_vma->vm_userfaultfd_ctx.ctx) 233 goto out_unlock; 234 235 if (dst_start < dst_vma->vm_start || 236 dst_start + len > dst_vma->vm_end) 237 goto out_unlock; 238 239 err = -EINVAL; 240 if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) 241 goto out_unlock; 242 243 vm_shared = dst_vma->vm_flags & VM_SHARED; 244 } 245 246 if (WARN_ON(dst_addr & (vma_hpagesize - 1) || 247 (len - copied) & (vma_hpagesize - 1))) 248 goto out_unlock; 249 250 /* 251 * If not shared, ensure the dst_vma has a anon_vma. 252 */ 253 err = -ENOMEM; 254 if (!vm_shared) { 255 if (unlikely(anon_vma_prepare(dst_vma))) 256 goto out_unlock; 257 } 258 259 h = hstate_vma(dst_vma); 260 261 while (src_addr < src_start + len) { 262 pte_t dst_pteval; 263 264 BUG_ON(dst_addr >= dst_start + len); 265 VM_BUG_ON(dst_addr & ~huge_page_mask(h)); 266 267 /* 268 * Serialize via hugetlb_fault_mutex 269 */ 270 idx = linear_page_index(dst_vma, dst_addr); 271 mapping = dst_vma->vm_file->f_mapping; 272 hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); 273 mutex_lock(&hugetlb_fault_mutex_table[hash]); 274 275 err = -ENOMEM; 276 dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); 277 if (!dst_pte) { 278 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 279 goto out_unlock; 280 } 281 282 err = -EEXIST; 283 dst_pteval = huge_ptep_get(dst_pte); 284 if (!huge_pte_none(dst_pteval)) { 285 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 286 goto out_unlock; 287 } 288 289 err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, 290 dst_addr, src_addr, &page); 291 292 mutex_unlock(&hugetlb_fault_mutex_table[hash]); 293 vm_alloc_shared = vm_shared; 294 295 cond_resched(); 296 297 if (unlikely(err == -ENOENT)) { 298 up_read(&dst_mm->mmap_sem); 299 BUG_ON(!page); 300 301 err = copy_huge_page_from_user(page, 302 (const void __user *)src_addr, 303 pages_per_huge_page(h), true); 304 if (unlikely(err)) { 305 err = -EFAULT; 306 goto out; 307 } 308 down_read(&dst_mm->mmap_sem); 309 310 dst_vma = NULL; 311 goto retry; 312 } else 313 BUG_ON(page); 314 315 if (!err) { 316 dst_addr += vma_hpagesize; 317 src_addr += vma_hpagesize; 318 copied += vma_hpagesize; 319 320 if (fatal_signal_pending(current)) 321 err = -EINTR; 322 } 323 if (err) 324 break; 325 } 326 327 out_unlock: 328 up_read(&dst_mm->mmap_sem); 329 out: 330 if (page) { 331 /* 332 * We encountered an error and are about to free a newly 333 * allocated huge page. 334 * 335 * Reservation handling is very subtle, and is different for 336 * private and shared mappings. See the routine 337 * restore_reserve_on_error for details. Unfortunately, we 338 * can not call restore_reserve_on_error now as it would 339 * require holding mmap_sem. 340 * 341 * If a reservation for the page existed in the reservation 342 * map of a private mapping, the map was modified to indicate 343 * the reservation was consumed when the page was allocated. 344 * We clear the PagePrivate flag now so that the global 345 * reserve count will not be incremented in free_huge_page. 346 * The reservation map will still indicate the reservation 347 * was consumed and possibly prevent later page allocation. 348 * This is better than leaking a global reservation. If no 349 * reservation existed, it is still safe to clear PagePrivate 350 * as no adjustments to reservation counts were made during 351 * allocation. 352 * 353 * The reservation map for shared mappings indicates which 354 * pages have reservations. When a huge page is allocated 355 * for an address with a reservation, no change is made to 356 * the reserve map. In this case PagePrivate will be set 357 * to indicate that the global reservation count should be 358 * incremented when the page is freed. This is the desired 359 * behavior. However, when a huge page is allocated for an 360 * address without a reservation a reservation entry is added 361 * to the reservation map, and PagePrivate will not be set. 362 * When the page is freed, the global reserve count will NOT 363 * be incremented and it will appear as though we have leaked 364 * reserved page. In this case, set PagePrivate so that the 365 * global reserve count will be incremented to match the 366 * reservation map entry which was created. 367 * 368 * Note that vm_alloc_shared is based on the flags of the vma 369 * for which the page was originally allocated. dst_vma could 370 * be different or NULL on error. 371 */ 372 if (vm_alloc_shared) 373 SetPagePrivate(page); 374 else 375 ClearPagePrivate(page); 376 put_page(page); 377 } 378 BUG_ON(copied < 0); 379 BUG_ON(err > 0); 380 BUG_ON(!copied && !err); 381 return copied ? copied : err; 382 } 383 #else /* !CONFIG_HUGETLB_PAGE */ 384 /* fail at build time if gcc attempts to use this */ 385 extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, 386 struct vm_area_struct *dst_vma, 387 unsigned long dst_start, 388 unsigned long src_start, 389 unsigned long len, 390 bool zeropage); 391 #endif /* CONFIG_HUGETLB_PAGE */ 392 393 static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, 394 pmd_t *dst_pmd, 395 struct vm_area_struct *dst_vma, 396 unsigned long dst_addr, 397 unsigned long src_addr, 398 struct page **page, 399 bool zeropage) 400 { 401 ssize_t err; 402 403 /* 404 * The normal page fault path for a shmem will invoke the 405 * fault, fill the hole in the file and COW it right away. The 406 * result generates plain anonymous memory. So when we are 407 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll 408 * generate anonymous memory directly without actually filling 409 * the hole. For the MAP_PRIVATE case the robustness check 410 * only happens in the pagetable (to verify it's still none) 411 * and not in the radix tree. 412 */ 413 if (!(dst_vma->vm_flags & VM_SHARED)) { 414 if (!zeropage) 415 err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, 416 dst_addr, src_addr, page); 417 else 418 err = mfill_zeropage_pte(dst_mm, dst_pmd, 419 dst_vma, dst_addr); 420 } else { 421 if (!zeropage) 422 err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, 423 dst_vma, dst_addr, 424 src_addr, page); 425 else 426 err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, 427 dst_vma, dst_addr); 428 } 429 430 return err; 431 } 432 433 static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, 434 unsigned long dst_start, 435 unsigned long src_start, 436 unsigned long len, 437 bool zeropage, 438 bool *mmap_changing) 439 { 440 struct vm_area_struct *dst_vma; 441 ssize_t err; 442 pmd_t *dst_pmd; 443 unsigned long src_addr, dst_addr; 444 long copied; 445 struct page *page; 446 447 /* 448 * Sanitize the command parameters: 449 */ 450 BUG_ON(dst_start & ~PAGE_MASK); 451 BUG_ON(len & ~PAGE_MASK); 452 453 /* Does the address range wrap, or is the span zero-sized? */ 454 BUG_ON(src_start + len <= src_start); 455 BUG_ON(dst_start + len <= dst_start); 456 457 src_addr = src_start; 458 dst_addr = dst_start; 459 copied = 0; 460 page = NULL; 461 retry: 462 down_read(&dst_mm->mmap_sem); 463 464 /* 465 * If memory mappings are changing because of non-cooperative 466 * operation (e.g. mremap) running in parallel, bail out and 467 * request the user to retry later 468 */ 469 err = -EAGAIN; 470 if (mmap_changing && READ_ONCE(*mmap_changing)) 471 goto out_unlock; 472 473 /* 474 * Make sure the vma is not shared, that the dst range is 475 * both valid and fully within a single existing vma. 476 */ 477 err = -ENOENT; 478 dst_vma = find_vma(dst_mm, dst_start); 479 if (!dst_vma) 480 goto out_unlock; 481 /* 482 * Check the vma is registered in uffd, this is required to 483 * enforce the VM_MAYWRITE check done at uffd registration 484 * time. 485 */ 486 if (!dst_vma->vm_userfaultfd_ctx.ctx) 487 goto out_unlock; 488 489 if (dst_start < dst_vma->vm_start || 490 dst_start + len > dst_vma->vm_end) 491 goto out_unlock; 492 493 err = -EINVAL; 494 /* 495 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but 496 * it will overwrite vm_ops, so vma_is_anonymous must return false. 497 */ 498 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && 499 dst_vma->vm_flags & VM_SHARED)) 500 goto out_unlock; 501 502 /* 503 * If this is a HUGETLB vma, pass off to appropriate routine 504 */ 505 if (is_vm_hugetlb_page(dst_vma)) 506 return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, 507 src_start, len, zeropage); 508 509 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) 510 goto out_unlock; 511 512 /* 513 * Ensure the dst_vma has a anon_vma or this page 514 * would get a NULL anon_vma when moved in the 515 * dst_vma. 516 */ 517 err = -ENOMEM; 518 if (!(dst_vma->vm_flags & VM_SHARED) && 519 unlikely(anon_vma_prepare(dst_vma))) 520 goto out_unlock; 521 522 while (src_addr < src_start + len) { 523 pmd_t dst_pmdval; 524 525 BUG_ON(dst_addr >= dst_start + len); 526 527 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); 528 if (unlikely(!dst_pmd)) { 529 err = -ENOMEM; 530 break; 531 } 532 533 dst_pmdval = pmd_read_atomic(dst_pmd); 534 /* 535 * If the dst_pmd is mapped as THP don't 536 * override it and just be strict. 537 */ 538 if (unlikely(pmd_trans_huge(dst_pmdval))) { 539 err = -EEXIST; 540 break; 541 } 542 if (unlikely(pmd_none(dst_pmdval)) && 543 unlikely(__pte_alloc(dst_mm, dst_pmd))) { 544 err = -ENOMEM; 545 break; 546 } 547 /* If an huge pmd materialized from under us fail */ 548 if (unlikely(pmd_trans_huge(*dst_pmd))) { 549 err = -EFAULT; 550 break; 551 } 552 553 BUG_ON(pmd_none(*dst_pmd)); 554 BUG_ON(pmd_trans_huge(*dst_pmd)); 555 556 err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, 557 src_addr, &page, zeropage); 558 cond_resched(); 559 560 if (unlikely(err == -ENOENT)) { 561 void *page_kaddr; 562 563 up_read(&dst_mm->mmap_sem); 564 BUG_ON(!page); 565 566 page_kaddr = kmap(page); 567 err = copy_from_user(page_kaddr, 568 (const void __user *) src_addr, 569 PAGE_SIZE); 570 kunmap(page); 571 if (unlikely(err)) { 572 err = -EFAULT; 573 goto out; 574 } 575 goto retry; 576 } else 577 BUG_ON(page); 578 579 if (!err) { 580 dst_addr += PAGE_SIZE; 581 src_addr += PAGE_SIZE; 582 copied += PAGE_SIZE; 583 584 if (fatal_signal_pending(current)) 585 err = -EINTR; 586 } 587 if (err) 588 break; 589 } 590 591 out_unlock: 592 up_read(&dst_mm->mmap_sem); 593 out: 594 if (page) 595 put_page(page); 596 BUG_ON(copied < 0); 597 BUG_ON(err > 0); 598 BUG_ON(!copied && !err); 599 return copied ? copied : err; 600 } 601 602 ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 603 unsigned long src_start, unsigned long len, 604 bool *mmap_changing) 605 { 606 return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, 607 mmap_changing); 608 } 609 610 ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, 611 unsigned long len, bool *mmap_changing) 612 { 613 return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); 614 } 615