1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/userfaultfd.c 4 * 5 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 6 * Copyright (C) 2008-2009 Red Hat, Inc. 7 * Copyright (C) 2015 Red Hat, Inc. 8 * 9 * Some part derived from fs/eventfd.c (anon inode setup) and 10 * mm/ksm.c (mm hashing). 11 */ 12 13 #include <linux/list.h> 14 #include <linux/hashtable.h> 15 #include <linux/sched/signal.h> 16 #include <linux/sched/mm.h> 17 #include <linux/mm.h> 18 #include <linux/mm_inline.h> 19 #include <linux/mmu_notifier.h> 20 #include <linux/poll.h> 21 #include <linux/slab.h> 22 #include <linux/seq_file.h> 23 #include <linux/file.h> 24 #include <linux/bug.h> 25 #include <linux/anon_inodes.h> 26 #include <linux/syscalls.h> 27 #include <linux/userfaultfd_k.h> 28 #include <linux/mempolicy.h> 29 #include <linux/ioctl.h> 30 #include <linux/security.h> 31 #include <linux/hugetlb.h> 32 33 int sysctl_unprivileged_userfaultfd __read_mostly; 34 35 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; 36 37 /* 38 * Start with fault_pending_wqh and fault_wqh so they're more likely 39 * to be in the same cacheline. 40 * 41 * Locking order: 42 * fd_wqh.lock 43 * fault_pending_wqh.lock 44 * fault_wqh.lock 45 * event_wqh.lock 46 * 47 * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, 48 * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's 49 * also taken in IRQ context. 50 */ 51 struct userfaultfd_ctx { 52 /* waitqueue head for the pending (i.e. not read) userfaults */ 53 wait_queue_head_t fault_pending_wqh; 54 /* waitqueue head for the userfaults */ 55 wait_queue_head_t fault_wqh; 56 /* waitqueue head for the pseudo fd to wakeup poll/read */ 57 wait_queue_head_t fd_wqh; 58 /* waitqueue head for events */ 59 wait_queue_head_t event_wqh; 60 /* a refile sequence protected by fault_pending_wqh lock */ 61 seqcount_spinlock_t refile_seq; 62 /* pseudo fd refcounting */ 63 refcount_t refcount; 64 /* userfaultfd syscall flags */ 65 unsigned int flags; 66 /* features requested from the userspace */ 67 unsigned int features; 68 /* released */ 69 bool released; 70 /* memory mappings are changing because of non-cooperative event */ 71 atomic_t mmap_changing; 72 /* mm with one ore more vmas attached to this userfaultfd_ctx */ 73 struct mm_struct *mm; 74 }; 75 76 struct userfaultfd_fork_ctx { 77 struct userfaultfd_ctx *orig; 78 struct userfaultfd_ctx *new; 79 struct list_head list; 80 }; 81 82 struct userfaultfd_unmap_ctx { 83 struct userfaultfd_ctx *ctx; 84 unsigned long start; 85 unsigned long end; 86 struct list_head list; 87 }; 88 89 struct userfaultfd_wait_queue { 90 struct uffd_msg msg; 91 wait_queue_entry_t wq; 92 struct userfaultfd_ctx *ctx; 93 bool waken; 94 }; 95 96 struct userfaultfd_wake_range { 97 unsigned long start; 98 unsigned long len; 99 }; 100 101 /* internal indication that UFFD_API ioctl was successfully executed */ 102 #define UFFD_FEATURE_INITIALIZED (1u << 31) 103 104 static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) 105 { 106 return ctx->features & UFFD_FEATURE_INITIALIZED; 107 } 108 109 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, 110 int wake_flags, void *key) 111 { 112 struct userfaultfd_wake_range *range = key; 113 int ret; 114 struct userfaultfd_wait_queue *uwq; 115 unsigned long start, len; 116 117 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 118 ret = 0; 119 /* len == 0 means wake all */ 120 start = range->start; 121 len = range->len; 122 if (len && (start > uwq->msg.arg.pagefault.address || 123 start + len <= uwq->msg.arg.pagefault.address)) 124 goto out; 125 WRITE_ONCE(uwq->waken, true); 126 /* 127 * The Program-Order guarantees provided by the scheduler 128 * ensure uwq->waken is visible before the task is woken. 129 */ 130 ret = wake_up_state(wq->private, mode); 131 if (ret) { 132 /* 133 * Wake only once, autoremove behavior. 134 * 135 * After the effect of list_del_init is visible to the other 136 * CPUs, the waitqueue may disappear from under us, see the 137 * !list_empty_careful() in handle_userfault(). 138 * 139 * try_to_wake_up() has an implicit smp_mb(), and the 140 * wq->private is read before calling the extern function 141 * "wake_up_state" (which in turns calls try_to_wake_up). 142 */ 143 list_del_init(&wq->entry); 144 } 145 out: 146 return ret; 147 } 148 149 /** 150 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd 151 * context. 152 * @ctx: [in] Pointer to the userfaultfd context. 153 */ 154 static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) 155 { 156 refcount_inc(&ctx->refcount); 157 } 158 159 /** 160 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd 161 * context. 162 * @ctx: [in] Pointer to userfaultfd context. 163 * 164 * The userfaultfd context reference must have been previously acquired either 165 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). 166 */ 167 static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) 168 { 169 if (refcount_dec_and_test(&ctx->refcount)) { 170 VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); 171 VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); 172 VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); 173 VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); 174 VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); 175 VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); 176 VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); 177 VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); 178 mmdrop(ctx->mm); 179 kmem_cache_free(userfaultfd_ctx_cachep, ctx); 180 } 181 } 182 183 static inline void msg_init(struct uffd_msg *msg) 184 { 185 BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); 186 /* 187 * Must use memset to zero out the paddings or kernel data is 188 * leaked to userland. 189 */ 190 memset(msg, 0, sizeof(struct uffd_msg)); 191 } 192 193 static inline struct uffd_msg userfault_msg(unsigned long address, 194 unsigned int flags, 195 unsigned long reason, 196 unsigned int features) 197 { 198 struct uffd_msg msg; 199 msg_init(&msg); 200 msg.event = UFFD_EVENT_PAGEFAULT; 201 msg.arg.pagefault.address = address; 202 /* 203 * These flags indicate why the userfault occurred: 204 * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. 205 * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault. 206 * - Neither of these flags being set indicates a MISSING fault. 207 * 208 * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write 209 * fault. Otherwise, it was a read fault. 210 */ 211 if (flags & FAULT_FLAG_WRITE) 212 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; 213 if (reason & VM_UFFD_WP) 214 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; 215 if (reason & VM_UFFD_MINOR) 216 msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; 217 if (features & UFFD_FEATURE_THREAD_ID) 218 msg.arg.pagefault.feat.ptid = task_pid_vnr(current); 219 return msg; 220 } 221 222 #ifdef CONFIG_HUGETLB_PAGE 223 /* 224 * Same functionality as userfaultfd_must_wait below with modifications for 225 * hugepmd ranges. 226 */ 227 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 228 struct vm_area_struct *vma, 229 unsigned long address, 230 unsigned long flags, 231 unsigned long reason) 232 { 233 struct mm_struct *mm = ctx->mm; 234 pte_t *ptep, pte; 235 bool ret = true; 236 237 mmap_assert_locked(mm); 238 239 ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); 240 241 if (!ptep) 242 goto out; 243 244 ret = false; 245 pte = huge_ptep_get(ptep); 246 247 /* 248 * Lockless access: we're in a wait_event so it's ok if it 249 * changes under us. 250 */ 251 if (huge_pte_none(pte)) 252 ret = true; 253 if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) 254 ret = true; 255 out: 256 return ret; 257 } 258 #else 259 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, 260 struct vm_area_struct *vma, 261 unsigned long address, 262 unsigned long flags, 263 unsigned long reason) 264 { 265 return false; /* should never get here */ 266 } 267 #endif /* CONFIG_HUGETLB_PAGE */ 268 269 /* 270 * Verify the pagetables are still not ok after having reigstered into 271 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any 272 * userfault that has already been resolved, if userfaultfd_read and 273 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different 274 * threads. 275 */ 276 static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, 277 unsigned long address, 278 unsigned long flags, 279 unsigned long reason) 280 { 281 struct mm_struct *mm = ctx->mm; 282 pgd_t *pgd; 283 p4d_t *p4d; 284 pud_t *pud; 285 pmd_t *pmd, _pmd; 286 pte_t *pte; 287 bool ret = true; 288 289 mmap_assert_locked(mm); 290 291 pgd = pgd_offset(mm, address); 292 if (!pgd_present(*pgd)) 293 goto out; 294 p4d = p4d_offset(pgd, address); 295 if (!p4d_present(*p4d)) 296 goto out; 297 pud = pud_offset(p4d, address); 298 if (!pud_present(*pud)) 299 goto out; 300 pmd = pmd_offset(pud, address); 301 /* 302 * READ_ONCE must function as a barrier with narrower scope 303 * and it must be equivalent to: 304 * _pmd = *pmd; barrier(); 305 * 306 * This is to deal with the instability (as in 307 * pmd_trans_unstable) of the pmd. 308 */ 309 _pmd = READ_ONCE(*pmd); 310 if (pmd_none(_pmd)) 311 goto out; 312 313 ret = false; 314 if (!pmd_present(_pmd)) 315 goto out; 316 317 if (pmd_trans_huge(_pmd)) { 318 if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) 319 ret = true; 320 goto out; 321 } 322 323 /* 324 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it 325 * and use the standard pte_offset_map() instead of parsing _pmd. 326 */ 327 pte = pte_offset_map(pmd, address); 328 /* 329 * Lockless access: we're in a wait_event so it's ok if it 330 * changes under us. 331 */ 332 if (pte_none(*pte)) 333 ret = true; 334 if (!pte_write(*pte) && (reason & VM_UFFD_WP)) 335 ret = true; 336 pte_unmap(pte); 337 338 out: 339 return ret; 340 } 341 342 static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags) 343 { 344 if (flags & FAULT_FLAG_INTERRUPTIBLE) 345 return TASK_INTERRUPTIBLE; 346 347 if (flags & FAULT_FLAG_KILLABLE) 348 return TASK_KILLABLE; 349 350 return TASK_UNINTERRUPTIBLE; 351 } 352 353 /* 354 * The locking rules involved in returning VM_FAULT_RETRY depending on 355 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and 356 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" 357 * recommendation in __lock_page_or_retry is not an understatement. 358 * 359 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released 360 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is 361 * not set. 362 * 363 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not 364 * set, VM_FAULT_RETRY can still be returned if and only if there are 365 * fatal_signal_pending()s, and the mmap_lock must be released before 366 * returning it. 367 */ 368 vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) 369 { 370 struct mm_struct *mm = vmf->vma->vm_mm; 371 struct userfaultfd_ctx *ctx; 372 struct userfaultfd_wait_queue uwq; 373 vm_fault_t ret = VM_FAULT_SIGBUS; 374 bool must_wait; 375 unsigned int blocking_state; 376 377 /* 378 * We don't do userfault handling for the final child pid update. 379 * 380 * We also don't do userfault handling during 381 * coredumping. hugetlbfs has the special 382 * follow_hugetlb_page() to skip missing pages in the 383 * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with 384 * the no_page_table() helper in follow_page_mask(), but the 385 * shmem_vm_ops->fault method is invoked even during 386 * coredumping without mmap_lock and it ends up here. 387 */ 388 if (current->flags & (PF_EXITING|PF_DUMPCORE)) 389 goto out; 390 391 /* 392 * Coredumping runs without mmap_lock so we can only check that 393 * the mmap_lock is held, if PF_DUMPCORE was not set. 394 */ 395 mmap_assert_locked(mm); 396 397 ctx = vmf->vma->vm_userfaultfd_ctx.ctx; 398 if (!ctx) 399 goto out; 400 401 BUG_ON(ctx->mm != mm); 402 403 /* Any unrecognized flag is a bug. */ 404 VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); 405 /* 0 or > 1 flags set is a bug; we expect exactly 1. */ 406 VM_BUG_ON(!reason || (reason & (reason - 1))); 407 408 if (ctx->features & UFFD_FEATURE_SIGBUS) 409 goto out; 410 if ((vmf->flags & FAULT_FLAG_USER) == 0 && 411 ctx->flags & UFFD_USER_MODE_ONLY) { 412 printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " 413 "sysctl knob to 1 if kernel faults must be handled " 414 "without obtaining CAP_SYS_PTRACE capability\n"); 415 goto out; 416 } 417 418 /* 419 * If it's already released don't get it. This avoids to loop 420 * in __get_user_pages if userfaultfd_release waits on the 421 * caller of handle_userfault to release the mmap_lock. 422 */ 423 if (unlikely(READ_ONCE(ctx->released))) { 424 /* 425 * Don't return VM_FAULT_SIGBUS in this case, so a non 426 * cooperative manager can close the uffd after the 427 * last UFFDIO_COPY, without risking to trigger an 428 * involuntary SIGBUS if the process was starting the 429 * userfaultfd while the userfaultfd was still armed 430 * (but after the last UFFDIO_COPY). If the uffd 431 * wasn't already closed when the userfault reached 432 * this point, that would normally be solved by 433 * userfaultfd_must_wait returning 'false'. 434 * 435 * If we were to return VM_FAULT_SIGBUS here, the non 436 * cooperative manager would be instead forced to 437 * always call UFFDIO_UNREGISTER before it can safely 438 * close the uffd. 439 */ 440 ret = VM_FAULT_NOPAGE; 441 goto out; 442 } 443 444 /* 445 * Check that we can return VM_FAULT_RETRY. 446 * 447 * NOTE: it should become possible to return VM_FAULT_RETRY 448 * even if FAULT_FLAG_TRIED is set without leading to gup() 449 * -EBUSY failures, if the userfaultfd is to be extended for 450 * VM_UFFD_WP tracking and we intend to arm the userfault 451 * without first stopping userland access to the memory. For 452 * VM_UFFD_MISSING userfaults this is enough for now. 453 */ 454 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { 455 /* 456 * Validate the invariant that nowait must allow retry 457 * to be sure not to return SIGBUS erroneously on 458 * nowait invocations. 459 */ 460 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); 461 #ifdef CONFIG_DEBUG_VM 462 if (printk_ratelimit()) { 463 printk(KERN_WARNING 464 "FAULT_FLAG_ALLOW_RETRY missing %x\n", 465 vmf->flags); 466 dump_stack(); 467 } 468 #endif 469 goto out; 470 } 471 472 /* 473 * Handle nowait, not much to do other than tell it to retry 474 * and wait. 475 */ 476 ret = VM_FAULT_RETRY; 477 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) 478 goto out; 479 480 /* take the reference before dropping the mmap_lock */ 481 userfaultfd_ctx_get(ctx); 482 483 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 484 uwq.wq.private = current; 485 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, 486 ctx->features); 487 uwq.ctx = ctx; 488 uwq.waken = false; 489 490 blocking_state = userfaultfd_get_blocking_state(vmf->flags); 491 492 spin_lock_irq(&ctx->fault_pending_wqh.lock); 493 /* 494 * After the __add_wait_queue the uwq is visible to userland 495 * through poll/read(). 496 */ 497 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); 498 /* 499 * The smp_mb() after __set_current_state prevents the reads 500 * following the spin_unlock to happen before the list_add in 501 * __add_wait_queue. 502 */ 503 set_current_state(blocking_state); 504 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 505 506 if (!is_vm_hugetlb_page(vmf->vma)) 507 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, 508 reason); 509 else 510 must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, 511 vmf->address, 512 vmf->flags, reason); 513 mmap_read_unlock(mm); 514 515 if (likely(must_wait && !READ_ONCE(ctx->released))) { 516 wake_up_poll(&ctx->fd_wqh, EPOLLIN); 517 schedule(); 518 } 519 520 __set_current_state(TASK_RUNNING); 521 522 /* 523 * Here we race with the list_del; list_add in 524 * userfaultfd_ctx_read(), however because we don't ever run 525 * list_del_init() to refile across the two lists, the prev 526 * and next pointers will never point to self. list_add also 527 * would never let any of the two pointers to point to 528 * self. So list_empty_careful won't risk to see both pointers 529 * pointing to self at any time during the list refile. The 530 * only case where list_del_init() is called is the full 531 * removal in the wake function and there we don't re-list_add 532 * and it's fine not to block on the spinlock. The uwq on this 533 * kernel stack can be released after the list_del_init. 534 */ 535 if (!list_empty_careful(&uwq.wq.entry)) { 536 spin_lock_irq(&ctx->fault_pending_wqh.lock); 537 /* 538 * No need of list_del_init(), the uwq on the stack 539 * will be freed shortly anyway. 540 */ 541 list_del(&uwq.wq.entry); 542 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 543 } 544 545 /* 546 * ctx may go away after this if the userfault pseudo fd is 547 * already released. 548 */ 549 userfaultfd_ctx_put(ctx); 550 551 out: 552 return ret; 553 } 554 555 static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, 556 struct userfaultfd_wait_queue *ewq) 557 { 558 struct userfaultfd_ctx *release_new_ctx; 559 560 if (WARN_ON_ONCE(current->flags & PF_EXITING)) 561 goto out; 562 563 ewq->ctx = ctx; 564 init_waitqueue_entry(&ewq->wq, current); 565 release_new_ctx = NULL; 566 567 spin_lock_irq(&ctx->event_wqh.lock); 568 /* 569 * After the __add_wait_queue the uwq is visible to userland 570 * through poll/read(). 571 */ 572 __add_wait_queue(&ctx->event_wqh, &ewq->wq); 573 for (;;) { 574 set_current_state(TASK_KILLABLE); 575 if (ewq->msg.event == 0) 576 break; 577 if (READ_ONCE(ctx->released) || 578 fatal_signal_pending(current)) { 579 /* 580 * &ewq->wq may be queued in fork_event, but 581 * __remove_wait_queue ignores the head 582 * parameter. It would be a problem if it 583 * didn't. 584 */ 585 __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 586 if (ewq->msg.event == UFFD_EVENT_FORK) { 587 struct userfaultfd_ctx *new; 588 589 new = (struct userfaultfd_ctx *) 590 (unsigned long) 591 ewq->msg.arg.reserved.reserved1; 592 release_new_ctx = new; 593 } 594 break; 595 } 596 597 spin_unlock_irq(&ctx->event_wqh.lock); 598 599 wake_up_poll(&ctx->fd_wqh, EPOLLIN); 600 schedule(); 601 602 spin_lock_irq(&ctx->event_wqh.lock); 603 } 604 __set_current_state(TASK_RUNNING); 605 spin_unlock_irq(&ctx->event_wqh.lock); 606 607 if (release_new_ctx) { 608 struct vm_area_struct *vma; 609 struct mm_struct *mm = release_new_ctx->mm; 610 611 /* the various vma->vm_userfaultfd_ctx still points to it */ 612 mmap_write_lock(mm); 613 for (vma = mm->mmap; vma; vma = vma->vm_next) 614 if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { 615 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 616 vma->vm_flags &= ~__VM_UFFD_FLAGS; 617 } 618 mmap_write_unlock(mm); 619 620 userfaultfd_ctx_put(release_new_ctx); 621 } 622 623 /* 624 * ctx may go away after this if the userfault pseudo fd is 625 * already released. 626 */ 627 out: 628 atomic_dec(&ctx->mmap_changing); 629 VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); 630 userfaultfd_ctx_put(ctx); 631 } 632 633 static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, 634 struct userfaultfd_wait_queue *ewq) 635 { 636 ewq->msg.event = 0; 637 wake_up_locked(&ctx->event_wqh); 638 __remove_wait_queue(&ctx->event_wqh, &ewq->wq); 639 } 640 641 int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) 642 { 643 struct userfaultfd_ctx *ctx = NULL, *octx; 644 struct userfaultfd_fork_ctx *fctx; 645 646 octx = vma->vm_userfaultfd_ctx.ctx; 647 if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { 648 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 649 vma->vm_flags &= ~__VM_UFFD_FLAGS; 650 return 0; 651 } 652 653 list_for_each_entry(fctx, fcs, list) 654 if (fctx->orig == octx) { 655 ctx = fctx->new; 656 break; 657 } 658 659 if (!ctx) { 660 fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); 661 if (!fctx) 662 return -ENOMEM; 663 664 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 665 if (!ctx) { 666 kfree(fctx); 667 return -ENOMEM; 668 } 669 670 refcount_set(&ctx->refcount, 1); 671 ctx->flags = octx->flags; 672 ctx->features = octx->features; 673 ctx->released = false; 674 atomic_set(&ctx->mmap_changing, 0); 675 ctx->mm = vma->vm_mm; 676 mmgrab(ctx->mm); 677 678 userfaultfd_ctx_get(octx); 679 atomic_inc(&octx->mmap_changing); 680 fctx->orig = octx; 681 fctx->new = ctx; 682 list_add_tail(&fctx->list, fcs); 683 } 684 685 vma->vm_userfaultfd_ctx.ctx = ctx; 686 return 0; 687 } 688 689 static void dup_fctx(struct userfaultfd_fork_ctx *fctx) 690 { 691 struct userfaultfd_ctx *ctx = fctx->orig; 692 struct userfaultfd_wait_queue ewq; 693 694 msg_init(&ewq.msg); 695 696 ewq.msg.event = UFFD_EVENT_FORK; 697 ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; 698 699 userfaultfd_event_wait_completion(ctx, &ewq); 700 } 701 702 void dup_userfaultfd_complete(struct list_head *fcs) 703 { 704 struct userfaultfd_fork_ctx *fctx, *n; 705 706 list_for_each_entry_safe(fctx, n, fcs, list) { 707 dup_fctx(fctx); 708 list_del(&fctx->list); 709 kfree(fctx); 710 } 711 } 712 713 void mremap_userfaultfd_prep(struct vm_area_struct *vma, 714 struct vm_userfaultfd_ctx *vm_ctx) 715 { 716 struct userfaultfd_ctx *ctx; 717 718 ctx = vma->vm_userfaultfd_ctx.ctx; 719 720 if (!ctx) 721 return; 722 723 if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { 724 vm_ctx->ctx = ctx; 725 userfaultfd_ctx_get(ctx); 726 atomic_inc(&ctx->mmap_changing); 727 } else { 728 /* Drop uffd context if remap feature not enabled */ 729 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 730 vma->vm_flags &= ~__VM_UFFD_FLAGS; 731 } 732 } 733 734 void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, 735 unsigned long from, unsigned long to, 736 unsigned long len) 737 { 738 struct userfaultfd_ctx *ctx = vm_ctx->ctx; 739 struct userfaultfd_wait_queue ewq; 740 741 if (!ctx) 742 return; 743 744 if (to & ~PAGE_MASK) { 745 userfaultfd_ctx_put(ctx); 746 return; 747 } 748 749 msg_init(&ewq.msg); 750 751 ewq.msg.event = UFFD_EVENT_REMAP; 752 ewq.msg.arg.remap.from = from; 753 ewq.msg.arg.remap.to = to; 754 ewq.msg.arg.remap.len = len; 755 756 userfaultfd_event_wait_completion(ctx, &ewq); 757 } 758 759 bool userfaultfd_remove(struct vm_area_struct *vma, 760 unsigned long start, unsigned long end) 761 { 762 struct mm_struct *mm = vma->vm_mm; 763 struct userfaultfd_ctx *ctx; 764 struct userfaultfd_wait_queue ewq; 765 766 ctx = vma->vm_userfaultfd_ctx.ctx; 767 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) 768 return true; 769 770 userfaultfd_ctx_get(ctx); 771 atomic_inc(&ctx->mmap_changing); 772 mmap_read_unlock(mm); 773 774 msg_init(&ewq.msg); 775 776 ewq.msg.event = UFFD_EVENT_REMOVE; 777 ewq.msg.arg.remove.start = start; 778 ewq.msg.arg.remove.end = end; 779 780 userfaultfd_event_wait_completion(ctx, &ewq); 781 782 return false; 783 } 784 785 static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, 786 unsigned long start, unsigned long end) 787 { 788 struct userfaultfd_unmap_ctx *unmap_ctx; 789 790 list_for_each_entry(unmap_ctx, unmaps, list) 791 if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && 792 unmap_ctx->end == end) 793 return true; 794 795 return false; 796 } 797 798 int userfaultfd_unmap_prep(struct vm_area_struct *vma, 799 unsigned long start, unsigned long end, 800 struct list_head *unmaps) 801 { 802 for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { 803 struct userfaultfd_unmap_ctx *unmap_ctx; 804 struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; 805 806 if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || 807 has_unmap_ctx(ctx, unmaps, start, end)) 808 continue; 809 810 unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); 811 if (!unmap_ctx) 812 return -ENOMEM; 813 814 userfaultfd_ctx_get(ctx); 815 atomic_inc(&ctx->mmap_changing); 816 unmap_ctx->ctx = ctx; 817 unmap_ctx->start = start; 818 unmap_ctx->end = end; 819 list_add_tail(&unmap_ctx->list, unmaps); 820 } 821 822 return 0; 823 } 824 825 void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) 826 { 827 struct userfaultfd_unmap_ctx *ctx, *n; 828 struct userfaultfd_wait_queue ewq; 829 830 list_for_each_entry_safe(ctx, n, uf, list) { 831 msg_init(&ewq.msg); 832 833 ewq.msg.event = UFFD_EVENT_UNMAP; 834 ewq.msg.arg.remove.start = ctx->start; 835 ewq.msg.arg.remove.end = ctx->end; 836 837 userfaultfd_event_wait_completion(ctx->ctx, &ewq); 838 839 list_del(&ctx->list); 840 kfree(ctx); 841 } 842 } 843 844 static int userfaultfd_release(struct inode *inode, struct file *file) 845 { 846 struct userfaultfd_ctx *ctx = file->private_data; 847 struct mm_struct *mm = ctx->mm; 848 struct vm_area_struct *vma, *prev; 849 /* len == 0 means wake all */ 850 struct userfaultfd_wake_range range = { .len = 0, }; 851 unsigned long new_flags; 852 853 WRITE_ONCE(ctx->released, true); 854 855 if (!mmget_not_zero(mm)) 856 goto wakeup; 857 858 /* 859 * Flush page faults out of all CPUs. NOTE: all page faults 860 * must be retried without returning VM_FAULT_SIGBUS if 861 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx 862 * changes while handle_userfault released the mmap_lock. So 863 * it's critical that released is set to true (above), before 864 * taking the mmap_lock for writing. 865 */ 866 mmap_write_lock(mm); 867 prev = NULL; 868 for (vma = mm->mmap; vma; vma = vma->vm_next) { 869 cond_resched(); 870 BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ 871 !!(vma->vm_flags & __VM_UFFD_FLAGS)); 872 if (vma->vm_userfaultfd_ctx.ctx != ctx) { 873 prev = vma; 874 continue; 875 } 876 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; 877 prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, 878 new_flags, vma->anon_vma, 879 vma->vm_file, vma->vm_pgoff, 880 vma_policy(vma), 881 NULL_VM_UFFD_CTX, vma_anon_name(vma)); 882 if (prev) 883 vma = prev; 884 else 885 prev = vma; 886 vma->vm_flags = new_flags; 887 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 888 } 889 mmap_write_unlock(mm); 890 mmput(mm); 891 wakeup: 892 /* 893 * After no new page faults can wait on this fault_*wqh, flush 894 * the last page faults that may have been already waiting on 895 * the fault_*wqh. 896 */ 897 spin_lock_irq(&ctx->fault_pending_wqh.lock); 898 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); 899 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); 900 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 901 902 /* Flush pending events that may still wait on event_wqh */ 903 wake_up_all(&ctx->event_wqh); 904 905 wake_up_poll(&ctx->fd_wqh, EPOLLHUP); 906 userfaultfd_ctx_put(ctx); 907 return 0; 908 } 909 910 /* fault_pending_wqh.lock must be hold by the caller */ 911 static inline struct userfaultfd_wait_queue *find_userfault_in( 912 wait_queue_head_t *wqh) 913 { 914 wait_queue_entry_t *wq; 915 struct userfaultfd_wait_queue *uwq; 916 917 lockdep_assert_held(&wqh->lock); 918 919 uwq = NULL; 920 if (!waitqueue_active(wqh)) 921 goto out; 922 /* walk in reverse to provide FIFO behavior to read userfaults */ 923 wq = list_last_entry(&wqh->head, typeof(*wq), entry); 924 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 925 out: 926 return uwq; 927 } 928 929 static inline struct userfaultfd_wait_queue *find_userfault( 930 struct userfaultfd_ctx *ctx) 931 { 932 return find_userfault_in(&ctx->fault_pending_wqh); 933 } 934 935 static inline struct userfaultfd_wait_queue *find_userfault_evt( 936 struct userfaultfd_ctx *ctx) 937 { 938 return find_userfault_in(&ctx->event_wqh); 939 } 940 941 static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) 942 { 943 struct userfaultfd_ctx *ctx = file->private_data; 944 __poll_t ret; 945 946 poll_wait(file, &ctx->fd_wqh, wait); 947 948 if (!userfaultfd_is_initialized(ctx)) 949 return EPOLLERR; 950 951 /* 952 * poll() never guarantees that read won't block. 953 * userfaults can be waken before they're read(). 954 */ 955 if (unlikely(!(file->f_flags & O_NONBLOCK))) 956 return EPOLLERR; 957 /* 958 * lockless access to see if there are pending faults 959 * __pollwait last action is the add_wait_queue but 960 * the spin_unlock would allow the waitqueue_active to 961 * pass above the actual list_add inside 962 * add_wait_queue critical section. So use a full 963 * memory barrier to serialize the list_add write of 964 * add_wait_queue() with the waitqueue_active read 965 * below. 966 */ 967 ret = 0; 968 smp_mb(); 969 if (waitqueue_active(&ctx->fault_pending_wqh)) 970 ret = EPOLLIN; 971 else if (waitqueue_active(&ctx->event_wqh)) 972 ret = EPOLLIN; 973 974 return ret; 975 } 976 977 static const struct file_operations userfaultfd_fops; 978 979 static int resolve_userfault_fork(struct userfaultfd_ctx *new, 980 struct inode *inode, 981 struct uffd_msg *msg) 982 { 983 int fd; 984 985 fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, 986 O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); 987 if (fd < 0) 988 return fd; 989 990 msg->arg.reserved.reserved1 = 0; 991 msg->arg.fork.ufd = fd; 992 return 0; 993 } 994 995 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 996 struct uffd_msg *msg, struct inode *inode) 997 { 998 ssize_t ret; 999 DECLARE_WAITQUEUE(wait, current); 1000 struct userfaultfd_wait_queue *uwq; 1001 /* 1002 * Handling fork event requires sleeping operations, so 1003 * we drop the event_wqh lock, then do these ops, then 1004 * lock it back and wake up the waiter. While the lock is 1005 * dropped the ewq may go away so we keep track of it 1006 * carefully. 1007 */ 1008 LIST_HEAD(fork_event); 1009 struct userfaultfd_ctx *fork_nctx = NULL; 1010 1011 /* always take the fd_wqh lock before the fault_pending_wqh lock */ 1012 spin_lock_irq(&ctx->fd_wqh.lock); 1013 __add_wait_queue(&ctx->fd_wqh, &wait); 1014 for (;;) { 1015 set_current_state(TASK_INTERRUPTIBLE); 1016 spin_lock(&ctx->fault_pending_wqh.lock); 1017 uwq = find_userfault(ctx); 1018 if (uwq) { 1019 /* 1020 * Use a seqcount to repeat the lockless check 1021 * in wake_userfault() to avoid missing 1022 * wakeups because during the refile both 1023 * waitqueue could become empty if this is the 1024 * only userfault. 1025 */ 1026 write_seqcount_begin(&ctx->refile_seq); 1027 1028 /* 1029 * The fault_pending_wqh.lock prevents the uwq 1030 * to disappear from under us. 1031 * 1032 * Refile this userfault from 1033 * fault_pending_wqh to fault_wqh, it's not 1034 * pending anymore after we read it. 1035 * 1036 * Use list_del() by hand (as 1037 * userfaultfd_wake_function also uses 1038 * list_del_init() by hand) to be sure nobody 1039 * changes __remove_wait_queue() to use 1040 * list_del_init() in turn breaking the 1041 * !list_empty_careful() check in 1042 * handle_userfault(). The uwq->wq.head list 1043 * must never be empty at any time during the 1044 * refile, or the waitqueue could disappear 1045 * from under us. The "wait_queue_head_t" 1046 * parameter of __remove_wait_queue() is unused 1047 * anyway. 1048 */ 1049 list_del(&uwq->wq.entry); 1050 add_wait_queue(&ctx->fault_wqh, &uwq->wq); 1051 1052 write_seqcount_end(&ctx->refile_seq); 1053 1054 /* careful to always initialize msg if ret == 0 */ 1055 *msg = uwq->msg; 1056 spin_unlock(&ctx->fault_pending_wqh.lock); 1057 ret = 0; 1058 break; 1059 } 1060 spin_unlock(&ctx->fault_pending_wqh.lock); 1061 1062 spin_lock(&ctx->event_wqh.lock); 1063 uwq = find_userfault_evt(ctx); 1064 if (uwq) { 1065 *msg = uwq->msg; 1066 1067 if (uwq->msg.event == UFFD_EVENT_FORK) { 1068 fork_nctx = (struct userfaultfd_ctx *) 1069 (unsigned long) 1070 uwq->msg.arg.reserved.reserved1; 1071 list_move(&uwq->wq.entry, &fork_event); 1072 /* 1073 * fork_nctx can be freed as soon as 1074 * we drop the lock, unless we take a 1075 * reference on it. 1076 */ 1077 userfaultfd_ctx_get(fork_nctx); 1078 spin_unlock(&ctx->event_wqh.lock); 1079 ret = 0; 1080 break; 1081 } 1082 1083 userfaultfd_event_complete(ctx, uwq); 1084 spin_unlock(&ctx->event_wqh.lock); 1085 ret = 0; 1086 break; 1087 } 1088 spin_unlock(&ctx->event_wqh.lock); 1089 1090 if (signal_pending(current)) { 1091 ret = -ERESTARTSYS; 1092 break; 1093 } 1094 if (no_wait) { 1095 ret = -EAGAIN; 1096 break; 1097 } 1098 spin_unlock_irq(&ctx->fd_wqh.lock); 1099 schedule(); 1100 spin_lock_irq(&ctx->fd_wqh.lock); 1101 } 1102 __remove_wait_queue(&ctx->fd_wqh, &wait); 1103 __set_current_state(TASK_RUNNING); 1104 spin_unlock_irq(&ctx->fd_wqh.lock); 1105 1106 if (!ret && msg->event == UFFD_EVENT_FORK) { 1107 ret = resolve_userfault_fork(fork_nctx, inode, msg); 1108 spin_lock_irq(&ctx->event_wqh.lock); 1109 if (!list_empty(&fork_event)) { 1110 /* 1111 * The fork thread didn't abort, so we can 1112 * drop the temporary refcount. 1113 */ 1114 userfaultfd_ctx_put(fork_nctx); 1115 1116 uwq = list_first_entry(&fork_event, 1117 typeof(*uwq), 1118 wq.entry); 1119 /* 1120 * If fork_event list wasn't empty and in turn 1121 * the event wasn't already released by fork 1122 * (the event is allocated on fork kernel 1123 * stack), put the event back to its place in 1124 * the event_wq. fork_event head will be freed 1125 * as soon as we return so the event cannot 1126 * stay queued there no matter the current 1127 * "ret" value. 1128 */ 1129 list_del(&uwq->wq.entry); 1130 __add_wait_queue(&ctx->event_wqh, &uwq->wq); 1131 1132 /* 1133 * Leave the event in the waitqueue and report 1134 * error to userland if we failed to resolve 1135 * the userfault fork. 1136 */ 1137 if (likely(!ret)) 1138 userfaultfd_event_complete(ctx, uwq); 1139 } else { 1140 /* 1141 * Here the fork thread aborted and the 1142 * refcount from the fork thread on fork_nctx 1143 * has already been released. We still hold 1144 * the reference we took before releasing the 1145 * lock above. If resolve_userfault_fork 1146 * failed we've to drop it because the 1147 * fork_nctx has to be freed in such case. If 1148 * it succeeded we'll hold it because the new 1149 * uffd references it. 1150 */ 1151 if (ret) 1152 userfaultfd_ctx_put(fork_nctx); 1153 } 1154 spin_unlock_irq(&ctx->event_wqh.lock); 1155 } 1156 1157 return ret; 1158 } 1159 1160 static ssize_t userfaultfd_read(struct file *file, char __user *buf, 1161 size_t count, loff_t *ppos) 1162 { 1163 struct userfaultfd_ctx *ctx = file->private_data; 1164 ssize_t _ret, ret = 0; 1165 struct uffd_msg msg; 1166 int no_wait = file->f_flags & O_NONBLOCK; 1167 struct inode *inode = file_inode(file); 1168 1169 if (!userfaultfd_is_initialized(ctx)) 1170 return -EINVAL; 1171 1172 for (;;) { 1173 if (count < sizeof(msg)) 1174 return ret ? ret : -EINVAL; 1175 _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); 1176 if (_ret < 0) 1177 return ret ? ret : _ret; 1178 if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) 1179 return ret ? ret : -EFAULT; 1180 ret += sizeof(msg); 1181 buf += sizeof(msg); 1182 count -= sizeof(msg); 1183 /* 1184 * Allow to read more than one fault at time but only 1185 * block if waiting for the very first one. 1186 */ 1187 no_wait = O_NONBLOCK; 1188 } 1189 } 1190 1191 static void __wake_userfault(struct userfaultfd_ctx *ctx, 1192 struct userfaultfd_wake_range *range) 1193 { 1194 spin_lock_irq(&ctx->fault_pending_wqh.lock); 1195 /* wake all in the range and autoremove */ 1196 if (waitqueue_active(&ctx->fault_pending_wqh)) 1197 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 1198 range); 1199 if (waitqueue_active(&ctx->fault_wqh)) 1200 __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); 1201 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 1202 } 1203 1204 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 1205 struct userfaultfd_wake_range *range) 1206 { 1207 unsigned seq; 1208 bool need_wakeup; 1209 1210 /* 1211 * To be sure waitqueue_active() is not reordered by the CPU 1212 * before the pagetable update, use an explicit SMP memory 1213 * barrier here. PT lock release or mmap_read_unlock(mm) still 1214 * have release semantics that can allow the 1215 * waitqueue_active() to be reordered before the pte update. 1216 */ 1217 smp_mb(); 1218 1219 /* 1220 * Use waitqueue_active because it's very frequent to 1221 * change the address space atomically even if there are no 1222 * userfaults yet. So we take the spinlock only when we're 1223 * sure we've userfaults to wake. 1224 */ 1225 do { 1226 seq = read_seqcount_begin(&ctx->refile_seq); 1227 need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || 1228 waitqueue_active(&ctx->fault_wqh); 1229 cond_resched(); 1230 } while (read_seqcount_retry(&ctx->refile_seq, seq)); 1231 if (need_wakeup) 1232 __wake_userfault(ctx, range); 1233 } 1234 1235 static __always_inline int validate_range(struct mm_struct *mm, 1236 __u64 start, __u64 len) 1237 { 1238 __u64 task_size = mm->task_size; 1239 1240 if (start & ~PAGE_MASK) 1241 return -EINVAL; 1242 if (len & ~PAGE_MASK) 1243 return -EINVAL; 1244 if (!len) 1245 return -EINVAL; 1246 if (start < mmap_min_addr) 1247 return -EINVAL; 1248 if (start >= task_size) 1249 return -EINVAL; 1250 if (len > task_size - start) 1251 return -EINVAL; 1252 return 0; 1253 } 1254 1255 static inline bool vma_can_userfault(struct vm_area_struct *vma, 1256 unsigned long vm_flags) 1257 { 1258 /* FIXME: add WP support to hugetlbfs and shmem */ 1259 if (vm_flags & VM_UFFD_WP) { 1260 if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) 1261 return false; 1262 } 1263 1264 if (vm_flags & VM_UFFD_MINOR) { 1265 if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma))) 1266 return false; 1267 } 1268 1269 return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || 1270 vma_is_shmem(vma); 1271 } 1272 1273 static int userfaultfd_register(struct userfaultfd_ctx *ctx, 1274 unsigned long arg) 1275 { 1276 struct mm_struct *mm = ctx->mm; 1277 struct vm_area_struct *vma, *prev, *cur; 1278 int ret; 1279 struct uffdio_register uffdio_register; 1280 struct uffdio_register __user *user_uffdio_register; 1281 unsigned long vm_flags, new_flags; 1282 bool found; 1283 bool basic_ioctls; 1284 unsigned long start, end, vma_end; 1285 1286 user_uffdio_register = (struct uffdio_register __user *) arg; 1287 1288 ret = -EFAULT; 1289 if (copy_from_user(&uffdio_register, user_uffdio_register, 1290 sizeof(uffdio_register)-sizeof(__u64))) 1291 goto out; 1292 1293 ret = -EINVAL; 1294 if (!uffdio_register.mode) 1295 goto out; 1296 if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) 1297 goto out; 1298 vm_flags = 0; 1299 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) 1300 vm_flags |= VM_UFFD_MISSING; 1301 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { 1302 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP 1303 goto out; 1304 #endif 1305 vm_flags |= VM_UFFD_WP; 1306 } 1307 if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { 1308 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 1309 goto out; 1310 #endif 1311 vm_flags |= VM_UFFD_MINOR; 1312 } 1313 1314 ret = validate_range(mm, uffdio_register.range.start, 1315 uffdio_register.range.len); 1316 if (ret) 1317 goto out; 1318 1319 start = uffdio_register.range.start; 1320 end = start + uffdio_register.range.len; 1321 1322 ret = -ENOMEM; 1323 if (!mmget_not_zero(mm)) 1324 goto out; 1325 1326 mmap_write_lock(mm); 1327 vma = find_vma_prev(mm, start, &prev); 1328 if (!vma) 1329 goto out_unlock; 1330 1331 /* check that there's at least one vma in the range */ 1332 ret = -EINVAL; 1333 if (vma->vm_start >= end) 1334 goto out_unlock; 1335 1336 /* 1337 * If the first vma contains huge pages, make sure start address 1338 * is aligned to huge page size. 1339 */ 1340 if (is_vm_hugetlb_page(vma)) { 1341 unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 1342 1343 if (start & (vma_hpagesize - 1)) 1344 goto out_unlock; 1345 } 1346 1347 /* 1348 * Search for not compatible vmas. 1349 */ 1350 found = false; 1351 basic_ioctls = false; 1352 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { 1353 cond_resched(); 1354 1355 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 1356 !!(cur->vm_flags & __VM_UFFD_FLAGS)); 1357 1358 /* check not compatible vmas */ 1359 ret = -EINVAL; 1360 if (!vma_can_userfault(cur, vm_flags)) 1361 goto out_unlock; 1362 1363 /* 1364 * UFFDIO_COPY will fill file holes even without 1365 * PROT_WRITE. This check enforces that if this is a 1366 * MAP_SHARED, the process has write permission to the backing 1367 * file. If VM_MAYWRITE is set it also enforces that on a 1368 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further 1369 * F_WRITE_SEAL can be taken until the vma is destroyed. 1370 */ 1371 ret = -EPERM; 1372 if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) 1373 goto out_unlock; 1374 1375 /* 1376 * If this vma contains ending address, and huge pages 1377 * check alignment. 1378 */ 1379 if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && 1380 end > cur->vm_start) { 1381 unsigned long vma_hpagesize = vma_kernel_pagesize(cur); 1382 1383 ret = -EINVAL; 1384 1385 if (end & (vma_hpagesize - 1)) 1386 goto out_unlock; 1387 } 1388 if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) 1389 goto out_unlock; 1390 1391 /* 1392 * Check that this vma isn't already owned by a 1393 * different userfaultfd. We can't allow more than one 1394 * userfaultfd to own a single vma simultaneously or we 1395 * wouldn't know which one to deliver the userfaults to. 1396 */ 1397 ret = -EBUSY; 1398 if (cur->vm_userfaultfd_ctx.ctx && 1399 cur->vm_userfaultfd_ctx.ctx != ctx) 1400 goto out_unlock; 1401 1402 /* 1403 * Note vmas containing huge pages 1404 */ 1405 if (is_vm_hugetlb_page(cur)) 1406 basic_ioctls = true; 1407 1408 found = true; 1409 } 1410 BUG_ON(!found); 1411 1412 if (vma->vm_start < start) 1413 prev = vma; 1414 1415 ret = 0; 1416 do { 1417 cond_resched(); 1418 1419 BUG_ON(!vma_can_userfault(vma, vm_flags)); 1420 BUG_ON(vma->vm_userfaultfd_ctx.ctx && 1421 vma->vm_userfaultfd_ctx.ctx != ctx); 1422 WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); 1423 1424 /* 1425 * Nothing to do: this vma is already registered into this 1426 * userfaultfd and with the right tracking mode too. 1427 */ 1428 if (vma->vm_userfaultfd_ctx.ctx == ctx && 1429 (vma->vm_flags & vm_flags) == vm_flags) 1430 goto skip; 1431 1432 if (vma->vm_start > start) 1433 start = vma->vm_start; 1434 vma_end = min(end, vma->vm_end); 1435 1436 new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; 1437 prev = vma_merge(mm, prev, start, vma_end, new_flags, 1438 vma->anon_vma, vma->vm_file, vma->vm_pgoff, 1439 vma_policy(vma), 1440 ((struct vm_userfaultfd_ctx){ ctx }), 1441 vma_anon_name(vma)); 1442 if (prev) { 1443 vma = prev; 1444 goto next; 1445 } 1446 if (vma->vm_start < start) { 1447 ret = split_vma(mm, vma, start, 1); 1448 if (ret) 1449 break; 1450 } 1451 if (vma->vm_end > end) { 1452 ret = split_vma(mm, vma, end, 0); 1453 if (ret) 1454 break; 1455 } 1456 next: 1457 /* 1458 * In the vma_merge() successful mprotect-like case 8: 1459 * the next vma was merged into the current one and 1460 * the current one has not been updated yet. 1461 */ 1462 vma->vm_flags = new_flags; 1463 vma->vm_userfaultfd_ctx.ctx = ctx; 1464 1465 if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) 1466 hugetlb_unshare_all_pmds(vma); 1467 1468 skip: 1469 prev = vma; 1470 start = vma->vm_end; 1471 vma = vma->vm_next; 1472 } while (vma && vma->vm_start < end); 1473 out_unlock: 1474 mmap_write_unlock(mm); 1475 mmput(mm); 1476 if (!ret) { 1477 __u64 ioctls_out; 1478 1479 ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : 1480 UFFD_API_RANGE_IOCTLS; 1481 1482 /* 1483 * Declare the WP ioctl only if the WP mode is 1484 * specified and all checks passed with the range 1485 */ 1486 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) 1487 ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); 1488 1489 /* CONTINUE ioctl is only supported for MINOR ranges. */ 1490 if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR)) 1491 ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE); 1492 1493 /* 1494 * Now that we scanned all vmas we can already tell 1495 * userland which ioctls methods are guaranteed to 1496 * succeed on this range. 1497 */ 1498 if (put_user(ioctls_out, &user_uffdio_register->ioctls)) 1499 ret = -EFAULT; 1500 } 1501 out: 1502 return ret; 1503 } 1504 1505 static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, 1506 unsigned long arg) 1507 { 1508 struct mm_struct *mm = ctx->mm; 1509 struct vm_area_struct *vma, *prev, *cur; 1510 int ret; 1511 struct uffdio_range uffdio_unregister; 1512 unsigned long new_flags; 1513 bool found; 1514 unsigned long start, end, vma_end; 1515 const void __user *buf = (void __user *)arg; 1516 1517 ret = -EFAULT; 1518 if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) 1519 goto out; 1520 1521 ret = validate_range(mm, uffdio_unregister.start, 1522 uffdio_unregister.len); 1523 if (ret) 1524 goto out; 1525 1526 start = uffdio_unregister.start; 1527 end = start + uffdio_unregister.len; 1528 1529 ret = -ENOMEM; 1530 if (!mmget_not_zero(mm)) 1531 goto out; 1532 1533 mmap_write_lock(mm); 1534 vma = find_vma_prev(mm, start, &prev); 1535 if (!vma) 1536 goto out_unlock; 1537 1538 /* check that there's at least one vma in the range */ 1539 ret = -EINVAL; 1540 if (vma->vm_start >= end) 1541 goto out_unlock; 1542 1543 /* 1544 * If the first vma contains huge pages, make sure start address 1545 * is aligned to huge page size. 1546 */ 1547 if (is_vm_hugetlb_page(vma)) { 1548 unsigned long vma_hpagesize = vma_kernel_pagesize(vma); 1549 1550 if (start & (vma_hpagesize - 1)) 1551 goto out_unlock; 1552 } 1553 1554 /* 1555 * Search for not compatible vmas. 1556 */ 1557 found = false; 1558 ret = -EINVAL; 1559 for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { 1560 cond_resched(); 1561 1562 BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ 1563 !!(cur->vm_flags & __VM_UFFD_FLAGS)); 1564 1565 /* 1566 * Check not compatible vmas, not strictly required 1567 * here as not compatible vmas cannot have an 1568 * userfaultfd_ctx registered on them, but this 1569 * provides for more strict behavior to notice 1570 * unregistration errors. 1571 */ 1572 if (!vma_can_userfault(cur, cur->vm_flags)) 1573 goto out_unlock; 1574 1575 found = true; 1576 } 1577 BUG_ON(!found); 1578 1579 if (vma->vm_start < start) 1580 prev = vma; 1581 1582 ret = 0; 1583 do { 1584 cond_resched(); 1585 1586 BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); 1587 1588 /* 1589 * Nothing to do: this vma is already registered into this 1590 * userfaultfd and with the right tracking mode too. 1591 */ 1592 if (!vma->vm_userfaultfd_ctx.ctx) 1593 goto skip; 1594 1595 WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); 1596 1597 if (vma->vm_start > start) 1598 start = vma->vm_start; 1599 vma_end = min(end, vma->vm_end); 1600 1601 if (userfaultfd_missing(vma)) { 1602 /* 1603 * Wake any concurrent pending userfault while 1604 * we unregister, so they will not hang 1605 * permanently and it avoids userland to call 1606 * UFFDIO_WAKE explicitly. 1607 */ 1608 struct userfaultfd_wake_range range; 1609 range.start = start; 1610 range.len = vma_end - start; 1611 wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); 1612 } 1613 1614 new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; 1615 prev = vma_merge(mm, prev, start, vma_end, new_flags, 1616 vma->anon_vma, vma->vm_file, vma->vm_pgoff, 1617 vma_policy(vma), 1618 NULL_VM_UFFD_CTX, vma_anon_name(vma)); 1619 if (prev) { 1620 vma = prev; 1621 goto next; 1622 } 1623 if (vma->vm_start < start) { 1624 ret = split_vma(mm, vma, start, 1); 1625 if (ret) 1626 break; 1627 } 1628 if (vma->vm_end > end) { 1629 ret = split_vma(mm, vma, end, 0); 1630 if (ret) 1631 break; 1632 } 1633 next: 1634 /* 1635 * In the vma_merge() successful mprotect-like case 8: 1636 * the next vma was merged into the current one and 1637 * the current one has not been updated yet. 1638 */ 1639 vma->vm_flags = new_flags; 1640 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 1641 1642 skip: 1643 prev = vma; 1644 start = vma->vm_end; 1645 vma = vma->vm_next; 1646 } while (vma && vma->vm_start < end); 1647 out_unlock: 1648 mmap_write_unlock(mm); 1649 mmput(mm); 1650 out: 1651 return ret; 1652 } 1653 1654 /* 1655 * userfaultfd_wake may be used in combination with the 1656 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. 1657 */ 1658 static int userfaultfd_wake(struct userfaultfd_ctx *ctx, 1659 unsigned long arg) 1660 { 1661 int ret; 1662 struct uffdio_range uffdio_wake; 1663 struct userfaultfd_wake_range range; 1664 const void __user *buf = (void __user *)arg; 1665 1666 ret = -EFAULT; 1667 if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) 1668 goto out; 1669 1670 ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); 1671 if (ret) 1672 goto out; 1673 1674 range.start = uffdio_wake.start; 1675 range.len = uffdio_wake.len; 1676 1677 /* 1678 * len == 0 means wake all and we don't want to wake all here, 1679 * so check it again to be sure. 1680 */ 1681 VM_BUG_ON(!range.len); 1682 1683 wake_userfault(ctx, &range); 1684 ret = 0; 1685 1686 out: 1687 return ret; 1688 } 1689 1690 static int userfaultfd_copy(struct userfaultfd_ctx *ctx, 1691 unsigned long arg) 1692 { 1693 __s64 ret; 1694 struct uffdio_copy uffdio_copy; 1695 struct uffdio_copy __user *user_uffdio_copy; 1696 struct userfaultfd_wake_range range; 1697 1698 user_uffdio_copy = (struct uffdio_copy __user *) arg; 1699 1700 ret = -EAGAIN; 1701 if (atomic_read(&ctx->mmap_changing)) 1702 goto out; 1703 1704 ret = -EFAULT; 1705 if (copy_from_user(&uffdio_copy, user_uffdio_copy, 1706 /* don't copy "copy" last field */ 1707 sizeof(uffdio_copy)-sizeof(__s64))) 1708 goto out; 1709 1710 ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); 1711 if (ret) 1712 goto out; 1713 /* 1714 * double check for wraparound just in case. copy_from_user() 1715 * will later check uffdio_copy.src + uffdio_copy.len to fit 1716 * in the userland range. 1717 */ 1718 ret = -EINVAL; 1719 if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) 1720 goto out; 1721 if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) 1722 goto out; 1723 if (mmget_not_zero(ctx->mm)) { 1724 ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, 1725 uffdio_copy.len, &ctx->mmap_changing, 1726 uffdio_copy.mode); 1727 mmput(ctx->mm); 1728 } else { 1729 return -ESRCH; 1730 } 1731 if (unlikely(put_user(ret, &user_uffdio_copy->copy))) 1732 return -EFAULT; 1733 if (ret < 0) 1734 goto out; 1735 BUG_ON(!ret); 1736 /* len == 0 would wake all */ 1737 range.len = ret; 1738 if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { 1739 range.start = uffdio_copy.dst; 1740 wake_userfault(ctx, &range); 1741 } 1742 ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; 1743 out: 1744 return ret; 1745 } 1746 1747 static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, 1748 unsigned long arg) 1749 { 1750 __s64 ret; 1751 struct uffdio_zeropage uffdio_zeropage; 1752 struct uffdio_zeropage __user *user_uffdio_zeropage; 1753 struct userfaultfd_wake_range range; 1754 1755 user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; 1756 1757 ret = -EAGAIN; 1758 if (atomic_read(&ctx->mmap_changing)) 1759 goto out; 1760 1761 ret = -EFAULT; 1762 if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, 1763 /* don't copy "zeropage" last field */ 1764 sizeof(uffdio_zeropage)-sizeof(__s64))) 1765 goto out; 1766 1767 ret = validate_range(ctx->mm, uffdio_zeropage.range.start, 1768 uffdio_zeropage.range.len); 1769 if (ret) 1770 goto out; 1771 ret = -EINVAL; 1772 if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) 1773 goto out; 1774 1775 if (mmget_not_zero(ctx->mm)) { 1776 ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, 1777 uffdio_zeropage.range.len, 1778 &ctx->mmap_changing); 1779 mmput(ctx->mm); 1780 } else { 1781 return -ESRCH; 1782 } 1783 if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) 1784 return -EFAULT; 1785 if (ret < 0) 1786 goto out; 1787 /* len == 0 would wake all */ 1788 BUG_ON(!ret); 1789 range.len = ret; 1790 if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { 1791 range.start = uffdio_zeropage.range.start; 1792 wake_userfault(ctx, &range); 1793 } 1794 ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; 1795 out: 1796 return ret; 1797 } 1798 1799 static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, 1800 unsigned long arg) 1801 { 1802 int ret; 1803 struct uffdio_writeprotect uffdio_wp; 1804 struct uffdio_writeprotect __user *user_uffdio_wp; 1805 struct userfaultfd_wake_range range; 1806 bool mode_wp, mode_dontwake; 1807 1808 if (atomic_read(&ctx->mmap_changing)) 1809 return -EAGAIN; 1810 1811 user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; 1812 1813 if (copy_from_user(&uffdio_wp, user_uffdio_wp, 1814 sizeof(struct uffdio_writeprotect))) 1815 return -EFAULT; 1816 1817 ret = validate_range(ctx->mm, uffdio_wp.range.start, 1818 uffdio_wp.range.len); 1819 if (ret) 1820 return ret; 1821 1822 if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | 1823 UFFDIO_WRITEPROTECT_MODE_WP)) 1824 return -EINVAL; 1825 1826 mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; 1827 mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; 1828 1829 if (mode_wp && mode_dontwake) 1830 return -EINVAL; 1831 1832 if (mmget_not_zero(ctx->mm)) { 1833 ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, 1834 uffdio_wp.range.len, mode_wp, 1835 &ctx->mmap_changing); 1836 mmput(ctx->mm); 1837 } else { 1838 return -ESRCH; 1839 } 1840 1841 if (ret) 1842 return ret; 1843 1844 if (!mode_wp && !mode_dontwake) { 1845 range.start = uffdio_wp.range.start; 1846 range.len = uffdio_wp.range.len; 1847 wake_userfault(ctx, &range); 1848 } 1849 return ret; 1850 } 1851 1852 static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) 1853 { 1854 __s64 ret; 1855 struct uffdio_continue uffdio_continue; 1856 struct uffdio_continue __user *user_uffdio_continue; 1857 struct userfaultfd_wake_range range; 1858 1859 user_uffdio_continue = (struct uffdio_continue __user *)arg; 1860 1861 ret = -EAGAIN; 1862 if (atomic_read(&ctx->mmap_changing)) 1863 goto out; 1864 1865 ret = -EFAULT; 1866 if (copy_from_user(&uffdio_continue, user_uffdio_continue, 1867 /* don't copy the output fields */ 1868 sizeof(uffdio_continue) - (sizeof(__s64)))) 1869 goto out; 1870 1871 ret = validate_range(ctx->mm, uffdio_continue.range.start, 1872 uffdio_continue.range.len); 1873 if (ret) 1874 goto out; 1875 1876 ret = -EINVAL; 1877 /* double check for wraparound just in case. */ 1878 if (uffdio_continue.range.start + uffdio_continue.range.len <= 1879 uffdio_continue.range.start) { 1880 goto out; 1881 } 1882 if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE) 1883 goto out; 1884 1885 if (mmget_not_zero(ctx->mm)) { 1886 ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, 1887 uffdio_continue.range.len, 1888 &ctx->mmap_changing); 1889 mmput(ctx->mm); 1890 } else { 1891 return -ESRCH; 1892 } 1893 1894 if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) 1895 return -EFAULT; 1896 if (ret < 0) 1897 goto out; 1898 1899 /* len == 0 would wake all */ 1900 BUG_ON(!ret); 1901 range.len = ret; 1902 if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) { 1903 range.start = uffdio_continue.range.start; 1904 wake_userfault(ctx, &range); 1905 } 1906 ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN; 1907 1908 out: 1909 return ret; 1910 } 1911 1912 static inline unsigned int uffd_ctx_features(__u64 user_features) 1913 { 1914 /* 1915 * For the current set of features the bits just coincide. Set 1916 * UFFD_FEATURE_INITIALIZED to mark the features as enabled. 1917 */ 1918 return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; 1919 } 1920 1921 /* 1922 * userland asks for a certain API version and we return which bits 1923 * and ioctl commands are implemented in this kernel for such API 1924 * version or -EINVAL if unknown. 1925 */ 1926 static int userfaultfd_api(struct userfaultfd_ctx *ctx, 1927 unsigned long arg) 1928 { 1929 struct uffdio_api uffdio_api; 1930 void __user *buf = (void __user *)arg; 1931 unsigned int ctx_features; 1932 int ret; 1933 __u64 features; 1934 1935 ret = -EFAULT; 1936 if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) 1937 goto out; 1938 features = uffdio_api.features; 1939 ret = -EINVAL; 1940 if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) 1941 goto err_out; 1942 ret = -EPERM; 1943 if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) 1944 goto err_out; 1945 /* report all available features and ioctls to userland */ 1946 uffdio_api.features = UFFD_API_FEATURES; 1947 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR 1948 uffdio_api.features &= 1949 ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); 1950 #endif 1951 #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP 1952 uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; 1953 #endif 1954 uffdio_api.ioctls = UFFD_API_IOCTLS; 1955 ret = -EFAULT; 1956 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 1957 goto out; 1958 1959 /* only enable the requested features for this uffd context */ 1960 ctx_features = uffd_ctx_features(features); 1961 ret = -EINVAL; 1962 if (cmpxchg(&ctx->features, 0, ctx_features) != 0) 1963 goto err_out; 1964 1965 ret = 0; 1966 out: 1967 return ret; 1968 err_out: 1969 memset(&uffdio_api, 0, sizeof(uffdio_api)); 1970 if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) 1971 ret = -EFAULT; 1972 goto out; 1973 } 1974 1975 static long userfaultfd_ioctl(struct file *file, unsigned cmd, 1976 unsigned long arg) 1977 { 1978 int ret = -EINVAL; 1979 struct userfaultfd_ctx *ctx = file->private_data; 1980 1981 if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) 1982 return -EINVAL; 1983 1984 switch(cmd) { 1985 case UFFDIO_API: 1986 ret = userfaultfd_api(ctx, arg); 1987 break; 1988 case UFFDIO_REGISTER: 1989 ret = userfaultfd_register(ctx, arg); 1990 break; 1991 case UFFDIO_UNREGISTER: 1992 ret = userfaultfd_unregister(ctx, arg); 1993 break; 1994 case UFFDIO_WAKE: 1995 ret = userfaultfd_wake(ctx, arg); 1996 break; 1997 case UFFDIO_COPY: 1998 ret = userfaultfd_copy(ctx, arg); 1999 break; 2000 case UFFDIO_ZEROPAGE: 2001 ret = userfaultfd_zeropage(ctx, arg); 2002 break; 2003 case UFFDIO_WRITEPROTECT: 2004 ret = userfaultfd_writeprotect(ctx, arg); 2005 break; 2006 case UFFDIO_CONTINUE: 2007 ret = userfaultfd_continue(ctx, arg); 2008 break; 2009 } 2010 return ret; 2011 } 2012 2013 #ifdef CONFIG_PROC_FS 2014 static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) 2015 { 2016 struct userfaultfd_ctx *ctx = f->private_data; 2017 wait_queue_entry_t *wq; 2018 unsigned long pending = 0, total = 0; 2019 2020 spin_lock_irq(&ctx->fault_pending_wqh.lock); 2021 list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { 2022 pending++; 2023 total++; 2024 } 2025 list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { 2026 total++; 2027 } 2028 spin_unlock_irq(&ctx->fault_pending_wqh.lock); 2029 2030 /* 2031 * If more protocols will be added, there will be all shown 2032 * separated by a space. Like this: 2033 * protocols: aa:... bb:... 2034 */ 2035 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", 2036 pending, total, UFFD_API, ctx->features, 2037 UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); 2038 } 2039 #endif 2040 2041 static const struct file_operations userfaultfd_fops = { 2042 #ifdef CONFIG_PROC_FS 2043 .show_fdinfo = userfaultfd_show_fdinfo, 2044 #endif 2045 .release = userfaultfd_release, 2046 .poll = userfaultfd_poll, 2047 .read = userfaultfd_read, 2048 .unlocked_ioctl = userfaultfd_ioctl, 2049 .compat_ioctl = compat_ptr_ioctl, 2050 .llseek = noop_llseek, 2051 }; 2052 2053 static void init_once_userfaultfd_ctx(void *mem) 2054 { 2055 struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; 2056 2057 init_waitqueue_head(&ctx->fault_pending_wqh); 2058 init_waitqueue_head(&ctx->fault_wqh); 2059 init_waitqueue_head(&ctx->event_wqh); 2060 init_waitqueue_head(&ctx->fd_wqh); 2061 seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); 2062 } 2063 2064 SYSCALL_DEFINE1(userfaultfd, int, flags) 2065 { 2066 struct userfaultfd_ctx *ctx; 2067 int fd; 2068 2069 if (!sysctl_unprivileged_userfaultfd && 2070 (flags & UFFD_USER_MODE_ONLY) == 0 && 2071 !capable(CAP_SYS_PTRACE)) { 2072 printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " 2073 "sysctl knob to 1 if kernel faults must be handled " 2074 "without obtaining CAP_SYS_PTRACE capability\n"); 2075 return -EPERM; 2076 } 2077 2078 BUG_ON(!current->mm); 2079 2080 /* Check the UFFD_* constants for consistency. */ 2081 BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); 2082 BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); 2083 BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); 2084 2085 if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) 2086 return -EINVAL; 2087 2088 ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); 2089 if (!ctx) 2090 return -ENOMEM; 2091 2092 refcount_set(&ctx->refcount, 1); 2093 ctx->flags = flags; 2094 ctx->features = 0; 2095 ctx->released = false; 2096 atomic_set(&ctx->mmap_changing, 0); 2097 ctx->mm = current->mm; 2098 /* prevent the mm struct to be freed */ 2099 mmgrab(ctx->mm); 2100 2101 fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, 2102 O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); 2103 if (fd < 0) { 2104 mmdrop(ctx->mm); 2105 kmem_cache_free(userfaultfd_ctx_cachep, ctx); 2106 } 2107 return fd; 2108 } 2109 2110 static int __init userfaultfd_init(void) 2111 { 2112 userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", 2113 sizeof(struct userfaultfd_ctx), 2114 0, 2115 SLAB_HWCACHE_ALIGN|SLAB_PANIC, 2116 init_once_userfaultfd_ctx); 2117 return 0; 2118 } 2119 __initcall(userfaultfd_init); 2120