userfaultfd.c (ba85c702e4b247393ffe9e3fbc13d8aee7b02059) userfaultfd.c (15b726ef048b31a24b3fefb6863083a25fe34800)
1/*
2 * fs/userfaultfd.c
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 * Copyright (C) 2008-2009 Red Hat, Inc.
6 * Copyright (C) 2015 Red Hat, Inc.
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See

--- 21 unchanged lines hidden (view full) ---

30enum userfaultfd_state {
31 UFFD_STATE_WAIT_API,
32 UFFD_STATE_RUNNING,
33};
34
35struct userfaultfd_ctx {
36 /* pseudo fd refcounting */
37 atomic_t refcount;
1/*
2 * fs/userfaultfd.c
3 *
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
5 * Copyright (C) 2008-2009 Red Hat, Inc.
6 * Copyright (C) 2015 Red Hat, Inc.
7 *
8 * This work is licensed under the terms of the GNU GPL, version 2. See

--- 21 unchanged lines hidden (view full) ---

30enum userfaultfd_state {
31 UFFD_STATE_WAIT_API,
32 UFFD_STATE_RUNNING,
33};
34
35struct userfaultfd_ctx {
36 /* pseudo fd refcounting */
37 atomic_t refcount;
38 /* waitqueue head for the userfaultfd page faults */
38 /* waitqueue head for the pending (i.e. not read) userfaults */
39 wait_queue_head_t fault_pending_wqh;
40 /* waitqueue head for the userfaults */
39 wait_queue_head_t fault_wqh;
40 /* waitqueue head for the pseudo fd to wakeup poll/read */
41 wait_queue_head_t fd_wqh;
42 /* userfaultfd syscall flags */
43 unsigned int flags;
44 /* state machine */
45 enum userfaultfd_state state;
46 /* released */
47 bool released;
48 /* mm with one ore more vmas attached to this userfaultfd_ctx */
49 struct mm_struct *mm;
50};
51
52struct userfaultfd_wait_queue {
53 struct uffd_msg msg;
54 wait_queue_t wq;
41 wait_queue_head_t fault_wqh;
42 /* waitqueue head for the pseudo fd to wakeup poll/read */
43 wait_queue_head_t fd_wqh;
44 /* userfaultfd syscall flags */
45 unsigned int flags;
46 /* state machine */
47 enum userfaultfd_state state;
48 /* released */
49 bool released;
50 /* mm with one ore more vmas attached to this userfaultfd_ctx */
51 struct mm_struct *mm;
52};
53
54struct userfaultfd_wait_queue {
55 struct uffd_msg msg;
56 wait_queue_t wq;
55 /*
56 * Only relevant when queued in fault_wqh and only used by the
57 * read operation to avoid reading the same userfault twice.
58 */
59 bool pending;
60 struct userfaultfd_ctx *ctx;
61};
62
63struct userfaultfd_wake_range {
64 unsigned long start;
65 unsigned long len;
66};
67

--- 190 unchanged lines hidden (view full) ---

258 userfaultfd_ctx_get(ctx);
259
260 /* be gentle and immediately relinquish the mmap_sem */
261 up_read(&mm->mmap_sem);
262
263 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
264 uwq.wq.private = current;
265 uwq.msg = userfault_msg(address, flags, reason);
57 struct userfaultfd_ctx *ctx;
58};
59
60struct userfaultfd_wake_range {
61 unsigned long start;
62 unsigned long len;
63};
64

--- 190 unchanged lines hidden (view full) ---

255 userfaultfd_ctx_get(ctx);
256
257 /* be gentle and immediately relinquish the mmap_sem */
258 up_read(&mm->mmap_sem);
259
260 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
261 uwq.wq.private = current;
262 uwq.msg = userfault_msg(address, flags, reason);
266 uwq.pending = true;
267 uwq.ctx = ctx;
268
263 uwq.ctx = ctx;
264
269 spin_lock(&ctx->fault_wqh.lock);
265 spin_lock(&ctx->fault_pending_wqh.lock);
270 /*
271 * After the __add_wait_queue the uwq is visible to userland
272 * through poll/read().
273 */
266 /*
267 * After the __add_wait_queue the uwq is visible to userland
268 * through poll/read().
269 */
274 __add_wait_queue(&ctx->fault_wqh, &uwq.wq);
270 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
271 /*
272 * The smp_mb() after __set_current_state prevents the reads
273 * following the spin_unlock to happen before the list_add in
274 * __add_wait_queue.
275 */
275 set_current_state(TASK_KILLABLE);
276 set_current_state(TASK_KILLABLE);
276 spin_unlock(&ctx->fault_wqh.lock);
277 spin_unlock(&ctx->fault_pending_wqh.lock);
277
278 if (likely(!ACCESS_ONCE(ctx->released) &&
279 !fatal_signal_pending(current))) {
280 wake_up_poll(&ctx->fd_wqh, POLLIN);
281 schedule();
282 ret |= VM_FAULT_MAJOR;
283 }
284
285 __set_current_state(TASK_RUNNING);
278
279 if (likely(!ACCESS_ONCE(ctx->released) &&
280 !fatal_signal_pending(current))) {
281 wake_up_poll(&ctx->fd_wqh, POLLIN);
282 schedule();
283 ret |= VM_FAULT_MAJOR;
284 }
285
286 __set_current_state(TASK_RUNNING);
286 /* see finish_wait() comment for why list_empty_careful() */
287
288 /*
289 * Here we race with the list_del; list_add in
290 * userfaultfd_ctx_read(), however because we don't ever run
291 * list_del_init() to refile across the two lists, the prev
292 * and next pointers will never point to self. list_add also
293 * would never let any of the two pointers to point to
294 * self. So list_empty_careful won't risk to see both pointers
295 * pointing to self at any time during the list refile. The
296 * only case where list_del_init() is called is the full
297 * removal in the wake function and there we don't re-list_add
298 * and it's fine not to block on the spinlock. The uwq on this
299 * kernel stack can be released after the list_del_init.
300 */
287 if (!list_empty_careful(&uwq.wq.task_list)) {
301 if (!list_empty_careful(&uwq.wq.task_list)) {
288 spin_lock(&ctx->fault_wqh.lock);
289 list_del_init(&uwq.wq.task_list);
290 spin_unlock(&ctx->fault_wqh.lock);
302 spin_lock(&ctx->fault_pending_wqh.lock);
303 /*
304 * No need of list_del_init(), the uwq on the stack
305 * will be freed shortly anyway.
306 */
307 list_del(&uwq.wq.task_list);
308 spin_unlock(&ctx->fault_pending_wqh.lock);
291 }
292
293 /*
294 * ctx may go away after this if the userfault pseudo fd is
295 * already released.
296 */
297 userfaultfd_ctx_put(ctx);
298

--- 41 unchanged lines hidden (view full) ---

340 else
341 prev = vma;
342 vma->vm_flags = new_flags;
343 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
344 }
345 up_write(&mm->mmap_sem);
346
347 /*
309 }
310
311 /*
312 * ctx may go away after this if the userfault pseudo fd is
313 * already released.
314 */
315 userfaultfd_ctx_put(ctx);
316

--- 41 unchanged lines hidden (view full) ---

358 else
359 prev = vma;
360 vma->vm_flags = new_flags;
361 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
362 }
363 up_write(&mm->mmap_sem);
364
365 /*
348 * After no new page faults can wait on this fault_wqh, flush
366 * After no new page faults can wait on this fault_*wqh, flush
349 * the last page faults that may have been already waiting on
367 * the last page faults that may have been already waiting on
350 * the fault_wqh.
368 * the fault_*wqh.
351 */
369 */
352 spin_lock(&ctx->fault_wqh.lock);
370 spin_lock(&ctx->fault_pending_wqh.lock);
371 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
353 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
372 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
354 spin_unlock(&ctx->fault_wqh.lock);
373 spin_unlock(&ctx->fault_pending_wqh.lock);
355
356 wake_up_poll(&ctx->fd_wqh, POLLHUP);
357 userfaultfd_ctx_put(ctx);
358 return 0;
359}
360
374
375 wake_up_poll(&ctx->fd_wqh, POLLHUP);
376 userfaultfd_ctx_put(ctx);
377 return 0;
378}
379
361/* fault_wqh.lock must be hold by the caller */
362static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx,
363 struct userfaultfd_wait_queue **uwq)
380/* fault_pending_wqh.lock must be hold by the caller */
381static inline struct userfaultfd_wait_queue *find_userfault(
382 struct userfaultfd_ctx *ctx)
364{
365 wait_queue_t *wq;
383{
384 wait_queue_t *wq;
366 struct userfaultfd_wait_queue *_uwq;
367 unsigned int ret = 0;
385 struct userfaultfd_wait_queue *uwq;
368
386
369 VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock));
387 VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
370
388
371 list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
372 _uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
373 if (_uwq->pending) {
374 ret = POLLIN;
375 if (!uwq)
376 /*
377 * If there's at least a pending and
378 * we don't care which one it is,
379 * break immediately and leverage the
380 * efficiency of the LIFO walk.
381 */
382 break;
383 /*
384 * If we need to find which one was pending we
385 * keep walking until we find the first not
386 * pending one, so we read() them in FIFO order.
387 */
388 *uwq = _uwq;
389 } else
390 /*
391 * break the loop at the first not pending
392 * one, there cannot be pending userfaults
393 * after the first not pending one, because
394 * all new pending ones are inserted at the
395 * head and we walk it in LIFO.
396 */
397 break;
398 }
399
400 return ret;
389 uwq = NULL;
390 if (!waitqueue_active(&ctx->fault_pending_wqh))
391 goto out;
392 /* walk in reverse to provide FIFO behavior to read userfaults */
393 wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
394 typeof(*wq), task_list);
395 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
396out:
397 return uwq;
401}
402
403static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
404{
405 struct userfaultfd_ctx *ctx = file->private_data;
406 unsigned int ret;
407
408 poll_wait(file, &ctx->fd_wqh, wait);
409
410 switch (ctx->state) {
411 case UFFD_STATE_WAIT_API:
412 return POLLERR;
413 case UFFD_STATE_RUNNING:
414 /*
415 * poll() never guarantees that read won't block.
416 * userfaults can be waken before they're read().
417 */
418 if (unlikely(!(file->f_flags & O_NONBLOCK)))
419 return POLLERR;
398}
399
400static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
401{
402 struct userfaultfd_ctx *ctx = file->private_data;
403 unsigned int ret;
404
405 poll_wait(file, &ctx->fd_wqh, wait);
406
407 switch (ctx->state) {
408 case UFFD_STATE_WAIT_API:
409 return POLLERR;
410 case UFFD_STATE_RUNNING:
411 /*
412 * poll() never guarantees that read won't block.
413 * userfaults can be waken before they're read().
414 */
415 if (unlikely(!(file->f_flags & O_NONBLOCK)))
416 return POLLERR;
420 spin_lock(&ctx->fault_wqh.lock);
421 ret = find_userfault(ctx, NULL);
422 spin_unlock(&ctx->fault_wqh.lock);
417 /*
418 * lockless access to see if there are pending faults
419 * __pollwait last action is the add_wait_queue but
420 * the spin_unlock would allow the waitqueue_active to
421 * pass above the actual list_add inside
422 * add_wait_queue critical section. So use a full
423 * memory barrier to serialize the list_add write of
424 * add_wait_queue() with the waitqueue_active read
425 * below.
426 */
427 ret = 0;
428 smp_mb();
429 if (waitqueue_active(&ctx->fault_pending_wqh))
430 ret = POLLIN;
423 return ret;
424 default:
425 BUG();
426 }
427}
428
429static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
430 struct uffd_msg *msg)
431{
432 ssize_t ret;
433 DECLARE_WAITQUEUE(wait, current);
431 return ret;
432 default:
433 BUG();
434 }
435}
436
437static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
438 struct uffd_msg *msg)
439{
440 ssize_t ret;
441 DECLARE_WAITQUEUE(wait, current);
434 struct userfaultfd_wait_queue *uwq = NULL;
442 struct userfaultfd_wait_queue *uwq;
435
443
436 /* always take the fd_wqh lock before the fault_wqh lock */
444 /* always take the fd_wqh lock before the fault_pending_wqh lock */
437 spin_lock(&ctx->fd_wqh.lock);
438 __add_wait_queue(&ctx->fd_wqh, &wait);
439 for (;;) {
440 set_current_state(TASK_INTERRUPTIBLE);
445 spin_lock(&ctx->fd_wqh.lock);
446 __add_wait_queue(&ctx->fd_wqh, &wait);
447 for (;;) {
448 set_current_state(TASK_INTERRUPTIBLE);
441 spin_lock(&ctx->fault_wqh.lock);
442 if (find_userfault(ctx, &uwq)) {
449 spin_lock(&ctx->fault_pending_wqh.lock);
450 uwq = find_userfault(ctx);
451 if (uwq) {
443 /*
452 /*
444 * The fault_wqh.lock prevents the uwq to
445 * disappear from under us.
453 * The fault_pending_wqh.lock prevents the uwq
454 * to disappear from under us.
455 *
456 * Refile this userfault from
457 * fault_pending_wqh to fault_wqh, it's not
458 * pending anymore after we read it.
459 *
460 * Use list_del() by hand (as
461 * userfaultfd_wake_function also uses
462 * list_del_init() by hand) to be sure nobody
463 * changes __remove_wait_queue() to use
464 * list_del_init() in turn breaking the
465 * !list_empty_careful() check in
466 * handle_userfault(). The uwq->wq.task_list
467 * must never be empty at any time during the
468 * refile, or the waitqueue could disappear
469 * from under us. The "wait_queue_head_t"
470 * parameter of __remove_wait_queue() is unused
471 * anyway.
446 */
472 */
447 uwq->pending = false;
473 list_del(&uwq->wq.task_list);
474 __add_wait_queue(&ctx->fault_wqh, &uwq->wq);
475
448 /* careful to always initialize msg if ret == 0 */
449 *msg = uwq->msg;
476 /* careful to always initialize msg if ret == 0 */
477 *msg = uwq->msg;
450 spin_unlock(&ctx->fault_wqh.lock);
478 spin_unlock(&ctx->fault_pending_wqh.lock);
451 ret = 0;
452 break;
453 }
479 ret = 0;
480 break;
481 }
454 spin_unlock(&ctx->fault_wqh.lock);
482 spin_unlock(&ctx->fault_pending_wqh.lock);
455 if (signal_pending(current)) {
456 ret = -ERESTARTSYS;
457 break;
458 }
459 if (no_wait) {
460 ret = -EAGAIN;
461 break;
462 }

--- 42 unchanged lines hidden (view full) ---

505static void __wake_userfault(struct userfaultfd_ctx *ctx,
506 struct userfaultfd_wake_range *range)
507{
508 unsigned long start, end;
509
510 start = range->start;
511 end = range->start + range->len;
512
483 if (signal_pending(current)) {
484 ret = -ERESTARTSYS;
485 break;
486 }
487 if (no_wait) {
488 ret = -EAGAIN;
489 break;
490 }

--- 42 unchanged lines hidden (view full) ---

533static void __wake_userfault(struct userfaultfd_ctx *ctx,
534 struct userfaultfd_wake_range *range)
535{
536 unsigned long start, end;
537
538 start = range->start;
539 end = range->start + range->len;
540
513 spin_lock(&ctx->fault_wqh.lock);
541 spin_lock(&ctx->fault_pending_wqh.lock);
514 /* wake all in the range and autoremove */
542 /* wake all in the range and autoremove */
515 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
516 spin_unlock(&ctx->fault_wqh.lock);
543 if (waitqueue_active(&ctx->fault_pending_wqh))
544 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
545 range);
546 if (waitqueue_active(&ctx->fault_wqh))
547 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
548 spin_unlock(&ctx->fault_pending_wqh.lock);
517}
518
519static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
520 struct userfaultfd_wake_range *range)
521{
522 /*
523 * To be sure waitqueue_active() is not reordered by the CPU
524 * before the pagetable update, use an explicit SMP memory

--- 4 unchanged lines hidden (view full) ---

529 smp_mb();
530
531 /*
532 * Use waitqueue_active because it's very frequent to
533 * change the address space atomically even if there are no
534 * userfaults yet. So we take the spinlock only when we're
535 * sure we've userfaults to wake.
536 */
549}
550
551static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
552 struct userfaultfd_wake_range *range)
553{
554 /*
555 * To be sure waitqueue_active() is not reordered by the CPU
556 * before the pagetable update, use an explicit SMP memory

--- 4 unchanged lines hidden (view full) ---

561 smp_mb();
562
563 /*
564 * Use waitqueue_active because it's very frequent to
565 * change the address space atomically even if there are no
566 * userfaults yet. So we take the spinlock only when we're
567 * sure we've userfaults to wake.
568 */
537 if (waitqueue_active(&ctx->fault_wqh))
569 if (waitqueue_active(&ctx->fault_pending_wqh) ||
570 waitqueue_active(&ctx->fault_wqh))
538 __wake_userfault(ctx, range);
539}
540
541static __always_inline int validate_range(struct mm_struct *mm,
542 __u64 start, __u64 len)
543{
544 __u64 task_size = mm->task_size;
545

--- 409 unchanged lines hidden (view full) ---

955#ifdef CONFIG_PROC_FS
956static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
957{
958 struct userfaultfd_ctx *ctx = f->private_data;
959 wait_queue_t *wq;
960 struct userfaultfd_wait_queue *uwq;
961 unsigned long pending = 0, total = 0;
962
571 __wake_userfault(ctx, range);
572}
573
574static __always_inline int validate_range(struct mm_struct *mm,
575 __u64 start, __u64 len)
576{
577 __u64 task_size = mm->task_size;
578

--- 409 unchanged lines hidden (view full) ---

988#ifdef CONFIG_PROC_FS
989static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
990{
991 struct userfaultfd_ctx *ctx = f->private_data;
992 wait_queue_t *wq;
993 struct userfaultfd_wait_queue *uwq;
994 unsigned long pending = 0, total = 0;
995
963 spin_lock(&ctx->fault_wqh.lock);
996 spin_lock(&ctx->fault_pending_wqh.lock);
997 list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
998 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
999 pending++;
1000 total++;
1001 }
964 list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
965 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
1002 list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
1003 uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
966 if (uwq->pending)
967 pending++;
968 total++;
969 }
1004 total++;
1005 }
970 spin_unlock(&ctx->fault_wqh.lock);
1006 spin_unlock(&ctx->fault_pending_wqh.lock);
971
972 /*
973 * If more protocols will be added, there will be all shown
974 * separated by a space. Like this:
975 * protocols: aa:... bb:...
976 */
977 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
978 pending, total, UFFD_API, UFFD_API_FEATURES,

--- 43 unchanged lines hidden (view full) ---

1022 goto out;
1023
1024 file = ERR_PTR(-ENOMEM);
1025 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
1026 if (!ctx)
1027 goto out;
1028
1029 atomic_set(&ctx->refcount, 1);
1007
1008 /*
1009 * If more protocols will be added, there will be all shown
1010 * separated by a space. Like this:
1011 * protocols: aa:... bb:...
1012 */
1013 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
1014 pending, total, UFFD_API, UFFD_API_FEATURES,

--- 43 unchanged lines hidden (view full) ---

1058 goto out;
1059
1060 file = ERR_PTR(-ENOMEM);
1061 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
1062 if (!ctx)
1063 goto out;
1064
1065 atomic_set(&ctx->refcount, 1);
1066 init_waitqueue_head(&ctx->fault_pending_wqh);
1030 init_waitqueue_head(&ctx->fault_wqh);
1031 init_waitqueue_head(&ctx->fd_wqh);
1032 ctx->flags = flags;
1033 ctx->state = UFFD_STATE_WAIT_API;
1034 ctx->released = false;
1035 ctx->mm = current->mm;
1036 /* prevent the mm struct to be freed */
1037 atomic_inc(&ctx->mm->mm_users);

--- 33 unchanged lines hidden ---
1067 init_waitqueue_head(&ctx->fault_wqh);
1068 init_waitqueue_head(&ctx->fd_wqh);
1069 ctx->flags = flags;
1070 ctx->state = UFFD_STATE_WAIT_API;
1071 ctx->released = false;
1072 ctx->mm = current->mm;
1073 /* prevent the mm struct to be freed */
1074 atomic_inc(&ctx->mm->mm_users);

--- 33 unchanged lines hidden ---