userfaultfd.c (ba85c702e4b247393ffe9e3fbc13d8aee7b02059) | userfaultfd.c (15b726ef048b31a24b3fefb6863083a25fe34800) |
---|---|
1/* 2 * fs/userfaultfd.c 3 * 4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 5 * Copyright (C) 2008-2009 Red Hat, Inc. 6 * Copyright (C) 2015 Red Hat, Inc. 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2. See --- 21 unchanged lines hidden (view full) --- 30enum userfaultfd_state { 31 UFFD_STATE_WAIT_API, 32 UFFD_STATE_RUNNING, 33}; 34 35struct userfaultfd_ctx { 36 /* pseudo fd refcounting */ 37 atomic_t refcount; | 1/* 2 * fs/userfaultfd.c 3 * 4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> 5 * Copyright (C) 2008-2009 Red Hat, Inc. 6 * Copyright (C) 2015 Red Hat, Inc. 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2. See --- 21 unchanged lines hidden (view full) --- 30enum userfaultfd_state { 31 UFFD_STATE_WAIT_API, 32 UFFD_STATE_RUNNING, 33}; 34 35struct userfaultfd_ctx { 36 /* pseudo fd refcounting */ 37 atomic_t refcount; |
38 /* waitqueue head for the userfaultfd page faults */ | 38 /* waitqueue head for the pending (i.e. not read) userfaults */ 39 wait_queue_head_t fault_pending_wqh; 40 /* waitqueue head for the userfaults */ |
39 wait_queue_head_t fault_wqh; 40 /* waitqueue head for the pseudo fd to wakeup poll/read */ 41 wait_queue_head_t fd_wqh; 42 /* userfaultfd syscall flags */ 43 unsigned int flags; 44 /* state machine */ 45 enum userfaultfd_state state; 46 /* released */ 47 bool released; 48 /* mm with one ore more vmas attached to this userfaultfd_ctx */ 49 struct mm_struct *mm; 50}; 51 52struct userfaultfd_wait_queue { 53 struct uffd_msg msg; 54 wait_queue_t wq; | 41 wait_queue_head_t fault_wqh; 42 /* waitqueue head for the pseudo fd to wakeup poll/read */ 43 wait_queue_head_t fd_wqh; 44 /* userfaultfd syscall flags */ 45 unsigned int flags; 46 /* state machine */ 47 enum userfaultfd_state state; 48 /* released */ 49 bool released; 50 /* mm with one ore more vmas attached to this userfaultfd_ctx */ 51 struct mm_struct *mm; 52}; 53 54struct userfaultfd_wait_queue { 55 struct uffd_msg msg; 56 wait_queue_t wq; |
55 /* 56 * Only relevant when queued in fault_wqh and only used by the 57 * read operation to avoid reading the same userfault twice. 58 */ 59 bool pending; | |
60 struct userfaultfd_ctx *ctx; 61}; 62 63struct userfaultfd_wake_range { 64 unsigned long start; 65 unsigned long len; 66}; 67 --- 190 unchanged lines hidden (view full) --- 258 userfaultfd_ctx_get(ctx); 259 260 /* be gentle and immediately relinquish the mmap_sem */ 261 up_read(&mm->mmap_sem); 262 263 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 264 uwq.wq.private = current; 265 uwq.msg = userfault_msg(address, flags, reason); | 57 struct userfaultfd_ctx *ctx; 58}; 59 60struct userfaultfd_wake_range { 61 unsigned long start; 62 unsigned long len; 63}; 64 --- 190 unchanged lines hidden (view full) --- 255 userfaultfd_ctx_get(ctx); 256 257 /* be gentle and immediately relinquish the mmap_sem */ 258 up_read(&mm->mmap_sem); 259 260 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 261 uwq.wq.private = current; 262 uwq.msg = userfault_msg(address, flags, reason); |
266 uwq.pending = true; | |
267 uwq.ctx = ctx; 268 | 263 uwq.ctx = ctx; 264 |
269 spin_lock(&ctx->fault_wqh.lock); | 265 spin_lock(&ctx->fault_pending_wqh.lock); |
270 /* 271 * After the __add_wait_queue the uwq is visible to userland 272 * through poll/read(). 273 */ | 266 /* 267 * After the __add_wait_queue the uwq is visible to userland 268 * through poll/read(). 269 */ |
274 __add_wait_queue(&ctx->fault_wqh, &uwq.wq); | 270 __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); 271 /* 272 * The smp_mb() after __set_current_state prevents the reads 273 * following the spin_unlock to happen before the list_add in 274 * __add_wait_queue. 275 */ |
275 set_current_state(TASK_KILLABLE); | 276 set_current_state(TASK_KILLABLE); |
276 spin_unlock(&ctx->fault_wqh.lock); | 277 spin_unlock(&ctx->fault_pending_wqh.lock); |
277 278 if (likely(!ACCESS_ONCE(ctx->released) && 279 !fatal_signal_pending(current))) { 280 wake_up_poll(&ctx->fd_wqh, POLLIN); 281 schedule(); 282 ret |= VM_FAULT_MAJOR; 283 } 284 285 __set_current_state(TASK_RUNNING); | 278 279 if (likely(!ACCESS_ONCE(ctx->released) && 280 !fatal_signal_pending(current))) { 281 wake_up_poll(&ctx->fd_wqh, POLLIN); 282 schedule(); 283 ret |= VM_FAULT_MAJOR; 284 } 285 286 __set_current_state(TASK_RUNNING); |
286 /* see finish_wait() comment for why list_empty_careful() */ | 287 288 /* 289 * Here we race with the list_del; list_add in 290 * userfaultfd_ctx_read(), however because we don't ever run 291 * list_del_init() to refile across the two lists, the prev 292 * and next pointers will never point to self. list_add also 293 * would never let any of the two pointers to point to 294 * self. So list_empty_careful won't risk to see both pointers 295 * pointing to self at any time during the list refile. The 296 * only case where list_del_init() is called is the full 297 * removal in the wake function and there we don't re-list_add 298 * and it's fine not to block on the spinlock. The uwq on this 299 * kernel stack can be released after the list_del_init. 300 */ |
287 if (!list_empty_careful(&uwq.wq.task_list)) { | 301 if (!list_empty_careful(&uwq.wq.task_list)) { |
288 spin_lock(&ctx->fault_wqh.lock); 289 list_del_init(&uwq.wq.task_list); 290 spin_unlock(&ctx->fault_wqh.lock); | 302 spin_lock(&ctx->fault_pending_wqh.lock); 303 /* 304 * No need of list_del_init(), the uwq on the stack 305 * will be freed shortly anyway. 306 */ 307 list_del(&uwq.wq.task_list); 308 spin_unlock(&ctx->fault_pending_wqh.lock); |
291 } 292 293 /* 294 * ctx may go away after this if the userfault pseudo fd is 295 * already released. 296 */ 297 userfaultfd_ctx_put(ctx); 298 --- 41 unchanged lines hidden (view full) --- 340 else 341 prev = vma; 342 vma->vm_flags = new_flags; 343 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 344 } 345 up_write(&mm->mmap_sem); 346 347 /* | 309 } 310 311 /* 312 * ctx may go away after this if the userfault pseudo fd is 313 * already released. 314 */ 315 userfaultfd_ctx_put(ctx); 316 --- 41 unchanged lines hidden (view full) --- 358 else 359 prev = vma; 360 vma->vm_flags = new_flags; 361 vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 362 } 363 up_write(&mm->mmap_sem); 364 365 /* |
348 * After no new page faults can wait on this fault_wqh, flush | 366 * After no new page faults can wait on this fault_*wqh, flush |
349 * the last page faults that may have been already waiting on | 367 * the last page faults that may have been already waiting on |
350 * the fault_wqh. | 368 * the fault_*wqh. |
351 */ | 369 */ |
352 spin_lock(&ctx->fault_wqh.lock); | 370 spin_lock(&ctx->fault_pending_wqh.lock); 371 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range); |
353 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); | 372 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range); |
354 spin_unlock(&ctx->fault_wqh.lock); | 373 spin_unlock(&ctx->fault_pending_wqh.lock); |
355 356 wake_up_poll(&ctx->fd_wqh, POLLHUP); 357 userfaultfd_ctx_put(ctx); 358 return 0; 359} 360 | 374 375 wake_up_poll(&ctx->fd_wqh, POLLHUP); 376 userfaultfd_ctx_put(ctx); 377 return 0; 378} 379 |
361/* fault_wqh.lock must be hold by the caller */ 362static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx, 363 struct userfaultfd_wait_queue **uwq) | 380/* fault_pending_wqh.lock must be hold by the caller */ 381static inline struct userfaultfd_wait_queue *find_userfault( 382 struct userfaultfd_ctx *ctx) |
364{ 365 wait_queue_t *wq; | 383{ 384 wait_queue_t *wq; |
366 struct userfaultfd_wait_queue *_uwq; 367 unsigned int ret = 0; | 385 struct userfaultfd_wait_queue *uwq; |
368 | 386 |
369 VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock)); | 387 VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock)); |
370 | 388 |
371 list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { 372 _uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 373 if (_uwq->pending) { 374 ret = POLLIN; 375 if (!uwq) 376 /* 377 * If there's at least a pending and 378 * we don't care which one it is, 379 * break immediately and leverage the 380 * efficiency of the LIFO walk. 381 */ 382 break; 383 /* 384 * If we need to find which one was pending we 385 * keep walking until we find the first not 386 * pending one, so we read() them in FIFO order. 387 */ 388 *uwq = _uwq; 389 } else 390 /* 391 * break the loop at the first not pending 392 * one, there cannot be pending userfaults 393 * after the first not pending one, because 394 * all new pending ones are inserted at the 395 * head and we walk it in LIFO. 396 */ 397 break; 398 } 399 400 return ret; | 389 uwq = NULL; 390 if (!waitqueue_active(&ctx->fault_pending_wqh)) 391 goto out; 392 /* walk in reverse to provide FIFO behavior to read userfaults */ 393 wq = list_last_entry(&ctx->fault_pending_wqh.task_list, 394 typeof(*wq), task_list); 395 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 396out: 397 return uwq; |
401} 402 403static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) 404{ 405 struct userfaultfd_ctx *ctx = file->private_data; 406 unsigned int ret; 407 408 poll_wait(file, &ctx->fd_wqh, wait); 409 410 switch (ctx->state) { 411 case UFFD_STATE_WAIT_API: 412 return POLLERR; 413 case UFFD_STATE_RUNNING: 414 /* 415 * poll() never guarantees that read won't block. 416 * userfaults can be waken before they're read(). 417 */ 418 if (unlikely(!(file->f_flags & O_NONBLOCK))) 419 return POLLERR; | 398} 399 400static unsigned int userfaultfd_poll(struct file *file, poll_table *wait) 401{ 402 struct userfaultfd_ctx *ctx = file->private_data; 403 unsigned int ret; 404 405 poll_wait(file, &ctx->fd_wqh, wait); 406 407 switch (ctx->state) { 408 case UFFD_STATE_WAIT_API: 409 return POLLERR; 410 case UFFD_STATE_RUNNING: 411 /* 412 * poll() never guarantees that read won't block. 413 * userfaults can be waken before they're read(). 414 */ 415 if (unlikely(!(file->f_flags & O_NONBLOCK))) 416 return POLLERR; |
420 spin_lock(&ctx->fault_wqh.lock); 421 ret = find_userfault(ctx, NULL); 422 spin_unlock(&ctx->fault_wqh.lock); | 417 /* 418 * lockless access to see if there are pending faults 419 * __pollwait last action is the add_wait_queue but 420 * the spin_unlock would allow the waitqueue_active to 421 * pass above the actual list_add inside 422 * add_wait_queue critical section. So use a full 423 * memory barrier to serialize the list_add write of 424 * add_wait_queue() with the waitqueue_active read 425 * below. 426 */ 427 ret = 0; 428 smp_mb(); 429 if (waitqueue_active(&ctx->fault_pending_wqh)) 430 ret = POLLIN; |
423 return ret; 424 default: 425 BUG(); 426 } 427} 428 429static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 430 struct uffd_msg *msg) 431{ 432 ssize_t ret; 433 DECLARE_WAITQUEUE(wait, current); | 431 return ret; 432 default: 433 BUG(); 434 } 435} 436 437static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, 438 struct uffd_msg *msg) 439{ 440 ssize_t ret; 441 DECLARE_WAITQUEUE(wait, current); |
434 struct userfaultfd_wait_queue *uwq = NULL; | 442 struct userfaultfd_wait_queue *uwq; |
435 | 443 |
436 /* always take the fd_wqh lock before the fault_wqh lock */ | 444 /* always take the fd_wqh lock before the fault_pending_wqh lock */ |
437 spin_lock(&ctx->fd_wqh.lock); 438 __add_wait_queue(&ctx->fd_wqh, &wait); 439 for (;;) { 440 set_current_state(TASK_INTERRUPTIBLE); | 445 spin_lock(&ctx->fd_wqh.lock); 446 __add_wait_queue(&ctx->fd_wqh, &wait); 447 for (;;) { 448 set_current_state(TASK_INTERRUPTIBLE); |
441 spin_lock(&ctx->fault_wqh.lock); 442 if (find_userfault(ctx, &uwq)) { | 449 spin_lock(&ctx->fault_pending_wqh.lock); 450 uwq = find_userfault(ctx); 451 if (uwq) { |
443 /* | 452 /* |
444 * The fault_wqh.lock prevents the uwq to 445 * disappear from under us. | 453 * The fault_pending_wqh.lock prevents the uwq 454 * to disappear from under us. 455 * 456 * Refile this userfault from 457 * fault_pending_wqh to fault_wqh, it's not 458 * pending anymore after we read it. 459 * 460 * Use list_del() by hand (as 461 * userfaultfd_wake_function also uses 462 * list_del_init() by hand) to be sure nobody 463 * changes __remove_wait_queue() to use 464 * list_del_init() in turn breaking the 465 * !list_empty_careful() check in 466 * handle_userfault(). The uwq->wq.task_list 467 * must never be empty at any time during the 468 * refile, or the waitqueue could disappear 469 * from under us. The "wait_queue_head_t" 470 * parameter of __remove_wait_queue() is unused 471 * anyway. |
446 */ | 472 */ |
447 uwq->pending = false; | 473 list_del(&uwq->wq.task_list); 474 __add_wait_queue(&ctx->fault_wqh, &uwq->wq); 475 |
448 /* careful to always initialize msg if ret == 0 */ 449 *msg = uwq->msg; | 476 /* careful to always initialize msg if ret == 0 */ 477 *msg = uwq->msg; |
450 spin_unlock(&ctx->fault_wqh.lock); | 478 spin_unlock(&ctx->fault_pending_wqh.lock); |
451 ret = 0; 452 break; 453 } | 479 ret = 0; 480 break; 481 } |
454 spin_unlock(&ctx->fault_wqh.lock); | 482 spin_unlock(&ctx->fault_pending_wqh.lock); |
455 if (signal_pending(current)) { 456 ret = -ERESTARTSYS; 457 break; 458 } 459 if (no_wait) { 460 ret = -EAGAIN; 461 break; 462 } --- 42 unchanged lines hidden (view full) --- 505static void __wake_userfault(struct userfaultfd_ctx *ctx, 506 struct userfaultfd_wake_range *range) 507{ 508 unsigned long start, end; 509 510 start = range->start; 511 end = range->start + range->len; 512 | 483 if (signal_pending(current)) { 484 ret = -ERESTARTSYS; 485 break; 486 } 487 if (no_wait) { 488 ret = -EAGAIN; 489 break; 490 } --- 42 unchanged lines hidden (view full) --- 533static void __wake_userfault(struct userfaultfd_ctx *ctx, 534 struct userfaultfd_wake_range *range) 535{ 536 unsigned long start, end; 537 538 start = range->start; 539 end = range->start + range->len; 540 |
513 spin_lock(&ctx->fault_wqh.lock); | 541 spin_lock(&ctx->fault_pending_wqh.lock); |
514 /* wake all in the range and autoremove */ | 542 /* wake all in the range and autoremove */ |
515 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); 516 spin_unlock(&ctx->fault_wqh.lock); | 543 if (waitqueue_active(&ctx->fault_pending_wqh)) 544 __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, 545 range); 546 if (waitqueue_active(&ctx->fault_wqh)) 547 __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range); 548 spin_unlock(&ctx->fault_pending_wqh.lock); |
517} 518 519static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 520 struct userfaultfd_wake_range *range) 521{ 522 /* 523 * To be sure waitqueue_active() is not reordered by the CPU 524 * before the pagetable update, use an explicit SMP memory --- 4 unchanged lines hidden (view full) --- 529 smp_mb(); 530 531 /* 532 * Use waitqueue_active because it's very frequent to 533 * change the address space atomically even if there are no 534 * userfaults yet. So we take the spinlock only when we're 535 * sure we've userfaults to wake. 536 */ | 549} 550 551static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, 552 struct userfaultfd_wake_range *range) 553{ 554 /* 555 * To be sure waitqueue_active() is not reordered by the CPU 556 * before the pagetable update, use an explicit SMP memory --- 4 unchanged lines hidden (view full) --- 561 smp_mb(); 562 563 /* 564 * Use waitqueue_active because it's very frequent to 565 * change the address space atomically even if there are no 566 * userfaults yet. So we take the spinlock only when we're 567 * sure we've userfaults to wake. 568 */ |
537 if (waitqueue_active(&ctx->fault_wqh)) | 569 if (waitqueue_active(&ctx->fault_pending_wqh) || 570 waitqueue_active(&ctx->fault_wqh)) |
538 __wake_userfault(ctx, range); 539} 540 541static __always_inline int validate_range(struct mm_struct *mm, 542 __u64 start, __u64 len) 543{ 544 __u64 task_size = mm->task_size; 545 --- 409 unchanged lines hidden (view full) --- 955#ifdef CONFIG_PROC_FS 956static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) 957{ 958 struct userfaultfd_ctx *ctx = f->private_data; 959 wait_queue_t *wq; 960 struct userfaultfd_wait_queue *uwq; 961 unsigned long pending = 0, total = 0; 962 | 571 __wake_userfault(ctx, range); 572} 573 574static __always_inline int validate_range(struct mm_struct *mm, 575 __u64 start, __u64 len) 576{ 577 __u64 task_size = mm->task_size; 578 --- 409 unchanged lines hidden (view full) --- 988#ifdef CONFIG_PROC_FS 989static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) 990{ 991 struct userfaultfd_ctx *ctx = f->private_data; 992 wait_queue_t *wq; 993 struct userfaultfd_wait_queue *uwq; 994 unsigned long pending = 0, total = 0; 995 |
963 spin_lock(&ctx->fault_wqh.lock); | 996 spin_lock(&ctx->fault_pending_wqh.lock); 997 list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) { 998 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); 999 pending++; 1000 total++; 1001 } |
964 list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { 965 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); | 1002 list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) { 1003 uwq = container_of(wq, struct userfaultfd_wait_queue, wq); |
966 if (uwq->pending) 967 pending++; | |
968 total++; 969 } | 1004 total++; 1005 } |
970 spin_unlock(&ctx->fault_wqh.lock); | 1006 spin_unlock(&ctx->fault_pending_wqh.lock); |
971 972 /* 973 * If more protocols will be added, there will be all shown 974 * separated by a space. Like this: 975 * protocols: aa:... bb:... 976 */ 977 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", 978 pending, total, UFFD_API, UFFD_API_FEATURES, --- 43 unchanged lines hidden (view full) --- 1022 goto out; 1023 1024 file = ERR_PTR(-ENOMEM); 1025 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 1026 if (!ctx) 1027 goto out; 1028 1029 atomic_set(&ctx->refcount, 1); | 1007 1008 /* 1009 * If more protocols will be added, there will be all shown 1010 * separated by a space. Like this: 1011 * protocols: aa:... bb:... 1012 */ 1013 seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n", 1014 pending, total, UFFD_API, UFFD_API_FEATURES, --- 43 unchanged lines hidden (view full) --- 1058 goto out; 1059 1060 file = ERR_PTR(-ENOMEM); 1061 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); 1062 if (!ctx) 1063 goto out; 1064 1065 atomic_set(&ctx->refcount, 1); |
1066 init_waitqueue_head(&ctx->fault_pending_wqh); |
|
1030 init_waitqueue_head(&ctx->fault_wqh); 1031 init_waitqueue_head(&ctx->fd_wqh); 1032 ctx->flags = flags; 1033 ctx->state = UFFD_STATE_WAIT_API; 1034 ctx->released = false; 1035 ctx->mm = current->mm; 1036 /* prevent the mm struct to be freed */ 1037 atomic_inc(&ctx->mm->mm_users); --- 33 unchanged lines hidden --- | 1067 init_waitqueue_head(&ctx->fault_wqh); 1068 init_waitqueue_head(&ctx->fd_wqh); 1069 ctx->flags = flags; 1070 ctx->state = UFFD_STATE_WAIT_API; 1071 ctx->released = false; 1072 ctx->mm = current->mm; 1073 /* prevent the mm struct to be freed */ 1074 atomic_inc(&ctx->mm->mm_users); --- 33 unchanged lines hidden --- |