1 // SPDX-License-Identifier: GPL-2.0-or-later 2 3 #include <linux/slab.h> 4 #include <linux/sched/task.h> 5 6 #include "futex.h" 7 #include "../locking/rtmutex_common.h" 8 9 /* 10 * PI code: 11 */ 12 int refill_pi_state_cache(void) 13 { 14 struct futex_pi_state *pi_state; 15 16 if (likely(current->pi_state_cache)) 17 return 0; 18 19 pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); 20 21 if (!pi_state) 22 return -ENOMEM; 23 24 INIT_LIST_HEAD(&pi_state->list); 25 /* pi_mutex gets initialized later */ 26 pi_state->owner = NULL; 27 refcount_set(&pi_state->refcount, 1); 28 pi_state->key = FUTEX_KEY_INIT; 29 30 current->pi_state_cache = pi_state; 31 32 return 0; 33 } 34 35 static struct futex_pi_state *alloc_pi_state(void) 36 { 37 struct futex_pi_state *pi_state = current->pi_state_cache; 38 39 WARN_ON(!pi_state); 40 current->pi_state_cache = NULL; 41 42 return pi_state; 43 } 44 45 static void pi_state_update_owner(struct futex_pi_state *pi_state, 46 struct task_struct *new_owner) 47 { 48 struct task_struct *old_owner = pi_state->owner; 49 50 lockdep_assert_held(&pi_state->pi_mutex.wait_lock); 51 52 if (old_owner) { 53 raw_spin_lock(&old_owner->pi_lock); 54 WARN_ON(list_empty(&pi_state->list)); 55 list_del_init(&pi_state->list); 56 raw_spin_unlock(&old_owner->pi_lock); 57 } 58 59 if (new_owner) { 60 raw_spin_lock(&new_owner->pi_lock); 61 WARN_ON(!list_empty(&pi_state->list)); 62 list_add(&pi_state->list, &new_owner->pi_state_list); 63 pi_state->owner = new_owner; 64 raw_spin_unlock(&new_owner->pi_lock); 65 } 66 } 67 68 void get_pi_state(struct futex_pi_state *pi_state) 69 { 70 WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); 71 } 72 73 /* 74 * Drops a reference to the pi_state object and frees or caches it 75 * when the last reference is gone. 76 */ 77 void put_pi_state(struct futex_pi_state *pi_state) 78 { 79 if (!pi_state) 80 return; 81 82 if (!refcount_dec_and_test(&pi_state->refcount)) 83 return; 84 85 /* 86 * If pi_state->owner is NULL, the owner is most probably dying 87 * and has cleaned up the pi_state already 88 */ 89 if (pi_state->owner) { 90 unsigned long flags; 91 92 raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); 93 pi_state_update_owner(pi_state, NULL); 94 rt_mutex_proxy_unlock(&pi_state->pi_mutex); 95 raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); 96 } 97 98 if (current->pi_state_cache) { 99 kfree(pi_state); 100 } else { 101 /* 102 * pi_state->list is already empty. 103 * clear pi_state->owner. 104 * refcount is at 0 - put it back to 1. 105 */ 106 pi_state->owner = NULL; 107 refcount_set(&pi_state->refcount, 1); 108 current->pi_state_cache = pi_state; 109 } 110 } 111 112 /* 113 * We need to check the following states: 114 * 115 * Waiter | pi_state | pi->owner | uTID | uODIED | ? 116 * 117 * [1] NULL | --- | --- | 0 | 0/1 | Valid 118 * [2] NULL | --- | --- | >0 | 0/1 | Valid 119 * 120 * [3] Found | NULL | -- | Any | 0/1 | Invalid 121 * 122 * [4] Found | Found | NULL | 0 | 1 | Valid 123 * [5] Found | Found | NULL | >0 | 1 | Invalid 124 * 125 * [6] Found | Found | task | 0 | 1 | Valid 126 * 127 * [7] Found | Found | NULL | Any | 0 | Invalid 128 * 129 * [8] Found | Found | task | ==taskTID | 0/1 | Valid 130 * [9] Found | Found | task | 0 | 0 | Invalid 131 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid 132 * 133 * [1] Indicates that the kernel can acquire the futex atomically. We 134 * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. 135 * 136 * [2] Valid, if TID does not belong to a kernel thread. If no matching 137 * thread is found then it indicates that the owner TID has died. 138 * 139 * [3] Invalid. The waiter is queued on a non PI futex 140 * 141 * [4] Valid state after exit_robust_list(), which sets the user space 142 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. 143 * 144 * [5] The user space value got manipulated between exit_robust_list() 145 * and exit_pi_state_list() 146 * 147 * [6] Valid state after exit_pi_state_list() which sets the new owner in 148 * the pi_state but cannot access the user space value. 149 * 150 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. 151 * 152 * [8] Owner and user space value match 153 * 154 * [9] There is no transient state which sets the user space TID to 0 155 * except exit_robust_list(), but this is indicated by the 156 * FUTEX_OWNER_DIED bit. See [4] 157 * 158 * [10] There is no transient state which leaves owner and user space 159 * TID out of sync. Except one error case where the kernel is denied 160 * write access to the user address, see fixup_pi_state_owner(). 161 * 162 * 163 * Serialization and lifetime rules: 164 * 165 * hb->lock: 166 * 167 * hb -> futex_q, relation 168 * futex_q -> pi_state, relation 169 * 170 * (cannot be raw because hb can contain arbitrary amount 171 * of futex_q's) 172 * 173 * pi_mutex->wait_lock: 174 * 175 * {uval, pi_state} 176 * 177 * (and pi_mutex 'obviously') 178 * 179 * p->pi_lock: 180 * 181 * p->pi_state_list -> pi_state->list, relation 182 * pi_mutex->owner -> pi_state->owner, relation 183 * 184 * pi_state->refcount: 185 * 186 * pi_state lifetime 187 * 188 * 189 * Lock order: 190 * 191 * hb->lock 192 * pi_mutex->wait_lock 193 * p->pi_lock 194 * 195 */ 196 197 /* 198 * Validate that the existing waiter has a pi_state and sanity check 199 * the pi_state against the user space value. If correct, attach to 200 * it. 201 */ 202 static int attach_to_pi_state(u32 __user *uaddr, u32 uval, 203 struct futex_pi_state *pi_state, 204 struct futex_pi_state **ps) 205 { 206 pid_t pid = uval & FUTEX_TID_MASK; 207 u32 uval2; 208 int ret; 209 210 /* 211 * Userspace might have messed up non-PI and PI futexes [3] 212 */ 213 if (unlikely(!pi_state)) 214 return -EINVAL; 215 216 /* 217 * We get here with hb->lock held, and having found a 218 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q 219 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(), 220 * which in turn means that futex_lock_pi() still has a reference on 221 * our pi_state. 222 * 223 * The waiter holding a reference on @pi_state also protects against 224 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() 225 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently 226 * free pi_state before we can take a reference ourselves. 227 */ 228 WARN_ON(!refcount_read(&pi_state->refcount)); 229 230 /* 231 * Now that we have a pi_state, we can acquire wait_lock 232 * and do the state validation. 233 */ 234 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 235 236 /* 237 * Since {uval, pi_state} is serialized by wait_lock, and our current 238 * uval was read without holding it, it can have changed. Verify it 239 * still is what we expect it to be, otherwise retry the entire 240 * operation. 241 */ 242 if (futex_get_value_locked(&uval2, uaddr)) 243 goto out_efault; 244 245 if (uval != uval2) 246 goto out_eagain; 247 248 /* 249 * Handle the owner died case: 250 */ 251 if (uval & FUTEX_OWNER_DIED) { 252 /* 253 * exit_pi_state_list sets owner to NULL and wakes the 254 * topmost waiter. The task which acquires the 255 * pi_state->rt_mutex will fixup owner. 256 */ 257 if (!pi_state->owner) { 258 /* 259 * No pi state owner, but the user space TID 260 * is not 0. Inconsistent state. [5] 261 */ 262 if (pid) 263 goto out_einval; 264 /* 265 * Take a ref on the state and return success. [4] 266 */ 267 goto out_attach; 268 } 269 270 /* 271 * If TID is 0, then either the dying owner has not 272 * yet executed exit_pi_state_list() or some waiter 273 * acquired the rtmutex in the pi state, but did not 274 * yet fixup the TID in user space. 275 * 276 * Take a ref on the state and return success. [6] 277 */ 278 if (!pid) 279 goto out_attach; 280 } else { 281 /* 282 * If the owner died bit is not set, then the pi_state 283 * must have an owner. [7] 284 */ 285 if (!pi_state->owner) 286 goto out_einval; 287 } 288 289 /* 290 * Bail out if user space manipulated the futex value. If pi 291 * state exists then the owner TID must be the same as the 292 * user space TID. [9/10] 293 */ 294 if (pid != task_pid_vnr(pi_state->owner)) 295 goto out_einval; 296 297 out_attach: 298 get_pi_state(pi_state); 299 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 300 *ps = pi_state; 301 return 0; 302 303 out_einval: 304 ret = -EINVAL; 305 goto out_error; 306 307 out_eagain: 308 ret = -EAGAIN; 309 goto out_error; 310 311 out_efault: 312 ret = -EFAULT; 313 goto out_error; 314 315 out_error: 316 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 317 return ret; 318 } 319 320 static int handle_exit_race(u32 __user *uaddr, u32 uval, 321 struct task_struct *tsk) 322 { 323 u32 uval2; 324 325 /* 326 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the 327 * caller that the alleged owner is busy. 328 */ 329 if (tsk && tsk->futex_state != FUTEX_STATE_DEAD) 330 return -EBUSY; 331 332 /* 333 * Reread the user space value to handle the following situation: 334 * 335 * CPU0 CPU1 336 * 337 * sys_exit() sys_futex() 338 * do_exit() futex_lock_pi() 339 * futex_lock_pi_atomic() 340 * exit_signals(tsk) No waiters: 341 * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID 342 * mm_release(tsk) Set waiter bit 343 * exit_robust_list(tsk) { *uaddr = 0x80000PID; 344 * Set owner died attach_to_pi_owner() { 345 * *uaddr = 0xC0000000; tsk = get_task(PID); 346 * } if (!tsk->flags & PF_EXITING) { 347 * ... attach(); 348 * tsk->futex_state = } else { 349 * FUTEX_STATE_DEAD; if (tsk->futex_state != 350 * FUTEX_STATE_DEAD) 351 * return -EAGAIN; 352 * return -ESRCH; <--- FAIL 353 * } 354 * 355 * Returning ESRCH unconditionally is wrong here because the 356 * user space value has been changed by the exiting task. 357 * 358 * The same logic applies to the case where the exiting task is 359 * already gone. 360 */ 361 if (futex_get_value_locked(&uval2, uaddr)) 362 return -EFAULT; 363 364 /* If the user space value has changed, try again. */ 365 if (uval2 != uval) 366 return -EAGAIN; 367 368 /* 369 * The exiting task did not have a robust list, the robust list was 370 * corrupted or the user space value in *uaddr is simply bogus. 371 * Give up and tell user space. 372 */ 373 return -ESRCH; 374 } 375 376 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, 377 struct futex_pi_state **ps) 378 { 379 /* 380 * No existing pi state. First waiter. [2] 381 * 382 * This creates pi_state, we have hb->lock held, this means nothing can 383 * observe this state, wait_lock is irrelevant. 384 */ 385 struct futex_pi_state *pi_state = alloc_pi_state(); 386 387 /* 388 * Initialize the pi_mutex in locked state and make @p 389 * the owner of it: 390 */ 391 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 392 393 /* Store the key for possible exit cleanups: */ 394 pi_state->key = *key; 395 396 WARN_ON(!list_empty(&pi_state->list)); 397 list_add(&pi_state->list, &p->pi_state_list); 398 /* 399 * Assignment without holding pi_state->pi_mutex.wait_lock is safe 400 * because there is no concurrency as the object is not published yet. 401 */ 402 pi_state->owner = p; 403 404 *ps = pi_state; 405 } 406 /* 407 * Lookup the task for the TID provided from user space and attach to 408 * it after doing proper sanity checks. 409 */ 410 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, 411 struct futex_pi_state **ps, 412 struct task_struct **exiting) 413 { 414 pid_t pid = uval & FUTEX_TID_MASK; 415 struct task_struct *p; 416 417 /* 418 * We are the first waiter - try to look up the real owner and attach 419 * the new pi_state to it, but bail out when TID = 0 [1] 420 * 421 * The !pid check is paranoid. None of the call sites should end up 422 * with pid == 0, but better safe than sorry. Let the caller retry 423 */ 424 if (!pid) 425 return -EAGAIN; 426 p = find_get_task_by_vpid(pid); 427 if (!p) 428 return handle_exit_race(uaddr, uval, NULL); 429 430 if (unlikely(p->flags & PF_KTHREAD)) { 431 put_task_struct(p); 432 return -EPERM; 433 } 434 435 /* 436 * We need to look at the task state to figure out, whether the 437 * task is exiting. To protect against the change of the task state 438 * in futex_exit_release(), we do this protected by p->pi_lock: 439 */ 440 raw_spin_lock_irq(&p->pi_lock); 441 if (unlikely(p->futex_state != FUTEX_STATE_OK)) { 442 /* 443 * The task is on the way out. When the futex state is 444 * FUTEX_STATE_DEAD, we know that the task has finished 445 * the cleanup: 446 */ 447 int ret = handle_exit_race(uaddr, uval, p); 448 449 raw_spin_unlock_irq(&p->pi_lock); 450 /* 451 * If the owner task is between FUTEX_STATE_EXITING and 452 * FUTEX_STATE_DEAD then store the task pointer and keep 453 * the reference on the task struct. The calling code will 454 * drop all locks, wait for the task to reach 455 * FUTEX_STATE_DEAD and then drop the refcount. This is 456 * required to prevent a live lock when the current task 457 * preempted the exiting task between the two states. 458 */ 459 if (ret == -EBUSY) 460 *exiting = p; 461 else 462 put_task_struct(p); 463 return ret; 464 } 465 466 __attach_to_pi_owner(p, key, ps); 467 raw_spin_unlock_irq(&p->pi_lock); 468 469 put_task_struct(p); 470 471 return 0; 472 } 473 474 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) 475 { 476 int err; 477 u32 curval; 478 479 if (unlikely(should_fail_futex(true))) 480 return -EFAULT; 481 482 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 483 if (unlikely(err)) 484 return err; 485 486 /* If user space value changed, let the caller retry */ 487 return curval != uval ? -EAGAIN : 0; 488 } 489 490 /** 491 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex 492 * @uaddr: the pi futex user address 493 * @hb: the pi futex hash bucket 494 * @key: the futex key associated with uaddr and hb 495 * @ps: the pi_state pointer where we store the result of the 496 * lookup 497 * @task: the task to perform the atomic lock work for. This will 498 * be "current" except in the case of requeue pi. 499 * @exiting: Pointer to store the task pointer of the owner task 500 * which is in the middle of exiting 501 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 502 * 503 * Return: 504 * - 0 - ready to wait; 505 * - 1 - acquired the lock; 506 * - <0 - error 507 * 508 * The hb->lock must be held by the caller. 509 * 510 * @exiting is only set when the return value is -EBUSY. If so, this holds 511 * a refcount on the exiting task on return and the caller needs to drop it 512 * after waiting for the exit to complete. 513 */ 514 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, 515 union futex_key *key, 516 struct futex_pi_state **ps, 517 struct task_struct *task, 518 struct task_struct **exiting, 519 int set_waiters) 520 { 521 u32 uval, newval, vpid = task_pid_vnr(task); 522 struct futex_q *top_waiter; 523 int ret; 524 525 /* 526 * Read the user space value first so we can validate a few 527 * things before proceeding further. 528 */ 529 if (futex_get_value_locked(&uval, uaddr)) 530 return -EFAULT; 531 532 if (unlikely(should_fail_futex(true))) 533 return -EFAULT; 534 535 /* 536 * Detect deadlocks. 537 */ 538 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 539 return -EDEADLK; 540 541 if ((unlikely(should_fail_futex(true)))) 542 return -EDEADLK; 543 544 /* 545 * Lookup existing state first. If it exists, try to attach to 546 * its pi_state. 547 */ 548 top_waiter = futex_top_waiter(hb, key); 549 if (top_waiter) 550 return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); 551 552 /* 553 * No waiter and user TID is 0. We are here because the 554 * waiters or the owner died bit is set or called from 555 * requeue_cmp_pi or for whatever reason something took the 556 * syscall. 557 */ 558 if (!(uval & FUTEX_TID_MASK)) { 559 /* 560 * We take over the futex. No other waiters and the user space 561 * TID is 0. We preserve the owner died bit. 562 */ 563 newval = uval & FUTEX_OWNER_DIED; 564 newval |= vpid; 565 566 /* The futex requeue_pi code can enforce the waiters bit */ 567 if (set_waiters) 568 newval |= FUTEX_WAITERS; 569 570 ret = lock_pi_update_atomic(uaddr, uval, newval); 571 if (ret) 572 return ret; 573 574 /* 575 * If the waiter bit was requested the caller also needs PI 576 * state attached to the new owner of the user space futex. 577 * 578 * @task is guaranteed to be alive and it cannot be exiting 579 * because it is either sleeping or waiting in 580 * futex_requeue_pi_wakeup_sync(). 581 * 582 * No need to do the full attach_to_pi_owner() exercise 583 * because @task is known and valid. 584 */ 585 if (set_waiters) { 586 raw_spin_lock_irq(&task->pi_lock); 587 __attach_to_pi_owner(task, key, ps); 588 raw_spin_unlock_irq(&task->pi_lock); 589 } 590 return 1; 591 } 592 593 /* 594 * First waiter. Set the waiters bit before attaching ourself to 595 * the owner. If owner tries to unlock, it will be forced into 596 * the kernel and blocked on hb->lock. 597 */ 598 newval = uval | FUTEX_WAITERS; 599 ret = lock_pi_update_atomic(uaddr, uval, newval); 600 if (ret) 601 return ret; 602 /* 603 * If the update of the user space value succeeded, we try to 604 * attach to the owner. If that fails, no harm done, we only 605 * set the FUTEX_WAITERS bit in the user space variable. 606 */ 607 return attach_to_pi_owner(uaddr, newval, key, ps, exiting); 608 } 609 610 /* 611 * Caller must hold a reference on @pi_state. 612 */ 613 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) 614 { 615 struct rt_mutex_waiter *top_waiter; 616 struct task_struct *new_owner; 617 bool postunlock = false; 618 DEFINE_RT_WAKE_Q(wqh); 619 u32 curval, newval; 620 int ret = 0; 621 622 top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex); 623 if (WARN_ON_ONCE(!top_waiter)) { 624 /* 625 * As per the comment in futex_unlock_pi() this should not happen. 626 * 627 * When this happens, give up our locks and try again, giving 628 * the futex_lock_pi() instance time to complete, either by 629 * waiting on the rtmutex or removing itself from the futex 630 * queue. 631 */ 632 ret = -EAGAIN; 633 goto out_unlock; 634 } 635 636 new_owner = top_waiter->task; 637 638 /* 639 * We pass it to the next owner. The WAITERS bit is always kept 640 * enabled while there is PI state around. We cleanup the owner 641 * died bit, because we are the owner. 642 */ 643 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 644 645 if (unlikely(should_fail_futex(true))) { 646 ret = -EFAULT; 647 goto out_unlock; 648 } 649 650 ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 651 if (!ret && (curval != uval)) { 652 /* 653 * If a unconditional UNLOCK_PI operation (user space did not 654 * try the TID->0 transition) raced with a waiter setting the 655 * FUTEX_WAITERS flag between get_user() and locking the hash 656 * bucket lock, retry the operation. 657 */ 658 if ((FUTEX_TID_MASK & curval) == uval) 659 ret = -EAGAIN; 660 else 661 ret = -EINVAL; 662 } 663 664 if (!ret) { 665 /* 666 * This is a point of no return; once we modified the uval 667 * there is no going back and subsequent operations must 668 * not fail. 669 */ 670 pi_state_update_owner(pi_state, new_owner); 671 postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh); 672 } 673 674 out_unlock: 675 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 676 677 if (postunlock) 678 rt_mutex_postunlock(&wqh); 679 680 return ret; 681 } 682 683 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 684 struct task_struct *argowner) 685 { 686 struct futex_pi_state *pi_state = q->pi_state; 687 struct task_struct *oldowner, *newowner; 688 u32 uval, curval, newval, newtid; 689 int err = 0; 690 691 oldowner = pi_state->owner; 692 693 /* 694 * We are here because either: 695 * 696 * - we stole the lock and pi_state->owner needs updating to reflect 697 * that (@argowner == current), 698 * 699 * or: 700 * 701 * - someone stole our lock and we need to fix things to point to the 702 * new owner (@argowner == NULL). 703 * 704 * Either way, we have to replace the TID in the user space variable. 705 * This must be atomic as we have to preserve the owner died bit here. 706 * 707 * Note: We write the user space value _before_ changing the pi_state 708 * because we can fault here. Imagine swapped out pages or a fork 709 * that marked all the anonymous memory readonly for cow. 710 * 711 * Modifying pi_state _before_ the user space value would leave the 712 * pi_state in an inconsistent state when we fault here, because we 713 * need to drop the locks to handle the fault. This might be observed 714 * in the PID checks when attaching to PI state . 715 */ 716 retry: 717 if (!argowner) { 718 if (oldowner != current) { 719 /* 720 * We raced against a concurrent self; things are 721 * already fixed up. Nothing to do. 722 */ 723 return 0; 724 } 725 726 if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { 727 /* We got the lock. pi_state is correct. Tell caller. */ 728 return 1; 729 } 730 731 /* 732 * The trylock just failed, so either there is an owner or 733 * there is a higher priority waiter than this one. 734 */ 735 newowner = rt_mutex_owner(&pi_state->pi_mutex); 736 /* 737 * If the higher priority waiter has not yet taken over the 738 * rtmutex then newowner is NULL. We can't return here with 739 * that state because it's inconsistent vs. the user space 740 * state. So drop the locks and try again. It's a valid 741 * situation and not any different from the other retry 742 * conditions. 743 */ 744 if (unlikely(!newowner)) { 745 err = -EAGAIN; 746 goto handle_err; 747 } 748 } else { 749 WARN_ON_ONCE(argowner != current); 750 if (oldowner == current) { 751 /* 752 * We raced against a concurrent self; things are 753 * already fixed up. Nothing to do. 754 */ 755 return 1; 756 } 757 newowner = argowner; 758 } 759 760 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 761 /* Owner died? */ 762 if (!pi_state->owner) 763 newtid |= FUTEX_OWNER_DIED; 764 765 err = futex_get_value_locked(&uval, uaddr); 766 if (err) 767 goto handle_err; 768 769 for (;;) { 770 newval = (uval & FUTEX_OWNER_DIED) | newtid; 771 772 err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval); 773 if (err) 774 goto handle_err; 775 776 if (curval == uval) 777 break; 778 uval = curval; 779 } 780 781 /* 782 * We fixed up user space. Now we need to fix the pi_state 783 * itself. 784 */ 785 pi_state_update_owner(pi_state, newowner); 786 787 return argowner == current; 788 789 /* 790 * In order to reschedule or handle a page fault, we need to drop the 791 * locks here. In the case of a fault, this gives the other task 792 * (either the highest priority waiter itself or the task which stole 793 * the rtmutex) the chance to try the fixup of the pi_state. So once we 794 * are back from handling the fault we need to check the pi_state after 795 * reacquiring the locks and before trying to do another fixup. When 796 * the fixup has been done already we simply return. 797 * 798 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely 799 * drop hb->lock since the caller owns the hb -> futex_q relation. 800 * Dropping the pi_mutex->wait_lock requires the state revalidate. 801 */ 802 handle_err: 803 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 804 spin_unlock(q->lock_ptr); 805 806 switch (err) { 807 case -EFAULT: 808 err = fault_in_user_writeable(uaddr); 809 break; 810 811 case -EAGAIN: 812 cond_resched(); 813 err = 0; 814 break; 815 816 default: 817 WARN_ON_ONCE(1); 818 break; 819 } 820 821 spin_lock(q->lock_ptr); 822 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 823 824 /* 825 * Check if someone else fixed it for us: 826 */ 827 if (pi_state->owner != oldowner) 828 return argowner == current; 829 830 /* Retry if err was -EAGAIN or the fault in succeeded */ 831 if (!err) 832 goto retry; 833 834 /* 835 * fault_in_user_writeable() failed so user state is immutable. At 836 * best we can make the kernel state consistent but user state will 837 * be most likely hosed and any subsequent unlock operation will be 838 * rejected due to PI futex rule [10]. 839 * 840 * Ensure that the rtmutex owner is also the pi_state owner despite 841 * the user space value claiming something different. There is no 842 * point in unlocking the rtmutex if current is the owner as it 843 * would need to wait until the next waiter has taken the rtmutex 844 * to guarantee consistent state. Keep it simple. Userspace asked 845 * for this wreckaged state. 846 * 847 * The rtmutex has an owner - either current or some other 848 * task. See the EAGAIN loop above. 849 */ 850 pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); 851 852 return err; 853 } 854 855 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 856 struct task_struct *argowner) 857 { 858 struct futex_pi_state *pi_state = q->pi_state; 859 int ret; 860 861 lockdep_assert_held(q->lock_ptr); 862 863 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 864 ret = __fixup_pi_state_owner(uaddr, q, argowner); 865 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); 866 return ret; 867 } 868 869 /** 870 * fixup_pi_owner() - Post lock pi_state and corner case management 871 * @uaddr: user address of the futex 872 * @q: futex_q (contains pi_state and access to the rt_mutex) 873 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) 874 * 875 * After attempting to lock an rt_mutex, this function is called to cleanup 876 * the pi_state owner as well as handle race conditions that may allow us to 877 * acquire the lock. Must be called with the hb lock held. 878 * 879 * Return: 880 * - 1 - success, lock taken; 881 * - 0 - success, lock not taken; 882 * - <0 - on error (-EFAULT) 883 */ 884 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked) 885 { 886 if (locked) { 887 /* 888 * Got the lock. We might not be the anticipated owner if we 889 * did a lock-steal - fix up the PI-state in that case: 890 * 891 * Speculative pi_state->owner read (we don't hold wait_lock); 892 * since we own the lock pi_state->owner == current is the 893 * stable state, anything else needs more attention. 894 */ 895 if (q->pi_state->owner != current) 896 return fixup_pi_state_owner(uaddr, q, current); 897 return 1; 898 } 899 900 /* 901 * If we didn't get the lock; check if anybody stole it from us. In 902 * that case, we need to fix up the uval to point to them instead of 903 * us, otherwise bad things happen. [10] 904 * 905 * Another speculative read; pi_state->owner == current is unstable 906 * but needs our attention. 907 */ 908 if (q->pi_state->owner == current) 909 return fixup_pi_state_owner(uaddr, q, NULL); 910 911 /* 912 * Paranoia check. If we did not take the lock, then we should not be 913 * the owner of the rt_mutex. Warn and establish consistent state. 914 */ 915 if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) 916 return fixup_pi_state_owner(uaddr, q, current); 917 918 return 0; 919 } 920 921 /* 922 * Userspace tried a 0 -> TID atomic transition of the futex value 923 * and failed. The kernel side here does the whole locking operation: 924 * if there are waiters then it will block as a consequence of relying 925 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see 926 * a 0 value of the futex too.). 927 * 928 * Also serves as futex trylock_pi()'ing, and due semantics. 929 */ 930 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) 931 { 932 struct hrtimer_sleeper timeout, *to; 933 struct task_struct *exiting = NULL; 934 struct rt_mutex_waiter rt_waiter; 935 struct futex_hash_bucket *hb; 936 struct futex_q q = futex_q_init; 937 int res, ret; 938 939 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 940 return -ENOSYS; 941 942 if (refill_pi_state_cache()) 943 return -ENOMEM; 944 945 to = futex_setup_timer(time, &timeout, flags, 0); 946 947 retry: 948 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); 949 if (unlikely(ret != 0)) 950 goto out; 951 952 retry_private: 953 hb = futex_q_lock(&q); 954 955 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 956 &exiting, 0); 957 if (unlikely(ret)) { 958 /* 959 * Atomic work succeeded and we got the lock, 960 * or failed. Either way, we do _not_ block. 961 */ 962 switch (ret) { 963 case 1: 964 /* We got the lock. */ 965 ret = 0; 966 goto out_unlock_put_key; 967 case -EFAULT: 968 goto uaddr_faulted; 969 case -EBUSY: 970 case -EAGAIN: 971 /* 972 * Two reasons for this: 973 * - EBUSY: Task is exiting and we just wait for the 974 * exit to complete. 975 * - EAGAIN: The user space value changed. 976 */ 977 futex_q_unlock(hb); 978 /* 979 * Handle the case where the owner is in the middle of 980 * exiting. Wait for the exit to complete otherwise 981 * this task might loop forever, aka. live lock. 982 */ 983 wait_for_owner_exiting(ret, exiting); 984 cond_resched(); 985 goto retry; 986 default: 987 goto out_unlock_put_key; 988 } 989 } 990 991 WARN_ON(!q.pi_state); 992 993 /* 994 * Only actually queue now that the atomic ops are done: 995 */ 996 __futex_queue(&q, hb); 997 998 if (trylock) { 999 ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); 1000 /* Fixup the trylock return value: */ 1001 ret = ret ? 0 : -EWOULDBLOCK; 1002 goto no_block; 1003 } 1004 1005 rt_mutex_init_waiter(&rt_waiter); 1006 1007 /* 1008 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not 1009 * hold it while doing rt_mutex_start_proxy(), because then it will 1010 * include hb->lock in the blocking chain, even through we'll not in 1011 * fact hold it while blocking. This will lead it to report -EDEADLK 1012 * and BUG when futex_unlock_pi() interleaves with this. 1013 * 1014 * Therefore acquire wait_lock while holding hb->lock, but drop the 1015 * latter before calling __rt_mutex_start_proxy_lock(). This 1016 * interleaves with futex_unlock_pi() -- which does a similar lock 1017 * handoff -- such that the latter can observe the futex_q::pi_state 1018 * before __rt_mutex_start_proxy_lock() is done. 1019 */ 1020 raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); 1021 spin_unlock(q.lock_ptr); 1022 /* 1023 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter 1024 * such that futex_unlock_pi() is guaranteed to observe the waiter when 1025 * it sees the futex_q::pi_state. 1026 */ 1027 ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 1028 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 1029 1030 if (ret) { 1031 if (ret == 1) 1032 ret = 0; 1033 goto cleanup; 1034 } 1035 1036 if (unlikely(to)) 1037 hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); 1038 1039 ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); 1040 1041 cleanup: 1042 spin_lock(q.lock_ptr); 1043 /* 1044 * If we failed to acquire the lock (deadlock/signal/timeout), we must 1045 * first acquire the hb->lock before removing the lock from the 1046 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait 1047 * lists consistent. 1048 * 1049 * In particular; it is important that futex_unlock_pi() can not 1050 * observe this inconsistency. 1051 */ 1052 if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) 1053 ret = 0; 1054 1055 no_block: 1056 /* 1057 * Fixup the pi_state owner and possibly acquire the lock if we 1058 * haven't already. 1059 */ 1060 res = fixup_pi_owner(uaddr, &q, !ret); 1061 /* 1062 * If fixup_pi_owner() returned an error, propagate that. If it acquired 1063 * the lock, clear our -ETIMEDOUT or -EINTR. 1064 */ 1065 if (res) 1066 ret = (res < 0) ? res : 0; 1067 1068 futex_unqueue_pi(&q); 1069 spin_unlock(q.lock_ptr); 1070 goto out; 1071 1072 out_unlock_put_key: 1073 futex_q_unlock(hb); 1074 1075 out: 1076 if (to) { 1077 hrtimer_cancel(&to->timer); 1078 destroy_hrtimer_on_stack(&to->timer); 1079 } 1080 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1081 1082 uaddr_faulted: 1083 futex_q_unlock(hb); 1084 1085 ret = fault_in_user_writeable(uaddr); 1086 if (ret) 1087 goto out; 1088 1089 if (!(flags & FLAGS_SHARED)) 1090 goto retry_private; 1091 1092 goto retry; 1093 } 1094 1095 /* 1096 * Userspace attempted a TID -> 0 atomic transition, and failed. 1097 * This is the in-kernel slowpath: we look up the PI state (if any), 1098 * and do the rt-mutex unlock. 1099 */ 1100 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) 1101 { 1102 u32 curval, uval, vpid = task_pid_vnr(current); 1103 union futex_key key = FUTEX_KEY_INIT; 1104 struct futex_hash_bucket *hb; 1105 struct futex_q *top_waiter; 1106 int ret; 1107 1108 if (!IS_ENABLED(CONFIG_FUTEX_PI)) 1109 return -ENOSYS; 1110 1111 retry: 1112 if (get_user(uval, uaddr)) 1113 return -EFAULT; 1114 /* 1115 * We release only a lock we actually own: 1116 */ 1117 if ((uval & FUTEX_TID_MASK) != vpid) 1118 return -EPERM; 1119 1120 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); 1121 if (ret) 1122 return ret; 1123 1124 hb = futex_hash(&key); 1125 spin_lock(&hb->lock); 1126 1127 /* 1128 * Check waiters first. We do not trust user space values at 1129 * all and we at least want to know if user space fiddled 1130 * with the futex value instead of blindly unlocking. 1131 */ 1132 top_waiter = futex_top_waiter(hb, &key); 1133 if (top_waiter) { 1134 struct futex_pi_state *pi_state = top_waiter->pi_state; 1135 1136 ret = -EINVAL; 1137 if (!pi_state) 1138 goto out_unlock; 1139 1140 /* 1141 * If current does not own the pi_state then the futex is 1142 * inconsistent and user space fiddled with the futex value. 1143 */ 1144 if (pi_state->owner != current) 1145 goto out_unlock; 1146 1147 get_pi_state(pi_state); 1148 /* 1149 * By taking wait_lock while still holding hb->lock, we ensure 1150 * there is no point where we hold neither; and therefore 1151 * wake_futex_p() must observe a state consistent with what we 1152 * observed. 1153 * 1154 * In particular; this forces __rt_mutex_start_proxy() to 1155 * complete such that we're guaranteed to observe the 1156 * rt_waiter. Also see the WARN in wake_futex_pi(). 1157 */ 1158 raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); 1159 spin_unlock(&hb->lock); 1160 1161 /* drops pi_state->pi_mutex.wait_lock */ 1162 ret = wake_futex_pi(uaddr, uval, pi_state); 1163 1164 put_pi_state(pi_state); 1165 1166 /* 1167 * Success, we're done! No tricky corner cases. 1168 */ 1169 if (!ret) 1170 return ret; 1171 /* 1172 * The atomic access to the futex value generated a 1173 * pagefault, so retry the user-access and the wakeup: 1174 */ 1175 if (ret == -EFAULT) 1176 goto pi_faulted; 1177 /* 1178 * A unconditional UNLOCK_PI op raced against a waiter 1179 * setting the FUTEX_WAITERS bit. Try again. 1180 */ 1181 if (ret == -EAGAIN) 1182 goto pi_retry; 1183 /* 1184 * wake_futex_pi has detected invalid state. Tell user 1185 * space. 1186 */ 1187 return ret; 1188 } 1189 1190 /* 1191 * We have no kernel internal state, i.e. no waiters in the 1192 * kernel. Waiters which are about to queue themselves are stuck 1193 * on hb->lock. So we can safely ignore them. We do neither 1194 * preserve the WAITERS bit not the OWNER_DIED one. We are the 1195 * owner. 1196 */ 1197 if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) { 1198 spin_unlock(&hb->lock); 1199 switch (ret) { 1200 case -EFAULT: 1201 goto pi_faulted; 1202 1203 case -EAGAIN: 1204 goto pi_retry; 1205 1206 default: 1207 WARN_ON_ONCE(1); 1208 return ret; 1209 } 1210 } 1211 1212 /* 1213 * If uval has changed, let user space handle it. 1214 */ 1215 ret = (curval == uval) ? 0 : -EAGAIN; 1216 1217 out_unlock: 1218 spin_unlock(&hb->lock); 1219 return ret; 1220 1221 pi_retry: 1222 cond_resched(); 1223 goto retry; 1224 1225 pi_faulted: 1226 1227 ret = fault_in_user_writeable(uaddr); 1228 if (!ret) 1229 goto retry; 1230 1231 return ret; 1232 } 1233 1234