185dc28faSPeter Zijlstra // SPDX-License-Identifier: GPL-2.0-or-later
285dc28faSPeter Zijlstra
385dc28faSPeter Zijlstra #include <linux/slab.h>
485dc28faSPeter Zijlstra #include <linux/sched/task.h>
585dc28faSPeter Zijlstra
685dc28faSPeter Zijlstra #include "futex.h"
785dc28faSPeter Zijlstra #include "../locking/rtmutex_common.h"
885dc28faSPeter Zijlstra
985dc28faSPeter Zijlstra /*
1085dc28faSPeter Zijlstra * PI code:
1185dc28faSPeter Zijlstra */
refill_pi_state_cache(void)1285dc28faSPeter Zijlstra int refill_pi_state_cache(void)
1385dc28faSPeter Zijlstra {
1485dc28faSPeter Zijlstra struct futex_pi_state *pi_state;
1585dc28faSPeter Zijlstra
1685dc28faSPeter Zijlstra if (likely(current->pi_state_cache))
1785dc28faSPeter Zijlstra return 0;
1885dc28faSPeter Zijlstra
1985dc28faSPeter Zijlstra pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
2085dc28faSPeter Zijlstra
2185dc28faSPeter Zijlstra if (!pi_state)
2285dc28faSPeter Zijlstra return -ENOMEM;
2385dc28faSPeter Zijlstra
2485dc28faSPeter Zijlstra INIT_LIST_HEAD(&pi_state->list);
2585dc28faSPeter Zijlstra /* pi_mutex gets initialized later */
2685dc28faSPeter Zijlstra pi_state->owner = NULL;
2785dc28faSPeter Zijlstra refcount_set(&pi_state->refcount, 1);
2885dc28faSPeter Zijlstra pi_state->key = FUTEX_KEY_INIT;
2985dc28faSPeter Zijlstra
3085dc28faSPeter Zijlstra current->pi_state_cache = pi_state;
3185dc28faSPeter Zijlstra
3285dc28faSPeter Zijlstra return 0;
3385dc28faSPeter Zijlstra }
3485dc28faSPeter Zijlstra
alloc_pi_state(void)3585dc28faSPeter Zijlstra static struct futex_pi_state *alloc_pi_state(void)
3685dc28faSPeter Zijlstra {
3785dc28faSPeter Zijlstra struct futex_pi_state *pi_state = current->pi_state_cache;
3885dc28faSPeter Zijlstra
3985dc28faSPeter Zijlstra WARN_ON(!pi_state);
4085dc28faSPeter Zijlstra current->pi_state_cache = NULL;
4185dc28faSPeter Zijlstra
4285dc28faSPeter Zijlstra return pi_state;
4385dc28faSPeter Zijlstra }
4485dc28faSPeter Zijlstra
pi_state_update_owner(struct futex_pi_state * pi_state,struct task_struct * new_owner)4585dc28faSPeter Zijlstra static void pi_state_update_owner(struct futex_pi_state *pi_state,
4685dc28faSPeter Zijlstra struct task_struct *new_owner)
4785dc28faSPeter Zijlstra {
4885dc28faSPeter Zijlstra struct task_struct *old_owner = pi_state->owner;
4985dc28faSPeter Zijlstra
5085dc28faSPeter Zijlstra lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
5185dc28faSPeter Zijlstra
5285dc28faSPeter Zijlstra if (old_owner) {
5385dc28faSPeter Zijlstra raw_spin_lock(&old_owner->pi_lock);
5485dc28faSPeter Zijlstra WARN_ON(list_empty(&pi_state->list));
5585dc28faSPeter Zijlstra list_del_init(&pi_state->list);
5685dc28faSPeter Zijlstra raw_spin_unlock(&old_owner->pi_lock);
5785dc28faSPeter Zijlstra }
5885dc28faSPeter Zijlstra
5985dc28faSPeter Zijlstra if (new_owner) {
6085dc28faSPeter Zijlstra raw_spin_lock(&new_owner->pi_lock);
6185dc28faSPeter Zijlstra WARN_ON(!list_empty(&pi_state->list));
6285dc28faSPeter Zijlstra list_add(&pi_state->list, &new_owner->pi_state_list);
6385dc28faSPeter Zijlstra pi_state->owner = new_owner;
6485dc28faSPeter Zijlstra raw_spin_unlock(&new_owner->pi_lock);
6585dc28faSPeter Zijlstra }
6685dc28faSPeter Zijlstra }
6785dc28faSPeter Zijlstra
get_pi_state(struct futex_pi_state * pi_state)6885dc28faSPeter Zijlstra void get_pi_state(struct futex_pi_state *pi_state)
6985dc28faSPeter Zijlstra {
7085dc28faSPeter Zijlstra WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
7185dc28faSPeter Zijlstra }
7285dc28faSPeter Zijlstra
7385dc28faSPeter Zijlstra /*
7485dc28faSPeter Zijlstra * Drops a reference to the pi_state object and frees or caches it
7585dc28faSPeter Zijlstra * when the last reference is gone.
7685dc28faSPeter Zijlstra */
put_pi_state(struct futex_pi_state * pi_state)7785dc28faSPeter Zijlstra void put_pi_state(struct futex_pi_state *pi_state)
7885dc28faSPeter Zijlstra {
7985dc28faSPeter Zijlstra if (!pi_state)
8085dc28faSPeter Zijlstra return;
8185dc28faSPeter Zijlstra
8285dc28faSPeter Zijlstra if (!refcount_dec_and_test(&pi_state->refcount))
8385dc28faSPeter Zijlstra return;
8485dc28faSPeter Zijlstra
8585dc28faSPeter Zijlstra /*
8685dc28faSPeter Zijlstra * If pi_state->owner is NULL, the owner is most probably dying
8785dc28faSPeter Zijlstra * and has cleaned up the pi_state already
8885dc28faSPeter Zijlstra */
8985dc28faSPeter Zijlstra if (pi_state->owner) {
9085dc28faSPeter Zijlstra unsigned long flags;
9185dc28faSPeter Zijlstra
9285dc28faSPeter Zijlstra raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
9385dc28faSPeter Zijlstra pi_state_update_owner(pi_state, NULL);
9485dc28faSPeter Zijlstra rt_mutex_proxy_unlock(&pi_state->pi_mutex);
9585dc28faSPeter Zijlstra raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
9685dc28faSPeter Zijlstra }
9785dc28faSPeter Zijlstra
9885dc28faSPeter Zijlstra if (current->pi_state_cache) {
9985dc28faSPeter Zijlstra kfree(pi_state);
10085dc28faSPeter Zijlstra } else {
10185dc28faSPeter Zijlstra /*
10285dc28faSPeter Zijlstra * pi_state->list is already empty.
10385dc28faSPeter Zijlstra * clear pi_state->owner.
10485dc28faSPeter Zijlstra * refcount is at 0 - put it back to 1.
10585dc28faSPeter Zijlstra */
10685dc28faSPeter Zijlstra pi_state->owner = NULL;
10785dc28faSPeter Zijlstra refcount_set(&pi_state->refcount, 1);
10885dc28faSPeter Zijlstra current->pi_state_cache = pi_state;
10985dc28faSPeter Zijlstra }
11085dc28faSPeter Zijlstra }
11185dc28faSPeter Zijlstra
11285dc28faSPeter Zijlstra /*
11385dc28faSPeter Zijlstra * We need to check the following states:
11485dc28faSPeter Zijlstra *
11585dc28faSPeter Zijlstra * Waiter | pi_state | pi->owner | uTID | uODIED | ?
11685dc28faSPeter Zijlstra *
11785dc28faSPeter Zijlstra * [1] NULL | --- | --- | 0 | 0/1 | Valid
11885dc28faSPeter Zijlstra * [2] NULL | --- | --- | >0 | 0/1 | Valid
11985dc28faSPeter Zijlstra *
12085dc28faSPeter Zijlstra * [3] Found | NULL | -- | Any | 0/1 | Invalid
12185dc28faSPeter Zijlstra *
12285dc28faSPeter Zijlstra * [4] Found | Found | NULL | 0 | 1 | Valid
12385dc28faSPeter Zijlstra * [5] Found | Found | NULL | >0 | 1 | Invalid
12485dc28faSPeter Zijlstra *
12585dc28faSPeter Zijlstra * [6] Found | Found | task | 0 | 1 | Valid
12685dc28faSPeter Zijlstra *
12785dc28faSPeter Zijlstra * [7] Found | Found | NULL | Any | 0 | Invalid
12885dc28faSPeter Zijlstra *
12985dc28faSPeter Zijlstra * [8] Found | Found | task | ==taskTID | 0/1 | Valid
13085dc28faSPeter Zijlstra * [9] Found | Found | task | 0 | 0 | Invalid
13185dc28faSPeter Zijlstra * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
13285dc28faSPeter Zijlstra *
13385dc28faSPeter Zijlstra * [1] Indicates that the kernel can acquire the futex atomically. We
13485dc28faSPeter Zijlstra * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
13585dc28faSPeter Zijlstra *
13685dc28faSPeter Zijlstra * [2] Valid, if TID does not belong to a kernel thread. If no matching
13785dc28faSPeter Zijlstra * thread is found then it indicates that the owner TID has died.
13885dc28faSPeter Zijlstra *
13985dc28faSPeter Zijlstra * [3] Invalid. The waiter is queued on a non PI futex
14085dc28faSPeter Zijlstra *
14185dc28faSPeter Zijlstra * [4] Valid state after exit_robust_list(), which sets the user space
14285dc28faSPeter Zijlstra * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
14385dc28faSPeter Zijlstra *
14485dc28faSPeter Zijlstra * [5] The user space value got manipulated between exit_robust_list()
14585dc28faSPeter Zijlstra * and exit_pi_state_list()
14685dc28faSPeter Zijlstra *
14785dc28faSPeter Zijlstra * [6] Valid state after exit_pi_state_list() which sets the new owner in
14885dc28faSPeter Zijlstra * the pi_state but cannot access the user space value.
14985dc28faSPeter Zijlstra *
15085dc28faSPeter Zijlstra * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
15185dc28faSPeter Zijlstra *
15285dc28faSPeter Zijlstra * [8] Owner and user space value match
15385dc28faSPeter Zijlstra *
15485dc28faSPeter Zijlstra * [9] There is no transient state which sets the user space TID to 0
15585dc28faSPeter Zijlstra * except exit_robust_list(), but this is indicated by the
15685dc28faSPeter Zijlstra * FUTEX_OWNER_DIED bit. See [4]
15785dc28faSPeter Zijlstra *
15885dc28faSPeter Zijlstra * [10] There is no transient state which leaves owner and user space
15985dc28faSPeter Zijlstra * TID out of sync. Except one error case where the kernel is denied
16085dc28faSPeter Zijlstra * write access to the user address, see fixup_pi_state_owner().
16185dc28faSPeter Zijlstra *
16285dc28faSPeter Zijlstra *
16385dc28faSPeter Zijlstra * Serialization and lifetime rules:
16485dc28faSPeter Zijlstra *
16585dc28faSPeter Zijlstra * hb->lock:
16685dc28faSPeter Zijlstra *
16785dc28faSPeter Zijlstra * hb -> futex_q, relation
16885dc28faSPeter Zijlstra * futex_q -> pi_state, relation
16985dc28faSPeter Zijlstra *
17085dc28faSPeter Zijlstra * (cannot be raw because hb can contain arbitrary amount
17185dc28faSPeter Zijlstra * of futex_q's)
17285dc28faSPeter Zijlstra *
17385dc28faSPeter Zijlstra * pi_mutex->wait_lock:
17485dc28faSPeter Zijlstra *
17585dc28faSPeter Zijlstra * {uval, pi_state}
17685dc28faSPeter Zijlstra *
17785dc28faSPeter Zijlstra * (and pi_mutex 'obviously')
17885dc28faSPeter Zijlstra *
17985dc28faSPeter Zijlstra * p->pi_lock:
18085dc28faSPeter Zijlstra *
18185dc28faSPeter Zijlstra * p->pi_state_list -> pi_state->list, relation
18285dc28faSPeter Zijlstra * pi_mutex->owner -> pi_state->owner, relation
18385dc28faSPeter Zijlstra *
18485dc28faSPeter Zijlstra * pi_state->refcount:
18585dc28faSPeter Zijlstra *
18685dc28faSPeter Zijlstra * pi_state lifetime
18785dc28faSPeter Zijlstra *
18885dc28faSPeter Zijlstra *
18985dc28faSPeter Zijlstra * Lock order:
19085dc28faSPeter Zijlstra *
19185dc28faSPeter Zijlstra * hb->lock
19285dc28faSPeter Zijlstra * pi_mutex->wait_lock
19385dc28faSPeter Zijlstra * p->pi_lock
19485dc28faSPeter Zijlstra *
19585dc28faSPeter Zijlstra */
19685dc28faSPeter Zijlstra
19785dc28faSPeter Zijlstra /*
19885dc28faSPeter Zijlstra * Validate that the existing waiter has a pi_state and sanity check
19985dc28faSPeter Zijlstra * the pi_state against the user space value. If correct, attach to
20085dc28faSPeter Zijlstra * it.
20185dc28faSPeter Zijlstra */
attach_to_pi_state(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state,struct futex_pi_state ** ps)20285dc28faSPeter Zijlstra static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
20385dc28faSPeter Zijlstra struct futex_pi_state *pi_state,
20485dc28faSPeter Zijlstra struct futex_pi_state **ps)
20585dc28faSPeter Zijlstra {
20685dc28faSPeter Zijlstra pid_t pid = uval & FUTEX_TID_MASK;
20785dc28faSPeter Zijlstra u32 uval2;
20885dc28faSPeter Zijlstra int ret;
20985dc28faSPeter Zijlstra
21085dc28faSPeter Zijlstra /*
21185dc28faSPeter Zijlstra * Userspace might have messed up non-PI and PI futexes [3]
21285dc28faSPeter Zijlstra */
21385dc28faSPeter Zijlstra if (unlikely(!pi_state))
21485dc28faSPeter Zijlstra return -EINVAL;
21585dc28faSPeter Zijlstra
21685dc28faSPeter Zijlstra /*
21785dc28faSPeter Zijlstra * We get here with hb->lock held, and having found a
21885dc28faSPeter Zijlstra * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
21985dc28faSPeter Zijlstra * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
22085dc28faSPeter Zijlstra * which in turn means that futex_lock_pi() still has a reference on
22185dc28faSPeter Zijlstra * our pi_state.
22285dc28faSPeter Zijlstra *
22385dc28faSPeter Zijlstra * The waiter holding a reference on @pi_state also protects against
22485dc28faSPeter Zijlstra * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
22585dc28faSPeter Zijlstra * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
22685dc28faSPeter Zijlstra * free pi_state before we can take a reference ourselves.
22785dc28faSPeter Zijlstra */
22885dc28faSPeter Zijlstra WARN_ON(!refcount_read(&pi_state->refcount));
22985dc28faSPeter Zijlstra
23085dc28faSPeter Zijlstra /*
23185dc28faSPeter Zijlstra * Now that we have a pi_state, we can acquire wait_lock
23285dc28faSPeter Zijlstra * and do the state validation.
23385dc28faSPeter Zijlstra */
23485dc28faSPeter Zijlstra raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
23585dc28faSPeter Zijlstra
23685dc28faSPeter Zijlstra /*
23785dc28faSPeter Zijlstra * Since {uval, pi_state} is serialized by wait_lock, and our current
23885dc28faSPeter Zijlstra * uval was read without holding it, it can have changed. Verify it
23985dc28faSPeter Zijlstra * still is what we expect it to be, otherwise retry the entire
24085dc28faSPeter Zijlstra * operation.
24185dc28faSPeter Zijlstra */
24285dc28faSPeter Zijlstra if (futex_get_value_locked(&uval2, uaddr))
24385dc28faSPeter Zijlstra goto out_efault;
24485dc28faSPeter Zijlstra
24585dc28faSPeter Zijlstra if (uval != uval2)
24685dc28faSPeter Zijlstra goto out_eagain;
24785dc28faSPeter Zijlstra
24885dc28faSPeter Zijlstra /*
24985dc28faSPeter Zijlstra * Handle the owner died case:
25085dc28faSPeter Zijlstra */
25185dc28faSPeter Zijlstra if (uval & FUTEX_OWNER_DIED) {
25285dc28faSPeter Zijlstra /*
25385dc28faSPeter Zijlstra * exit_pi_state_list sets owner to NULL and wakes the
25485dc28faSPeter Zijlstra * topmost waiter. The task which acquires the
25585dc28faSPeter Zijlstra * pi_state->rt_mutex will fixup owner.
25685dc28faSPeter Zijlstra */
25785dc28faSPeter Zijlstra if (!pi_state->owner) {
25885dc28faSPeter Zijlstra /*
25985dc28faSPeter Zijlstra * No pi state owner, but the user space TID
26085dc28faSPeter Zijlstra * is not 0. Inconsistent state. [5]
26185dc28faSPeter Zijlstra */
26285dc28faSPeter Zijlstra if (pid)
26385dc28faSPeter Zijlstra goto out_einval;
26485dc28faSPeter Zijlstra /*
26585dc28faSPeter Zijlstra * Take a ref on the state and return success. [4]
26685dc28faSPeter Zijlstra */
26785dc28faSPeter Zijlstra goto out_attach;
26885dc28faSPeter Zijlstra }
26985dc28faSPeter Zijlstra
27085dc28faSPeter Zijlstra /*
27185dc28faSPeter Zijlstra * If TID is 0, then either the dying owner has not
27285dc28faSPeter Zijlstra * yet executed exit_pi_state_list() or some waiter
27385dc28faSPeter Zijlstra * acquired the rtmutex in the pi state, but did not
27485dc28faSPeter Zijlstra * yet fixup the TID in user space.
27585dc28faSPeter Zijlstra *
27685dc28faSPeter Zijlstra * Take a ref on the state and return success. [6]
27785dc28faSPeter Zijlstra */
27885dc28faSPeter Zijlstra if (!pid)
27985dc28faSPeter Zijlstra goto out_attach;
28085dc28faSPeter Zijlstra } else {
28185dc28faSPeter Zijlstra /*
28285dc28faSPeter Zijlstra * If the owner died bit is not set, then the pi_state
28385dc28faSPeter Zijlstra * must have an owner. [7]
28485dc28faSPeter Zijlstra */
28585dc28faSPeter Zijlstra if (!pi_state->owner)
28685dc28faSPeter Zijlstra goto out_einval;
28785dc28faSPeter Zijlstra }
28885dc28faSPeter Zijlstra
28985dc28faSPeter Zijlstra /*
29085dc28faSPeter Zijlstra * Bail out if user space manipulated the futex value. If pi
29185dc28faSPeter Zijlstra * state exists then the owner TID must be the same as the
29285dc28faSPeter Zijlstra * user space TID. [9/10]
29385dc28faSPeter Zijlstra */
29485dc28faSPeter Zijlstra if (pid != task_pid_vnr(pi_state->owner))
29585dc28faSPeter Zijlstra goto out_einval;
29685dc28faSPeter Zijlstra
29785dc28faSPeter Zijlstra out_attach:
29885dc28faSPeter Zijlstra get_pi_state(pi_state);
29985dc28faSPeter Zijlstra raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
30085dc28faSPeter Zijlstra *ps = pi_state;
30185dc28faSPeter Zijlstra return 0;
30285dc28faSPeter Zijlstra
30385dc28faSPeter Zijlstra out_einval:
30485dc28faSPeter Zijlstra ret = -EINVAL;
30585dc28faSPeter Zijlstra goto out_error;
30685dc28faSPeter Zijlstra
30785dc28faSPeter Zijlstra out_eagain:
30885dc28faSPeter Zijlstra ret = -EAGAIN;
30985dc28faSPeter Zijlstra goto out_error;
31085dc28faSPeter Zijlstra
31185dc28faSPeter Zijlstra out_efault:
31285dc28faSPeter Zijlstra ret = -EFAULT;
31385dc28faSPeter Zijlstra goto out_error;
31485dc28faSPeter Zijlstra
31585dc28faSPeter Zijlstra out_error:
31685dc28faSPeter Zijlstra raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
31785dc28faSPeter Zijlstra return ret;
31885dc28faSPeter Zijlstra }
31985dc28faSPeter Zijlstra
handle_exit_race(u32 __user * uaddr,u32 uval,struct task_struct * tsk)32085dc28faSPeter Zijlstra static int handle_exit_race(u32 __user *uaddr, u32 uval,
32185dc28faSPeter Zijlstra struct task_struct *tsk)
32285dc28faSPeter Zijlstra {
32385dc28faSPeter Zijlstra u32 uval2;
32485dc28faSPeter Zijlstra
32585dc28faSPeter Zijlstra /*
32685dc28faSPeter Zijlstra * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
32785dc28faSPeter Zijlstra * caller that the alleged owner is busy.
32885dc28faSPeter Zijlstra */
32985dc28faSPeter Zijlstra if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
33085dc28faSPeter Zijlstra return -EBUSY;
33185dc28faSPeter Zijlstra
33285dc28faSPeter Zijlstra /*
33385dc28faSPeter Zijlstra * Reread the user space value to handle the following situation:
33485dc28faSPeter Zijlstra *
33585dc28faSPeter Zijlstra * CPU0 CPU1
33685dc28faSPeter Zijlstra *
33785dc28faSPeter Zijlstra * sys_exit() sys_futex()
33885dc28faSPeter Zijlstra * do_exit() futex_lock_pi()
33985dc28faSPeter Zijlstra * futex_lock_pi_atomic()
34085dc28faSPeter Zijlstra * exit_signals(tsk) No waiters:
34185dc28faSPeter Zijlstra * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
34285dc28faSPeter Zijlstra * mm_release(tsk) Set waiter bit
34385dc28faSPeter Zijlstra * exit_robust_list(tsk) { *uaddr = 0x80000PID;
34485dc28faSPeter Zijlstra * Set owner died attach_to_pi_owner() {
34585dc28faSPeter Zijlstra * *uaddr = 0xC0000000; tsk = get_task(PID);
34685dc28faSPeter Zijlstra * } if (!tsk->flags & PF_EXITING) {
34785dc28faSPeter Zijlstra * ... attach();
34885dc28faSPeter Zijlstra * tsk->futex_state = } else {
34985dc28faSPeter Zijlstra * FUTEX_STATE_DEAD; if (tsk->futex_state !=
35085dc28faSPeter Zijlstra * FUTEX_STATE_DEAD)
35185dc28faSPeter Zijlstra * return -EAGAIN;
35285dc28faSPeter Zijlstra * return -ESRCH; <--- FAIL
35385dc28faSPeter Zijlstra * }
35485dc28faSPeter Zijlstra *
35585dc28faSPeter Zijlstra * Returning ESRCH unconditionally is wrong here because the
35685dc28faSPeter Zijlstra * user space value has been changed by the exiting task.
35785dc28faSPeter Zijlstra *
35885dc28faSPeter Zijlstra * The same logic applies to the case where the exiting task is
35985dc28faSPeter Zijlstra * already gone.
36085dc28faSPeter Zijlstra */
36185dc28faSPeter Zijlstra if (futex_get_value_locked(&uval2, uaddr))
36285dc28faSPeter Zijlstra return -EFAULT;
36385dc28faSPeter Zijlstra
36485dc28faSPeter Zijlstra /* If the user space value has changed, try again. */
36585dc28faSPeter Zijlstra if (uval2 != uval)
36685dc28faSPeter Zijlstra return -EAGAIN;
36785dc28faSPeter Zijlstra
36885dc28faSPeter Zijlstra /*
36985dc28faSPeter Zijlstra * The exiting task did not have a robust list, the robust list was
37085dc28faSPeter Zijlstra * corrupted or the user space value in *uaddr is simply bogus.
37185dc28faSPeter Zijlstra * Give up and tell user space.
37285dc28faSPeter Zijlstra */
37385dc28faSPeter Zijlstra return -ESRCH;
37485dc28faSPeter Zijlstra }
37585dc28faSPeter Zijlstra
__attach_to_pi_owner(struct task_struct * p,union futex_key * key,struct futex_pi_state ** ps)37685dc28faSPeter Zijlstra static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
37785dc28faSPeter Zijlstra struct futex_pi_state **ps)
37885dc28faSPeter Zijlstra {
37985dc28faSPeter Zijlstra /*
38085dc28faSPeter Zijlstra * No existing pi state. First waiter. [2]
38185dc28faSPeter Zijlstra *
38285dc28faSPeter Zijlstra * This creates pi_state, we have hb->lock held, this means nothing can
38385dc28faSPeter Zijlstra * observe this state, wait_lock is irrelevant.
38485dc28faSPeter Zijlstra */
38585dc28faSPeter Zijlstra struct futex_pi_state *pi_state = alloc_pi_state();
38685dc28faSPeter Zijlstra
38785dc28faSPeter Zijlstra /*
38885dc28faSPeter Zijlstra * Initialize the pi_mutex in locked state and make @p
38985dc28faSPeter Zijlstra * the owner of it:
39085dc28faSPeter Zijlstra */
39185dc28faSPeter Zijlstra rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
39285dc28faSPeter Zijlstra
39385dc28faSPeter Zijlstra /* Store the key for possible exit cleanups: */
39485dc28faSPeter Zijlstra pi_state->key = *key;
39585dc28faSPeter Zijlstra
39685dc28faSPeter Zijlstra WARN_ON(!list_empty(&pi_state->list));
39785dc28faSPeter Zijlstra list_add(&pi_state->list, &p->pi_state_list);
39885dc28faSPeter Zijlstra /*
39985dc28faSPeter Zijlstra * Assignment without holding pi_state->pi_mutex.wait_lock is safe
40085dc28faSPeter Zijlstra * because there is no concurrency as the object is not published yet.
40185dc28faSPeter Zijlstra */
40285dc28faSPeter Zijlstra pi_state->owner = p;
40385dc28faSPeter Zijlstra
40485dc28faSPeter Zijlstra *ps = pi_state;
40585dc28faSPeter Zijlstra }
40685dc28faSPeter Zijlstra /*
40785dc28faSPeter Zijlstra * Lookup the task for the TID provided from user space and attach to
40885dc28faSPeter Zijlstra * it after doing proper sanity checks.
40985dc28faSPeter Zijlstra */
attach_to_pi_owner(u32 __user * uaddr,u32 uval,union futex_key * key,struct futex_pi_state ** ps,struct task_struct ** exiting)41085dc28faSPeter Zijlstra static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
41185dc28faSPeter Zijlstra struct futex_pi_state **ps,
41285dc28faSPeter Zijlstra struct task_struct **exiting)
41385dc28faSPeter Zijlstra {
41485dc28faSPeter Zijlstra pid_t pid = uval & FUTEX_TID_MASK;
41585dc28faSPeter Zijlstra struct task_struct *p;
41685dc28faSPeter Zijlstra
41785dc28faSPeter Zijlstra /*
41885dc28faSPeter Zijlstra * We are the first waiter - try to look up the real owner and attach
41985dc28faSPeter Zijlstra * the new pi_state to it, but bail out when TID = 0 [1]
42085dc28faSPeter Zijlstra *
42185dc28faSPeter Zijlstra * The !pid check is paranoid. None of the call sites should end up
42285dc28faSPeter Zijlstra * with pid == 0, but better safe than sorry. Let the caller retry
42385dc28faSPeter Zijlstra */
42485dc28faSPeter Zijlstra if (!pid)
42585dc28faSPeter Zijlstra return -EAGAIN;
42685dc28faSPeter Zijlstra p = find_get_task_by_vpid(pid);
42785dc28faSPeter Zijlstra if (!p)
42885dc28faSPeter Zijlstra return handle_exit_race(uaddr, uval, NULL);
42985dc28faSPeter Zijlstra
43085dc28faSPeter Zijlstra if (unlikely(p->flags & PF_KTHREAD)) {
43185dc28faSPeter Zijlstra put_task_struct(p);
43285dc28faSPeter Zijlstra return -EPERM;
43385dc28faSPeter Zijlstra }
43485dc28faSPeter Zijlstra
43585dc28faSPeter Zijlstra /*
43685dc28faSPeter Zijlstra * We need to look at the task state to figure out, whether the
43785dc28faSPeter Zijlstra * task is exiting. To protect against the change of the task state
43885dc28faSPeter Zijlstra * in futex_exit_release(), we do this protected by p->pi_lock:
43985dc28faSPeter Zijlstra */
44085dc28faSPeter Zijlstra raw_spin_lock_irq(&p->pi_lock);
44185dc28faSPeter Zijlstra if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
44285dc28faSPeter Zijlstra /*
44385dc28faSPeter Zijlstra * The task is on the way out. When the futex state is
44485dc28faSPeter Zijlstra * FUTEX_STATE_DEAD, we know that the task has finished
44585dc28faSPeter Zijlstra * the cleanup:
44685dc28faSPeter Zijlstra */
44785dc28faSPeter Zijlstra int ret = handle_exit_race(uaddr, uval, p);
44885dc28faSPeter Zijlstra
44985dc28faSPeter Zijlstra raw_spin_unlock_irq(&p->pi_lock);
45085dc28faSPeter Zijlstra /*
45185dc28faSPeter Zijlstra * If the owner task is between FUTEX_STATE_EXITING and
45285dc28faSPeter Zijlstra * FUTEX_STATE_DEAD then store the task pointer and keep
45385dc28faSPeter Zijlstra * the reference on the task struct. The calling code will
45485dc28faSPeter Zijlstra * drop all locks, wait for the task to reach
45585dc28faSPeter Zijlstra * FUTEX_STATE_DEAD and then drop the refcount. This is
45685dc28faSPeter Zijlstra * required to prevent a live lock when the current task
45785dc28faSPeter Zijlstra * preempted the exiting task between the two states.
45885dc28faSPeter Zijlstra */
45985dc28faSPeter Zijlstra if (ret == -EBUSY)
46085dc28faSPeter Zijlstra *exiting = p;
46185dc28faSPeter Zijlstra else
46285dc28faSPeter Zijlstra put_task_struct(p);
46385dc28faSPeter Zijlstra return ret;
46485dc28faSPeter Zijlstra }
46585dc28faSPeter Zijlstra
46685dc28faSPeter Zijlstra __attach_to_pi_owner(p, key, ps);
46785dc28faSPeter Zijlstra raw_spin_unlock_irq(&p->pi_lock);
46885dc28faSPeter Zijlstra
46985dc28faSPeter Zijlstra put_task_struct(p);
47085dc28faSPeter Zijlstra
47185dc28faSPeter Zijlstra return 0;
47285dc28faSPeter Zijlstra }
47385dc28faSPeter Zijlstra
lock_pi_update_atomic(u32 __user * uaddr,u32 uval,u32 newval)47485dc28faSPeter Zijlstra static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
47585dc28faSPeter Zijlstra {
47685dc28faSPeter Zijlstra int err;
47785dc28faSPeter Zijlstra u32 curval;
47885dc28faSPeter Zijlstra
47985dc28faSPeter Zijlstra if (unlikely(should_fail_futex(true)))
48085dc28faSPeter Zijlstra return -EFAULT;
48185dc28faSPeter Zijlstra
48285dc28faSPeter Zijlstra err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
48385dc28faSPeter Zijlstra if (unlikely(err))
48485dc28faSPeter Zijlstra return err;
48585dc28faSPeter Zijlstra
48685dc28faSPeter Zijlstra /* If user space value changed, let the caller retry */
48785dc28faSPeter Zijlstra return curval != uval ? -EAGAIN : 0;
48885dc28faSPeter Zijlstra }
48985dc28faSPeter Zijlstra
49085dc28faSPeter Zijlstra /**
49185dc28faSPeter Zijlstra * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
49285dc28faSPeter Zijlstra * @uaddr: the pi futex user address
49385dc28faSPeter Zijlstra * @hb: the pi futex hash bucket
49485dc28faSPeter Zijlstra * @key: the futex key associated with uaddr and hb
49585dc28faSPeter Zijlstra * @ps: the pi_state pointer where we store the result of the
49685dc28faSPeter Zijlstra * lookup
49785dc28faSPeter Zijlstra * @task: the task to perform the atomic lock work for. This will
49885dc28faSPeter Zijlstra * be "current" except in the case of requeue pi.
49985dc28faSPeter Zijlstra * @exiting: Pointer to store the task pointer of the owner task
50085dc28faSPeter Zijlstra * which is in the middle of exiting
50185dc28faSPeter Zijlstra * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
50285dc28faSPeter Zijlstra *
50385dc28faSPeter Zijlstra * Return:
50485dc28faSPeter Zijlstra * - 0 - ready to wait;
50585dc28faSPeter Zijlstra * - 1 - acquired the lock;
50685dc28faSPeter Zijlstra * - <0 - error
50785dc28faSPeter Zijlstra *
50885dc28faSPeter Zijlstra * The hb->lock must be held by the caller.
50985dc28faSPeter Zijlstra *
51085dc28faSPeter Zijlstra * @exiting is only set when the return value is -EBUSY. If so, this holds
51185dc28faSPeter Zijlstra * a refcount on the exiting task on return and the caller needs to drop it
51285dc28faSPeter Zijlstra * after waiting for the exit to complete.
51385dc28faSPeter Zijlstra */
futex_lock_pi_atomic(u32 __user * uaddr,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps,struct task_struct * task,struct task_struct ** exiting,int set_waiters)51485dc28faSPeter Zijlstra int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
51585dc28faSPeter Zijlstra union futex_key *key,
51685dc28faSPeter Zijlstra struct futex_pi_state **ps,
51785dc28faSPeter Zijlstra struct task_struct *task,
51885dc28faSPeter Zijlstra struct task_struct **exiting,
51985dc28faSPeter Zijlstra int set_waiters)
52085dc28faSPeter Zijlstra {
52185dc28faSPeter Zijlstra u32 uval, newval, vpid = task_pid_vnr(task);
52285dc28faSPeter Zijlstra struct futex_q *top_waiter;
52385dc28faSPeter Zijlstra int ret;
52485dc28faSPeter Zijlstra
52585dc28faSPeter Zijlstra /*
52685dc28faSPeter Zijlstra * Read the user space value first so we can validate a few
52785dc28faSPeter Zijlstra * things before proceeding further.
52885dc28faSPeter Zijlstra */
52985dc28faSPeter Zijlstra if (futex_get_value_locked(&uval, uaddr))
53085dc28faSPeter Zijlstra return -EFAULT;
53185dc28faSPeter Zijlstra
53285dc28faSPeter Zijlstra if (unlikely(should_fail_futex(true)))
53385dc28faSPeter Zijlstra return -EFAULT;
53485dc28faSPeter Zijlstra
53585dc28faSPeter Zijlstra /*
53685dc28faSPeter Zijlstra * Detect deadlocks.
53785dc28faSPeter Zijlstra */
53885dc28faSPeter Zijlstra if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
53985dc28faSPeter Zijlstra return -EDEADLK;
54085dc28faSPeter Zijlstra
54185dc28faSPeter Zijlstra if ((unlikely(should_fail_futex(true))))
54285dc28faSPeter Zijlstra return -EDEADLK;
54385dc28faSPeter Zijlstra
54485dc28faSPeter Zijlstra /*
54585dc28faSPeter Zijlstra * Lookup existing state first. If it exists, try to attach to
54685dc28faSPeter Zijlstra * its pi_state.
54785dc28faSPeter Zijlstra */
54885dc28faSPeter Zijlstra top_waiter = futex_top_waiter(hb, key);
54985dc28faSPeter Zijlstra if (top_waiter)
55085dc28faSPeter Zijlstra return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
55185dc28faSPeter Zijlstra
55285dc28faSPeter Zijlstra /*
55385dc28faSPeter Zijlstra * No waiter and user TID is 0. We are here because the
55485dc28faSPeter Zijlstra * waiters or the owner died bit is set or called from
55585dc28faSPeter Zijlstra * requeue_cmp_pi or for whatever reason something took the
55685dc28faSPeter Zijlstra * syscall.
55785dc28faSPeter Zijlstra */
55885dc28faSPeter Zijlstra if (!(uval & FUTEX_TID_MASK)) {
55985dc28faSPeter Zijlstra /*
56085dc28faSPeter Zijlstra * We take over the futex. No other waiters and the user space
56185dc28faSPeter Zijlstra * TID is 0. We preserve the owner died bit.
56285dc28faSPeter Zijlstra */
56385dc28faSPeter Zijlstra newval = uval & FUTEX_OWNER_DIED;
56485dc28faSPeter Zijlstra newval |= vpid;
56585dc28faSPeter Zijlstra
56685dc28faSPeter Zijlstra /* The futex requeue_pi code can enforce the waiters bit */
56785dc28faSPeter Zijlstra if (set_waiters)
56885dc28faSPeter Zijlstra newval |= FUTEX_WAITERS;
56985dc28faSPeter Zijlstra
57085dc28faSPeter Zijlstra ret = lock_pi_update_atomic(uaddr, uval, newval);
57185dc28faSPeter Zijlstra if (ret)
57285dc28faSPeter Zijlstra return ret;
57385dc28faSPeter Zijlstra
57485dc28faSPeter Zijlstra /*
57585dc28faSPeter Zijlstra * If the waiter bit was requested the caller also needs PI
57685dc28faSPeter Zijlstra * state attached to the new owner of the user space futex.
57785dc28faSPeter Zijlstra *
57885dc28faSPeter Zijlstra * @task is guaranteed to be alive and it cannot be exiting
57985dc28faSPeter Zijlstra * because it is either sleeping or waiting in
58085dc28faSPeter Zijlstra * futex_requeue_pi_wakeup_sync().
58185dc28faSPeter Zijlstra *
58285dc28faSPeter Zijlstra * No need to do the full attach_to_pi_owner() exercise
58385dc28faSPeter Zijlstra * because @task is known and valid.
58485dc28faSPeter Zijlstra */
58585dc28faSPeter Zijlstra if (set_waiters) {
58685dc28faSPeter Zijlstra raw_spin_lock_irq(&task->pi_lock);
58785dc28faSPeter Zijlstra __attach_to_pi_owner(task, key, ps);
58885dc28faSPeter Zijlstra raw_spin_unlock_irq(&task->pi_lock);
58985dc28faSPeter Zijlstra }
59085dc28faSPeter Zijlstra return 1;
59185dc28faSPeter Zijlstra }
59285dc28faSPeter Zijlstra
59385dc28faSPeter Zijlstra /*
59485dc28faSPeter Zijlstra * First waiter. Set the waiters bit before attaching ourself to
59585dc28faSPeter Zijlstra * the owner. If owner tries to unlock, it will be forced into
59685dc28faSPeter Zijlstra * the kernel and blocked on hb->lock.
59785dc28faSPeter Zijlstra */
59885dc28faSPeter Zijlstra newval = uval | FUTEX_WAITERS;
59985dc28faSPeter Zijlstra ret = lock_pi_update_atomic(uaddr, uval, newval);
60085dc28faSPeter Zijlstra if (ret)
60185dc28faSPeter Zijlstra return ret;
60285dc28faSPeter Zijlstra /*
60385dc28faSPeter Zijlstra * If the update of the user space value succeeded, we try to
60485dc28faSPeter Zijlstra * attach to the owner. If that fails, no harm done, we only
60585dc28faSPeter Zijlstra * set the FUTEX_WAITERS bit in the user space variable.
60685dc28faSPeter Zijlstra */
60785dc28faSPeter Zijlstra return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
60885dc28faSPeter Zijlstra }
60985dc28faSPeter Zijlstra
61085dc28faSPeter Zijlstra /*
61185dc28faSPeter Zijlstra * Caller must hold a reference on @pi_state.
61285dc28faSPeter Zijlstra */
wake_futex_pi(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state)61385dc28faSPeter Zijlstra static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
61485dc28faSPeter Zijlstra {
61585dc28faSPeter Zijlstra struct rt_mutex_waiter *top_waiter;
61685dc28faSPeter Zijlstra struct task_struct *new_owner;
61785dc28faSPeter Zijlstra bool postunlock = false;
61885dc28faSPeter Zijlstra DEFINE_RT_WAKE_Q(wqh);
61985dc28faSPeter Zijlstra u32 curval, newval;
62085dc28faSPeter Zijlstra int ret = 0;
62185dc28faSPeter Zijlstra
62285dc28faSPeter Zijlstra top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
62385dc28faSPeter Zijlstra if (WARN_ON_ONCE(!top_waiter)) {
62485dc28faSPeter Zijlstra /*
62585dc28faSPeter Zijlstra * As per the comment in futex_unlock_pi() this should not happen.
62685dc28faSPeter Zijlstra *
62785dc28faSPeter Zijlstra * When this happens, give up our locks and try again, giving
62885dc28faSPeter Zijlstra * the futex_lock_pi() instance time to complete, either by
62985dc28faSPeter Zijlstra * waiting on the rtmutex or removing itself from the futex
63085dc28faSPeter Zijlstra * queue.
63185dc28faSPeter Zijlstra */
63285dc28faSPeter Zijlstra ret = -EAGAIN;
63385dc28faSPeter Zijlstra goto out_unlock;
63485dc28faSPeter Zijlstra }
63585dc28faSPeter Zijlstra
63685dc28faSPeter Zijlstra new_owner = top_waiter->task;
63785dc28faSPeter Zijlstra
63885dc28faSPeter Zijlstra /*
63985dc28faSPeter Zijlstra * We pass it to the next owner. The WAITERS bit is always kept
64085dc28faSPeter Zijlstra * enabled while there is PI state around. We cleanup the owner
64185dc28faSPeter Zijlstra * died bit, because we are the owner.
64285dc28faSPeter Zijlstra */
64385dc28faSPeter Zijlstra newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
64485dc28faSPeter Zijlstra
64585dc28faSPeter Zijlstra if (unlikely(should_fail_futex(true))) {
64685dc28faSPeter Zijlstra ret = -EFAULT;
64785dc28faSPeter Zijlstra goto out_unlock;
64885dc28faSPeter Zijlstra }
64985dc28faSPeter Zijlstra
65085dc28faSPeter Zijlstra ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
65185dc28faSPeter Zijlstra if (!ret && (curval != uval)) {
65285dc28faSPeter Zijlstra /*
65385dc28faSPeter Zijlstra * If a unconditional UNLOCK_PI operation (user space did not
65485dc28faSPeter Zijlstra * try the TID->0 transition) raced with a waiter setting the
65585dc28faSPeter Zijlstra * FUTEX_WAITERS flag between get_user() and locking the hash
65685dc28faSPeter Zijlstra * bucket lock, retry the operation.
65785dc28faSPeter Zijlstra */
65885dc28faSPeter Zijlstra if ((FUTEX_TID_MASK & curval) == uval)
65985dc28faSPeter Zijlstra ret = -EAGAIN;
66085dc28faSPeter Zijlstra else
66185dc28faSPeter Zijlstra ret = -EINVAL;
66285dc28faSPeter Zijlstra }
66385dc28faSPeter Zijlstra
66485dc28faSPeter Zijlstra if (!ret) {
66585dc28faSPeter Zijlstra /*
66685dc28faSPeter Zijlstra * This is a point of no return; once we modified the uval
66785dc28faSPeter Zijlstra * there is no going back and subsequent operations must
66885dc28faSPeter Zijlstra * not fail.
66985dc28faSPeter Zijlstra */
67085dc28faSPeter Zijlstra pi_state_update_owner(pi_state, new_owner);
67185dc28faSPeter Zijlstra postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
67285dc28faSPeter Zijlstra }
67385dc28faSPeter Zijlstra
67485dc28faSPeter Zijlstra out_unlock:
67585dc28faSPeter Zijlstra raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
67685dc28faSPeter Zijlstra
67785dc28faSPeter Zijlstra if (postunlock)
67885dc28faSPeter Zijlstra rt_mutex_postunlock(&wqh);
67985dc28faSPeter Zijlstra
68085dc28faSPeter Zijlstra return ret;
68185dc28faSPeter Zijlstra }
68285dc28faSPeter Zijlstra
__fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)68385dc28faSPeter Zijlstra static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
68485dc28faSPeter Zijlstra struct task_struct *argowner)
68585dc28faSPeter Zijlstra {
68685dc28faSPeter Zijlstra struct futex_pi_state *pi_state = q->pi_state;
68785dc28faSPeter Zijlstra struct task_struct *oldowner, *newowner;
68885dc28faSPeter Zijlstra u32 uval, curval, newval, newtid;
68985dc28faSPeter Zijlstra int err = 0;
69085dc28faSPeter Zijlstra
69185dc28faSPeter Zijlstra oldowner = pi_state->owner;
69285dc28faSPeter Zijlstra
69385dc28faSPeter Zijlstra /*
69485dc28faSPeter Zijlstra * We are here because either:
69585dc28faSPeter Zijlstra *
69685dc28faSPeter Zijlstra * - we stole the lock and pi_state->owner needs updating to reflect
69785dc28faSPeter Zijlstra * that (@argowner == current),
69885dc28faSPeter Zijlstra *
69985dc28faSPeter Zijlstra * or:
70085dc28faSPeter Zijlstra *
70185dc28faSPeter Zijlstra * - someone stole our lock and we need to fix things to point to the
70285dc28faSPeter Zijlstra * new owner (@argowner == NULL).
70385dc28faSPeter Zijlstra *
70485dc28faSPeter Zijlstra * Either way, we have to replace the TID in the user space variable.
70585dc28faSPeter Zijlstra * This must be atomic as we have to preserve the owner died bit here.
70685dc28faSPeter Zijlstra *
70785dc28faSPeter Zijlstra * Note: We write the user space value _before_ changing the pi_state
70885dc28faSPeter Zijlstra * because we can fault here. Imagine swapped out pages or a fork
70985dc28faSPeter Zijlstra * that marked all the anonymous memory readonly for cow.
71085dc28faSPeter Zijlstra *
71185dc28faSPeter Zijlstra * Modifying pi_state _before_ the user space value would leave the
71285dc28faSPeter Zijlstra * pi_state in an inconsistent state when we fault here, because we
71385dc28faSPeter Zijlstra * need to drop the locks to handle the fault. This might be observed
71485dc28faSPeter Zijlstra * in the PID checks when attaching to PI state .
71585dc28faSPeter Zijlstra */
71685dc28faSPeter Zijlstra retry:
71785dc28faSPeter Zijlstra if (!argowner) {
71885dc28faSPeter Zijlstra if (oldowner != current) {
71985dc28faSPeter Zijlstra /*
72085dc28faSPeter Zijlstra * We raced against a concurrent self; things are
72185dc28faSPeter Zijlstra * already fixed up. Nothing to do.
72285dc28faSPeter Zijlstra */
72385dc28faSPeter Zijlstra return 0;
72485dc28faSPeter Zijlstra }
72585dc28faSPeter Zijlstra
72685dc28faSPeter Zijlstra if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
72785dc28faSPeter Zijlstra /* We got the lock. pi_state is correct. Tell caller. */
72885dc28faSPeter Zijlstra return 1;
72985dc28faSPeter Zijlstra }
73085dc28faSPeter Zijlstra
73185dc28faSPeter Zijlstra /*
73285dc28faSPeter Zijlstra * The trylock just failed, so either there is an owner or
73385dc28faSPeter Zijlstra * there is a higher priority waiter than this one.
73485dc28faSPeter Zijlstra */
73585dc28faSPeter Zijlstra newowner = rt_mutex_owner(&pi_state->pi_mutex);
73685dc28faSPeter Zijlstra /*
73785dc28faSPeter Zijlstra * If the higher priority waiter has not yet taken over the
73885dc28faSPeter Zijlstra * rtmutex then newowner is NULL. We can't return here with
73985dc28faSPeter Zijlstra * that state because it's inconsistent vs. the user space
74085dc28faSPeter Zijlstra * state. So drop the locks and try again. It's a valid
74185dc28faSPeter Zijlstra * situation and not any different from the other retry
74285dc28faSPeter Zijlstra * conditions.
74385dc28faSPeter Zijlstra */
74485dc28faSPeter Zijlstra if (unlikely(!newowner)) {
74585dc28faSPeter Zijlstra err = -EAGAIN;
74685dc28faSPeter Zijlstra goto handle_err;
74785dc28faSPeter Zijlstra }
74885dc28faSPeter Zijlstra } else {
74985dc28faSPeter Zijlstra WARN_ON_ONCE(argowner != current);
75085dc28faSPeter Zijlstra if (oldowner == current) {
75185dc28faSPeter Zijlstra /*
75285dc28faSPeter Zijlstra * We raced against a concurrent self; things are
75385dc28faSPeter Zijlstra * already fixed up. Nothing to do.
75485dc28faSPeter Zijlstra */
75585dc28faSPeter Zijlstra return 1;
75685dc28faSPeter Zijlstra }
75785dc28faSPeter Zijlstra newowner = argowner;
75885dc28faSPeter Zijlstra }
75985dc28faSPeter Zijlstra
76085dc28faSPeter Zijlstra newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
76185dc28faSPeter Zijlstra /* Owner died? */
76285dc28faSPeter Zijlstra if (!pi_state->owner)
76385dc28faSPeter Zijlstra newtid |= FUTEX_OWNER_DIED;
76485dc28faSPeter Zijlstra
76585dc28faSPeter Zijlstra err = futex_get_value_locked(&uval, uaddr);
76685dc28faSPeter Zijlstra if (err)
76785dc28faSPeter Zijlstra goto handle_err;
76885dc28faSPeter Zijlstra
76985dc28faSPeter Zijlstra for (;;) {
77085dc28faSPeter Zijlstra newval = (uval & FUTEX_OWNER_DIED) | newtid;
77185dc28faSPeter Zijlstra
77285dc28faSPeter Zijlstra err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
77385dc28faSPeter Zijlstra if (err)
77485dc28faSPeter Zijlstra goto handle_err;
77585dc28faSPeter Zijlstra
77685dc28faSPeter Zijlstra if (curval == uval)
77785dc28faSPeter Zijlstra break;
77885dc28faSPeter Zijlstra uval = curval;
77985dc28faSPeter Zijlstra }
78085dc28faSPeter Zijlstra
78185dc28faSPeter Zijlstra /*
78285dc28faSPeter Zijlstra * We fixed up user space. Now we need to fix the pi_state
78385dc28faSPeter Zijlstra * itself.
78485dc28faSPeter Zijlstra */
78585dc28faSPeter Zijlstra pi_state_update_owner(pi_state, newowner);
78685dc28faSPeter Zijlstra
78785dc28faSPeter Zijlstra return argowner == current;
78885dc28faSPeter Zijlstra
78985dc28faSPeter Zijlstra /*
79085dc28faSPeter Zijlstra * In order to reschedule or handle a page fault, we need to drop the
79185dc28faSPeter Zijlstra * locks here. In the case of a fault, this gives the other task
79285dc28faSPeter Zijlstra * (either the highest priority waiter itself or the task which stole
79385dc28faSPeter Zijlstra * the rtmutex) the chance to try the fixup of the pi_state. So once we
79485dc28faSPeter Zijlstra * are back from handling the fault we need to check the pi_state after
79585dc28faSPeter Zijlstra * reacquiring the locks and before trying to do another fixup. When
79685dc28faSPeter Zijlstra * the fixup has been done already we simply return.
79785dc28faSPeter Zijlstra *
79885dc28faSPeter Zijlstra * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
79985dc28faSPeter Zijlstra * drop hb->lock since the caller owns the hb -> futex_q relation.
80085dc28faSPeter Zijlstra * Dropping the pi_mutex->wait_lock requires the state revalidate.
80185dc28faSPeter Zijlstra */
80285dc28faSPeter Zijlstra handle_err:
80385dc28faSPeter Zijlstra raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
80485dc28faSPeter Zijlstra spin_unlock(q->lock_ptr);
80585dc28faSPeter Zijlstra
80685dc28faSPeter Zijlstra switch (err) {
80785dc28faSPeter Zijlstra case -EFAULT:
80885dc28faSPeter Zijlstra err = fault_in_user_writeable(uaddr);
80985dc28faSPeter Zijlstra break;
81085dc28faSPeter Zijlstra
81185dc28faSPeter Zijlstra case -EAGAIN:
81285dc28faSPeter Zijlstra cond_resched();
81385dc28faSPeter Zijlstra err = 0;
81485dc28faSPeter Zijlstra break;
81585dc28faSPeter Zijlstra
81685dc28faSPeter Zijlstra default:
81785dc28faSPeter Zijlstra WARN_ON_ONCE(1);
81885dc28faSPeter Zijlstra break;
81985dc28faSPeter Zijlstra }
82085dc28faSPeter Zijlstra
82185dc28faSPeter Zijlstra spin_lock(q->lock_ptr);
82285dc28faSPeter Zijlstra raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
82385dc28faSPeter Zijlstra
82485dc28faSPeter Zijlstra /*
82585dc28faSPeter Zijlstra * Check if someone else fixed it for us:
82685dc28faSPeter Zijlstra */
82785dc28faSPeter Zijlstra if (pi_state->owner != oldowner)
82885dc28faSPeter Zijlstra return argowner == current;
82985dc28faSPeter Zijlstra
83085dc28faSPeter Zijlstra /* Retry if err was -EAGAIN or the fault in succeeded */
83185dc28faSPeter Zijlstra if (!err)
83285dc28faSPeter Zijlstra goto retry;
83385dc28faSPeter Zijlstra
83485dc28faSPeter Zijlstra /*
83585dc28faSPeter Zijlstra * fault_in_user_writeable() failed so user state is immutable. At
83685dc28faSPeter Zijlstra * best we can make the kernel state consistent but user state will
83785dc28faSPeter Zijlstra * be most likely hosed and any subsequent unlock operation will be
83885dc28faSPeter Zijlstra * rejected due to PI futex rule [10].
83985dc28faSPeter Zijlstra *
84085dc28faSPeter Zijlstra * Ensure that the rtmutex owner is also the pi_state owner despite
84185dc28faSPeter Zijlstra * the user space value claiming something different. There is no
84285dc28faSPeter Zijlstra * point in unlocking the rtmutex if current is the owner as it
84385dc28faSPeter Zijlstra * would need to wait until the next waiter has taken the rtmutex
84485dc28faSPeter Zijlstra * to guarantee consistent state. Keep it simple. Userspace asked
84585dc28faSPeter Zijlstra * for this wreckaged state.
84685dc28faSPeter Zijlstra *
84785dc28faSPeter Zijlstra * The rtmutex has an owner - either current or some other
84885dc28faSPeter Zijlstra * task. See the EAGAIN loop above.
84985dc28faSPeter Zijlstra */
85085dc28faSPeter Zijlstra pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
85185dc28faSPeter Zijlstra
85285dc28faSPeter Zijlstra return err;
85385dc28faSPeter Zijlstra }
85485dc28faSPeter Zijlstra
fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)85585dc28faSPeter Zijlstra static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
85685dc28faSPeter Zijlstra struct task_struct *argowner)
85785dc28faSPeter Zijlstra {
85885dc28faSPeter Zijlstra struct futex_pi_state *pi_state = q->pi_state;
85985dc28faSPeter Zijlstra int ret;
86085dc28faSPeter Zijlstra
86185dc28faSPeter Zijlstra lockdep_assert_held(q->lock_ptr);
86285dc28faSPeter Zijlstra
86385dc28faSPeter Zijlstra raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
86485dc28faSPeter Zijlstra ret = __fixup_pi_state_owner(uaddr, q, argowner);
86585dc28faSPeter Zijlstra raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
86685dc28faSPeter Zijlstra return ret;
86785dc28faSPeter Zijlstra }
86885dc28faSPeter Zijlstra
86985dc28faSPeter Zijlstra /**
87085dc28faSPeter Zijlstra * fixup_pi_owner() - Post lock pi_state and corner case management
87185dc28faSPeter Zijlstra * @uaddr: user address of the futex
87285dc28faSPeter Zijlstra * @q: futex_q (contains pi_state and access to the rt_mutex)
87385dc28faSPeter Zijlstra * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
87485dc28faSPeter Zijlstra *
87585dc28faSPeter Zijlstra * After attempting to lock an rt_mutex, this function is called to cleanup
87685dc28faSPeter Zijlstra * the pi_state owner as well as handle race conditions that may allow us to
87785dc28faSPeter Zijlstra * acquire the lock. Must be called with the hb lock held.
87885dc28faSPeter Zijlstra *
87985dc28faSPeter Zijlstra * Return:
88085dc28faSPeter Zijlstra * - 1 - success, lock taken;
88185dc28faSPeter Zijlstra * - 0 - success, lock not taken;
88285dc28faSPeter Zijlstra * - <0 - on error (-EFAULT)
88385dc28faSPeter Zijlstra */
fixup_pi_owner(u32 __user * uaddr,struct futex_q * q,int locked)88485dc28faSPeter Zijlstra int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
88585dc28faSPeter Zijlstra {
88685dc28faSPeter Zijlstra if (locked) {
88785dc28faSPeter Zijlstra /*
88885dc28faSPeter Zijlstra * Got the lock. We might not be the anticipated owner if we
88985dc28faSPeter Zijlstra * did a lock-steal - fix up the PI-state in that case:
89085dc28faSPeter Zijlstra *
89185dc28faSPeter Zijlstra * Speculative pi_state->owner read (we don't hold wait_lock);
89285dc28faSPeter Zijlstra * since we own the lock pi_state->owner == current is the
89385dc28faSPeter Zijlstra * stable state, anything else needs more attention.
89485dc28faSPeter Zijlstra */
89585dc28faSPeter Zijlstra if (q->pi_state->owner != current)
89685dc28faSPeter Zijlstra return fixup_pi_state_owner(uaddr, q, current);
89785dc28faSPeter Zijlstra return 1;
89885dc28faSPeter Zijlstra }
89985dc28faSPeter Zijlstra
90085dc28faSPeter Zijlstra /*
90185dc28faSPeter Zijlstra * If we didn't get the lock; check if anybody stole it from us. In
90285dc28faSPeter Zijlstra * that case, we need to fix up the uval to point to them instead of
90385dc28faSPeter Zijlstra * us, otherwise bad things happen. [10]
90485dc28faSPeter Zijlstra *
90585dc28faSPeter Zijlstra * Another speculative read; pi_state->owner == current is unstable
90685dc28faSPeter Zijlstra * but needs our attention.
90785dc28faSPeter Zijlstra */
90885dc28faSPeter Zijlstra if (q->pi_state->owner == current)
90985dc28faSPeter Zijlstra return fixup_pi_state_owner(uaddr, q, NULL);
91085dc28faSPeter Zijlstra
91185dc28faSPeter Zijlstra /*
91285dc28faSPeter Zijlstra * Paranoia check. If we did not take the lock, then we should not be
91385dc28faSPeter Zijlstra * the owner of the rt_mutex. Warn and establish consistent state.
91485dc28faSPeter Zijlstra */
91585dc28faSPeter Zijlstra if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
91685dc28faSPeter Zijlstra return fixup_pi_state_owner(uaddr, q, current);
91785dc28faSPeter Zijlstra
91885dc28faSPeter Zijlstra return 0;
91985dc28faSPeter Zijlstra }
92085dc28faSPeter Zijlstra
92185dc28faSPeter Zijlstra /*
92285dc28faSPeter Zijlstra * Userspace tried a 0 -> TID atomic transition of the futex value
92385dc28faSPeter Zijlstra * and failed. The kernel side here does the whole locking operation:
92485dc28faSPeter Zijlstra * if there are waiters then it will block as a consequence of relying
92585dc28faSPeter Zijlstra * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
92685dc28faSPeter Zijlstra * a 0 value of the futex too.).
92785dc28faSPeter Zijlstra *
92885dc28faSPeter Zijlstra * Also serves as futex trylock_pi()'ing, and due semantics.
92985dc28faSPeter Zijlstra */
futex_lock_pi(u32 __user * uaddr,unsigned int flags,ktime_t * time,int trylock)93085dc28faSPeter Zijlstra int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
93185dc28faSPeter Zijlstra {
93285dc28faSPeter Zijlstra struct hrtimer_sleeper timeout, *to;
93385dc28faSPeter Zijlstra struct task_struct *exiting = NULL;
93485dc28faSPeter Zijlstra struct rt_mutex_waiter rt_waiter;
93585dc28faSPeter Zijlstra struct futex_hash_bucket *hb;
93685dc28faSPeter Zijlstra struct futex_q q = futex_q_init;
93785dc28faSPeter Zijlstra int res, ret;
93885dc28faSPeter Zijlstra
93985dc28faSPeter Zijlstra if (!IS_ENABLED(CONFIG_FUTEX_PI))
94085dc28faSPeter Zijlstra return -ENOSYS;
94185dc28faSPeter Zijlstra
94285dc28faSPeter Zijlstra if (refill_pi_state_cache())
94385dc28faSPeter Zijlstra return -ENOMEM;
94485dc28faSPeter Zijlstra
94585dc28faSPeter Zijlstra to = futex_setup_timer(time, &timeout, flags, 0);
94685dc28faSPeter Zijlstra
94785dc28faSPeter Zijlstra retry:
94885dc28faSPeter Zijlstra ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
94985dc28faSPeter Zijlstra if (unlikely(ret != 0))
95085dc28faSPeter Zijlstra goto out;
95185dc28faSPeter Zijlstra
95285dc28faSPeter Zijlstra retry_private:
95385dc28faSPeter Zijlstra hb = futex_q_lock(&q);
95485dc28faSPeter Zijlstra
95585dc28faSPeter Zijlstra ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
95685dc28faSPeter Zijlstra &exiting, 0);
95785dc28faSPeter Zijlstra if (unlikely(ret)) {
95885dc28faSPeter Zijlstra /*
95985dc28faSPeter Zijlstra * Atomic work succeeded and we got the lock,
96085dc28faSPeter Zijlstra * or failed. Either way, we do _not_ block.
96185dc28faSPeter Zijlstra */
96285dc28faSPeter Zijlstra switch (ret) {
96385dc28faSPeter Zijlstra case 1:
96485dc28faSPeter Zijlstra /* We got the lock. */
96585dc28faSPeter Zijlstra ret = 0;
96685dc28faSPeter Zijlstra goto out_unlock_put_key;
96785dc28faSPeter Zijlstra case -EFAULT:
96885dc28faSPeter Zijlstra goto uaddr_faulted;
96985dc28faSPeter Zijlstra case -EBUSY:
97085dc28faSPeter Zijlstra case -EAGAIN:
97185dc28faSPeter Zijlstra /*
97285dc28faSPeter Zijlstra * Two reasons for this:
97385dc28faSPeter Zijlstra * - EBUSY: Task is exiting and we just wait for the
97485dc28faSPeter Zijlstra * exit to complete.
97585dc28faSPeter Zijlstra * - EAGAIN: The user space value changed.
97685dc28faSPeter Zijlstra */
97785dc28faSPeter Zijlstra futex_q_unlock(hb);
97885dc28faSPeter Zijlstra /*
97985dc28faSPeter Zijlstra * Handle the case where the owner is in the middle of
98085dc28faSPeter Zijlstra * exiting. Wait for the exit to complete otherwise
98185dc28faSPeter Zijlstra * this task might loop forever, aka. live lock.
98285dc28faSPeter Zijlstra */
98385dc28faSPeter Zijlstra wait_for_owner_exiting(ret, exiting);
98485dc28faSPeter Zijlstra cond_resched();
98585dc28faSPeter Zijlstra goto retry;
98685dc28faSPeter Zijlstra default:
98785dc28faSPeter Zijlstra goto out_unlock_put_key;
98885dc28faSPeter Zijlstra }
98985dc28faSPeter Zijlstra }
99085dc28faSPeter Zijlstra
99185dc28faSPeter Zijlstra WARN_ON(!q.pi_state);
99285dc28faSPeter Zijlstra
99385dc28faSPeter Zijlstra /*
99485dc28faSPeter Zijlstra * Only actually queue now that the atomic ops are done:
99585dc28faSPeter Zijlstra */
99685dc28faSPeter Zijlstra __futex_queue(&q, hb);
99785dc28faSPeter Zijlstra
99885dc28faSPeter Zijlstra if (trylock) {
99985dc28faSPeter Zijlstra ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
100085dc28faSPeter Zijlstra /* Fixup the trylock return value: */
100185dc28faSPeter Zijlstra ret = ret ? 0 : -EWOULDBLOCK;
100285dc28faSPeter Zijlstra goto no_block;
100385dc28faSPeter Zijlstra }
100485dc28faSPeter Zijlstra
100585dc28faSPeter Zijlstra rt_mutex_init_waiter(&rt_waiter);
100685dc28faSPeter Zijlstra
100785dc28faSPeter Zijlstra /*
1008*68290613SSebastian Andrzej Siewior * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
100985dc28faSPeter Zijlstra * hold it while doing rt_mutex_start_proxy(), because then it will
101085dc28faSPeter Zijlstra * include hb->lock in the blocking chain, even through we'll not in
101185dc28faSPeter Zijlstra * fact hold it while blocking. This will lead it to report -EDEADLK
101285dc28faSPeter Zijlstra * and BUG when futex_unlock_pi() interleaves with this.
101385dc28faSPeter Zijlstra *
101485dc28faSPeter Zijlstra * Therefore acquire wait_lock while holding hb->lock, but drop the
101585dc28faSPeter Zijlstra * latter before calling __rt_mutex_start_proxy_lock(). This
101685dc28faSPeter Zijlstra * interleaves with futex_unlock_pi() -- which does a similar lock
101785dc28faSPeter Zijlstra * handoff -- such that the latter can observe the futex_q::pi_state
101885dc28faSPeter Zijlstra * before __rt_mutex_start_proxy_lock() is done.
101985dc28faSPeter Zijlstra */
102085dc28faSPeter Zijlstra raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
102185dc28faSPeter Zijlstra spin_unlock(q.lock_ptr);
102285dc28faSPeter Zijlstra /*
102385dc28faSPeter Zijlstra * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
102485dc28faSPeter Zijlstra * such that futex_unlock_pi() is guaranteed to observe the waiter when
102585dc28faSPeter Zijlstra * it sees the futex_q::pi_state.
102685dc28faSPeter Zijlstra */
102785dc28faSPeter Zijlstra ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
102885dc28faSPeter Zijlstra raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
102985dc28faSPeter Zijlstra
103085dc28faSPeter Zijlstra if (ret) {
103185dc28faSPeter Zijlstra if (ret == 1)
103285dc28faSPeter Zijlstra ret = 0;
103385dc28faSPeter Zijlstra goto cleanup;
103485dc28faSPeter Zijlstra }
103585dc28faSPeter Zijlstra
103685dc28faSPeter Zijlstra if (unlikely(to))
103785dc28faSPeter Zijlstra hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
103885dc28faSPeter Zijlstra
103985dc28faSPeter Zijlstra ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
104085dc28faSPeter Zijlstra
104185dc28faSPeter Zijlstra cleanup:
104285dc28faSPeter Zijlstra spin_lock(q.lock_ptr);
104385dc28faSPeter Zijlstra /*
104485dc28faSPeter Zijlstra * If we failed to acquire the lock (deadlock/signal/timeout), we must
104585dc28faSPeter Zijlstra * first acquire the hb->lock before removing the lock from the
104685dc28faSPeter Zijlstra * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
104785dc28faSPeter Zijlstra * lists consistent.
104885dc28faSPeter Zijlstra *
104985dc28faSPeter Zijlstra * In particular; it is important that futex_unlock_pi() can not
105085dc28faSPeter Zijlstra * observe this inconsistency.
105185dc28faSPeter Zijlstra */
105285dc28faSPeter Zijlstra if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
105385dc28faSPeter Zijlstra ret = 0;
105485dc28faSPeter Zijlstra
105585dc28faSPeter Zijlstra no_block:
105685dc28faSPeter Zijlstra /*
105785dc28faSPeter Zijlstra * Fixup the pi_state owner and possibly acquire the lock if we
105885dc28faSPeter Zijlstra * haven't already.
105985dc28faSPeter Zijlstra */
106085dc28faSPeter Zijlstra res = fixup_pi_owner(uaddr, &q, !ret);
106185dc28faSPeter Zijlstra /*
106285dc28faSPeter Zijlstra * If fixup_pi_owner() returned an error, propagate that. If it acquired
106385dc28faSPeter Zijlstra * the lock, clear our -ETIMEDOUT or -EINTR.
106485dc28faSPeter Zijlstra */
106585dc28faSPeter Zijlstra if (res)
106685dc28faSPeter Zijlstra ret = (res < 0) ? res : 0;
106785dc28faSPeter Zijlstra
106885dc28faSPeter Zijlstra futex_unqueue_pi(&q);
106985dc28faSPeter Zijlstra spin_unlock(q.lock_ptr);
107085dc28faSPeter Zijlstra goto out;
107185dc28faSPeter Zijlstra
107285dc28faSPeter Zijlstra out_unlock_put_key:
107385dc28faSPeter Zijlstra futex_q_unlock(hb);
107485dc28faSPeter Zijlstra
107585dc28faSPeter Zijlstra out:
107685dc28faSPeter Zijlstra if (to) {
107785dc28faSPeter Zijlstra hrtimer_cancel(&to->timer);
107885dc28faSPeter Zijlstra destroy_hrtimer_on_stack(&to->timer);
107985dc28faSPeter Zijlstra }
108085dc28faSPeter Zijlstra return ret != -EINTR ? ret : -ERESTARTNOINTR;
108185dc28faSPeter Zijlstra
108285dc28faSPeter Zijlstra uaddr_faulted:
108385dc28faSPeter Zijlstra futex_q_unlock(hb);
108485dc28faSPeter Zijlstra
108585dc28faSPeter Zijlstra ret = fault_in_user_writeable(uaddr);
108685dc28faSPeter Zijlstra if (ret)
108785dc28faSPeter Zijlstra goto out;
108885dc28faSPeter Zijlstra
108985dc28faSPeter Zijlstra if (!(flags & FLAGS_SHARED))
109085dc28faSPeter Zijlstra goto retry_private;
109185dc28faSPeter Zijlstra
109285dc28faSPeter Zijlstra goto retry;
109385dc28faSPeter Zijlstra }
109485dc28faSPeter Zijlstra
109585dc28faSPeter Zijlstra /*
109685dc28faSPeter Zijlstra * Userspace attempted a TID -> 0 atomic transition, and failed.
109785dc28faSPeter Zijlstra * This is the in-kernel slowpath: we look up the PI state (if any),
109885dc28faSPeter Zijlstra * and do the rt-mutex unlock.
109985dc28faSPeter Zijlstra */
futex_unlock_pi(u32 __user * uaddr,unsigned int flags)110085dc28faSPeter Zijlstra int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
110185dc28faSPeter Zijlstra {
110285dc28faSPeter Zijlstra u32 curval, uval, vpid = task_pid_vnr(current);
110385dc28faSPeter Zijlstra union futex_key key = FUTEX_KEY_INIT;
110485dc28faSPeter Zijlstra struct futex_hash_bucket *hb;
110585dc28faSPeter Zijlstra struct futex_q *top_waiter;
110685dc28faSPeter Zijlstra int ret;
110785dc28faSPeter Zijlstra
110885dc28faSPeter Zijlstra if (!IS_ENABLED(CONFIG_FUTEX_PI))
110985dc28faSPeter Zijlstra return -ENOSYS;
111085dc28faSPeter Zijlstra
111185dc28faSPeter Zijlstra retry:
111285dc28faSPeter Zijlstra if (get_user(uval, uaddr))
111385dc28faSPeter Zijlstra return -EFAULT;
111485dc28faSPeter Zijlstra /*
111585dc28faSPeter Zijlstra * We release only a lock we actually own:
111685dc28faSPeter Zijlstra */
111785dc28faSPeter Zijlstra if ((uval & FUTEX_TID_MASK) != vpid)
111885dc28faSPeter Zijlstra return -EPERM;
111985dc28faSPeter Zijlstra
112085dc28faSPeter Zijlstra ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
112185dc28faSPeter Zijlstra if (ret)
112285dc28faSPeter Zijlstra return ret;
112385dc28faSPeter Zijlstra
112485dc28faSPeter Zijlstra hb = futex_hash(&key);
112585dc28faSPeter Zijlstra spin_lock(&hb->lock);
112685dc28faSPeter Zijlstra
112785dc28faSPeter Zijlstra /*
112885dc28faSPeter Zijlstra * Check waiters first. We do not trust user space values at
112985dc28faSPeter Zijlstra * all and we at least want to know if user space fiddled
113085dc28faSPeter Zijlstra * with the futex value instead of blindly unlocking.
113185dc28faSPeter Zijlstra */
113285dc28faSPeter Zijlstra top_waiter = futex_top_waiter(hb, &key);
113385dc28faSPeter Zijlstra if (top_waiter) {
113485dc28faSPeter Zijlstra struct futex_pi_state *pi_state = top_waiter->pi_state;
113585dc28faSPeter Zijlstra
113685dc28faSPeter Zijlstra ret = -EINVAL;
113785dc28faSPeter Zijlstra if (!pi_state)
113885dc28faSPeter Zijlstra goto out_unlock;
113985dc28faSPeter Zijlstra
114085dc28faSPeter Zijlstra /*
114185dc28faSPeter Zijlstra * If current does not own the pi_state then the futex is
114285dc28faSPeter Zijlstra * inconsistent and user space fiddled with the futex value.
114385dc28faSPeter Zijlstra */
114485dc28faSPeter Zijlstra if (pi_state->owner != current)
114585dc28faSPeter Zijlstra goto out_unlock;
114685dc28faSPeter Zijlstra
114785dc28faSPeter Zijlstra get_pi_state(pi_state);
114885dc28faSPeter Zijlstra /*
114985dc28faSPeter Zijlstra * By taking wait_lock while still holding hb->lock, we ensure
115085dc28faSPeter Zijlstra * there is no point where we hold neither; and therefore
115185dc28faSPeter Zijlstra * wake_futex_p() must observe a state consistent with what we
115285dc28faSPeter Zijlstra * observed.
115385dc28faSPeter Zijlstra *
115485dc28faSPeter Zijlstra * In particular; this forces __rt_mutex_start_proxy() to
115585dc28faSPeter Zijlstra * complete such that we're guaranteed to observe the
115685dc28faSPeter Zijlstra * rt_waiter. Also see the WARN in wake_futex_pi().
115785dc28faSPeter Zijlstra */
115885dc28faSPeter Zijlstra raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
115985dc28faSPeter Zijlstra spin_unlock(&hb->lock);
116085dc28faSPeter Zijlstra
116185dc28faSPeter Zijlstra /* drops pi_state->pi_mutex.wait_lock */
116285dc28faSPeter Zijlstra ret = wake_futex_pi(uaddr, uval, pi_state);
116385dc28faSPeter Zijlstra
116485dc28faSPeter Zijlstra put_pi_state(pi_state);
116585dc28faSPeter Zijlstra
116685dc28faSPeter Zijlstra /*
116785dc28faSPeter Zijlstra * Success, we're done! No tricky corner cases.
116885dc28faSPeter Zijlstra */
116985dc28faSPeter Zijlstra if (!ret)
117085dc28faSPeter Zijlstra return ret;
117185dc28faSPeter Zijlstra /*
117285dc28faSPeter Zijlstra * The atomic access to the futex value generated a
117385dc28faSPeter Zijlstra * pagefault, so retry the user-access and the wakeup:
117485dc28faSPeter Zijlstra */
117585dc28faSPeter Zijlstra if (ret == -EFAULT)
117685dc28faSPeter Zijlstra goto pi_faulted;
117785dc28faSPeter Zijlstra /*
117885dc28faSPeter Zijlstra * A unconditional UNLOCK_PI op raced against a waiter
117985dc28faSPeter Zijlstra * setting the FUTEX_WAITERS bit. Try again.
118085dc28faSPeter Zijlstra */
118185dc28faSPeter Zijlstra if (ret == -EAGAIN)
118285dc28faSPeter Zijlstra goto pi_retry;
118385dc28faSPeter Zijlstra /*
118485dc28faSPeter Zijlstra * wake_futex_pi has detected invalid state. Tell user
118585dc28faSPeter Zijlstra * space.
118685dc28faSPeter Zijlstra */
118785dc28faSPeter Zijlstra return ret;
118885dc28faSPeter Zijlstra }
118985dc28faSPeter Zijlstra
119085dc28faSPeter Zijlstra /*
119185dc28faSPeter Zijlstra * We have no kernel internal state, i.e. no waiters in the
119285dc28faSPeter Zijlstra * kernel. Waiters which are about to queue themselves are stuck
119385dc28faSPeter Zijlstra * on hb->lock. So we can safely ignore them. We do neither
119485dc28faSPeter Zijlstra * preserve the WAITERS bit not the OWNER_DIED one. We are the
119585dc28faSPeter Zijlstra * owner.
119685dc28faSPeter Zijlstra */
119785dc28faSPeter Zijlstra if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
119885dc28faSPeter Zijlstra spin_unlock(&hb->lock);
119985dc28faSPeter Zijlstra switch (ret) {
120085dc28faSPeter Zijlstra case -EFAULT:
120185dc28faSPeter Zijlstra goto pi_faulted;
120285dc28faSPeter Zijlstra
120385dc28faSPeter Zijlstra case -EAGAIN:
120485dc28faSPeter Zijlstra goto pi_retry;
120585dc28faSPeter Zijlstra
120685dc28faSPeter Zijlstra default:
120785dc28faSPeter Zijlstra WARN_ON_ONCE(1);
120885dc28faSPeter Zijlstra return ret;
120985dc28faSPeter Zijlstra }
121085dc28faSPeter Zijlstra }
121185dc28faSPeter Zijlstra
121285dc28faSPeter Zijlstra /*
121385dc28faSPeter Zijlstra * If uval has changed, let user space handle it.
121485dc28faSPeter Zijlstra */
121585dc28faSPeter Zijlstra ret = (curval == uval) ? 0 : -EAGAIN;
121685dc28faSPeter Zijlstra
121785dc28faSPeter Zijlstra out_unlock:
121885dc28faSPeter Zijlstra spin_unlock(&hb->lock);
121985dc28faSPeter Zijlstra return ret;
122085dc28faSPeter Zijlstra
122185dc28faSPeter Zijlstra pi_retry:
122285dc28faSPeter Zijlstra cond_resched();
122385dc28faSPeter Zijlstra goto retry;
122485dc28faSPeter Zijlstra
122585dc28faSPeter Zijlstra pi_faulted:
122685dc28faSPeter Zijlstra
122785dc28faSPeter Zijlstra ret = fault_in_user_writeable(uaddr);
122885dc28faSPeter Zijlstra if (!ret)
122985dc28faSPeter Zijlstra goto retry;
123085dc28faSPeter Zijlstra
123185dc28faSPeter Zijlstra return ret;
123285dc28faSPeter Zijlstra }
123385dc28faSPeter Zijlstra
1234