xref: /openbmc/linux/kernel/futex/pi.c (revision 4f2c0a4acffbec01079c28f839422e64ddeff004)
185dc28faSPeter Zijlstra // SPDX-License-Identifier: GPL-2.0-or-later
285dc28faSPeter Zijlstra 
385dc28faSPeter Zijlstra #include <linux/slab.h>
485dc28faSPeter Zijlstra #include <linux/sched/task.h>
585dc28faSPeter Zijlstra 
685dc28faSPeter Zijlstra #include "futex.h"
785dc28faSPeter Zijlstra #include "../locking/rtmutex_common.h"
885dc28faSPeter Zijlstra 
985dc28faSPeter Zijlstra /*
1085dc28faSPeter Zijlstra  * PI code:
1185dc28faSPeter Zijlstra  */
refill_pi_state_cache(void)1285dc28faSPeter Zijlstra int refill_pi_state_cache(void)
1385dc28faSPeter Zijlstra {
1485dc28faSPeter Zijlstra 	struct futex_pi_state *pi_state;
1585dc28faSPeter Zijlstra 
1685dc28faSPeter Zijlstra 	if (likely(current->pi_state_cache))
1785dc28faSPeter Zijlstra 		return 0;
1885dc28faSPeter Zijlstra 
1985dc28faSPeter Zijlstra 	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
2085dc28faSPeter Zijlstra 
2185dc28faSPeter Zijlstra 	if (!pi_state)
2285dc28faSPeter Zijlstra 		return -ENOMEM;
2385dc28faSPeter Zijlstra 
2485dc28faSPeter Zijlstra 	INIT_LIST_HEAD(&pi_state->list);
2585dc28faSPeter Zijlstra 	/* pi_mutex gets initialized later */
2685dc28faSPeter Zijlstra 	pi_state->owner = NULL;
2785dc28faSPeter Zijlstra 	refcount_set(&pi_state->refcount, 1);
2885dc28faSPeter Zijlstra 	pi_state->key = FUTEX_KEY_INIT;
2985dc28faSPeter Zijlstra 
3085dc28faSPeter Zijlstra 	current->pi_state_cache = pi_state;
3185dc28faSPeter Zijlstra 
3285dc28faSPeter Zijlstra 	return 0;
3385dc28faSPeter Zijlstra }
3485dc28faSPeter Zijlstra 
alloc_pi_state(void)3585dc28faSPeter Zijlstra static struct futex_pi_state *alloc_pi_state(void)
3685dc28faSPeter Zijlstra {
3785dc28faSPeter Zijlstra 	struct futex_pi_state *pi_state = current->pi_state_cache;
3885dc28faSPeter Zijlstra 
3985dc28faSPeter Zijlstra 	WARN_ON(!pi_state);
4085dc28faSPeter Zijlstra 	current->pi_state_cache = NULL;
4185dc28faSPeter Zijlstra 
4285dc28faSPeter Zijlstra 	return pi_state;
4385dc28faSPeter Zijlstra }
4485dc28faSPeter Zijlstra 
pi_state_update_owner(struct futex_pi_state * pi_state,struct task_struct * new_owner)4585dc28faSPeter Zijlstra static void pi_state_update_owner(struct futex_pi_state *pi_state,
4685dc28faSPeter Zijlstra 				  struct task_struct *new_owner)
4785dc28faSPeter Zijlstra {
4885dc28faSPeter Zijlstra 	struct task_struct *old_owner = pi_state->owner;
4985dc28faSPeter Zijlstra 
5085dc28faSPeter Zijlstra 	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
5185dc28faSPeter Zijlstra 
5285dc28faSPeter Zijlstra 	if (old_owner) {
5385dc28faSPeter Zijlstra 		raw_spin_lock(&old_owner->pi_lock);
5485dc28faSPeter Zijlstra 		WARN_ON(list_empty(&pi_state->list));
5585dc28faSPeter Zijlstra 		list_del_init(&pi_state->list);
5685dc28faSPeter Zijlstra 		raw_spin_unlock(&old_owner->pi_lock);
5785dc28faSPeter Zijlstra 	}
5885dc28faSPeter Zijlstra 
5985dc28faSPeter Zijlstra 	if (new_owner) {
6085dc28faSPeter Zijlstra 		raw_spin_lock(&new_owner->pi_lock);
6185dc28faSPeter Zijlstra 		WARN_ON(!list_empty(&pi_state->list));
6285dc28faSPeter Zijlstra 		list_add(&pi_state->list, &new_owner->pi_state_list);
6385dc28faSPeter Zijlstra 		pi_state->owner = new_owner;
6485dc28faSPeter Zijlstra 		raw_spin_unlock(&new_owner->pi_lock);
6585dc28faSPeter Zijlstra 	}
6685dc28faSPeter Zijlstra }
6785dc28faSPeter Zijlstra 
get_pi_state(struct futex_pi_state * pi_state)6885dc28faSPeter Zijlstra void get_pi_state(struct futex_pi_state *pi_state)
6985dc28faSPeter Zijlstra {
7085dc28faSPeter Zijlstra 	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
7185dc28faSPeter Zijlstra }
7285dc28faSPeter Zijlstra 
7385dc28faSPeter Zijlstra /*
7485dc28faSPeter Zijlstra  * Drops a reference to the pi_state object and frees or caches it
7585dc28faSPeter Zijlstra  * when the last reference is gone.
7685dc28faSPeter Zijlstra  */
put_pi_state(struct futex_pi_state * pi_state)7785dc28faSPeter Zijlstra void put_pi_state(struct futex_pi_state *pi_state)
7885dc28faSPeter Zijlstra {
7985dc28faSPeter Zijlstra 	if (!pi_state)
8085dc28faSPeter Zijlstra 		return;
8185dc28faSPeter Zijlstra 
8285dc28faSPeter Zijlstra 	if (!refcount_dec_and_test(&pi_state->refcount))
8385dc28faSPeter Zijlstra 		return;
8485dc28faSPeter Zijlstra 
8585dc28faSPeter Zijlstra 	/*
8685dc28faSPeter Zijlstra 	 * If pi_state->owner is NULL, the owner is most probably dying
8785dc28faSPeter Zijlstra 	 * and has cleaned up the pi_state already
8885dc28faSPeter Zijlstra 	 */
8985dc28faSPeter Zijlstra 	if (pi_state->owner) {
9085dc28faSPeter Zijlstra 		unsigned long flags;
9185dc28faSPeter Zijlstra 
9285dc28faSPeter Zijlstra 		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
9385dc28faSPeter Zijlstra 		pi_state_update_owner(pi_state, NULL);
9485dc28faSPeter Zijlstra 		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
9585dc28faSPeter Zijlstra 		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
9685dc28faSPeter Zijlstra 	}
9785dc28faSPeter Zijlstra 
9885dc28faSPeter Zijlstra 	if (current->pi_state_cache) {
9985dc28faSPeter Zijlstra 		kfree(pi_state);
10085dc28faSPeter Zijlstra 	} else {
10185dc28faSPeter Zijlstra 		/*
10285dc28faSPeter Zijlstra 		 * pi_state->list is already empty.
10385dc28faSPeter Zijlstra 		 * clear pi_state->owner.
10485dc28faSPeter Zijlstra 		 * refcount is at 0 - put it back to 1.
10585dc28faSPeter Zijlstra 		 */
10685dc28faSPeter Zijlstra 		pi_state->owner = NULL;
10785dc28faSPeter Zijlstra 		refcount_set(&pi_state->refcount, 1);
10885dc28faSPeter Zijlstra 		current->pi_state_cache = pi_state;
10985dc28faSPeter Zijlstra 	}
11085dc28faSPeter Zijlstra }
11185dc28faSPeter Zijlstra 
11285dc28faSPeter Zijlstra /*
11385dc28faSPeter Zijlstra  * We need to check the following states:
11485dc28faSPeter Zijlstra  *
11585dc28faSPeter Zijlstra  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
11685dc28faSPeter Zijlstra  *
11785dc28faSPeter Zijlstra  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
11885dc28faSPeter Zijlstra  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
11985dc28faSPeter Zijlstra  *
12085dc28faSPeter Zijlstra  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
12185dc28faSPeter Zijlstra  *
12285dc28faSPeter Zijlstra  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
12385dc28faSPeter Zijlstra  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
12485dc28faSPeter Zijlstra  *
12585dc28faSPeter Zijlstra  * [6]  Found  | Found    | task      | 0         | 1      | Valid
12685dc28faSPeter Zijlstra  *
12785dc28faSPeter Zijlstra  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
12885dc28faSPeter Zijlstra  *
12985dc28faSPeter Zijlstra  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
13085dc28faSPeter Zijlstra  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
13185dc28faSPeter Zijlstra  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
13285dc28faSPeter Zijlstra  *
13385dc28faSPeter Zijlstra  * [1]	Indicates that the kernel can acquire the futex atomically. We
13485dc28faSPeter Zijlstra  *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
13585dc28faSPeter Zijlstra  *
13685dc28faSPeter Zijlstra  * [2]	Valid, if TID does not belong to a kernel thread. If no matching
13785dc28faSPeter Zijlstra  *      thread is found then it indicates that the owner TID has died.
13885dc28faSPeter Zijlstra  *
13985dc28faSPeter Zijlstra  * [3]	Invalid. The waiter is queued on a non PI futex
14085dc28faSPeter Zijlstra  *
14185dc28faSPeter Zijlstra  * [4]	Valid state after exit_robust_list(), which sets the user space
14285dc28faSPeter Zijlstra  *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
14385dc28faSPeter Zijlstra  *
14485dc28faSPeter Zijlstra  * [5]	The user space value got manipulated between exit_robust_list()
14585dc28faSPeter Zijlstra  *	and exit_pi_state_list()
14685dc28faSPeter Zijlstra  *
14785dc28faSPeter Zijlstra  * [6]	Valid state after exit_pi_state_list() which sets the new owner in
14885dc28faSPeter Zijlstra  *	the pi_state but cannot access the user space value.
14985dc28faSPeter Zijlstra  *
15085dc28faSPeter Zijlstra  * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
15185dc28faSPeter Zijlstra  *
15285dc28faSPeter Zijlstra  * [8]	Owner and user space value match
15385dc28faSPeter Zijlstra  *
15485dc28faSPeter Zijlstra  * [9]	There is no transient state which sets the user space TID to 0
15585dc28faSPeter Zijlstra  *	except exit_robust_list(), but this is indicated by the
15685dc28faSPeter Zijlstra  *	FUTEX_OWNER_DIED bit. See [4]
15785dc28faSPeter Zijlstra  *
15885dc28faSPeter Zijlstra  * [10] There is no transient state which leaves owner and user space
15985dc28faSPeter Zijlstra  *	TID out of sync. Except one error case where the kernel is denied
16085dc28faSPeter Zijlstra  *	write access to the user address, see fixup_pi_state_owner().
16185dc28faSPeter Zijlstra  *
16285dc28faSPeter Zijlstra  *
16385dc28faSPeter Zijlstra  * Serialization and lifetime rules:
16485dc28faSPeter Zijlstra  *
16585dc28faSPeter Zijlstra  * hb->lock:
16685dc28faSPeter Zijlstra  *
16785dc28faSPeter Zijlstra  *	hb -> futex_q, relation
16885dc28faSPeter Zijlstra  *	futex_q -> pi_state, relation
16985dc28faSPeter Zijlstra  *
17085dc28faSPeter Zijlstra  *	(cannot be raw because hb can contain arbitrary amount
17185dc28faSPeter Zijlstra  *	 of futex_q's)
17285dc28faSPeter Zijlstra  *
17385dc28faSPeter Zijlstra  * pi_mutex->wait_lock:
17485dc28faSPeter Zijlstra  *
17585dc28faSPeter Zijlstra  *	{uval, pi_state}
17685dc28faSPeter Zijlstra  *
17785dc28faSPeter Zijlstra  *	(and pi_mutex 'obviously')
17885dc28faSPeter Zijlstra  *
17985dc28faSPeter Zijlstra  * p->pi_lock:
18085dc28faSPeter Zijlstra  *
18185dc28faSPeter Zijlstra  *	p->pi_state_list -> pi_state->list, relation
18285dc28faSPeter Zijlstra  *	pi_mutex->owner -> pi_state->owner, relation
18385dc28faSPeter Zijlstra  *
18485dc28faSPeter Zijlstra  * pi_state->refcount:
18585dc28faSPeter Zijlstra  *
18685dc28faSPeter Zijlstra  *	pi_state lifetime
18785dc28faSPeter Zijlstra  *
18885dc28faSPeter Zijlstra  *
18985dc28faSPeter Zijlstra  * Lock order:
19085dc28faSPeter Zijlstra  *
19185dc28faSPeter Zijlstra  *   hb->lock
19285dc28faSPeter Zijlstra  *     pi_mutex->wait_lock
19385dc28faSPeter Zijlstra  *       p->pi_lock
19485dc28faSPeter Zijlstra  *
19585dc28faSPeter Zijlstra  */
19685dc28faSPeter Zijlstra 
19785dc28faSPeter Zijlstra /*
19885dc28faSPeter Zijlstra  * Validate that the existing waiter has a pi_state and sanity check
19985dc28faSPeter Zijlstra  * the pi_state against the user space value. If correct, attach to
20085dc28faSPeter Zijlstra  * it.
20185dc28faSPeter Zijlstra  */
attach_to_pi_state(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state,struct futex_pi_state ** ps)20285dc28faSPeter Zijlstra static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
20385dc28faSPeter Zijlstra 			      struct futex_pi_state *pi_state,
20485dc28faSPeter Zijlstra 			      struct futex_pi_state **ps)
20585dc28faSPeter Zijlstra {
20685dc28faSPeter Zijlstra 	pid_t pid = uval & FUTEX_TID_MASK;
20785dc28faSPeter Zijlstra 	u32 uval2;
20885dc28faSPeter Zijlstra 	int ret;
20985dc28faSPeter Zijlstra 
21085dc28faSPeter Zijlstra 	/*
21185dc28faSPeter Zijlstra 	 * Userspace might have messed up non-PI and PI futexes [3]
21285dc28faSPeter Zijlstra 	 */
21385dc28faSPeter Zijlstra 	if (unlikely(!pi_state))
21485dc28faSPeter Zijlstra 		return -EINVAL;
21585dc28faSPeter Zijlstra 
21685dc28faSPeter Zijlstra 	/*
21785dc28faSPeter Zijlstra 	 * We get here with hb->lock held, and having found a
21885dc28faSPeter Zijlstra 	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
21985dc28faSPeter Zijlstra 	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
22085dc28faSPeter Zijlstra 	 * which in turn means that futex_lock_pi() still has a reference on
22185dc28faSPeter Zijlstra 	 * our pi_state.
22285dc28faSPeter Zijlstra 	 *
22385dc28faSPeter Zijlstra 	 * The waiter holding a reference on @pi_state also protects against
22485dc28faSPeter Zijlstra 	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
22585dc28faSPeter Zijlstra 	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
22685dc28faSPeter Zijlstra 	 * free pi_state before we can take a reference ourselves.
22785dc28faSPeter Zijlstra 	 */
22885dc28faSPeter Zijlstra 	WARN_ON(!refcount_read(&pi_state->refcount));
22985dc28faSPeter Zijlstra 
23085dc28faSPeter Zijlstra 	/*
23185dc28faSPeter Zijlstra 	 * Now that we have a pi_state, we can acquire wait_lock
23285dc28faSPeter Zijlstra 	 * and do the state validation.
23385dc28faSPeter Zijlstra 	 */
23485dc28faSPeter Zijlstra 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
23585dc28faSPeter Zijlstra 
23685dc28faSPeter Zijlstra 	/*
23785dc28faSPeter Zijlstra 	 * Since {uval, pi_state} is serialized by wait_lock, and our current
23885dc28faSPeter Zijlstra 	 * uval was read without holding it, it can have changed. Verify it
23985dc28faSPeter Zijlstra 	 * still is what we expect it to be, otherwise retry the entire
24085dc28faSPeter Zijlstra 	 * operation.
24185dc28faSPeter Zijlstra 	 */
24285dc28faSPeter Zijlstra 	if (futex_get_value_locked(&uval2, uaddr))
24385dc28faSPeter Zijlstra 		goto out_efault;
24485dc28faSPeter Zijlstra 
24585dc28faSPeter Zijlstra 	if (uval != uval2)
24685dc28faSPeter Zijlstra 		goto out_eagain;
24785dc28faSPeter Zijlstra 
24885dc28faSPeter Zijlstra 	/*
24985dc28faSPeter Zijlstra 	 * Handle the owner died case:
25085dc28faSPeter Zijlstra 	 */
25185dc28faSPeter Zijlstra 	if (uval & FUTEX_OWNER_DIED) {
25285dc28faSPeter Zijlstra 		/*
25385dc28faSPeter Zijlstra 		 * exit_pi_state_list sets owner to NULL and wakes the
25485dc28faSPeter Zijlstra 		 * topmost waiter. The task which acquires the
25585dc28faSPeter Zijlstra 		 * pi_state->rt_mutex will fixup owner.
25685dc28faSPeter Zijlstra 		 */
25785dc28faSPeter Zijlstra 		if (!pi_state->owner) {
25885dc28faSPeter Zijlstra 			/*
25985dc28faSPeter Zijlstra 			 * No pi state owner, but the user space TID
26085dc28faSPeter Zijlstra 			 * is not 0. Inconsistent state. [5]
26185dc28faSPeter Zijlstra 			 */
26285dc28faSPeter Zijlstra 			if (pid)
26385dc28faSPeter Zijlstra 				goto out_einval;
26485dc28faSPeter Zijlstra 			/*
26585dc28faSPeter Zijlstra 			 * Take a ref on the state and return success. [4]
26685dc28faSPeter Zijlstra 			 */
26785dc28faSPeter Zijlstra 			goto out_attach;
26885dc28faSPeter Zijlstra 		}
26985dc28faSPeter Zijlstra 
27085dc28faSPeter Zijlstra 		/*
27185dc28faSPeter Zijlstra 		 * If TID is 0, then either the dying owner has not
27285dc28faSPeter Zijlstra 		 * yet executed exit_pi_state_list() or some waiter
27385dc28faSPeter Zijlstra 		 * acquired the rtmutex in the pi state, but did not
27485dc28faSPeter Zijlstra 		 * yet fixup the TID in user space.
27585dc28faSPeter Zijlstra 		 *
27685dc28faSPeter Zijlstra 		 * Take a ref on the state and return success. [6]
27785dc28faSPeter Zijlstra 		 */
27885dc28faSPeter Zijlstra 		if (!pid)
27985dc28faSPeter Zijlstra 			goto out_attach;
28085dc28faSPeter Zijlstra 	} else {
28185dc28faSPeter Zijlstra 		/*
28285dc28faSPeter Zijlstra 		 * If the owner died bit is not set, then the pi_state
28385dc28faSPeter Zijlstra 		 * must have an owner. [7]
28485dc28faSPeter Zijlstra 		 */
28585dc28faSPeter Zijlstra 		if (!pi_state->owner)
28685dc28faSPeter Zijlstra 			goto out_einval;
28785dc28faSPeter Zijlstra 	}
28885dc28faSPeter Zijlstra 
28985dc28faSPeter Zijlstra 	/*
29085dc28faSPeter Zijlstra 	 * Bail out if user space manipulated the futex value. If pi
29185dc28faSPeter Zijlstra 	 * state exists then the owner TID must be the same as the
29285dc28faSPeter Zijlstra 	 * user space TID. [9/10]
29385dc28faSPeter Zijlstra 	 */
29485dc28faSPeter Zijlstra 	if (pid != task_pid_vnr(pi_state->owner))
29585dc28faSPeter Zijlstra 		goto out_einval;
29685dc28faSPeter Zijlstra 
29785dc28faSPeter Zijlstra out_attach:
29885dc28faSPeter Zijlstra 	get_pi_state(pi_state);
29985dc28faSPeter Zijlstra 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
30085dc28faSPeter Zijlstra 	*ps = pi_state;
30185dc28faSPeter Zijlstra 	return 0;
30285dc28faSPeter Zijlstra 
30385dc28faSPeter Zijlstra out_einval:
30485dc28faSPeter Zijlstra 	ret = -EINVAL;
30585dc28faSPeter Zijlstra 	goto out_error;
30685dc28faSPeter Zijlstra 
30785dc28faSPeter Zijlstra out_eagain:
30885dc28faSPeter Zijlstra 	ret = -EAGAIN;
30985dc28faSPeter Zijlstra 	goto out_error;
31085dc28faSPeter Zijlstra 
31185dc28faSPeter Zijlstra out_efault:
31285dc28faSPeter Zijlstra 	ret = -EFAULT;
31385dc28faSPeter Zijlstra 	goto out_error;
31485dc28faSPeter Zijlstra 
31585dc28faSPeter Zijlstra out_error:
31685dc28faSPeter Zijlstra 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
31785dc28faSPeter Zijlstra 	return ret;
31885dc28faSPeter Zijlstra }
31985dc28faSPeter Zijlstra 
handle_exit_race(u32 __user * uaddr,u32 uval,struct task_struct * tsk)32085dc28faSPeter Zijlstra static int handle_exit_race(u32 __user *uaddr, u32 uval,
32185dc28faSPeter Zijlstra 			    struct task_struct *tsk)
32285dc28faSPeter Zijlstra {
32385dc28faSPeter Zijlstra 	u32 uval2;
32485dc28faSPeter Zijlstra 
32585dc28faSPeter Zijlstra 	/*
32685dc28faSPeter Zijlstra 	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
32785dc28faSPeter Zijlstra 	 * caller that the alleged owner is busy.
32885dc28faSPeter Zijlstra 	 */
32985dc28faSPeter Zijlstra 	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
33085dc28faSPeter Zijlstra 		return -EBUSY;
33185dc28faSPeter Zijlstra 
33285dc28faSPeter Zijlstra 	/*
33385dc28faSPeter Zijlstra 	 * Reread the user space value to handle the following situation:
33485dc28faSPeter Zijlstra 	 *
33585dc28faSPeter Zijlstra 	 * CPU0				CPU1
33685dc28faSPeter Zijlstra 	 *
33785dc28faSPeter Zijlstra 	 * sys_exit()			sys_futex()
33885dc28faSPeter Zijlstra 	 *  do_exit()			 futex_lock_pi()
33985dc28faSPeter Zijlstra 	 *                                futex_lock_pi_atomic()
34085dc28faSPeter Zijlstra 	 *   exit_signals(tsk)		    No waiters:
34185dc28faSPeter Zijlstra 	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
34285dc28faSPeter Zijlstra 	 *  mm_release(tsk)		    Set waiter bit
34385dc28faSPeter Zijlstra 	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
34485dc28faSPeter Zijlstra 	 *      Set owner died		    attach_to_pi_owner() {
34585dc28faSPeter Zijlstra 	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
34685dc28faSPeter Zijlstra 	 *   }				     if (!tsk->flags & PF_EXITING) {
34785dc28faSPeter Zijlstra 	 *  ...				       attach();
34885dc28faSPeter Zijlstra 	 *  tsk->futex_state =               } else {
34985dc28faSPeter Zijlstra 	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
35085dc28faSPeter Zijlstra 	 *					  FUTEX_STATE_DEAD)
35185dc28faSPeter Zijlstra 	 *				         return -EAGAIN;
35285dc28faSPeter Zijlstra 	 *				       return -ESRCH; <--- FAIL
35385dc28faSPeter Zijlstra 	 *				     }
35485dc28faSPeter Zijlstra 	 *
35585dc28faSPeter Zijlstra 	 * Returning ESRCH unconditionally is wrong here because the
35685dc28faSPeter Zijlstra 	 * user space value has been changed by the exiting task.
35785dc28faSPeter Zijlstra 	 *
35885dc28faSPeter Zijlstra 	 * The same logic applies to the case where the exiting task is
35985dc28faSPeter Zijlstra 	 * already gone.
36085dc28faSPeter Zijlstra 	 */
36185dc28faSPeter Zijlstra 	if (futex_get_value_locked(&uval2, uaddr))
36285dc28faSPeter Zijlstra 		return -EFAULT;
36385dc28faSPeter Zijlstra 
36485dc28faSPeter Zijlstra 	/* If the user space value has changed, try again. */
36585dc28faSPeter Zijlstra 	if (uval2 != uval)
36685dc28faSPeter Zijlstra 		return -EAGAIN;
36785dc28faSPeter Zijlstra 
36885dc28faSPeter Zijlstra 	/*
36985dc28faSPeter Zijlstra 	 * The exiting task did not have a robust list, the robust list was
37085dc28faSPeter Zijlstra 	 * corrupted or the user space value in *uaddr is simply bogus.
37185dc28faSPeter Zijlstra 	 * Give up and tell user space.
37285dc28faSPeter Zijlstra 	 */
37385dc28faSPeter Zijlstra 	return -ESRCH;
37485dc28faSPeter Zijlstra }
37585dc28faSPeter Zijlstra 
__attach_to_pi_owner(struct task_struct * p,union futex_key * key,struct futex_pi_state ** ps)37685dc28faSPeter Zijlstra static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
37785dc28faSPeter Zijlstra 				 struct futex_pi_state **ps)
37885dc28faSPeter Zijlstra {
37985dc28faSPeter Zijlstra 	/*
38085dc28faSPeter Zijlstra 	 * No existing pi state. First waiter. [2]
38185dc28faSPeter Zijlstra 	 *
38285dc28faSPeter Zijlstra 	 * This creates pi_state, we have hb->lock held, this means nothing can
38385dc28faSPeter Zijlstra 	 * observe this state, wait_lock is irrelevant.
38485dc28faSPeter Zijlstra 	 */
38585dc28faSPeter Zijlstra 	struct futex_pi_state *pi_state = alloc_pi_state();
38685dc28faSPeter Zijlstra 
38785dc28faSPeter Zijlstra 	/*
38885dc28faSPeter Zijlstra 	 * Initialize the pi_mutex in locked state and make @p
38985dc28faSPeter Zijlstra 	 * the owner of it:
39085dc28faSPeter Zijlstra 	 */
39185dc28faSPeter Zijlstra 	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
39285dc28faSPeter Zijlstra 
39385dc28faSPeter Zijlstra 	/* Store the key for possible exit cleanups: */
39485dc28faSPeter Zijlstra 	pi_state->key = *key;
39585dc28faSPeter Zijlstra 
39685dc28faSPeter Zijlstra 	WARN_ON(!list_empty(&pi_state->list));
39785dc28faSPeter Zijlstra 	list_add(&pi_state->list, &p->pi_state_list);
39885dc28faSPeter Zijlstra 	/*
39985dc28faSPeter Zijlstra 	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
40085dc28faSPeter Zijlstra 	 * because there is no concurrency as the object is not published yet.
40185dc28faSPeter Zijlstra 	 */
40285dc28faSPeter Zijlstra 	pi_state->owner = p;
40385dc28faSPeter Zijlstra 
40485dc28faSPeter Zijlstra 	*ps = pi_state;
40585dc28faSPeter Zijlstra }
40685dc28faSPeter Zijlstra /*
40785dc28faSPeter Zijlstra  * Lookup the task for the TID provided from user space and attach to
40885dc28faSPeter Zijlstra  * it after doing proper sanity checks.
40985dc28faSPeter Zijlstra  */
attach_to_pi_owner(u32 __user * uaddr,u32 uval,union futex_key * key,struct futex_pi_state ** ps,struct task_struct ** exiting)41085dc28faSPeter Zijlstra static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
41185dc28faSPeter Zijlstra 			      struct futex_pi_state **ps,
41285dc28faSPeter Zijlstra 			      struct task_struct **exiting)
41385dc28faSPeter Zijlstra {
41485dc28faSPeter Zijlstra 	pid_t pid = uval & FUTEX_TID_MASK;
41585dc28faSPeter Zijlstra 	struct task_struct *p;
41685dc28faSPeter Zijlstra 
41785dc28faSPeter Zijlstra 	/*
41885dc28faSPeter Zijlstra 	 * We are the first waiter - try to look up the real owner and attach
41985dc28faSPeter Zijlstra 	 * the new pi_state to it, but bail out when TID = 0 [1]
42085dc28faSPeter Zijlstra 	 *
42185dc28faSPeter Zijlstra 	 * The !pid check is paranoid. None of the call sites should end up
42285dc28faSPeter Zijlstra 	 * with pid == 0, but better safe than sorry. Let the caller retry
42385dc28faSPeter Zijlstra 	 */
42485dc28faSPeter Zijlstra 	if (!pid)
42585dc28faSPeter Zijlstra 		return -EAGAIN;
42685dc28faSPeter Zijlstra 	p = find_get_task_by_vpid(pid);
42785dc28faSPeter Zijlstra 	if (!p)
42885dc28faSPeter Zijlstra 		return handle_exit_race(uaddr, uval, NULL);
42985dc28faSPeter Zijlstra 
43085dc28faSPeter Zijlstra 	if (unlikely(p->flags & PF_KTHREAD)) {
43185dc28faSPeter Zijlstra 		put_task_struct(p);
43285dc28faSPeter Zijlstra 		return -EPERM;
43385dc28faSPeter Zijlstra 	}
43485dc28faSPeter Zijlstra 
43585dc28faSPeter Zijlstra 	/*
43685dc28faSPeter Zijlstra 	 * We need to look at the task state to figure out, whether the
43785dc28faSPeter Zijlstra 	 * task is exiting. To protect against the change of the task state
43885dc28faSPeter Zijlstra 	 * in futex_exit_release(), we do this protected by p->pi_lock:
43985dc28faSPeter Zijlstra 	 */
44085dc28faSPeter Zijlstra 	raw_spin_lock_irq(&p->pi_lock);
44185dc28faSPeter Zijlstra 	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
44285dc28faSPeter Zijlstra 		/*
44385dc28faSPeter Zijlstra 		 * The task is on the way out. When the futex state is
44485dc28faSPeter Zijlstra 		 * FUTEX_STATE_DEAD, we know that the task has finished
44585dc28faSPeter Zijlstra 		 * the cleanup:
44685dc28faSPeter Zijlstra 		 */
44785dc28faSPeter Zijlstra 		int ret = handle_exit_race(uaddr, uval, p);
44885dc28faSPeter Zijlstra 
44985dc28faSPeter Zijlstra 		raw_spin_unlock_irq(&p->pi_lock);
45085dc28faSPeter Zijlstra 		/*
45185dc28faSPeter Zijlstra 		 * If the owner task is between FUTEX_STATE_EXITING and
45285dc28faSPeter Zijlstra 		 * FUTEX_STATE_DEAD then store the task pointer and keep
45385dc28faSPeter Zijlstra 		 * the reference on the task struct. The calling code will
45485dc28faSPeter Zijlstra 		 * drop all locks, wait for the task to reach
45585dc28faSPeter Zijlstra 		 * FUTEX_STATE_DEAD and then drop the refcount. This is
45685dc28faSPeter Zijlstra 		 * required to prevent a live lock when the current task
45785dc28faSPeter Zijlstra 		 * preempted the exiting task between the two states.
45885dc28faSPeter Zijlstra 		 */
45985dc28faSPeter Zijlstra 		if (ret == -EBUSY)
46085dc28faSPeter Zijlstra 			*exiting = p;
46185dc28faSPeter Zijlstra 		else
46285dc28faSPeter Zijlstra 			put_task_struct(p);
46385dc28faSPeter Zijlstra 		return ret;
46485dc28faSPeter Zijlstra 	}
46585dc28faSPeter Zijlstra 
46685dc28faSPeter Zijlstra 	__attach_to_pi_owner(p, key, ps);
46785dc28faSPeter Zijlstra 	raw_spin_unlock_irq(&p->pi_lock);
46885dc28faSPeter Zijlstra 
46985dc28faSPeter Zijlstra 	put_task_struct(p);
47085dc28faSPeter Zijlstra 
47185dc28faSPeter Zijlstra 	return 0;
47285dc28faSPeter Zijlstra }
47385dc28faSPeter Zijlstra 
lock_pi_update_atomic(u32 __user * uaddr,u32 uval,u32 newval)47485dc28faSPeter Zijlstra static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
47585dc28faSPeter Zijlstra {
47685dc28faSPeter Zijlstra 	int err;
47785dc28faSPeter Zijlstra 	u32 curval;
47885dc28faSPeter Zijlstra 
47985dc28faSPeter Zijlstra 	if (unlikely(should_fail_futex(true)))
48085dc28faSPeter Zijlstra 		return -EFAULT;
48185dc28faSPeter Zijlstra 
48285dc28faSPeter Zijlstra 	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
48385dc28faSPeter Zijlstra 	if (unlikely(err))
48485dc28faSPeter Zijlstra 		return err;
48585dc28faSPeter Zijlstra 
48685dc28faSPeter Zijlstra 	/* If user space value changed, let the caller retry */
48785dc28faSPeter Zijlstra 	return curval != uval ? -EAGAIN : 0;
48885dc28faSPeter Zijlstra }
48985dc28faSPeter Zijlstra 
49085dc28faSPeter Zijlstra /**
49185dc28faSPeter Zijlstra  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
49285dc28faSPeter Zijlstra  * @uaddr:		the pi futex user address
49385dc28faSPeter Zijlstra  * @hb:			the pi futex hash bucket
49485dc28faSPeter Zijlstra  * @key:		the futex key associated with uaddr and hb
49585dc28faSPeter Zijlstra  * @ps:			the pi_state pointer where we store the result of the
49685dc28faSPeter Zijlstra  *			lookup
49785dc28faSPeter Zijlstra  * @task:		the task to perform the atomic lock work for.  This will
49885dc28faSPeter Zijlstra  *			be "current" except in the case of requeue pi.
49985dc28faSPeter Zijlstra  * @exiting:		Pointer to store the task pointer of the owner task
50085dc28faSPeter Zijlstra  *			which is in the middle of exiting
50185dc28faSPeter Zijlstra  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
50285dc28faSPeter Zijlstra  *
50385dc28faSPeter Zijlstra  * Return:
50485dc28faSPeter Zijlstra  *  -  0 - ready to wait;
50585dc28faSPeter Zijlstra  *  -  1 - acquired the lock;
50685dc28faSPeter Zijlstra  *  - <0 - error
50785dc28faSPeter Zijlstra  *
50885dc28faSPeter Zijlstra  * The hb->lock must be held by the caller.
50985dc28faSPeter Zijlstra  *
51085dc28faSPeter Zijlstra  * @exiting is only set when the return value is -EBUSY. If so, this holds
51185dc28faSPeter Zijlstra  * a refcount on the exiting task on return and the caller needs to drop it
51285dc28faSPeter Zijlstra  * after waiting for the exit to complete.
51385dc28faSPeter Zijlstra  */
futex_lock_pi_atomic(u32 __user * uaddr,struct futex_hash_bucket * hb,union futex_key * key,struct futex_pi_state ** ps,struct task_struct * task,struct task_struct ** exiting,int set_waiters)51485dc28faSPeter Zijlstra int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
51585dc28faSPeter Zijlstra 			 union futex_key *key,
51685dc28faSPeter Zijlstra 			 struct futex_pi_state **ps,
51785dc28faSPeter Zijlstra 			 struct task_struct *task,
51885dc28faSPeter Zijlstra 			 struct task_struct **exiting,
51985dc28faSPeter Zijlstra 			 int set_waiters)
52085dc28faSPeter Zijlstra {
52185dc28faSPeter Zijlstra 	u32 uval, newval, vpid = task_pid_vnr(task);
52285dc28faSPeter Zijlstra 	struct futex_q *top_waiter;
52385dc28faSPeter Zijlstra 	int ret;
52485dc28faSPeter Zijlstra 
52585dc28faSPeter Zijlstra 	/*
52685dc28faSPeter Zijlstra 	 * Read the user space value first so we can validate a few
52785dc28faSPeter Zijlstra 	 * things before proceeding further.
52885dc28faSPeter Zijlstra 	 */
52985dc28faSPeter Zijlstra 	if (futex_get_value_locked(&uval, uaddr))
53085dc28faSPeter Zijlstra 		return -EFAULT;
53185dc28faSPeter Zijlstra 
53285dc28faSPeter Zijlstra 	if (unlikely(should_fail_futex(true)))
53385dc28faSPeter Zijlstra 		return -EFAULT;
53485dc28faSPeter Zijlstra 
53585dc28faSPeter Zijlstra 	/*
53685dc28faSPeter Zijlstra 	 * Detect deadlocks.
53785dc28faSPeter Zijlstra 	 */
53885dc28faSPeter Zijlstra 	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
53985dc28faSPeter Zijlstra 		return -EDEADLK;
54085dc28faSPeter Zijlstra 
54185dc28faSPeter Zijlstra 	if ((unlikely(should_fail_futex(true))))
54285dc28faSPeter Zijlstra 		return -EDEADLK;
54385dc28faSPeter Zijlstra 
54485dc28faSPeter Zijlstra 	/*
54585dc28faSPeter Zijlstra 	 * Lookup existing state first. If it exists, try to attach to
54685dc28faSPeter Zijlstra 	 * its pi_state.
54785dc28faSPeter Zijlstra 	 */
54885dc28faSPeter Zijlstra 	top_waiter = futex_top_waiter(hb, key);
54985dc28faSPeter Zijlstra 	if (top_waiter)
55085dc28faSPeter Zijlstra 		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
55185dc28faSPeter Zijlstra 
55285dc28faSPeter Zijlstra 	/*
55385dc28faSPeter Zijlstra 	 * No waiter and user TID is 0. We are here because the
55485dc28faSPeter Zijlstra 	 * waiters or the owner died bit is set or called from
55585dc28faSPeter Zijlstra 	 * requeue_cmp_pi or for whatever reason something took the
55685dc28faSPeter Zijlstra 	 * syscall.
55785dc28faSPeter Zijlstra 	 */
55885dc28faSPeter Zijlstra 	if (!(uval & FUTEX_TID_MASK)) {
55985dc28faSPeter Zijlstra 		/*
56085dc28faSPeter Zijlstra 		 * We take over the futex. No other waiters and the user space
56185dc28faSPeter Zijlstra 		 * TID is 0. We preserve the owner died bit.
56285dc28faSPeter Zijlstra 		 */
56385dc28faSPeter Zijlstra 		newval = uval & FUTEX_OWNER_DIED;
56485dc28faSPeter Zijlstra 		newval |= vpid;
56585dc28faSPeter Zijlstra 
56685dc28faSPeter Zijlstra 		/* The futex requeue_pi code can enforce the waiters bit */
56785dc28faSPeter Zijlstra 		if (set_waiters)
56885dc28faSPeter Zijlstra 			newval |= FUTEX_WAITERS;
56985dc28faSPeter Zijlstra 
57085dc28faSPeter Zijlstra 		ret = lock_pi_update_atomic(uaddr, uval, newval);
57185dc28faSPeter Zijlstra 		if (ret)
57285dc28faSPeter Zijlstra 			return ret;
57385dc28faSPeter Zijlstra 
57485dc28faSPeter Zijlstra 		/*
57585dc28faSPeter Zijlstra 		 * If the waiter bit was requested the caller also needs PI
57685dc28faSPeter Zijlstra 		 * state attached to the new owner of the user space futex.
57785dc28faSPeter Zijlstra 		 *
57885dc28faSPeter Zijlstra 		 * @task is guaranteed to be alive and it cannot be exiting
57985dc28faSPeter Zijlstra 		 * because it is either sleeping or waiting in
58085dc28faSPeter Zijlstra 		 * futex_requeue_pi_wakeup_sync().
58185dc28faSPeter Zijlstra 		 *
58285dc28faSPeter Zijlstra 		 * No need to do the full attach_to_pi_owner() exercise
58385dc28faSPeter Zijlstra 		 * because @task is known and valid.
58485dc28faSPeter Zijlstra 		 */
58585dc28faSPeter Zijlstra 		if (set_waiters) {
58685dc28faSPeter Zijlstra 			raw_spin_lock_irq(&task->pi_lock);
58785dc28faSPeter Zijlstra 			__attach_to_pi_owner(task, key, ps);
58885dc28faSPeter Zijlstra 			raw_spin_unlock_irq(&task->pi_lock);
58985dc28faSPeter Zijlstra 		}
59085dc28faSPeter Zijlstra 		return 1;
59185dc28faSPeter Zijlstra 	}
59285dc28faSPeter Zijlstra 
59385dc28faSPeter Zijlstra 	/*
59485dc28faSPeter Zijlstra 	 * First waiter. Set the waiters bit before attaching ourself to
59585dc28faSPeter Zijlstra 	 * the owner. If owner tries to unlock, it will be forced into
59685dc28faSPeter Zijlstra 	 * the kernel and blocked on hb->lock.
59785dc28faSPeter Zijlstra 	 */
59885dc28faSPeter Zijlstra 	newval = uval | FUTEX_WAITERS;
59985dc28faSPeter Zijlstra 	ret = lock_pi_update_atomic(uaddr, uval, newval);
60085dc28faSPeter Zijlstra 	if (ret)
60185dc28faSPeter Zijlstra 		return ret;
60285dc28faSPeter Zijlstra 	/*
60385dc28faSPeter Zijlstra 	 * If the update of the user space value succeeded, we try to
60485dc28faSPeter Zijlstra 	 * attach to the owner. If that fails, no harm done, we only
60585dc28faSPeter Zijlstra 	 * set the FUTEX_WAITERS bit in the user space variable.
60685dc28faSPeter Zijlstra 	 */
60785dc28faSPeter Zijlstra 	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
60885dc28faSPeter Zijlstra }
60985dc28faSPeter Zijlstra 
61085dc28faSPeter Zijlstra /*
61185dc28faSPeter Zijlstra  * Caller must hold a reference on @pi_state.
61285dc28faSPeter Zijlstra  */
wake_futex_pi(u32 __user * uaddr,u32 uval,struct futex_pi_state * pi_state)61385dc28faSPeter Zijlstra static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
61485dc28faSPeter Zijlstra {
61585dc28faSPeter Zijlstra 	struct rt_mutex_waiter *top_waiter;
61685dc28faSPeter Zijlstra 	struct task_struct *new_owner;
61785dc28faSPeter Zijlstra 	bool postunlock = false;
61885dc28faSPeter Zijlstra 	DEFINE_RT_WAKE_Q(wqh);
61985dc28faSPeter Zijlstra 	u32 curval, newval;
62085dc28faSPeter Zijlstra 	int ret = 0;
62185dc28faSPeter Zijlstra 
62285dc28faSPeter Zijlstra 	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
62385dc28faSPeter Zijlstra 	if (WARN_ON_ONCE(!top_waiter)) {
62485dc28faSPeter Zijlstra 		/*
62585dc28faSPeter Zijlstra 		 * As per the comment in futex_unlock_pi() this should not happen.
62685dc28faSPeter Zijlstra 		 *
62785dc28faSPeter Zijlstra 		 * When this happens, give up our locks and try again, giving
62885dc28faSPeter Zijlstra 		 * the futex_lock_pi() instance time to complete, either by
62985dc28faSPeter Zijlstra 		 * waiting on the rtmutex or removing itself from the futex
63085dc28faSPeter Zijlstra 		 * queue.
63185dc28faSPeter Zijlstra 		 */
63285dc28faSPeter Zijlstra 		ret = -EAGAIN;
63385dc28faSPeter Zijlstra 		goto out_unlock;
63485dc28faSPeter Zijlstra 	}
63585dc28faSPeter Zijlstra 
63685dc28faSPeter Zijlstra 	new_owner = top_waiter->task;
63785dc28faSPeter Zijlstra 
63885dc28faSPeter Zijlstra 	/*
63985dc28faSPeter Zijlstra 	 * We pass it to the next owner. The WAITERS bit is always kept
64085dc28faSPeter Zijlstra 	 * enabled while there is PI state around. We cleanup the owner
64185dc28faSPeter Zijlstra 	 * died bit, because we are the owner.
64285dc28faSPeter Zijlstra 	 */
64385dc28faSPeter Zijlstra 	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
64485dc28faSPeter Zijlstra 
64585dc28faSPeter Zijlstra 	if (unlikely(should_fail_futex(true))) {
64685dc28faSPeter Zijlstra 		ret = -EFAULT;
64785dc28faSPeter Zijlstra 		goto out_unlock;
64885dc28faSPeter Zijlstra 	}
64985dc28faSPeter Zijlstra 
65085dc28faSPeter Zijlstra 	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
65185dc28faSPeter Zijlstra 	if (!ret && (curval != uval)) {
65285dc28faSPeter Zijlstra 		/*
65385dc28faSPeter Zijlstra 		 * If a unconditional UNLOCK_PI operation (user space did not
65485dc28faSPeter Zijlstra 		 * try the TID->0 transition) raced with a waiter setting the
65585dc28faSPeter Zijlstra 		 * FUTEX_WAITERS flag between get_user() and locking the hash
65685dc28faSPeter Zijlstra 		 * bucket lock, retry the operation.
65785dc28faSPeter Zijlstra 		 */
65885dc28faSPeter Zijlstra 		if ((FUTEX_TID_MASK & curval) == uval)
65985dc28faSPeter Zijlstra 			ret = -EAGAIN;
66085dc28faSPeter Zijlstra 		else
66185dc28faSPeter Zijlstra 			ret = -EINVAL;
66285dc28faSPeter Zijlstra 	}
66385dc28faSPeter Zijlstra 
66485dc28faSPeter Zijlstra 	if (!ret) {
66585dc28faSPeter Zijlstra 		/*
66685dc28faSPeter Zijlstra 		 * This is a point of no return; once we modified the uval
66785dc28faSPeter Zijlstra 		 * there is no going back and subsequent operations must
66885dc28faSPeter Zijlstra 		 * not fail.
66985dc28faSPeter Zijlstra 		 */
67085dc28faSPeter Zijlstra 		pi_state_update_owner(pi_state, new_owner);
67185dc28faSPeter Zijlstra 		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
67285dc28faSPeter Zijlstra 	}
67385dc28faSPeter Zijlstra 
67485dc28faSPeter Zijlstra out_unlock:
67585dc28faSPeter Zijlstra 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
67685dc28faSPeter Zijlstra 
67785dc28faSPeter Zijlstra 	if (postunlock)
67885dc28faSPeter Zijlstra 		rt_mutex_postunlock(&wqh);
67985dc28faSPeter Zijlstra 
68085dc28faSPeter Zijlstra 	return ret;
68185dc28faSPeter Zijlstra }
68285dc28faSPeter Zijlstra 
__fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)68385dc28faSPeter Zijlstra static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
68485dc28faSPeter Zijlstra 				  struct task_struct *argowner)
68585dc28faSPeter Zijlstra {
68685dc28faSPeter Zijlstra 	struct futex_pi_state *pi_state = q->pi_state;
68785dc28faSPeter Zijlstra 	struct task_struct *oldowner, *newowner;
68885dc28faSPeter Zijlstra 	u32 uval, curval, newval, newtid;
68985dc28faSPeter Zijlstra 	int err = 0;
69085dc28faSPeter Zijlstra 
69185dc28faSPeter Zijlstra 	oldowner = pi_state->owner;
69285dc28faSPeter Zijlstra 
69385dc28faSPeter Zijlstra 	/*
69485dc28faSPeter Zijlstra 	 * We are here because either:
69585dc28faSPeter Zijlstra 	 *
69685dc28faSPeter Zijlstra 	 *  - we stole the lock and pi_state->owner needs updating to reflect
69785dc28faSPeter Zijlstra 	 *    that (@argowner == current),
69885dc28faSPeter Zijlstra 	 *
69985dc28faSPeter Zijlstra 	 * or:
70085dc28faSPeter Zijlstra 	 *
70185dc28faSPeter Zijlstra 	 *  - someone stole our lock and we need to fix things to point to the
70285dc28faSPeter Zijlstra 	 *    new owner (@argowner == NULL).
70385dc28faSPeter Zijlstra 	 *
70485dc28faSPeter Zijlstra 	 * Either way, we have to replace the TID in the user space variable.
70585dc28faSPeter Zijlstra 	 * This must be atomic as we have to preserve the owner died bit here.
70685dc28faSPeter Zijlstra 	 *
70785dc28faSPeter Zijlstra 	 * Note: We write the user space value _before_ changing the pi_state
70885dc28faSPeter Zijlstra 	 * because we can fault here. Imagine swapped out pages or a fork
70985dc28faSPeter Zijlstra 	 * that marked all the anonymous memory readonly for cow.
71085dc28faSPeter Zijlstra 	 *
71185dc28faSPeter Zijlstra 	 * Modifying pi_state _before_ the user space value would leave the
71285dc28faSPeter Zijlstra 	 * pi_state in an inconsistent state when we fault here, because we
71385dc28faSPeter Zijlstra 	 * need to drop the locks to handle the fault. This might be observed
71485dc28faSPeter Zijlstra 	 * in the PID checks when attaching to PI state .
71585dc28faSPeter Zijlstra 	 */
71685dc28faSPeter Zijlstra retry:
71785dc28faSPeter Zijlstra 	if (!argowner) {
71885dc28faSPeter Zijlstra 		if (oldowner != current) {
71985dc28faSPeter Zijlstra 			/*
72085dc28faSPeter Zijlstra 			 * We raced against a concurrent self; things are
72185dc28faSPeter Zijlstra 			 * already fixed up. Nothing to do.
72285dc28faSPeter Zijlstra 			 */
72385dc28faSPeter Zijlstra 			return 0;
72485dc28faSPeter Zijlstra 		}
72585dc28faSPeter Zijlstra 
72685dc28faSPeter Zijlstra 		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
72785dc28faSPeter Zijlstra 			/* We got the lock. pi_state is correct. Tell caller. */
72885dc28faSPeter Zijlstra 			return 1;
72985dc28faSPeter Zijlstra 		}
73085dc28faSPeter Zijlstra 
73185dc28faSPeter Zijlstra 		/*
73285dc28faSPeter Zijlstra 		 * The trylock just failed, so either there is an owner or
73385dc28faSPeter Zijlstra 		 * there is a higher priority waiter than this one.
73485dc28faSPeter Zijlstra 		 */
73585dc28faSPeter Zijlstra 		newowner = rt_mutex_owner(&pi_state->pi_mutex);
73685dc28faSPeter Zijlstra 		/*
73785dc28faSPeter Zijlstra 		 * If the higher priority waiter has not yet taken over the
73885dc28faSPeter Zijlstra 		 * rtmutex then newowner is NULL. We can't return here with
73985dc28faSPeter Zijlstra 		 * that state because it's inconsistent vs. the user space
74085dc28faSPeter Zijlstra 		 * state. So drop the locks and try again. It's a valid
74185dc28faSPeter Zijlstra 		 * situation and not any different from the other retry
74285dc28faSPeter Zijlstra 		 * conditions.
74385dc28faSPeter Zijlstra 		 */
74485dc28faSPeter Zijlstra 		if (unlikely(!newowner)) {
74585dc28faSPeter Zijlstra 			err = -EAGAIN;
74685dc28faSPeter Zijlstra 			goto handle_err;
74785dc28faSPeter Zijlstra 		}
74885dc28faSPeter Zijlstra 	} else {
74985dc28faSPeter Zijlstra 		WARN_ON_ONCE(argowner != current);
75085dc28faSPeter Zijlstra 		if (oldowner == current) {
75185dc28faSPeter Zijlstra 			/*
75285dc28faSPeter Zijlstra 			 * We raced against a concurrent self; things are
75385dc28faSPeter Zijlstra 			 * already fixed up. Nothing to do.
75485dc28faSPeter Zijlstra 			 */
75585dc28faSPeter Zijlstra 			return 1;
75685dc28faSPeter Zijlstra 		}
75785dc28faSPeter Zijlstra 		newowner = argowner;
75885dc28faSPeter Zijlstra 	}
75985dc28faSPeter Zijlstra 
76085dc28faSPeter Zijlstra 	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
76185dc28faSPeter Zijlstra 	/* Owner died? */
76285dc28faSPeter Zijlstra 	if (!pi_state->owner)
76385dc28faSPeter Zijlstra 		newtid |= FUTEX_OWNER_DIED;
76485dc28faSPeter Zijlstra 
76585dc28faSPeter Zijlstra 	err = futex_get_value_locked(&uval, uaddr);
76685dc28faSPeter Zijlstra 	if (err)
76785dc28faSPeter Zijlstra 		goto handle_err;
76885dc28faSPeter Zijlstra 
76985dc28faSPeter Zijlstra 	for (;;) {
77085dc28faSPeter Zijlstra 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
77185dc28faSPeter Zijlstra 
77285dc28faSPeter Zijlstra 		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
77385dc28faSPeter Zijlstra 		if (err)
77485dc28faSPeter Zijlstra 			goto handle_err;
77585dc28faSPeter Zijlstra 
77685dc28faSPeter Zijlstra 		if (curval == uval)
77785dc28faSPeter Zijlstra 			break;
77885dc28faSPeter Zijlstra 		uval = curval;
77985dc28faSPeter Zijlstra 	}
78085dc28faSPeter Zijlstra 
78185dc28faSPeter Zijlstra 	/*
78285dc28faSPeter Zijlstra 	 * We fixed up user space. Now we need to fix the pi_state
78385dc28faSPeter Zijlstra 	 * itself.
78485dc28faSPeter Zijlstra 	 */
78585dc28faSPeter Zijlstra 	pi_state_update_owner(pi_state, newowner);
78685dc28faSPeter Zijlstra 
78785dc28faSPeter Zijlstra 	return argowner == current;
78885dc28faSPeter Zijlstra 
78985dc28faSPeter Zijlstra 	/*
79085dc28faSPeter Zijlstra 	 * In order to reschedule or handle a page fault, we need to drop the
79185dc28faSPeter Zijlstra 	 * locks here. In the case of a fault, this gives the other task
79285dc28faSPeter Zijlstra 	 * (either the highest priority waiter itself or the task which stole
79385dc28faSPeter Zijlstra 	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
79485dc28faSPeter Zijlstra 	 * are back from handling the fault we need to check the pi_state after
79585dc28faSPeter Zijlstra 	 * reacquiring the locks and before trying to do another fixup. When
79685dc28faSPeter Zijlstra 	 * the fixup has been done already we simply return.
79785dc28faSPeter Zijlstra 	 *
79885dc28faSPeter Zijlstra 	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
79985dc28faSPeter Zijlstra 	 * drop hb->lock since the caller owns the hb -> futex_q relation.
80085dc28faSPeter Zijlstra 	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
80185dc28faSPeter Zijlstra 	 */
80285dc28faSPeter Zijlstra handle_err:
80385dc28faSPeter Zijlstra 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
80485dc28faSPeter Zijlstra 	spin_unlock(q->lock_ptr);
80585dc28faSPeter Zijlstra 
80685dc28faSPeter Zijlstra 	switch (err) {
80785dc28faSPeter Zijlstra 	case -EFAULT:
80885dc28faSPeter Zijlstra 		err = fault_in_user_writeable(uaddr);
80985dc28faSPeter Zijlstra 		break;
81085dc28faSPeter Zijlstra 
81185dc28faSPeter Zijlstra 	case -EAGAIN:
81285dc28faSPeter Zijlstra 		cond_resched();
81385dc28faSPeter Zijlstra 		err = 0;
81485dc28faSPeter Zijlstra 		break;
81585dc28faSPeter Zijlstra 
81685dc28faSPeter Zijlstra 	default:
81785dc28faSPeter Zijlstra 		WARN_ON_ONCE(1);
81885dc28faSPeter Zijlstra 		break;
81985dc28faSPeter Zijlstra 	}
82085dc28faSPeter Zijlstra 
82185dc28faSPeter Zijlstra 	spin_lock(q->lock_ptr);
82285dc28faSPeter Zijlstra 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
82385dc28faSPeter Zijlstra 
82485dc28faSPeter Zijlstra 	/*
82585dc28faSPeter Zijlstra 	 * Check if someone else fixed it for us:
82685dc28faSPeter Zijlstra 	 */
82785dc28faSPeter Zijlstra 	if (pi_state->owner != oldowner)
82885dc28faSPeter Zijlstra 		return argowner == current;
82985dc28faSPeter Zijlstra 
83085dc28faSPeter Zijlstra 	/* Retry if err was -EAGAIN or the fault in succeeded */
83185dc28faSPeter Zijlstra 	if (!err)
83285dc28faSPeter Zijlstra 		goto retry;
83385dc28faSPeter Zijlstra 
83485dc28faSPeter Zijlstra 	/*
83585dc28faSPeter Zijlstra 	 * fault_in_user_writeable() failed so user state is immutable. At
83685dc28faSPeter Zijlstra 	 * best we can make the kernel state consistent but user state will
83785dc28faSPeter Zijlstra 	 * be most likely hosed and any subsequent unlock operation will be
83885dc28faSPeter Zijlstra 	 * rejected due to PI futex rule [10].
83985dc28faSPeter Zijlstra 	 *
84085dc28faSPeter Zijlstra 	 * Ensure that the rtmutex owner is also the pi_state owner despite
84185dc28faSPeter Zijlstra 	 * the user space value claiming something different. There is no
84285dc28faSPeter Zijlstra 	 * point in unlocking the rtmutex if current is the owner as it
84385dc28faSPeter Zijlstra 	 * would need to wait until the next waiter has taken the rtmutex
84485dc28faSPeter Zijlstra 	 * to guarantee consistent state. Keep it simple. Userspace asked
84585dc28faSPeter Zijlstra 	 * for this wreckaged state.
84685dc28faSPeter Zijlstra 	 *
84785dc28faSPeter Zijlstra 	 * The rtmutex has an owner - either current or some other
84885dc28faSPeter Zijlstra 	 * task. See the EAGAIN loop above.
84985dc28faSPeter Zijlstra 	 */
85085dc28faSPeter Zijlstra 	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
85185dc28faSPeter Zijlstra 
85285dc28faSPeter Zijlstra 	return err;
85385dc28faSPeter Zijlstra }
85485dc28faSPeter Zijlstra 
fixup_pi_state_owner(u32 __user * uaddr,struct futex_q * q,struct task_struct * argowner)85585dc28faSPeter Zijlstra static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
85685dc28faSPeter Zijlstra 				struct task_struct *argowner)
85785dc28faSPeter Zijlstra {
85885dc28faSPeter Zijlstra 	struct futex_pi_state *pi_state = q->pi_state;
85985dc28faSPeter Zijlstra 	int ret;
86085dc28faSPeter Zijlstra 
86185dc28faSPeter Zijlstra 	lockdep_assert_held(q->lock_ptr);
86285dc28faSPeter Zijlstra 
86385dc28faSPeter Zijlstra 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
86485dc28faSPeter Zijlstra 	ret = __fixup_pi_state_owner(uaddr, q, argowner);
86585dc28faSPeter Zijlstra 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
86685dc28faSPeter Zijlstra 	return ret;
86785dc28faSPeter Zijlstra }
86885dc28faSPeter Zijlstra 
86985dc28faSPeter Zijlstra /**
87085dc28faSPeter Zijlstra  * fixup_pi_owner() - Post lock pi_state and corner case management
87185dc28faSPeter Zijlstra  * @uaddr:	user address of the futex
87285dc28faSPeter Zijlstra  * @q:		futex_q (contains pi_state and access to the rt_mutex)
87385dc28faSPeter Zijlstra  * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
87485dc28faSPeter Zijlstra  *
87585dc28faSPeter Zijlstra  * After attempting to lock an rt_mutex, this function is called to cleanup
87685dc28faSPeter Zijlstra  * the pi_state owner as well as handle race conditions that may allow us to
87785dc28faSPeter Zijlstra  * acquire the lock. Must be called with the hb lock held.
87885dc28faSPeter Zijlstra  *
87985dc28faSPeter Zijlstra  * Return:
88085dc28faSPeter Zijlstra  *  -  1 - success, lock taken;
88185dc28faSPeter Zijlstra  *  -  0 - success, lock not taken;
88285dc28faSPeter Zijlstra  *  - <0 - on error (-EFAULT)
88385dc28faSPeter Zijlstra  */
fixup_pi_owner(u32 __user * uaddr,struct futex_q * q,int locked)88485dc28faSPeter Zijlstra int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
88585dc28faSPeter Zijlstra {
88685dc28faSPeter Zijlstra 	if (locked) {
88785dc28faSPeter Zijlstra 		/*
88885dc28faSPeter Zijlstra 		 * Got the lock. We might not be the anticipated owner if we
88985dc28faSPeter Zijlstra 		 * did a lock-steal - fix up the PI-state in that case:
89085dc28faSPeter Zijlstra 		 *
89185dc28faSPeter Zijlstra 		 * Speculative pi_state->owner read (we don't hold wait_lock);
89285dc28faSPeter Zijlstra 		 * since we own the lock pi_state->owner == current is the
89385dc28faSPeter Zijlstra 		 * stable state, anything else needs more attention.
89485dc28faSPeter Zijlstra 		 */
89585dc28faSPeter Zijlstra 		if (q->pi_state->owner != current)
89685dc28faSPeter Zijlstra 			return fixup_pi_state_owner(uaddr, q, current);
89785dc28faSPeter Zijlstra 		return 1;
89885dc28faSPeter Zijlstra 	}
89985dc28faSPeter Zijlstra 
90085dc28faSPeter Zijlstra 	/*
90185dc28faSPeter Zijlstra 	 * If we didn't get the lock; check if anybody stole it from us. In
90285dc28faSPeter Zijlstra 	 * that case, we need to fix up the uval to point to them instead of
90385dc28faSPeter Zijlstra 	 * us, otherwise bad things happen. [10]
90485dc28faSPeter Zijlstra 	 *
90585dc28faSPeter Zijlstra 	 * Another speculative read; pi_state->owner == current is unstable
90685dc28faSPeter Zijlstra 	 * but needs our attention.
90785dc28faSPeter Zijlstra 	 */
90885dc28faSPeter Zijlstra 	if (q->pi_state->owner == current)
90985dc28faSPeter Zijlstra 		return fixup_pi_state_owner(uaddr, q, NULL);
91085dc28faSPeter Zijlstra 
91185dc28faSPeter Zijlstra 	/*
91285dc28faSPeter Zijlstra 	 * Paranoia check. If we did not take the lock, then we should not be
91385dc28faSPeter Zijlstra 	 * the owner of the rt_mutex. Warn and establish consistent state.
91485dc28faSPeter Zijlstra 	 */
91585dc28faSPeter Zijlstra 	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
91685dc28faSPeter Zijlstra 		return fixup_pi_state_owner(uaddr, q, current);
91785dc28faSPeter Zijlstra 
91885dc28faSPeter Zijlstra 	return 0;
91985dc28faSPeter Zijlstra }
92085dc28faSPeter Zijlstra 
92185dc28faSPeter Zijlstra /*
92285dc28faSPeter Zijlstra  * Userspace tried a 0 -> TID atomic transition of the futex value
92385dc28faSPeter Zijlstra  * and failed. The kernel side here does the whole locking operation:
92485dc28faSPeter Zijlstra  * if there are waiters then it will block as a consequence of relying
92585dc28faSPeter Zijlstra  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
92685dc28faSPeter Zijlstra  * a 0 value of the futex too.).
92785dc28faSPeter Zijlstra  *
92885dc28faSPeter Zijlstra  * Also serves as futex trylock_pi()'ing, and due semantics.
92985dc28faSPeter Zijlstra  */
futex_lock_pi(u32 __user * uaddr,unsigned int flags,ktime_t * time,int trylock)93085dc28faSPeter Zijlstra int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
93185dc28faSPeter Zijlstra {
93285dc28faSPeter Zijlstra 	struct hrtimer_sleeper timeout, *to;
93385dc28faSPeter Zijlstra 	struct task_struct *exiting = NULL;
93485dc28faSPeter Zijlstra 	struct rt_mutex_waiter rt_waiter;
93585dc28faSPeter Zijlstra 	struct futex_hash_bucket *hb;
93685dc28faSPeter Zijlstra 	struct futex_q q = futex_q_init;
93785dc28faSPeter Zijlstra 	int res, ret;
93885dc28faSPeter Zijlstra 
93985dc28faSPeter Zijlstra 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
94085dc28faSPeter Zijlstra 		return -ENOSYS;
94185dc28faSPeter Zijlstra 
94285dc28faSPeter Zijlstra 	if (refill_pi_state_cache())
94385dc28faSPeter Zijlstra 		return -ENOMEM;
94485dc28faSPeter Zijlstra 
94585dc28faSPeter Zijlstra 	to = futex_setup_timer(time, &timeout, flags, 0);
94685dc28faSPeter Zijlstra 
94785dc28faSPeter Zijlstra retry:
94885dc28faSPeter Zijlstra 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
94985dc28faSPeter Zijlstra 	if (unlikely(ret != 0))
95085dc28faSPeter Zijlstra 		goto out;
95185dc28faSPeter Zijlstra 
95285dc28faSPeter Zijlstra retry_private:
95385dc28faSPeter Zijlstra 	hb = futex_q_lock(&q);
95485dc28faSPeter Zijlstra 
95585dc28faSPeter Zijlstra 	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
95685dc28faSPeter Zijlstra 				   &exiting, 0);
95785dc28faSPeter Zijlstra 	if (unlikely(ret)) {
95885dc28faSPeter Zijlstra 		/*
95985dc28faSPeter Zijlstra 		 * Atomic work succeeded and we got the lock,
96085dc28faSPeter Zijlstra 		 * or failed. Either way, we do _not_ block.
96185dc28faSPeter Zijlstra 		 */
96285dc28faSPeter Zijlstra 		switch (ret) {
96385dc28faSPeter Zijlstra 		case 1:
96485dc28faSPeter Zijlstra 			/* We got the lock. */
96585dc28faSPeter Zijlstra 			ret = 0;
96685dc28faSPeter Zijlstra 			goto out_unlock_put_key;
96785dc28faSPeter Zijlstra 		case -EFAULT:
96885dc28faSPeter Zijlstra 			goto uaddr_faulted;
96985dc28faSPeter Zijlstra 		case -EBUSY:
97085dc28faSPeter Zijlstra 		case -EAGAIN:
97185dc28faSPeter Zijlstra 			/*
97285dc28faSPeter Zijlstra 			 * Two reasons for this:
97385dc28faSPeter Zijlstra 			 * - EBUSY: Task is exiting and we just wait for the
97485dc28faSPeter Zijlstra 			 *   exit to complete.
97585dc28faSPeter Zijlstra 			 * - EAGAIN: The user space value changed.
97685dc28faSPeter Zijlstra 			 */
97785dc28faSPeter Zijlstra 			futex_q_unlock(hb);
97885dc28faSPeter Zijlstra 			/*
97985dc28faSPeter Zijlstra 			 * Handle the case where the owner is in the middle of
98085dc28faSPeter Zijlstra 			 * exiting. Wait for the exit to complete otherwise
98185dc28faSPeter Zijlstra 			 * this task might loop forever, aka. live lock.
98285dc28faSPeter Zijlstra 			 */
98385dc28faSPeter Zijlstra 			wait_for_owner_exiting(ret, exiting);
98485dc28faSPeter Zijlstra 			cond_resched();
98585dc28faSPeter Zijlstra 			goto retry;
98685dc28faSPeter Zijlstra 		default:
98785dc28faSPeter Zijlstra 			goto out_unlock_put_key;
98885dc28faSPeter Zijlstra 		}
98985dc28faSPeter Zijlstra 	}
99085dc28faSPeter Zijlstra 
99185dc28faSPeter Zijlstra 	WARN_ON(!q.pi_state);
99285dc28faSPeter Zijlstra 
99385dc28faSPeter Zijlstra 	/*
99485dc28faSPeter Zijlstra 	 * Only actually queue now that the atomic ops are done:
99585dc28faSPeter Zijlstra 	 */
99685dc28faSPeter Zijlstra 	__futex_queue(&q, hb);
99785dc28faSPeter Zijlstra 
99885dc28faSPeter Zijlstra 	if (trylock) {
99985dc28faSPeter Zijlstra 		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
100085dc28faSPeter Zijlstra 		/* Fixup the trylock return value: */
100185dc28faSPeter Zijlstra 		ret = ret ? 0 : -EWOULDBLOCK;
100285dc28faSPeter Zijlstra 		goto no_block;
100385dc28faSPeter Zijlstra 	}
100485dc28faSPeter Zijlstra 
100585dc28faSPeter Zijlstra 	rt_mutex_init_waiter(&rt_waiter);
100685dc28faSPeter Zijlstra 
100785dc28faSPeter Zijlstra 	/*
1008*68290613SSebastian Andrzej Siewior 	 * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
100985dc28faSPeter Zijlstra 	 * hold it while doing rt_mutex_start_proxy(), because then it will
101085dc28faSPeter Zijlstra 	 * include hb->lock in the blocking chain, even through we'll not in
101185dc28faSPeter Zijlstra 	 * fact hold it while blocking. This will lead it to report -EDEADLK
101285dc28faSPeter Zijlstra 	 * and BUG when futex_unlock_pi() interleaves with this.
101385dc28faSPeter Zijlstra 	 *
101485dc28faSPeter Zijlstra 	 * Therefore acquire wait_lock while holding hb->lock, but drop the
101585dc28faSPeter Zijlstra 	 * latter before calling __rt_mutex_start_proxy_lock(). This
101685dc28faSPeter Zijlstra 	 * interleaves with futex_unlock_pi() -- which does a similar lock
101785dc28faSPeter Zijlstra 	 * handoff -- such that the latter can observe the futex_q::pi_state
101885dc28faSPeter Zijlstra 	 * before __rt_mutex_start_proxy_lock() is done.
101985dc28faSPeter Zijlstra 	 */
102085dc28faSPeter Zijlstra 	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
102185dc28faSPeter Zijlstra 	spin_unlock(q.lock_ptr);
102285dc28faSPeter Zijlstra 	/*
102385dc28faSPeter Zijlstra 	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
102485dc28faSPeter Zijlstra 	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
102585dc28faSPeter Zijlstra 	 * it sees the futex_q::pi_state.
102685dc28faSPeter Zijlstra 	 */
102785dc28faSPeter Zijlstra 	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
102885dc28faSPeter Zijlstra 	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
102985dc28faSPeter Zijlstra 
103085dc28faSPeter Zijlstra 	if (ret) {
103185dc28faSPeter Zijlstra 		if (ret == 1)
103285dc28faSPeter Zijlstra 			ret = 0;
103385dc28faSPeter Zijlstra 		goto cleanup;
103485dc28faSPeter Zijlstra 	}
103585dc28faSPeter Zijlstra 
103685dc28faSPeter Zijlstra 	if (unlikely(to))
103785dc28faSPeter Zijlstra 		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
103885dc28faSPeter Zijlstra 
103985dc28faSPeter Zijlstra 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
104085dc28faSPeter Zijlstra 
104185dc28faSPeter Zijlstra cleanup:
104285dc28faSPeter Zijlstra 	spin_lock(q.lock_ptr);
104385dc28faSPeter Zijlstra 	/*
104485dc28faSPeter Zijlstra 	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
104585dc28faSPeter Zijlstra 	 * first acquire the hb->lock before removing the lock from the
104685dc28faSPeter Zijlstra 	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
104785dc28faSPeter Zijlstra 	 * lists consistent.
104885dc28faSPeter Zijlstra 	 *
104985dc28faSPeter Zijlstra 	 * In particular; it is important that futex_unlock_pi() can not
105085dc28faSPeter Zijlstra 	 * observe this inconsistency.
105185dc28faSPeter Zijlstra 	 */
105285dc28faSPeter Zijlstra 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
105385dc28faSPeter Zijlstra 		ret = 0;
105485dc28faSPeter Zijlstra 
105585dc28faSPeter Zijlstra no_block:
105685dc28faSPeter Zijlstra 	/*
105785dc28faSPeter Zijlstra 	 * Fixup the pi_state owner and possibly acquire the lock if we
105885dc28faSPeter Zijlstra 	 * haven't already.
105985dc28faSPeter Zijlstra 	 */
106085dc28faSPeter Zijlstra 	res = fixup_pi_owner(uaddr, &q, !ret);
106185dc28faSPeter Zijlstra 	/*
106285dc28faSPeter Zijlstra 	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
106385dc28faSPeter Zijlstra 	 * the lock, clear our -ETIMEDOUT or -EINTR.
106485dc28faSPeter Zijlstra 	 */
106585dc28faSPeter Zijlstra 	if (res)
106685dc28faSPeter Zijlstra 		ret = (res < 0) ? res : 0;
106785dc28faSPeter Zijlstra 
106885dc28faSPeter Zijlstra 	futex_unqueue_pi(&q);
106985dc28faSPeter Zijlstra 	spin_unlock(q.lock_ptr);
107085dc28faSPeter Zijlstra 	goto out;
107185dc28faSPeter Zijlstra 
107285dc28faSPeter Zijlstra out_unlock_put_key:
107385dc28faSPeter Zijlstra 	futex_q_unlock(hb);
107485dc28faSPeter Zijlstra 
107585dc28faSPeter Zijlstra out:
107685dc28faSPeter Zijlstra 	if (to) {
107785dc28faSPeter Zijlstra 		hrtimer_cancel(&to->timer);
107885dc28faSPeter Zijlstra 		destroy_hrtimer_on_stack(&to->timer);
107985dc28faSPeter Zijlstra 	}
108085dc28faSPeter Zijlstra 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
108185dc28faSPeter Zijlstra 
108285dc28faSPeter Zijlstra uaddr_faulted:
108385dc28faSPeter Zijlstra 	futex_q_unlock(hb);
108485dc28faSPeter Zijlstra 
108585dc28faSPeter Zijlstra 	ret = fault_in_user_writeable(uaddr);
108685dc28faSPeter Zijlstra 	if (ret)
108785dc28faSPeter Zijlstra 		goto out;
108885dc28faSPeter Zijlstra 
108985dc28faSPeter Zijlstra 	if (!(flags & FLAGS_SHARED))
109085dc28faSPeter Zijlstra 		goto retry_private;
109185dc28faSPeter Zijlstra 
109285dc28faSPeter Zijlstra 	goto retry;
109385dc28faSPeter Zijlstra }
109485dc28faSPeter Zijlstra 
109585dc28faSPeter Zijlstra /*
109685dc28faSPeter Zijlstra  * Userspace attempted a TID -> 0 atomic transition, and failed.
109785dc28faSPeter Zijlstra  * This is the in-kernel slowpath: we look up the PI state (if any),
109885dc28faSPeter Zijlstra  * and do the rt-mutex unlock.
109985dc28faSPeter Zijlstra  */
futex_unlock_pi(u32 __user * uaddr,unsigned int flags)110085dc28faSPeter Zijlstra int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
110185dc28faSPeter Zijlstra {
110285dc28faSPeter Zijlstra 	u32 curval, uval, vpid = task_pid_vnr(current);
110385dc28faSPeter Zijlstra 	union futex_key key = FUTEX_KEY_INIT;
110485dc28faSPeter Zijlstra 	struct futex_hash_bucket *hb;
110585dc28faSPeter Zijlstra 	struct futex_q *top_waiter;
110685dc28faSPeter Zijlstra 	int ret;
110785dc28faSPeter Zijlstra 
110885dc28faSPeter Zijlstra 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
110985dc28faSPeter Zijlstra 		return -ENOSYS;
111085dc28faSPeter Zijlstra 
111185dc28faSPeter Zijlstra retry:
111285dc28faSPeter Zijlstra 	if (get_user(uval, uaddr))
111385dc28faSPeter Zijlstra 		return -EFAULT;
111485dc28faSPeter Zijlstra 	/*
111585dc28faSPeter Zijlstra 	 * We release only a lock we actually own:
111685dc28faSPeter Zijlstra 	 */
111785dc28faSPeter Zijlstra 	if ((uval & FUTEX_TID_MASK) != vpid)
111885dc28faSPeter Zijlstra 		return -EPERM;
111985dc28faSPeter Zijlstra 
112085dc28faSPeter Zijlstra 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
112185dc28faSPeter Zijlstra 	if (ret)
112285dc28faSPeter Zijlstra 		return ret;
112385dc28faSPeter Zijlstra 
112485dc28faSPeter Zijlstra 	hb = futex_hash(&key);
112585dc28faSPeter Zijlstra 	spin_lock(&hb->lock);
112685dc28faSPeter Zijlstra 
112785dc28faSPeter Zijlstra 	/*
112885dc28faSPeter Zijlstra 	 * Check waiters first. We do not trust user space values at
112985dc28faSPeter Zijlstra 	 * all and we at least want to know if user space fiddled
113085dc28faSPeter Zijlstra 	 * with the futex value instead of blindly unlocking.
113185dc28faSPeter Zijlstra 	 */
113285dc28faSPeter Zijlstra 	top_waiter = futex_top_waiter(hb, &key);
113385dc28faSPeter Zijlstra 	if (top_waiter) {
113485dc28faSPeter Zijlstra 		struct futex_pi_state *pi_state = top_waiter->pi_state;
113585dc28faSPeter Zijlstra 
113685dc28faSPeter Zijlstra 		ret = -EINVAL;
113785dc28faSPeter Zijlstra 		if (!pi_state)
113885dc28faSPeter Zijlstra 			goto out_unlock;
113985dc28faSPeter Zijlstra 
114085dc28faSPeter Zijlstra 		/*
114185dc28faSPeter Zijlstra 		 * If current does not own the pi_state then the futex is
114285dc28faSPeter Zijlstra 		 * inconsistent and user space fiddled with the futex value.
114385dc28faSPeter Zijlstra 		 */
114485dc28faSPeter Zijlstra 		if (pi_state->owner != current)
114585dc28faSPeter Zijlstra 			goto out_unlock;
114685dc28faSPeter Zijlstra 
114785dc28faSPeter Zijlstra 		get_pi_state(pi_state);
114885dc28faSPeter Zijlstra 		/*
114985dc28faSPeter Zijlstra 		 * By taking wait_lock while still holding hb->lock, we ensure
115085dc28faSPeter Zijlstra 		 * there is no point where we hold neither; and therefore
115185dc28faSPeter Zijlstra 		 * wake_futex_p() must observe a state consistent with what we
115285dc28faSPeter Zijlstra 		 * observed.
115385dc28faSPeter Zijlstra 		 *
115485dc28faSPeter Zijlstra 		 * In particular; this forces __rt_mutex_start_proxy() to
115585dc28faSPeter Zijlstra 		 * complete such that we're guaranteed to observe the
115685dc28faSPeter Zijlstra 		 * rt_waiter. Also see the WARN in wake_futex_pi().
115785dc28faSPeter Zijlstra 		 */
115885dc28faSPeter Zijlstra 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
115985dc28faSPeter Zijlstra 		spin_unlock(&hb->lock);
116085dc28faSPeter Zijlstra 
116185dc28faSPeter Zijlstra 		/* drops pi_state->pi_mutex.wait_lock */
116285dc28faSPeter Zijlstra 		ret = wake_futex_pi(uaddr, uval, pi_state);
116385dc28faSPeter Zijlstra 
116485dc28faSPeter Zijlstra 		put_pi_state(pi_state);
116585dc28faSPeter Zijlstra 
116685dc28faSPeter Zijlstra 		/*
116785dc28faSPeter Zijlstra 		 * Success, we're done! No tricky corner cases.
116885dc28faSPeter Zijlstra 		 */
116985dc28faSPeter Zijlstra 		if (!ret)
117085dc28faSPeter Zijlstra 			return ret;
117185dc28faSPeter Zijlstra 		/*
117285dc28faSPeter Zijlstra 		 * The atomic access to the futex value generated a
117385dc28faSPeter Zijlstra 		 * pagefault, so retry the user-access and the wakeup:
117485dc28faSPeter Zijlstra 		 */
117585dc28faSPeter Zijlstra 		if (ret == -EFAULT)
117685dc28faSPeter Zijlstra 			goto pi_faulted;
117785dc28faSPeter Zijlstra 		/*
117885dc28faSPeter Zijlstra 		 * A unconditional UNLOCK_PI op raced against a waiter
117985dc28faSPeter Zijlstra 		 * setting the FUTEX_WAITERS bit. Try again.
118085dc28faSPeter Zijlstra 		 */
118185dc28faSPeter Zijlstra 		if (ret == -EAGAIN)
118285dc28faSPeter Zijlstra 			goto pi_retry;
118385dc28faSPeter Zijlstra 		/*
118485dc28faSPeter Zijlstra 		 * wake_futex_pi has detected invalid state. Tell user
118585dc28faSPeter Zijlstra 		 * space.
118685dc28faSPeter Zijlstra 		 */
118785dc28faSPeter Zijlstra 		return ret;
118885dc28faSPeter Zijlstra 	}
118985dc28faSPeter Zijlstra 
119085dc28faSPeter Zijlstra 	/*
119185dc28faSPeter Zijlstra 	 * We have no kernel internal state, i.e. no waiters in the
119285dc28faSPeter Zijlstra 	 * kernel. Waiters which are about to queue themselves are stuck
119385dc28faSPeter Zijlstra 	 * on hb->lock. So we can safely ignore them. We do neither
119485dc28faSPeter Zijlstra 	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
119585dc28faSPeter Zijlstra 	 * owner.
119685dc28faSPeter Zijlstra 	 */
119785dc28faSPeter Zijlstra 	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
119885dc28faSPeter Zijlstra 		spin_unlock(&hb->lock);
119985dc28faSPeter Zijlstra 		switch (ret) {
120085dc28faSPeter Zijlstra 		case -EFAULT:
120185dc28faSPeter Zijlstra 			goto pi_faulted;
120285dc28faSPeter Zijlstra 
120385dc28faSPeter Zijlstra 		case -EAGAIN:
120485dc28faSPeter Zijlstra 			goto pi_retry;
120585dc28faSPeter Zijlstra 
120685dc28faSPeter Zijlstra 		default:
120785dc28faSPeter Zijlstra 			WARN_ON_ONCE(1);
120885dc28faSPeter Zijlstra 			return ret;
120985dc28faSPeter Zijlstra 		}
121085dc28faSPeter Zijlstra 	}
121185dc28faSPeter Zijlstra 
121285dc28faSPeter Zijlstra 	/*
121385dc28faSPeter Zijlstra 	 * If uval has changed, let user space handle it.
121485dc28faSPeter Zijlstra 	 */
121585dc28faSPeter Zijlstra 	ret = (curval == uval) ? 0 : -EAGAIN;
121685dc28faSPeter Zijlstra 
121785dc28faSPeter Zijlstra out_unlock:
121885dc28faSPeter Zijlstra 	spin_unlock(&hb->lock);
121985dc28faSPeter Zijlstra 	return ret;
122085dc28faSPeter Zijlstra 
122185dc28faSPeter Zijlstra pi_retry:
122285dc28faSPeter Zijlstra 	cond_resched();
122385dc28faSPeter Zijlstra 	goto retry;
122485dc28faSPeter Zijlstra 
122585dc28faSPeter Zijlstra pi_faulted:
122685dc28faSPeter Zijlstra 
122785dc28faSPeter Zijlstra 	ret = fault_in_user_writeable(uaddr);
122885dc28faSPeter Zijlstra 	if (!ret)
122985dc28faSPeter Zijlstra 		goto retry;
123085dc28faSPeter Zijlstra 
123185dc28faSPeter Zijlstra 	return ret;
123285dc28faSPeter Zijlstra }
123385dc28faSPeter Zijlstra 
1234