xref: /openbmc/linux/kernel/futex/pi.c (revision dfc66bef)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 
3 #include <linux/slab.h>
4 #include <linux/sched/task.h>
5 
6 #include "futex.h"
7 #include "../locking/rtmutex_common.h"
8 
9 /*
10  * PI code:
11  */
12 int refill_pi_state_cache(void)
13 {
14 	struct futex_pi_state *pi_state;
15 
16 	if (likely(current->pi_state_cache))
17 		return 0;
18 
19 	pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
20 
21 	if (!pi_state)
22 		return -ENOMEM;
23 
24 	INIT_LIST_HEAD(&pi_state->list);
25 	/* pi_mutex gets initialized later */
26 	pi_state->owner = NULL;
27 	refcount_set(&pi_state->refcount, 1);
28 	pi_state->key = FUTEX_KEY_INIT;
29 
30 	current->pi_state_cache = pi_state;
31 
32 	return 0;
33 }
34 
35 static struct futex_pi_state *alloc_pi_state(void)
36 {
37 	struct futex_pi_state *pi_state = current->pi_state_cache;
38 
39 	WARN_ON(!pi_state);
40 	current->pi_state_cache = NULL;
41 
42 	return pi_state;
43 }
44 
45 static void pi_state_update_owner(struct futex_pi_state *pi_state,
46 				  struct task_struct *new_owner)
47 {
48 	struct task_struct *old_owner = pi_state->owner;
49 
50 	lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
51 
52 	if (old_owner) {
53 		raw_spin_lock(&old_owner->pi_lock);
54 		WARN_ON(list_empty(&pi_state->list));
55 		list_del_init(&pi_state->list);
56 		raw_spin_unlock(&old_owner->pi_lock);
57 	}
58 
59 	if (new_owner) {
60 		raw_spin_lock(&new_owner->pi_lock);
61 		WARN_ON(!list_empty(&pi_state->list));
62 		list_add(&pi_state->list, &new_owner->pi_state_list);
63 		pi_state->owner = new_owner;
64 		raw_spin_unlock(&new_owner->pi_lock);
65 	}
66 }
67 
68 void get_pi_state(struct futex_pi_state *pi_state)
69 {
70 	WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
71 }
72 
73 /*
74  * Drops a reference to the pi_state object and frees or caches it
75  * when the last reference is gone.
76  */
77 void put_pi_state(struct futex_pi_state *pi_state)
78 {
79 	if (!pi_state)
80 		return;
81 
82 	if (!refcount_dec_and_test(&pi_state->refcount))
83 		return;
84 
85 	/*
86 	 * If pi_state->owner is NULL, the owner is most probably dying
87 	 * and has cleaned up the pi_state already
88 	 */
89 	if (pi_state->owner) {
90 		unsigned long flags;
91 
92 		raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
93 		pi_state_update_owner(pi_state, NULL);
94 		rt_mutex_proxy_unlock(&pi_state->pi_mutex);
95 		raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
96 	}
97 
98 	if (current->pi_state_cache) {
99 		kfree(pi_state);
100 	} else {
101 		/*
102 		 * pi_state->list is already empty.
103 		 * clear pi_state->owner.
104 		 * refcount is at 0 - put it back to 1.
105 		 */
106 		pi_state->owner = NULL;
107 		refcount_set(&pi_state->refcount, 1);
108 		current->pi_state_cache = pi_state;
109 	}
110 }
111 
112 /*
113  * We need to check the following states:
114  *
115  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
116  *
117  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
118  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
119  *
120  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
121  *
122  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
123  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
124  *
125  * [6]  Found  | Found    | task      | 0         | 1      | Valid
126  *
127  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
128  *
129  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
130  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
131  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
132  *
133  * [1]	Indicates that the kernel can acquire the futex atomically. We
134  *	came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
135  *
136  * [2]	Valid, if TID does not belong to a kernel thread. If no matching
137  *      thread is found then it indicates that the owner TID has died.
138  *
139  * [3]	Invalid. The waiter is queued on a non PI futex
140  *
141  * [4]	Valid state after exit_robust_list(), which sets the user space
142  *	value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
143  *
144  * [5]	The user space value got manipulated between exit_robust_list()
145  *	and exit_pi_state_list()
146  *
147  * [6]	Valid state after exit_pi_state_list() which sets the new owner in
148  *	the pi_state but cannot access the user space value.
149  *
150  * [7]	pi_state->owner can only be NULL when the OWNER_DIED bit is set.
151  *
152  * [8]	Owner and user space value match
153  *
154  * [9]	There is no transient state which sets the user space TID to 0
155  *	except exit_robust_list(), but this is indicated by the
156  *	FUTEX_OWNER_DIED bit. See [4]
157  *
158  * [10] There is no transient state which leaves owner and user space
159  *	TID out of sync. Except one error case where the kernel is denied
160  *	write access to the user address, see fixup_pi_state_owner().
161  *
162  *
163  * Serialization and lifetime rules:
164  *
165  * hb->lock:
166  *
167  *	hb -> futex_q, relation
168  *	futex_q -> pi_state, relation
169  *
170  *	(cannot be raw because hb can contain arbitrary amount
171  *	 of futex_q's)
172  *
173  * pi_mutex->wait_lock:
174  *
175  *	{uval, pi_state}
176  *
177  *	(and pi_mutex 'obviously')
178  *
179  * p->pi_lock:
180  *
181  *	p->pi_state_list -> pi_state->list, relation
182  *	pi_mutex->owner -> pi_state->owner, relation
183  *
184  * pi_state->refcount:
185  *
186  *	pi_state lifetime
187  *
188  *
189  * Lock order:
190  *
191  *   hb->lock
192  *     pi_mutex->wait_lock
193  *       p->pi_lock
194  *
195  */
196 
197 /*
198  * Validate that the existing waiter has a pi_state and sanity check
199  * the pi_state against the user space value. If correct, attach to
200  * it.
201  */
202 static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
203 			      struct futex_pi_state *pi_state,
204 			      struct futex_pi_state **ps)
205 {
206 	pid_t pid = uval & FUTEX_TID_MASK;
207 	u32 uval2;
208 	int ret;
209 
210 	/*
211 	 * Userspace might have messed up non-PI and PI futexes [3]
212 	 */
213 	if (unlikely(!pi_state))
214 		return -EINVAL;
215 
216 	/*
217 	 * We get here with hb->lock held, and having found a
218 	 * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
219 	 * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
220 	 * which in turn means that futex_lock_pi() still has a reference on
221 	 * our pi_state.
222 	 *
223 	 * The waiter holding a reference on @pi_state also protects against
224 	 * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
225 	 * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
226 	 * free pi_state before we can take a reference ourselves.
227 	 */
228 	WARN_ON(!refcount_read(&pi_state->refcount));
229 
230 	/*
231 	 * Now that we have a pi_state, we can acquire wait_lock
232 	 * and do the state validation.
233 	 */
234 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
235 
236 	/*
237 	 * Since {uval, pi_state} is serialized by wait_lock, and our current
238 	 * uval was read without holding it, it can have changed. Verify it
239 	 * still is what we expect it to be, otherwise retry the entire
240 	 * operation.
241 	 */
242 	if (futex_get_value_locked(&uval2, uaddr))
243 		goto out_efault;
244 
245 	if (uval != uval2)
246 		goto out_eagain;
247 
248 	/*
249 	 * Handle the owner died case:
250 	 */
251 	if (uval & FUTEX_OWNER_DIED) {
252 		/*
253 		 * exit_pi_state_list sets owner to NULL and wakes the
254 		 * topmost waiter. The task which acquires the
255 		 * pi_state->rt_mutex will fixup owner.
256 		 */
257 		if (!pi_state->owner) {
258 			/*
259 			 * No pi state owner, but the user space TID
260 			 * is not 0. Inconsistent state. [5]
261 			 */
262 			if (pid)
263 				goto out_einval;
264 			/*
265 			 * Take a ref on the state and return success. [4]
266 			 */
267 			goto out_attach;
268 		}
269 
270 		/*
271 		 * If TID is 0, then either the dying owner has not
272 		 * yet executed exit_pi_state_list() or some waiter
273 		 * acquired the rtmutex in the pi state, but did not
274 		 * yet fixup the TID in user space.
275 		 *
276 		 * Take a ref on the state and return success. [6]
277 		 */
278 		if (!pid)
279 			goto out_attach;
280 	} else {
281 		/*
282 		 * If the owner died bit is not set, then the pi_state
283 		 * must have an owner. [7]
284 		 */
285 		if (!pi_state->owner)
286 			goto out_einval;
287 	}
288 
289 	/*
290 	 * Bail out if user space manipulated the futex value. If pi
291 	 * state exists then the owner TID must be the same as the
292 	 * user space TID. [9/10]
293 	 */
294 	if (pid != task_pid_vnr(pi_state->owner))
295 		goto out_einval;
296 
297 out_attach:
298 	get_pi_state(pi_state);
299 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
300 	*ps = pi_state;
301 	return 0;
302 
303 out_einval:
304 	ret = -EINVAL;
305 	goto out_error;
306 
307 out_eagain:
308 	ret = -EAGAIN;
309 	goto out_error;
310 
311 out_efault:
312 	ret = -EFAULT;
313 	goto out_error;
314 
315 out_error:
316 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
317 	return ret;
318 }
319 
320 static int handle_exit_race(u32 __user *uaddr, u32 uval,
321 			    struct task_struct *tsk)
322 {
323 	u32 uval2;
324 
325 	/*
326 	 * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
327 	 * caller that the alleged owner is busy.
328 	 */
329 	if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
330 		return -EBUSY;
331 
332 	/*
333 	 * Reread the user space value to handle the following situation:
334 	 *
335 	 * CPU0				CPU1
336 	 *
337 	 * sys_exit()			sys_futex()
338 	 *  do_exit()			 futex_lock_pi()
339 	 *                                futex_lock_pi_atomic()
340 	 *   exit_signals(tsk)		    No waiters:
341 	 *    tsk->flags |= PF_EXITING;	    *uaddr == 0x00000PID
342 	 *  mm_release(tsk)		    Set waiter bit
343 	 *   exit_robust_list(tsk) {	    *uaddr = 0x80000PID;
344 	 *      Set owner died		    attach_to_pi_owner() {
345 	 *    *uaddr = 0xC0000000;	     tsk = get_task(PID);
346 	 *   }				     if (!tsk->flags & PF_EXITING) {
347 	 *  ...				       attach();
348 	 *  tsk->futex_state =               } else {
349 	 *	FUTEX_STATE_DEAD;              if (tsk->futex_state !=
350 	 *					  FUTEX_STATE_DEAD)
351 	 *				         return -EAGAIN;
352 	 *				       return -ESRCH; <--- FAIL
353 	 *				     }
354 	 *
355 	 * Returning ESRCH unconditionally is wrong here because the
356 	 * user space value has been changed by the exiting task.
357 	 *
358 	 * The same logic applies to the case where the exiting task is
359 	 * already gone.
360 	 */
361 	if (futex_get_value_locked(&uval2, uaddr))
362 		return -EFAULT;
363 
364 	/* If the user space value has changed, try again. */
365 	if (uval2 != uval)
366 		return -EAGAIN;
367 
368 	/*
369 	 * The exiting task did not have a robust list, the robust list was
370 	 * corrupted or the user space value in *uaddr is simply bogus.
371 	 * Give up and tell user space.
372 	 */
373 	return -ESRCH;
374 }
375 
376 static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
377 				 struct futex_pi_state **ps)
378 {
379 	/*
380 	 * No existing pi state. First waiter. [2]
381 	 *
382 	 * This creates pi_state, we have hb->lock held, this means nothing can
383 	 * observe this state, wait_lock is irrelevant.
384 	 */
385 	struct futex_pi_state *pi_state = alloc_pi_state();
386 
387 	/*
388 	 * Initialize the pi_mutex in locked state and make @p
389 	 * the owner of it:
390 	 */
391 	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
392 
393 	/* Store the key for possible exit cleanups: */
394 	pi_state->key = *key;
395 
396 	WARN_ON(!list_empty(&pi_state->list));
397 	list_add(&pi_state->list, &p->pi_state_list);
398 	/*
399 	 * Assignment without holding pi_state->pi_mutex.wait_lock is safe
400 	 * because there is no concurrency as the object is not published yet.
401 	 */
402 	pi_state->owner = p;
403 
404 	*ps = pi_state;
405 }
406 /*
407  * Lookup the task for the TID provided from user space and attach to
408  * it after doing proper sanity checks.
409  */
410 static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
411 			      struct futex_pi_state **ps,
412 			      struct task_struct **exiting)
413 {
414 	pid_t pid = uval & FUTEX_TID_MASK;
415 	struct task_struct *p;
416 
417 	/*
418 	 * We are the first waiter - try to look up the real owner and attach
419 	 * the new pi_state to it, but bail out when TID = 0 [1]
420 	 *
421 	 * The !pid check is paranoid. None of the call sites should end up
422 	 * with pid == 0, but better safe than sorry. Let the caller retry
423 	 */
424 	if (!pid)
425 		return -EAGAIN;
426 	p = find_get_task_by_vpid(pid);
427 	if (!p)
428 		return handle_exit_race(uaddr, uval, NULL);
429 
430 	if (unlikely(p->flags & PF_KTHREAD)) {
431 		put_task_struct(p);
432 		return -EPERM;
433 	}
434 
435 	/*
436 	 * We need to look at the task state to figure out, whether the
437 	 * task is exiting. To protect against the change of the task state
438 	 * in futex_exit_release(), we do this protected by p->pi_lock:
439 	 */
440 	raw_spin_lock_irq(&p->pi_lock);
441 	if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
442 		/*
443 		 * The task is on the way out. When the futex state is
444 		 * FUTEX_STATE_DEAD, we know that the task has finished
445 		 * the cleanup:
446 		 */
447 		int ret = handle_exit_race(uaddr, uval, p);
448 
449 		raw_spin_unlock_irq(&p->pi_lock);
450 		/*
451 		 * If the owner task is between FUTEX_STATE_EXITING and
452 		 * FUTEX_STATE_DEAD then store the task pointer and keep
453 		 * the reference on the task struct. The calling code will
454 		 * drop all locks, wait for the task to reach
455 		 * FUTEX_STATE_DEAD and then drop the refcount. This is
456 		 * required to prevent a live lock when the current task
457 		 * preempted the exiting task between the two states.
458 		 */
459 		if (ret == -EBUSY)
460 			*exiting = p;
461 		else
462 			put_task_struct(p);
463 		return ret;
464 	}
465 
466 	__attach_to_pi_owner(p, key, ps);
467 	raw_spin_unlock_irq(&p->pi_lock);
468 
469 	put_task_struct(p);
470 
471 	return 0;
472 }
473 
474 static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
475 {
476 	int err;
477 	u32 curval;
478 
479 	if (unlikely(should_fail_futex(true)))
480 		return -EFAULT;
481 
482 	err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
483 	if (unlikely(err))
484 		return err;
485 
486 	/* If user space value changed, let the caller retry */
487 	return curval != uval ? -EAGAIN : 0;
488 }
489 
490 /**
491  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
492  * @uaddr:		the pi futex user address
493  * @hb:			the pi futex hash bucket
494  * @key:		the futex key associated with uaddr and hb
495  * @ps:			the pi_state pointer where we store the result of the
496  *			lookup
497  * @task:		the task to perform the atomic lock work for.  This will
498  *			be "current" except in the case of requeue pi.
499  * @exiting:		Pointer to store the task pointer of the owner task
500  *			which is in the middle of exiting
501  * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
502  *
503  * Return:
504  *  -  0 - ready to wait;
505  *  -  1 - acquired the lock;
506  *  - <0 - error
507  *
508  * The hb->lock must be held by the caller.
509  *
510  * @exiting is only set when the return value is -EBUSY. If so, this holds
511  * a refcount on the exiting task on return and the caller needs to drop it
512  * after waiting for the exit to complete.
513  */
514 int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
515 			 union futex_key *key,
516 			 struct futex_pi_state **ps,
517 			 struct task_struct *task,
518 			 struct task_struct **exiting,
519 			 int set_waiters)
520 {
521 	u32 uval, newval, vpid = task_pid_vnr(task);
522 	struct futex_q *top_waiter;
523 	int ret;
524 
525 	/*
526 	 * Read the user space value first so we can validate a few
527 	 * things before proceeding further.
528 	 */
529 	if (futex_get_value_locked(&uval, uaddr))
530 		return -EFAULT;
531 
532 	if (unlikely(should_fail_futex(true)))
533 		return -EFAULT;
534 
535 	/*
536 	 * Detect deadlocks.
537 	 */
538 	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
539 		return -EDEADLK;
540 
541 	if ((unlikely(should_fail_futex(true))))
542 		return -EDEADLK;
543 
544 	/*
545 	 * Lookup existing state first. If it exists, try to attach to
546 	 * its pi_state.
547 	 */
548 	top_waiter = futex_top_waiter(hb, key);
549 	if (top_waiter)
550 		return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
551 
552 	/*
553 	 * No waiter and user TID is 0. We are here because the
554 	 * waiters or the owner died bit is set or called from
555 	 * requeue_cmp_pi or for whatever reason something took the
556 	 * syscall.
557 	 */
558 	if (!(uval & FUTEX_TID_MASK)) {
559 		/*
560 		 * We take over the futex. No other waiters and the user space
561 		 * TID is 0. We preserve the owner died bit.
562 		 */
563 		newval = uval & FUTEX_OWNER_DIED;
564 		newval |= vpid;
565 
566 		/* The futex requeue_pi code can enforce the waiters bit */
567 		if (set_waiters)
568 			newval |= FUTEX_WAITERS;
569 
570 		ret = lock_pi_update_atomic(uaddr, uval, newval);
571 		if (ret)
572 			return ret;
573 
574 		/*
575 		 * If the waiter bit was requested the caller also needs PI
576 		 * state attached to the new owner of the user space futex.
577 		 *
578 		 * @task is guaranteed to be alive and it cannot be exiting
579 		 * because it is either sleeping or waiting in
580 		 * futex_requeue_pi_wakeup_sync().
581 		 *
582 		 * No need to do the full attach_to_pi_owner() exercise
583 		 * because @task is known and valid.
584 		 */
585 		if (set_waiters) {
586 			raw_spin_lock_irq(&task->pi_lock);
587 			__attach_to_pi_owner(task, key, ps);
588 			raw_spin_unlock_irq(&task->pi_lock);
589 		}
590 		return 1;
591 	}
592 
593 	/*
594 	 * First waiter. Set the waiters bit before attaching ourself to
595 	 * the owner. If owner tries to unlock, it will be forced into
596 	 * the kernel and blocked on hb->lock.
597 	 */
598 	newval = uval | FUTEX_WAITERS;
599 	ret = lock_pi_update_atomic(uaddr, uval, newval);
600 	if (ret)
601 		return ret;
602 	/*
603 	 * If the update of the user space value succeeded, we try to
604 	 * attach to the owner. If that fails, no harm done, we only
605 	 * set the FUTEX_WAITERS bit in the user space variable.
606 	 */
607 	return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
608 }
609 
610 /*
611  * Caller must hold a reference on @pi_state.
612  */
613 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
614 {
615 	struct rt_mutex_waiter *top_waiter;
616 	struct task_struct *new_owner;
617 	bool postunlock = false;
618 	DEFINE_RT_WAKE_Q(wqh);
619 	u32 curval, newval;
620 	int ret = 0;
621 
622 	top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
623 	if (WARN_ON_ONCE(!top_waiter)) {
624 		/*
625 		 * As per the comment in futex_unlock_pi() this should not happen.
626 		 *
627 		 * When this happens, give up our locks and try again, giving
628 		 * the futex_lock_pi() instance time to complete, either by
629 		 * waiting on the rtmutex or removing itself from the futex
630 		 * queue.
631 		 */
632 		ret = -EAGAIN;
633 		goto out_unlock;
634 	}
635 
636 	new_owner = top_waiter->task;
637 
638 	/*
639 	 * We pass it to the next owner. The WAITERS bit is always kept
640 	 * enabled while there is PI state around. We cleanup the owner
641 	 * died bit, because we are the owner.
642 	 */
643 	newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
644 
645 	if (unlikely(should_fail_futex(true))) {
646 		ret = -EFAULT;
647 		goto out_unlock;
648 	}
649 
650 	ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
651 	if (!ret && (curval != uval)) {
652 		/*
653 		 * If a unconditional UNLOCK_PI operation (user space did not
654 		 * try the TID->0 transition) raced with a waiter setting the
655 		 * FUTEX_WAITERS flag between get_user() and locking the hash
656 		 * bucket lock, retry the operation.
657 		 */
658 		if ((FUTEX_TID_MASK & curval) == uval)
659 			ret = -EAGAIN;
660 		else
661 			ret = -EINVAL;
662 	}
663 
664 	if (!ret) {
665 		/*
666 		 * This is a point of no return; once we modified the uval
667 		 * there is no going back and subsequent operations must
668 		 * not fail.
669 		 */
670 		pi_state_update_owner(pi_state, new_owner);
671 		postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
672 	}
673 
674 out_unlock:
675 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
676 
677 	if (postunlock)
678 		rt_mutex_postunlock(&wqh);
679 
680 	return ret;
681 }
682 
683 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
684 				  struct task_struct *argowner)
685 {
686 	struct futex_pi_state *pi_state = q->pi_state;
687 	struct task_struct *oldowner, *newowner;
688 	u32 uval, curval, newval, newtid;
689 	int err = 0;
690 
691 	oldowner = pi_state->owner;
692 
693 	/*
694 	 * We are here because either:
695 	 *
696 	 *  - we stole the lock and pi_state->owner needs updating to reflect
697 	 *    that (@argowner == current),
698 	 *
699 	 * or:
700 	 *
701 	 *  - someone stole our lock and we need to fix things to point to the
702 	 *    new owner (@argowner == NULL).
703 	 *
704 	 * Either way, we have to replace the TID in the user space variable.
705 	 * This must be atomic as we have to preserve the owner died bit here.
706 	 *
707 	 * Note: We write the user space value _before_ changing the pi_state
708 	 * because we can fault here. Imagine swapped out pages or a fork
709 	 * that marked all the anonymous memory readonly for cow.
710 	 *
711 	 * Modifying pi_state _before_ the user space value would leave the
712 	 * pi_state in an inconsistent state when we fault here, because we
713 	 * need to drop the locks to handle the fault. This might be observed
714 	 * in the PID checks when attaching to PI state .
715 	 */
716 retry:
717 	if (!argowner) {
718 		if (oldowner != current) {
719 			/*
720 			 * We raced against a concurrent self; things are
721 			 * already fixed up. Nothing to do.
722 			 */
723 			return 0;
724 		}
725 
726 		if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
727 			/* We got the lock. pi_state is correct. Tell caller. */
728 			return 1;
729 		}
730 
731 		/*
732 		 * The trylock just failed, so either there is an owner or
733 		 * there is a higher priority waiter than this one.
734 		 */
735 		newowner = rt_mutex_owner(&pi_state->pi_mutex);
736 		/*
737 		 * If the higher priority waiter has not yet taken over the
738 		 * rtmutex then newowner is NULL. We can't return here with
739 		 * that state because it's inconsistent vs. the user space
740 		 * state. So drop the locks and try again. It's a valid
741 		 * situation and not any different from the other retry
742 		 * conditions.
743 		 */
744 		if (unlikely(!newowner)) {
745 			err = -EAGAIN;
746 			goto handle_err;
747 		}
748 	} else {
749 		WARN_ON_ONCE(argowner != current);
750 		if (oldowner == current) {
751 			/*
752 			 * We raced against a concurrent self; things are
753 			 * already fixed up. Nothing to do.
754 			 */
755 			return 1;
756 		}
757 		newowner = argowner;
758 	}
759 
760 	newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
761 	/* Owner died? */
762 	if (!pi_state->owner)
763 		newtid |= FUTEX_OWNER_DIED;
764 
765 	err = futex_get_value_locked(&uval, uaddr);
766 	if (err)
767 		goto handle_err;
768 
769 	for (;;) {
770 		newval = (uval & FUTEX_OWNER_DIED) | newtid;
771 
772 		err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
773 		if (err)
774 			goto handle_err;
775 
776 		if (curval == uval)
777 			break;
778 		uval = curval;
779 	}
780 
781 	/*
782 	 * We fixed up user space. Now we need to fix the pi_state
783 	 * itself.
784 	 */
785 	pi_state_update_owner(pi_state, newowner);
786 
787 	return argowner == current;
788 
789 	/*
790 	 * In order to reschedule or handle a page fault, we need to drop the
791 	 * locks here. In the case of a fault, this gives the other task
792 	 * (either the highest priority waiter itself or the task which stole
793 	 * the rtmutex) the chance to try the fixup of the pi_state. So once we
794 	 * are back from handling the fault we need to check the pi_state after
795 	 * reacquiring the locks and before trying to do another fixup. When
796 	 * the fixup has been done already we simply return.
797 	 *
798 	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
799 	 * drop hb->lock since the caller owns the hb -> futex_q relation.
800 	 * Dropping the pi_mutex->wait_lock requires the state revalidate.
801 	 */
802 handle_err:
803 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
804 	spin_unlock(q->lock_ptr);
805 
806 	switch (err) {
807 	case -EFAULT:
808 		err = fault_in_user_writeable(uaddr);
809 		break;
810 
811 	case -EAGAIN:
812 		cond_resched();
813 		err = 0;
814 		break;
815 
816 	default:
817 		WARN_ON_ONCE(1);
818 		break;
819 	}
820 
821 	spin_lock(q->lock_ptr);
822 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
823 
824 	/*
825 	 * Check if someone else fixed it for us:
826 	 */
827 	if (pi_state->owner != oldowner)
828 		return argowner == current;
829 
830 	/* Retry if err was -EAGAIN or the fault in succeeded */
831 	if (!err)
832 		goto retry;
833 
834 	/*
835 	 * fault_in_user_writeable() failed so user state is immutable. At
836 	 * best we can make the kernel state consistent but user state will
837 	 * be most likely hosed and any subsequent unlock operation will be
838 	 * rejected due to PI futex rule [10].
839 	 *
840 	 * Ensure that the rtmutex owner is also the pi_state owner despite
841 	 * the user space value claiming something different. There is no
842 	 * point in unlocking the rtmutex if current is the owner as it
843 	 * would need to wait until the next waiter has taken the rtmutex
844 	 * to guarantee consistent state. Keep it simple. Userspace asked
845 	 * for this wreckaged state.
846 	 *
847 	 * The rtmutex has an owner - either current or some other
848 	 * task. See the EAGAIN loop above.
849 	 */
850 	pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
851 
852 	return err;
853 }
854 
855 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
856 				struct task_struct *argowner)
857 {
858 	struct futex_pi_state *pi_state = q->pi_state;
859 	int ret;
860 
861 	lockdep_assert_held(q->lock_ptr);
862 
863 	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
864 	ret = __fixup_pi_state_owner(uaddr, q, argowner);
865 	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
866 	return ret;
867 }
868 
869 /**
870  * fixup_pi_owner() - Post lock pi_state and corner case management
871  * @uaddr:	user address of the futex
872  * @q:		futex_q (contains pi_state and access to the rt_mutex)
873  * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
874  *
875  * After attempting to lock an rt_mutex, this function is called to cleanup
876  * the pi_state owner as well as handle race conditions that may allow us to
877  * acquire the lock. Must be called with the hb lock held.
878  *
879  * Return:
880  *  -  1 - success, lock taken;
881  *  -  0 - success, lock not taken;
882  *  - <0 - on error (-EFAULT)
883  */
884 int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
885 {
886 	if (locked) {
887 		/*
888 		 * Got the lock. We might not be the anticipated owner if we
889 		 * did a lock-steal - fix up the PI-state in that case:
890 		 *
891 		 * Speculative pi_state->owner read (we don't hold wait_lock);
892 		 * since we own the lock pi_state->owner == current is the
893 		 * stable state, anything else needs more attention.
894 		 */
895 		if (q->pi_state->owner != current)
896 			return fixup_pi_state_owner(uaddr, q, current);
897 		return 1;
898 	}
899 
900 	/*
901 	 * If we didn't get the lock; check if anybody stole it from us. In
902 	 * that case, we need to fix up the uval to point to them instead of
903 	 * us, otherwise bad things happen. [10]
904 	 *
905 	 * Another speculative read; pi_state->owner == current is unstable
906 	 * but needs our attention.
907 	 */
908 	if (q->pi_state->owner == current)
909 		return fixup_pi_state_owner(uaddr, q, NULL);
910 
911 	/*
912 	 * Paranoia check. If we did not take the lock, then we should not be
913 	 * the owner of the rt_mutex. Warn and establish consistent state.
914 	 */
915 	if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
916 		return fixup_pi_state_owner(uaddr, q, current);
917 
918 	return 0;
919 }
920 
921 /*
922  * Userspace tried a 0 -> TID atomic transition of the futex value
923  * and failed. The kernel side here does the whole locking operation:
924  * if there are waiters then it will block as a consequence of relying
925  * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
926  * a 0 value of the futex too.).
927  *
928  * Also serves as futex trylock_pi()'ing, and due semantics.
929  */
930 int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
931 {
932 	struct hrtimer_sleeper timeout, *to;
933 	struct task_struct *exiting = NULL;
934 	struct rt_mutex_waiter rt_waiter;
935 	struct futex_hash_bucket *hb;
936 	struct futex_q q = futex_q_init;
937 	int res, ret;
938 
939 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
940 		return -ENOSYS;
941 
942 	if (refill_pi_state_cache())
943 		return -ENOMEM;
944 
945 	to = futex_setup_timer(time, &timeout, flags, 0);
946 
947 retry:
948 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
949 	if (unlikely(ret != 0))
950 		goto out;
951 
952 retry_private:
953 	hb = futex_q_lock(&q);
954 
955 	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
956 				   &exiting, 0);
957 	if (unlikely(ret)) {
958 		/*
959 		 * Atomic work succeeded and we got the lock,
960 		 * or failed. Either way, we do _not_ block.
961 		 */
962 		switch (ret) {
963 		case 1:
964 			/* We got the lock. */
965 			ret = 0;
966 			goto out_unlock_put_key;
967 		case -EFAULT:
968 			goto uaddr_faulted;
969 		case -EBUSY:
970 		case -EAGAIN:
971 			/*
972 			 * Two reasons for this:
973 			 * - EBUSY: Task is exiting and we just wait for the
974 			 *   exit to complete.
975 			 * - EAGAIN: The user space value changed.
976 			 */
977 			futex_q_unlock(hb);
978 			/*
979 			 * Handle the case where the owner is in the middle of
980 			 * exiting. Wait for the exit to complete otherwise
981 			 * this task might loop forever, aka. live lock.
982 			 */
983 			wait_for_owner_exiting(ret, exiting);
984 			cond_resched();
985 			goto retry;
986 		default:
987 			goto out_unlock_put_key;
988 		}
989 	}
990 
991 	WARN_ON(!q.pi_state);
992 
993 	/*
994 	 * Only actually queue now that the atomic ops are done:
995 	 */
996 	__futex_queue(&q, hb);
997 
998 	if (trylock) {
999 		ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
1000 		/* Fixup the trylock return value: */
1001 		ret = ret ? 0 : -EWOULDBLOCK;
1002 		goto no_block;
1003 	}
1004 
1005 	rt_mutex_init_waiter(&rt_waiter);
1006 
1007 	/*
1008 	 * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
1009 	 * hold it while doing rt_mutex_start_proxy(), because then it will
1010 	 * include hb->lock in the blocking chain, even through we'll not in
1011 	 * fact hold it while blocking. This will lead it to report -EDEADLK
1012 	 * and BUG when futex_unlock_pi() interleaves with this.
1013 	 *
1014 	 * Therefore acquire wait_lock while holding hb->lock, but drop the
1015 	 * latter before calling __rt_mutex_start_proxy_lock(). This
1016 	 * interleaves with futex_unlock_pi() -- which does a similar lock
1017 	 * handoff -- such that the latter can observe the futex_q::pi_state
1018 	 * before __rt_mutex_start_proxy_lock() is done.
1019 	 */
1020 	raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
1021 	spin_unlock(q.lock_ptr);
1022 	/*
1023 	 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
1024 	 * such that futex_unlock_pi() is guaranteed to observe the waiter when
1025 	 * it sees the futex_q::pi_state.
1026 	 */
1027 	ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
1028 	raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
1029 
1030 	if (ret) {
1031 		if (ret == 1)
1032 			ret = 0;
1033 		goto cleanup;
1034 	}
1035 
1036 	if (unlikely(to))
1037 		hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
1038 
1039 	ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
1040 
1041 cleanup:
1042 	spin_lock(q.lock_ptr);
1043 	/*
1044 	 * If we failed to acquire the lock (deadlock/signal/timeout), we must
1045 	 * first acquire the hb->lock before removing the lock from the
1046 	 * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
1047 	 * lists consistent.
1048 	 *
1049 	 * In particular; it is important that futex_unlock_pi() can not
1050 	 * observe this inconsistency.
1051 	 */
1052 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
1053 		ret = 0;
1054 
1055 no_block:
1056 	/*
1057 	 * Fixup the pi_state owner and possibly acquire the lock if we
1058 	 * haven't already.
1059 	 */
1060 	res = fixup_pi_owner(uaddr, &q, !ret);
1061 	/*
1062 	 * If fixup_pi_owner() returned an error, propagate that.  If it acquired
1063 	 * the lock, clear our -ETIMEDOUT or -EINTR.
1064 	 */
1065 	if (res)
1066 		ret = (res < 0) ? res : 0;
1067 
1068 	futex_unqueue_pi(&q);
1069 	spin_unlock(q.lock_ptr);
1070 	goto out;
1071 
1072 out_unlock_put_key:
1073 	futex_q_unlock(hb);
1074 
1075 out:
1076 	if (to) {
1077 		hrtimer_cancel(&to->timer);
1078 		destroy_hrtimer_on_stack(&to->timer);
1079 	}
1080 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1081 
1082 uaddr_faulted:
1083 	futex_q_unlock(hb);
1084 
1085 	ret = fault_in_user_writeable(uaddr);
1086 	if (ret)
1087 		goto out;
1088 
1089 	if (!(flags & FLAGS_SHARED))
1090 		goto retry_private;
1091 
1092 	goto retry;
1093 }
1094 
1095 /*
1096  * Userspace attempted a TID -> 0 atomic transition, and failed.
1097  * This is the in-kernel slowpath: we look up the PI state (if any),
1098  * and do the rt-mutex unlock.
1099  */
1100 int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
1101 {
1102 	u32 curval, uval, vpid = task_pid_vnr(current);
1103 	union futex_key key = FUTEX_KEY_INIT;
1104 	struct futex_hash_bucket *hb;
1105 	struct futex_q *top_waiter;
1106 	int ret;
1107 
1108 	if (!IS_ENABLED(CONFIG_FUTEX_PI))
1109 		return -ENOSYS;
1110 
1111 retry:
1112 	if (get_user(uval, uaddr))
1113 		return -EFAULT;
1114 	/*
1115 	 * We release only a lock we actually own:
1116 	 */
1117 	if ((uval & FUTEX_TID_MASK) != vpid)
1118 		return -EPERM;
1119 
1120 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
1121 	if (ret)
1122 		return ret;
1123 
1124 	hb = futex_hash(&key);
1125 	spin_lock(&hb->lock);
1126 
1127 	/*
1128 	 * Check waiters first. We do not trust user space values at
1129 	 * all and we at least want to know if user space fiddled
1130 	 * with the futex value instead of blindly unlocking.
1131 	 */
1132 	top_waiter = futex_top_waiter(hb, &key);
1133 	if (top_waiter) {
1134 		struct futex_pi_state *pi_state = top_waiter->pi_state;
1135 
1136 		ret = -EINVAL;
1137 		if (!pi_state)
1138 			goto out_unlock;
1139 
1140 		/*
1141 		 * If current does not own the pi_state then the futex is
1142 		 * inconsistent and user space fiddled with the futex value.
1143 		 */
1144 		if (pi_state->owner != current)
1145 			goto out_unlock;
1146 
1147 		get_pi_state(pi_state);
1148 		/*
1149 		 * By taking wait_lock while still holding hb->lock, we ensure
1150 		 * there is no point where we hold neither; and therefore
1151 		 * wake_futex_p() must observe a state consistent with what we
1152 		 * observed.
1153 		 *
1154 		 * In particular; this forces __rt_mutex_start_proxy() to
1155 		 * complete such that we're guaranteed to observe the
1156 		 * rt_waiter. Also see the WARN in wake_futex_pi().
1157 		 */
1158 		raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
1159 		spin_unlock(&hb->lock);
1160 
1161 		/* drops pi_state->pi_mutex.wait_lock */
1162 		ret = wake_futex_pi(uaddr, uval, pi_state);
1163 
1164 		put_pi_state(pi_state);
1165 
1166 		/*
1167 		 * Success, we're done! No tricky corner cases.
1168 		 */
1169 		if (!ret)
1170 			return ret;
1171 		/*
1172 		 * The atomic access to the futex value generated a
1173 		 * pagefault, so retry the user-access and the wakeup:
1174 		 */
1175 		if (ret == -EFAULT)
1176 			goto pi_faulted;
1177 		/*
1178 		 * A unconditional UNLOCK_PI op raced against a waiter
1179 		 * setting the FUTEX_WAITERS bit. Try again.
1180 		 */
1181 		if (ret == -EAGAIN)
1182 			goto pi_retry;
1183 		/*
1184 		 * wake_futex_pi has detected invalid state. Tell user
1185 		 * space.
1186 		 */
1187 		return ret;
1188 	}
1189 
1190 	/*
1191 	 * We have no kernel internal state, i.e. no waiters in the
1192 	 * kernel. Waiters which are about to queue themselves are stuck
1193 	 * on hb->lock. So we can safely ignore them. We do neither
1194 	 * preserve the WAITERS bit not the OWNER_DIED one. We are the
1195 	 * owner.
1196 	 */
1197 	if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
1198 		spin_unlock(&hb->lock);
1199 		switch (ret) {
1200 		case -EFAULT:
1201 			goto pi_faulted;
1202 
1203 		case -EAGAIN:
1204 			goto pi_retry;
1205 
1206 		default:
1207 			WARN_ON_ONCE(1);
1208 			return ret;
1209 		}
1210 	}
1211 
1212 	/*
1213 	 * If uval has changed, let user space handle it.
1214 	 */
1215 	ret = (curval == uval) ? 0 : -EAGAIN;
1216 
1217 out_unlock:
1218 	spin_unlock(&hb->lock);
1219 	return ret;
1220 
1221 pi_retry:
1222 	cond_resched();
1223 	goto retry;
1224 
1225 pi_faulted:
1226 
1227 	ret = fault_in_user_writeable(uaddr);
1228 	if (!ret)
1229 		goto retry;
1230 
1231 	return ret;
1232 }
1233 
1234