xref: /openbmc/linux/kernel/locking/rtmutex.c (revision 829c6524)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * RT-Mutexes: simple blocking mutual exclusion locks with PI support
4  *
5  * started by Ingo Molnar and Thomas Gleixner.
6  *
7  *  Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
8  *  Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
9  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
10  *  Copyright (C) 2006 Esben Nielsen
11  * Adaptive Spinlocks:
12  *  Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich,
13  *				     and Peter Morreale,
14  * Adaptive Spinlocks simplification:
15  *  Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com>
16  *
17  *  See Documentation/locking/rt-mutex-design.rst for details.
18  */
19 #include <linux/sched.h>
20 #include <linux/sched/debug.h>
21 #include <linux/sched/deadline.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/rt.h>
24 #include <linux/sched/wake_q.h>
25 #include <linux/ww_mutex.h>
26 
27 #include <trace/events/lock.h>
28 
29 #include "rtmutex_common.h"
30 
31 #ifndef WW_RT
32 # define build_ww_mutex()	(false)
33 # define ww_container_of(rtm)	NULL
34 
35 static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter,
36 					struct rt_mutex *lock,
37 					struct ww_acquire_ctx *ww_ctx)
38 {
39 	return 0;
40 }
41 
42 static inline void __ww_mutex_check_waiters(struct rt_mutex *lock,
43 					    struct ww_acquire_ctx *ww_ctx)
44 {
45 }
46 
47 static inline void ww_mutex_lock_acquired(struct ww_mutex *lock,
48 					  struct ww_acquire_ctx *ww_ctx)
49 {
50 }
51 
52 static inline int __ww_mutex_check_kill(struct rt_mutex *lock,
53 					struct rt_mutex_waiter *waiter,
54 					struct ww_acquire_ctx *ww_ctx)
55 {
56 	return 0;
57 }
58 
59 #else
60 # define build_ww_mutex()	(true)
61 # define ww_container_of(rtm)	container_of(rtm, struct ww_mutex, base)
62 # include "ww_mutex.h"
63 #endif
64 
65 /*
66  * lock->owner state tracking:
67  *
68  * lock->owner holds the task_struct pointer of the owner. Bit 0
69  * is used to keep track of the "lock has waiters" state.
70  *
71  * owner	bit0
72  * NULL		0	lock is free (fast acquire possible)
73  * NULL		1	lock is free and has waiters and the top waiter
74  *				is going to take the lock*
75  * taskpointer	0	lock is held (fast release possible)
76  * taskpointer	1	lock is held and has waiters**
77  *
78  * The fast atomic compare exchange based acquire and release is only
79  * possible when bit 0 of lock->owner is 0.
80  *
81  * (*) It also can be a transitional state when grabbing the lock
82  * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
83  * we need to set the bit0 before looking at the lock, and the owner may be
84  * NULL in this small time, hence this can be a transitional state.
85  *
86  * (**) There is a small time when bit 0 is set but there are no
87  * waiters. This can happen when grabbing the lock in the slow path.
88  * To prevent a cmpxchg of the owner releasing the lock, we need to
89  * set this bit before looking at the lock.
90  */
91 
92 static __always_inline struct task_struct *
93 rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner)
94 {
95 	unsigned long val = (unsigned long)owner;
96 
97 	if (rt_mutex_has_waiters(lock))
98 		val |= RT_MUTEX_HAS_WAITERS;
99 
100 	return (struct task_struct *)val;
101 }
102 
103 static __always_inline void
104 rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner)
105 {
106 	/*
107 	 * lock->wait_lock is held but explicit acquire semantics are needed
108 	 * for a new lock owner so WRITE_ONCE is insufficient.
109 	 */
110 	xchg_acquire(&lock->owner, rt_mutex_owner_encode(lock, owner));
111 }
112 
113 static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock)
114 {
115 	/* lock->wait_lock is held so the unlock provides release semantics. */
116 	WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL));
117 }
118 
119 static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock)
120 {
121 	lock->owner = (struct task_struct *)
122 			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
123 }
124 
125 static __always_inline void
126 fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock)
127 {
128 	unsigned long owner, *p = (unsigned long *) &lock->owner;
129 
130 	if (rt_mutex_has_waiters(lock))
131 		return;
132 
133 	/*
134 	 * The rbtree has no waiters enqueued, now make sure that the
135 	 * lock->owner still has the waiters bit set, otherwise the
136 	 * following can happen:
137 	 *
138 	 * CPU 0	CPU 1		CPU2
139 	 * l->owner=T1
140 	 *		rt_mutex_lock(l)
141 	 *		lock(l->lock)
142 	 *		l->owner = T1 | HAS_WAITERS;
143 	 *		enqueue(T2)
144 	 *		boost()
145 	 *		  unlock(l->lock)
146 	 *		block()
147 	 *
148 	 *				rt_mutex_lock(l)
149 	 *				lock(l->lock)
150 	 *				l->owner = T1 | HAS_WAITERS;
151 	 *				enqueue(T3)
152 	 *				boost()
153 	 *				  unlock(l->lock)
154 	 *				block()
155 	 *		signal(->T2)	signal(->T3)
156 	 *		lock(l->lock)
157 	 *		dequeue(T2)
158 	 *		deboost()
159 	 *		  unlock(l->lock)
160 	 *				lock(l->lock)
161 	 *				dequeue(T3)
162 	 *				 ==> wait list is empty
163 	 *				deboost()
164 	 *				 unlock(l->lock)
165 	 *		lock(l->lock)
166 	 *		fixup_rt_mutex_waiters()
167 	 *		  if (wait_list_empty(l) {
168 	 *		    l->owner = owner
169 	 *		    owner = l->owner & ~HAS_WAITERS;
170 	 *		      ==> l->owner = T1
171 	 *		  }
172 	 *				lock(l->lock)
173 	 * rt_mutex_unlock(l)		fixup_rt_mutex_waiters()
174 	 *				  if (wait_list_empty(l) {
175 	 *				    owner = l->owner & ~HAS_WAITERS;
176 	 * cmpxchg(l->owner, T1, NULL)
177 	 *  ===> Success (l->owner = NULL)
178 	 *
179 	 *				    l->owner = owner
180 	 *				      ==> l->owner = T1
181 	 *				  }
182 	 *
183 	 * With the check for the waiter bit in place T3 on CPU2 will not
184 	 * overwrite. All tasks fiddling with the waiters bit are
185 	 * serialized by l->lock, so nothing else can modify the waiters
186 	 * bit. If the bit is set then nothing can change l->owner either
187 	 * so the simple RMW is safe. The cmpxchg() will simply fail if it
188 	 * happens in the middle of the RMW because the waiters bit is
189 	 * still set.
190 	 */
191 	owner = READ_ONCE(*p);
192 	if (owner & RT_MUTEX_HAS_WAITERS) {
193 		/*
194 		 * See rt_mutex_set_owner() and rt_mutex_clear_owner() on
195 		 * why xchg_acquire() is used for updating owner for
196 		 * locking and WRITE_ONCE() for unlocking.
197 		 *
198 		 * WRITE_ONCE() would work for the acquire case too, but
199 		 * in case that the lock acquisition failed it might
200 		 * force other lockers into the slow path unnecessarily.
201 		 */
202 		if (acquire_lock)
203 			xchg_acquire(p, owner & ~RT_MUTEX_HAS_WAITERS);
204 		else
205 			WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS);
206 	}
207 }
208 
209 /*
210  * We can speed up the acquire/release, if there's no debugging state to be
211  * set up.
212  */
213 #ifndef CONFIG_DEBUG_RT_MUTEXES
214 static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
215 						     struct task_struct *old,
216 						     struct task_struct *new)
217 {
218 	return try_cmpxchg_acquire(&lock->owner, &old, new);
219 }
220 
221 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
222 						     struct task_struct *old,
223 						     struct task_struct *new)
224 {
225 	return try_cmpxchg_release(&lock->owner, &old, new);
226 }
227 
228 /*
229  * Callers must hold the ->wait_lock -- which is the whole purpose as we force
230  * all future threads that attempt to [Rmw] the lock to the slowpath. As such
231  * relaxed semantics suffice.
232  */
233 static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
234 {
235 	unsigned long owner, *p = (unsigned long *) &lock->owner;
236 
237 	do {
238 		owner = *p;
239 	} while (cmpxchg_relaxed(p, owner,
240 				 owner | RT_MUTEX_HAS_WAITERS) != owner);
241 
242 	/*
243 	 * The cmpxchg loop above is relaxed to avoid back-to-back ACQUIRE
244 	 * operations in the event of contention. Ensure the successful
245 	 * cmpxchg is visible.
246 	 */
247 	smp_mb__after_atomic();
248 }
249 
250 /*
251  * Safe fastpath aware unlock:
252  * 1) Clear the waiters bit
253  * 2) Drop lock->wait_lock
254  * 3) Try to unlock the lock with cmpxchg
255  */
256 static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
257 						 unsigned long flags)
258 	__releases(lock->wait_lock)
259 {
260 	struct task_struct *owner = rt_mutex_owner(lock);
261 
262 	clear_rt_mutex_waiters(lock);
263 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
264 	/*
265 	 * If a new waiter comes in between the unlock and the cmpxchg
266 	 * we have two situations:
267 	 *
268 	 * unlock(wait_lock);
269 	 *					lock(wait_lock);
270 	 * cmpxchg(p, owner, 0) == owner
271 	 *					mark_rt_mutex_waiters(lock);
272 	 *					acquire(lock);
273 	 * or:
274 	 *
275 	 * unlock(wait_lock);
276 	 *					lock(wait_lock);
277 	 *					mark_rt_mutex_waiters(lock);
278 	 *
279 	 * cmpxchg(p, owner, 0) != owner
280 	 *					enqueue_waiter();
281 	 *					unlock(wait_lock);
282 	 * lock(wait_lock);
283 	 * wake waiter();
284 	 * unlock(wait_lock);
285 	 *					lock(wait_lock);
286 	 *					acquire(lock);
287 	 */
288 	return rt_mutex_cmpxchg_release(lock, owner, NULL);
289 }
290 
291 #else
292 static __always_inline bool rt_mutex_cmpxchg_acquire(struct rt_mutex_base *lock,
293 						     struct task_struct *old,
294 						     struct task_struct *new)
295 {
296 	return false;
297 
298 }
299 
300 static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock,
301 						     struct task_struct *old,
302 						     struct task_struct *new)
303 {
304 	return false;
305 }
306 
307 static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock)
308 {
309 	lock->owner = (struct task_struct *)
310 			((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
311 }
312 
313 /*
314  * Simple slow path only version: lock->owner is protected by lock->wait_lock.
315  */
316 static __always_inline bool unlock_rt_mutex_safe(struct rt_mutex_base *lock,
317 						 unsigned long flags)
318 	__releases(lock->wait_lock)
319 {
320 	lock->owner = NULL;
321 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
322 	return true;
323 }
324 #endif
325 
326 static __always_inline int __waiter_prio(struct task_struct *task)
327 {
328 	int prio = task->prio;
329 
330 	if (!rt_prio(prio))
331 		return DEFAULT_PRIO;
332 
333 	return prio;
334 }
335 
336 /*
337  * Update the waiter->tree copy of the sort keys.
338  */
339 static __always_inline void
340 waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
341 {
342 	lockdep_assert_held(&waiter->lock->wait_lock);
343 	lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry));
344 
345 	waiter->tree.prio = __waiter_prio(task);
346 	waiter->tree.deadline = task->dl.deadline;
347 }
348 
349 /*
350  * Update the waiter->pi_tree copy of the sort keys (from the tree copy).
351  */
352 static __always_inline void
353 waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task)
354 {
355 	lockdep_assert_held(&waiter->lock->wait_lock);
356 	lockdep_assert_held(&task->pi_lock);
357 	lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry));
358 
359 	waiter->pi_tree.prio = waiter->tree.prio;
360 	waiter->pi_tree.deadline = waiter->tree.deadline;
361 }
362 
363 /*
364  * Only use with rt_waiter_node_{less,equal}()
365  */
366 #define task_to_waiter_node(p)	\
367 	&(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline }
368 #define task_to_waiter(p)	\
369 	&(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) }
370 
371 static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left,
372 					       struct rt_waiter_node *right)
373 {
374 	if (left->prio < right->prio)
375 		return 1;
376 
377 	/*
378 	 * If both waiters have dl_prio(), we check the deadlines of the
379 	 * associated tasks.
380 	 * If left waiter has a dl_prio(), and we didn't return 1 above,
381 	 * then right waiter has a dl_prio() too.
382 	 */
383 	if (dl_prio(left->prio))
384 		return dl_time_before(left->deadline, right->deadline);
385 
386 	return 0;
387 }
388 
389 static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left,
390 						 struct rt_waiter_node *right)
391 {
392 	if (left->prio != right->prio)
393 		return 0;
394 
395 	/*
396 	 * If both waiters have dl_prio(), we check the deadlines of the
397 	 * associated tasks.
398 	 * If left waiter has a dl_prio(), and we didn't return 0 above,
399 	 * then right waiter has a dl_prio() too.
400 	 */
401 	if (dl_prio(left->prio))
402 		return left->deadline == right->deadline;
403 
404 	return 1;
405 }
406 
407 static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter,
408 				  struct rt_mutex_waiter *top_waiter)
409 {
410 	if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree))
411 		return true;
412 
413 #ifdef RT_MUTEX_BUILD_SPINLOCKS
414 	/*
415 	 * Note that RT tasks are excluded from same priority (lateral)
416 	 * steals to prevent the introduction of an unbounded latency.
417 	 */
418 	if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio))
419 		return false;
420 
421 	return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree);
422 #else
423 	return false;
424 #endif
425 }
426 
427 #define __node_2_waiter(node) \
428 	rb_entry((node), struct rt_mutex_waiter, tree.entry)
429 
430 static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b)
431 {
432 	struct rt_mutex_waiter *aw = __node_2_waiter(a);
433 	struct rt_mutex_waiter *bw = __node_2_waiter(b);
434 
435 	if (rt_waiter_node_less(&aw->tree, &bw->tree))
436 		return 1;
437 
438 	if (!build_ww_mutex())
439 		return 0;
440 
441 	if (rt_waiter_node_less(&bw->tree, &aw->tree))
442 		return 0;
443 
444 	/* NOTE: relies on waiter->ww_ctx being set before insertion */
445 	if (aw->ww_ctx) {
446 		if (!bw->ww_ctx)
447 			return 1;
448 
449 		return (signed long)(aw->ww_ctx->stamp -
450 				     bw->ww_ctx->stamp) < 0;
451 	}
452 
453 	return 0;
454 }
455 
456 static __always_inline void
457 rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
458 {
459 	lockdep_assert_held(&lock->wait_lock);
460 
461 	rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less);
462 }
463 
464 static __always_inline void
465 rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter)
466 {
467 	lockdep_assert_held(&lock->wait_lock);
468 
469 	if (RB_EMPTY_NODE(&waiter->tree.entry))
470 		return;
471 
472 	rb_erase_cached(&waiter->tree.entry, &lock->waiters);
473 	RB_CLEAR_NODE(&waiter->tree.entry);
474 }
475 
476 #define __node_2_rt_node(node) \
477 	rb_entry((node), struct rt_waiter_node, entry)
478 
479 static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b)
480 {
481 	return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b));
482 }
483 
484 static __always_inline void
485 rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
486 {
487 	lockdep_assert_held(&task->pi_lock);
488 
489 	rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less);
490 }
491 
492 static __always_inline void
493 rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
494 {
495 	lockdep_assert_held(&task->pi_lock);
496 
497 	if (RB_EMPTY_NODE(&waiter->pi_tree.entry))
498 		return;
499 
500 	rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters);
501 	RB_CLEAR_NODE(&waiter->pi_tree.entry);
502 }
503 
504 static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock,
505 						 struct task_struct *p)
506 {
507 	struct task_struct *pi_task = NULL;
508 
509 	lockdep_assert_held(&lock->wait_lock);
510 	lockdep_assert(rt_mutex_owner(lock) == p);
511 	lockdep_assert_held(&p->pi_lock);
512 
513 	if (task_has_pi_waiters(p))
514 		pi_task = task_top_pi_waiter(p)->task;
515 
516 	rt_mutex_setprio(p, pi_task);
517 }
518 
519 /* RT mutex specific wake_q wrappers */
520 static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
521 						     struct task_struct *task,
522 						     unsigned int wake_state)
523 {
524 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
525 		if (IS_ENABLED(CONFIG_PROVE_LOCKING))
526 			WARN_ON_ONCE(wqh->rtlock_task);
527 		get_task_struct(task);
528 		wqh->rtlock_task = task;
529 	} else {
530 		wake_q_add(&wqh->head, task);
531 	}
532 }
533 
534 static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
535 						struct rt_mutex_waiter *w)
536 {
537 	rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
538 }
539 
540 static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
541 {
542 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) {
543 		wake_up_state(wqh->rtlock_task, TASK_RTLOCK_WAIT);
544 		put_task_struct(wqh->rtlock_task);
545 		wqh->rtlock_task = NULL;
546 	}
547 
548 	if (!wake_q_empty(&wqh->head))
549 		wake_up_q(&wqh->head);
550 
551 	/* Pairs with preempt_disable() in mark_wakeup_next_waiter() */
552 	preempt_enable();
553 }
554 
555 /*
556  * Deadlock detection is conditional:
557  *
558  * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
559  * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
560  *
561  * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
562  * conducted independent of the detect argument.
563  *
564  * If the waiter argument is NULL this indicates the deboost path and
565  * deadlock detection is disabled independent of the detect argument
566  * and the config settings.
567  */
568 static __always_inline bool
569 rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
570 			      enum rtmutex_chainwalk chwalk)
571 {
572 	if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES))
573 		return waiter != NULL;
574 	return chwalk == RT_MUTEX_FULL_CHAINWALK;
575 }
576 
577 static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_struct *p)
578 {
579 	return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
580 }
581 
582 /*
583  * Adjust the priority chain. Also used for deadlock detection.
584  * Decreases task's usage by one - may thus free the task.
585  *
586  * @task:	the task owning the mutex (owner) for which a chain walk is
587  *		probably needed
588  * @chwalk:	do we have to carry out deadlock detection?
589  * @orig_lock:	the mutex (can be NULL if we are walking the chain to recheck
590  *		things for a task that has just got its priority adjusted, and
591  *		is waiting on a mutex)
592  * @next_lock:	the mutex on which the owner of @orig_lock was blocked before
593  *		we dropped its pi_lock. Is never dereferenced, only used for
594  *		comparison to detect lock chain changes.
595  * @orig_waiter: rt_mutex_waiter struct for the task that has just donated
596  *		its priority to the mutex owner (can be NULL in the case
597  *		depicted above or if the top waiter is gone away and we are
598  *		actually deboosting the owner)
599  * @top_task:	the current top waiter
600  *
601  * Returns 0 or -EDEADLK.
602  *
603  * Chain walk basics and protection scope
604  *
605  * [R] refcount on task
606  * [Pn] task->pi_lock held
607  * [L] rtmutex->wait_lock held
608  *
609  * Normal locking order:
610  *
611  *   rtmutex->wait_lock
612  *     task->pi_lock
613  *
614  * Step	Description				Protected by
615  *	function arguments:
616  *	@task					[R]
617  *	@orig_lock if != NULL			@top_task is blocked on it
618  *	@next_lock				Unprotected. Cannot be
619  *						dereferenced. Only used for
620  *						comparison.
621  *	@orig_waiter if != NULL			@top_task is blocked on it
622  *	@top_task				current, or in case of proxy
623  *						locking protected by calling
624  *						code
625  *	again:
626  *	  loop_sanity_check();
627  *	retry:
628  * [1]	  lock(task->pi_lock);			[R] acquire [P1]
629  * [2]	  waiter = task->pi_blocked_on;		[P1]
630  * [3]	  check_exit_conditions_1();		[P1]
631  * [4]	  lock = waiter->lock;			[P1]
632  * [5]	  if (!try_lock(lock->wait_lock)) {	[P1] try to acquire [L]
633  *	    unlock(task->pi_lock);		release [P1]
634  *	    goto retry;
635  *	  }
636  * [6]	  check_exit_conditions_2();		[P1] + [L]
637  * [7]	  requeue_lock_waiter(lock, waiter);	[P1] + [L]
638  * [8]	  unlock(task->pi_lock);		release [P1]
639  *	  put_task_struct(task);		release [R]
640  * [9]	  check_exit_conditions_3();		[L]
641  * [10]	  task = owner(lock);			[L]
642  *	  get_task_struct(task);		[L] acquire [R]
643  *	  lock(task->pi_lock);			[L] acquire [P2]
644  * [11]	  requeue_pi_waiter(tsk, waiters(lock));[P2] + [L]
645  * [12]	  check_exit_conditions_4();		[P2] + [L]
646  * [13]	  unlock(task->pi_lock);		release [P2]
647  *	  unlock(lock->wait_lock);		release [L]
648  *	  goto again;
649  *
650  * Where P1 is the blocking task and P2 is the lock owner; going up one step
651  * the owner becomes the next blocked task etc..
652  *
653 *
654  */
655 static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
656 					      enum rtmutex_chainwalk chwalk,
657 					      struct rt_mutex_base *orig_lock,
658 					      struct rt_mutex_base *next_lock,
659 					      struct rt_mutex_waiter *orig_waiter,
660 					      struct task_struct *top_task)
661 {
662 	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
663 	struct rt_mutex_waiter *prerequeue_top_waiter;
664 	int ret = 0, depth = 0;
665 	struct rt_mutex_base *lock;
666 	bool detect_deadlock;
667 	bool requeue = true;
668 
669 	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
670 
671 	/*
672 	 * The (de)boosting is a step by step approach with a lot of
673 	 * pitfalls. We want this to be preemptible and we want hold a
674 	 * maximum of two locks per step. So we have to check
675 	 * carefully whether things change under us.
676 	 */
677  again:
678 	/*
679 	 * We limit the lock chain length for each invocation.
680 	 */
681 	if (++depth > max_lock_depth) {
682 		static int prev_max;
683 
684 		/*
685 		 * Print this only once. If the admin changes the limit,
686 		 * print a new message when reaching the limit again.
687 		 */
688 		if (prev_max != max_lock_depth) {
689 			prev_max = max_lock_depth;
690 			printk(KERN_WARNING "Maximum lock depth %d reached "
691 			       "task: %s (%d)\n", max_lock_depth,
692 			       top_task->comm, task_pid_nr(top_task));
693 		}
694 		put_task_struct(task);
695 
696 		return -EDEADLK;
697 	}
698 
699 	/*
700 	 * We are fully preemptible here and only hold the refcount on
701 	 * @task. So everything can have changed under us since the
702 	 * caller or our own code below (goto retry/again) dropped all
703 	 * locks.
704 	 */
705  retry:
706 	/*
707 	 * [1] Task cannot go away as we did a get_task() before !
708 	 */
709 	raw_spin_lock_irq(&task->pi_lock);
710 
711 	/*
712 	 * [2] Get the waiter on which @task is blocked on.
713 	 */
714 	waiter = task->pi_blocked_on;
715 
716 	/*
717 	 * [3] check_exit_conditions_1() protected by task->pi_lock.
718 	 */
719 
720 	/*
721 	 * Check whether the end of the boosting chain has been
722 	 * reached or the state of the chain has changed while we
723 	 * dropped the locks.
724 	 */
725 	if (!waiter)
726 		goto out_unlock_pi;
727 
728 	/*
729 	 * Check the orig_waiter state. After we dropped the locks,
730 	 * the previous owner of the lock might have released the lock.
731 	 */
732 	if (orig_waiter && !rt_mutex_owner(orig_lock))
733 		goto out_unlock_pi;
734 
735 	/*
736 	 * We dropped all locks after taking a refcount on @task, so
737 	 * the task might have moved on in the lock chain or even left
738 	 * the chain completely and blocks now on an unrelated lock or
739 	 * on @orig_lock.
740 	 *
741 	 * We stored the lock on which @task was blocked in @next_lock,
742 	 * so we can detect the chain change.
743 	 */
744 	if (next_lock != waiter->lock)
745 		goto out_unlock_pi;
746 
747 	/*
748 	 * There could be 'spurious' loops in the lock graph due to ww_mutex,
749 	 * consider:
750 	 *
751 	 *   P1: A, ww_A, ww_B
752 	 *   P2: ww_B, ww_A
753 	 *   P3: A
754 	 *
755 	 * P3 should not return -EDEADLK because it gets trapped in the cycle
756 	 * created by P1 and P2 (which will resolve -- and runs into
757 	 * max_lock_depth above). Therefore disable detect_deadlock such that
758 	 * the below termination condition can trigger once all relevant tasks
759 	 * are boosted.
760 	 *
761 	 * Even when we start with ww_mutex we can disable deadlock detection,
762 	 * since we would supress a ww_mutex induced deadlock at [6] anyway.
763 	 * Supressing it here however is not sufficient since we might still
764 	 * hit [6] due to adjustment driven iteration.
765 	 *
766 	 * NOTE: if someone were to create a deadlock between 2 ww_classes we'd
767 	 * utterly fail to report it; lockdep should.
768 	 */
769 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock)
770 		detect_deadlock = false;
771 
772 	/*
773 	 * Drop out, when the task has no waiters. Note,
774 	 * top_waiter can be NULL, when we are in the deboosting
775 	 * mode!
776 	 */
777 	if (top_waiter) {
778 		if (!task_has_pi_waiters(task))
779 			goto out_unlock_pi;
780 		/*
781 		 * If deadlock detection is off, we stop here if we
782 		 * are not the top pi waiter of the task. If deadlock
783 		 * detection is enabled we continue, but stop the
784 		 * requeueing in the chain walk.
785 		 */
786 		if (top_waiter != task_top_pi_waiter(task)) {
787 			if (!detect_deadlock)
788 				goto out_unlock_pi;
789 			else
790 				requeue = false;
791 		}
792 	}
793 
794 	/*
795 	 * If the waiter priority is the same as the task priority
796 	 * then there is no further priority adjustment necessary.  If
797 	 * deadlock detection is off, we stop the chain walk. If its
798 	 * enabled we continue, but stop the requeueing in the chain
799 	 * walk.
800 	 */
801 	if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) {
802 		if (!detect_deadlock)
803 			goto out_unlock_pi;
804 		else
805 			requeue = false;
806 	}
807 
808 	/*
809 	 * [4] Get the next lock; per holding task->pi_lock we can't unblock
810 	 * and guarantee @lock's existence.
811 	 */
812 	lock = waiter->lock;
813 	/*
814 	 * [5] We need to trylock here as we are holding task->pi_lock,
815 	 * which is the reverse lock order versus the other rtmutex
816 	 * operations.
817 	 *
818 	 * Per the above, holding task->pi_lock guarantees lock exists, so
819 	 * inverting this lock order is infeasible from a life-time
820 	 * perspective.
821 	 */
822 	if (!raw_spin_trylock(&lock->wait_lock)) {
823 		raw_spin_unlock_irq(&task->pi_lock);
824 		cpu_relax();
825 		goto retry;
826 	}
827 
828 	/*
829 	 * [6] check_exit_conditions_2() protected by task->pi_lock and
830 	 * lock->wait_lock.
831 	 *
832 	 * Deadlock detection. If the lock is the same as the original
833 	 * lock which caused us to walk the lock chain or if the
834 	 * current lock is owned by the task which initiated the chain
835 	 * walk, we detected a deadlock.
836 	 */
837 	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
838 		ret = -EDEADLK;
839 
840 		/*
841 		 * When the deadlock is due to ww_mutex; also see above. Don't
842 		 * report the deadlock and instead let the ww_mutex wound/die
843 		 * logic pick which of the contending threads gets -EDEADLK.
844 		 *
845 		 * NOTE: assumes the cycle only contains a single ww_class; any
846 		 * other configuration and we fail to report; also, see
847 		 * lockdep.
848 		 */
849 		if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
850 			ret = 0;
851 
852 		raw_spin_unlock(&lock->wait_lock);
853 		goto out_unlock_pi;
854 	}
855 
856 	/*
857 	 * If we just follow the lock chain for deadlock detection, no
858 	 * need to do all the requeue operations. To avoid a truckload
859 	 * of conditionals around the various places below, just do the
860 	 * minimum chain walk checks.
861 	 */
862 	if (!requeue) {
863 		/*
864 		 * No requeue[7] here. Just release @task [8]
865 		 */
866 		raw_spin_unlock(&task->pi_lock);
867 		put_task_struct(task);
868 
869 		/*
870 		 * [9] check_exit_conditions_3 protected by lock->wait_lock.
871 		 * If there is no owner of the lock, end of chain.
872 		 */
873 		if (!rt_mutex_owner(lock)) {
874 			raw_spin_unlock_irq(&lock->wait_lock);
875 			return 0;
876 		}
877 
878 		/* [10] Grab the next task, i.e. owner of @lock */
879 		task = get_task_struct(rt_mutex_owner(lock));
880 		raw_spin_lock(&task->pi_lock);
881 
882 		/*
883 		 * No requeue [11] here. We just do deadlock detection.
884 		 *
885 		 * [12] Store whether owner is blocked
886 		 * itself. Decision is made after dropping the locks
887 		 */
888 		next_lock = task_blocked_on_lock(task);
889 		/*
890 		 * Get the top waiter for the next iteration
891 		 */
892 		top_waiter = rt_mutex_top_waiter(lock);
893 
894 		/* [13] Drop locks */
895 		raw_spin_unlock(&task->pi_lock);
896 		raw_spin_unlock_irq(&lock->wait_lock);
897 
898 		/* If owner is not blocked, end of chain. */
899 		if (!next_lock)
900 			goto out_put_task;
901 		goto again;
902 	}
903 
904 	/*
905 	 * Store the current top waiter before doing the requeue
906 	 * operation on @lock. We need it for the boost/deboost
907 	 * decision below.
908 	 */
909 	prerequeue_top_waiter = rt_mutex_top_waiter(lock);
910 
911 	/* [7] Requeue the waiter in the lock waiter tree. */
912 	rt_mutex_dequeue(lock, waiter);
913 
914 	/*
915 	 * Update the waiter prio fields now that we're dequeued.
916 	 *
917 	 * These values can have changed through either:
918 	 *
919 	 *   sys_sched_set_scheduler() / sys_sched_setattr()
920 	 *
921 	 * or
922 	 *
923 	 *   DL CBS enforcement advancing the effective deadline.
924 	 */
925 	waiter_update_prio(waiter, task);
926 
927 	rt_mutex_enqueue(lock, waiter);
928 
929 	/*
930 	 * [8] Release the (blocking) task in preparation for
931 	 * taking the owner task in [10].
932 	 *
933 	 * Since we hold lock->waiter_lock, task cannot unblock, even if we
934 	 * release task->pi_lock.
935 	 */
936 	raw_spin_unlock(&task->pi_lock);
937 	put_task_struct(task);
938 
939 	/*
940 	 * [9] check_exit_conditions_3 protected by lock->wait_lock.
941 	 *
942 	 * We must abort the chain walk if there is no lock owner even
943 	 * in the dead lock detection case, as we have nothing to
944 	 * follow here. This is the end of the chain we are walking.
945 	 */
946 	if (!rt_mutex_owner(lock)) {
947 		/*
948 		 * If the requeue [7] above changed the top waiter,
949 		 * then we need to wake the new top waiter up to try
950 		 * to get the lock.
951 		 */
952 		top_waiter = rt_mutex_top_waiter(lock);
953 		if (prerequeue_top_waiter != top_waiter)
954 			wake_up_state(top_waiter->task, top_waiter->wake_state);
955 		raw_spin_unlock_irq(&lock->wait_lock);
956 		return 0;
957 	}
958 
959 	/*
960 	 * [10] Grab the next task, i.e. the owner of @lock
961 	 *
962 	 * Per holding lock->wait_lock and checking for !owner above, there
963 	 * must be an owner and it cannot go away.
964 	 */
965 	task = get_task_struct(rt_mutex_owner(lock));
966 	raw_spin_lock(&task->pi_lock);
967 
968 	/* [11] requeue the pi waiters if necessary */
969 	if (waiter == rt_mutex_top_waiter(lock)) {
970 		/*
971 		 * The waiter became the new top (highest priority)
972 		 * waiter on the lock. Replace the previous top waiter
973 		 * in the owner tasks pi waiters tree with this waiter
974 		 * and adjust the priority of the owner.
975 		 */
976 		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
977 		waiter_clone_prio(waiter, task);
978 		rt_mutex_enqueue_pi(task, waiter);
979 		rt_mutex_adjust_prio(lock, task);
980 
981 	} else if (prerequeue_top_waiter == waiter) {
982 		/*
983 		 * The waiter was the top waiter on the lock, but is
984 		 * no longer the top priority waiter. Replace waiter in
985 		 * the owner tasks pi waiters tree with the new top
986 		 * (highest priority) waiter and adjust the priority
987 		 * of the owner.
988 		 * The new top waiter is stored in @waiter so that
989 		 * @waiter == @top_waiter evaluates to true below and
990 		 * we continue to deboost the rest of the chain.
991 		 */
992 		rt_mutex_dequeue_pi(task, waiter);
993 		waiter = rt_mutex_top_waiter(lock);
994 		waiter_clone_prio(waiter, task);
995 		rt_mutex_enqueue_pi(task, waiter);
996 		rt_mutex_adjust_prio(lock, task);
997 	} else {
998 		/*
999 		 * Nothing changed. No need to do any priority
1000 		 * adjustment.
1001 		 */
1002 	}
1003 
1004 	/*
1005 	 * [12] check_exit_conditions_4() protected by task->pi_lock
1006 	 * and lock->wait_lock. The actual decisions are made after we
1007 	 * dropped the locks.
1008 	 *
1009 	 * Check whether the task which owns the current lock is pi
1010 	 * blocked itself. If yes we store a pointer to the lock for
1011 	 * the lock chain change detection above. After we dropped
1012 	 * task->pi_lock next_lock cannot be dereferenced anymore.
1013 	 */
1014 	next_lock = task_blocked_on_lock(task);
1015 	/*
1016 	 * Store the top waiter of @lock for the end of chain walk
1017 	 * decision below.
1018 	 */
1019 	top_waiter = rt_mutex_top_waiter(lock);
1020 
1021 	/* [13] Drop the locks */
1022 	raw_spin_unlock(&task->pi_lock);
1023 	raw_spin_unlock_irq(&lock->wait_lock);
1024 
1025 	/*
1026 	 * Make the actual exit decisions [12], based on the stored
1027 	 * values.
1028 	 *
1029 	 * We reached the end of the lock chain. Stop right here. No
1030 	 * point to go back just to figure that out.
1031 	 */
1032 	if (!next_lock)
1033 		goto out_put_task;
1034 
1035 	/*
1036 	 * If the current waiter is not the top waiter on the lock,
1037 	 * then we can stop the chain walk here if we are not in full
1038 	 * deadlock detection mode.
1039 	 */
1040 	if (!detect_deadlock && waiter != top_waiter)
1041 		goto out_put_task;
1042 
1043 	goto again;
1044 
1045  out_unlock_pi:
1046 	raw_spin_unlock_irq(&task->pi_lock);
1047  out_put_task:
1048 	put_task_struct(task);
1049 
1050 	return ret;
1051 }
1052 
1053 /*
1054  * Try to take an rt-mutex
1055  *
1056  * Must be called with lock->wait_lock held and interrupts disabled
1057  *
1058  * @lock:   The lock to be acquired.
1059  * @task:   The task which wants to acquire the lock
1060  * @waiter: The waiter that is queued to the lock's wait tree if the
1061  *	    callsite called task_blocked_on_lock(), otherwise NULL
1062  */
1063 static int __sched
1064 try_to_take_rt_mutex(struct rt_mutex_base *lock, struct task_struct *task,
1065 		     struct rt_mutex_waiter *waiter)
1066 {
1067 	lockdep_assert_held(&lock->wait_lock);
1068 
1069 	/*
1070 	 * Before testing whether we can acquire @lock, we set the
1071 	 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
1072 	 * other tasks which try to modify @lock into the slow path
1073 	 * and they serialize on @lock->wait_lock.
1074 	 *
1075 	 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
1076 	 * as explained at the top of this file if and only if:
1077 	 *
1078 	 * - There is a lock owner. The caller must fixup the
1079 	 *   transient state if it does a trylock or leaves the lock
1080 	 *   function due to a signal or timeout.
1081 	 *
1082 	 * - @task acquires the lock and there are no other
1083 	 *   waiters. This is undone in rt_mutex_set_owner(@task) at
1084 	 *   the end of this function.
1085 	 */
1086 	mark_rt_mutex_waiters(lock);
1087 
1088 	/*
1089 	 * If @lock has an owner, give up.
1090 	 */
1091 	if (rt_mutex_owner(lock))
1092 		return 0;
1093 
1094 	/*
1095 	 * If @waiter != NULL, @task has already enqueued the waiter
1096 	 * into @lock waiter tree. If @waiter == NULL then this is a
1097 	 * trylock attempt.
1098 	 */
1099 	if (waiter) {
1100 		struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
1101 
1102 		/*
1103 		 * If waiter is the highest priority waiter of @lock,
1104 		 * or allowed to steal it, take it over.
1105 		 */
1106 		if (waiter == top_waiter || rt_mutex_steal(waiter, top_waiter)) {
1107 			/*
1108 			 * We can acquire the lock. Remove the waiter from the
1109 			 * lock waiters tree.
1110 			 */
1111 			rt_mutex_dequeue(lock, waiter);
1112 		} else {
1113 			return 0;
1114 		}
1115 	} else {
1116 		/*
1117 		 * If the lock has waiters already we check whether @task is
1118 		 * eligible to take over the lock.
1119 		 *
1120 		 * If there are no other waiters, @task can acquire
1121 		 * the lock.  @task->pi_blocked_on is NULL, so it does
1122 		 * not need to be dequeued.
1123 		 */
1124 		if (rt_mutex_has_waiters(lock)) {
1125 			/* Check whether the trylock can steal it. */
1126 			if (!rt_mutex_steal(task_to_waiter(task),
1127 					    rt_mutex_top_waiter(lock)))
1128 				return 0;
1129 
1130 			/*
1131 			 * The current top waiter stays enqueued. We
1132 			 * don't have to change anything in the lock
1133 			 * waiters order.
1134 			 */
1135 		} else {
1136 			/*
1137 			 * No waiters. Take the lock without the
1138 			 * pi_lock dance.@task->pi_blocked_on is NULL
1139 			 * and we have no waiters to enqueue in @task
1140 			 * pi waiters tree.
1141 			 */
1142 			goto takeit;
1143 		}
1144 	}
1145 
1146 	/*
1147 	 * Clear @task->pi_blocked_on. Requires protection by
1148 	 * @task->pi_lock. Redundant operation for the @waiter == NULL
1149 	 * case, but conditionals are more expensive than a redundant
1150 	 * store.
1151 	 */
1152 	raw_spin_lock(&task->pi_lock);
1153 	task->pi_blocked_on = NULL;
1154 	/*
1155 	 * Finish the lock acquisition. @task is the new owner. If
1156 	 * other waiters exist we have to insert the highest priority
1157 	 * waiter into @task->pi_waiters tree.
1158 	 */
1159 	if (rt_mutex_has_waiters(lock))
1160 		rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
1161 	raw_spin_unlock(&task->pi_lock);
1162 
1163 takeit:
1164 	/*
1165 	 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
1166 	 * are still waiters or clears it.
1167 	 */
1168 	rt_mutex_set_owner(lock, task);
1169 
1170 	return 1;
1171 }
1172 
1173 /*
1174  * Task blocks on lock.
1175  *
1176  * Prepare waiter and propagate pi chain
1177  *
1178  * This must be called with lock->wait_lock held and interrupts disabled
1179  */
1180 static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
1181 					   struct rt_mutex_waiter *waiter,
1182 					   struct task_struct *task,
1183 					   struct ww_acquire_ctx *ww_ctx,
1184 					   enum rtmutex_chainwalk chwalk)
1185 {
1186 	struct task_struct *owner = rt_mutex_owner(lock);
1187 	struct rt_mutex_waiter *top_waiter = waiter;
1188 	struct rt_mutex_base *next_lock;
1189 	int chain_walk = 0, res;
1190 
1191 	lockdep_assert_held(&lock->wait_lock);
1192 
1193 	/*
1194 	 * Early deadlock detection. We really don't want the task to
1195 	 * enqueue on itself just to untangle the mess later. It's not
1196 	 * only an optimization. We drop the locks, so another waiter
1197 	 * can come in before the chain walk detects the deadlock. So
1198 	 * the other will detect the deadlock and return -EDEADLOCK,
1199 	 * which is wrong, as the other waiter is not in a deadlock
1200 	 * situation.
1201 	 *
1202 	 * Except for ww_mutex, in that case the chain walk must already deal
1203 	 * with spurious cycles, see the comments at [3] and [6].
1204 	 */
1205 	if (owner == task && !(build_ww_mutex() && ww_ctx))
1206 		return -EDEADLK;
1207 
1208 	raw_spin_lock(&task->pi_lock);
1209 	waiter->task = task;
1210 	waiter->lock = lock;
1211 	waiter_update_prio(waiter, task);
1212 	waiter_clone_prio(waiter, task);
1213 
1214 	/* Get the top priority waiter on the lock */
1215 	if (rt_mutex_has_waiters(lock))
1216 		top_waiter = rt_mutex_top_waiter(lock);
1217 	rt_mutex_enqueue(lock, waiter);
1218 
1219 	task->pi_blocked_on = waiter;
1220 
1221 	raw_spin_unlock(&task->pi_lock);
1222 
1223 	if (build_ww_mutex() && ww_ctx) {
1224 		struct rt_mutex *rtm;
1225 
1226 		/* Check whether the waiter should back out immediately */
1227 		rtm = container_of(lock, struct rt_mutex, rtmutex);
1228 		res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx);
1229 		if (res) {
1230 			raw_spin_lock(&task->pi_lock);
1231 			rt_mutex_dequeue(lock, waiter);
1232 			task->pi_blocked_on = NULL;
1233 			raw_spin_unlock(&task->pi_lock);
1234 			return res;
1235 		}
1236 	}
1237 
1238 	if (!owner)
1239 		return 0;
1240 
1241 	raw_spin_lock(&owner->pi_lock);
1242 	if (waiter == rt_mutex_top_waiter(lock)) {
1243 		rt_mutex_dequeue_pi(owner, top_waiter);
1244 		rt_mutex_enqueue_pi(owner, waiter);
1245 
1246 		rt_mutex_adjust_prio(lock, owner);
1247 		if (owner->pi_blocked_on)
1248 			chain_walk = 1;
1249 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
1250 		chain_walk = 1;
1251 	}
1252 
1253 	/* Store the lock on which owner is blocked or NULL */
1254 	next_lock = task_blocked_on_lock(owner);
1255 
1256 	raw_spin_unlock(&owner->pi_lock);
1257 	/*
1258 	 * Even if full deadlock detection is on, if the owner is not
1259 	 * blocked itself, we can avoid finding this out in the chain
1260 	 * walk.
1261 	 */
1262 	if (!chain_walk || !next_lock)
1263 		return 0;
1264 
1265 	/*
1266 	 * The owner can't disappear while holding a lock,
1267 	 * so the owner struct is protected by wait_lock.
1268 	 * Gets dropped in rt_mutex_adjust_prio_chain()!
1269 	 */
1270 	get_task_struct(owner);
1271 
1272 	raw_spin_unlock_irq(&lock->wait_lock);
1273 
1274 	res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
1275 					 next_lock, waiter, task);
1276 
1277 	raw_spin_lock_irq(&lock->wait_lock);
1278 
1279 	return res;
1280 }
1281 
1282 /*
1283  * Remove the top waiter from the current tasks pi waiter tree and
1284  * queue it up.
1285  *
1286  * Called with lock->wait_lock held and interrupts disabled.
1287  */
1288 static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh,
1289 					    struct rt_mutex_base *lock)
1290 {
1291 	struct rt_mutex_waiter *waiter;
1292 
1293 	lockdep_assert_held(&lock->wait_lock);
1294 
1295 	raw_spin_lock(&current->pi_lock);
1296 
1297 	waiter = rt_mutex_top_waiter(lock);
1298 
1299 	/*
1300 	 * Remove it from current->pi_waiters and deboost.
1301 	 *
1302 	 * We must in fact deboost here in order to ensure we call
1303 	 * rt_mutex_setprio() to update p->pi_top_task before the
1304 	 * task unblocks.
1305 	 */
1306 	rt_mutex_dequeue_pi(current, waiter);
1307 	rt_mutex_adjust_prio(lock, current);
1308 
1309 	/*
1310 	 * As we are waking up the top waiter, and the waiter stays
1311 	 * queued on the lock until it gets the lock, this lock
1312 	 * obviously has waiters. Just set the bit here and this has
1313 	 * the added benefit of forcing all new tasks into the
1314 	 * slow path making sure no task of lower priority than
1315 	 * the top waiter can steal this lock.
1316 	 */
1317 	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
1318 
1319 	/*
1320 	 * We deboosted before waking the top waiter task such that we don't
1321 	 * run two tasks with the 'same' priority (and ensure the
1322 	 * p->pi_top_task pointer points to a blocked task). This however can
1323 	 * lead to priority inversion if we would get preempted after the
1324 	 * deboost but before waking our donor task, hence the preempt_disable()
1325 	 * before unlock.
1326 	 *
1327 	 * Pairs with preempt_enable() in rt_mutex_wake_up_q();
1328 	 */
1329 	preempt_disable();
1330 	rt_mutex_wake_q_add(wqh, waiter);
1331 	raw_spin_unlock(&current->pi_lock);
1332 }
1333 
1334 static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock)
1335 {
1336 	int ret = try_to_take_rt_mutex(lock, current, NULL);
1337 
1338 	/*
1339 	 * try_to_take_rt_mutex() sets the lock waiters bit
1340 	 * unconditionally. Clean this up.
1341 	 */
1342 	fixup_rt_mutex_waiters(lock, true);
1343 
1344 	return ret;
1345 }
1346 
1347 /*
1348  * Slow path try-lock function:
1349  */
1350 static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock)
1351 {
1352 	unsigned long flags;
1353 	int ret;
1354 
1355 	/*
1356 	 * If the lock already has an owner we fail to get the lock.
1357 	 * This can be done without taking the @lock->wait_lock as
1358 	 * it is only being read, and this is a trylock anyway.
1359 	 */
1360 	if (rt_mutex_owner(lock))
1361 		return 0;
1362 
1363 	/*
1364 	 * The mutex has currently no owner. Lock the wait lock and try to
1365 	 * acquire the lock. We use irqsave here to support early boot calls.
1366 	 */
1367 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
1368 
1369 	ret = __rt_mutex_slowtrylock(lock);
1370 
1371 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1372 
1373 	return ret;
1374 }
1375 
1376 static __always_inline int __rt_mutex_trylock(struct rt_mutex_base *lock)
1377 {
1378 	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
1379 		return 1;
1380 
1381 	return rt_mutex_slowtrylock(lock);
1382 }
1383 
1384 /*
1385  * Slow path to release a rt-mutex.
1386  */
1387 static void __sched rt_mutex_slowunlock(struct rt_mutex_base *lock)
1388 {
1389 	DEFINE_RT_WAKE_Q(wqh);
1390 	unsigned long flags;
1391 
1392 	/* irqsave required to support early boot calls */
1393 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
1394 
1395 	debug_rt_mutex_unlock(lock);
1396 
1397 	/*
1398 	 * We must be careful here if the fast path is enabled. If we
1399 	 * have no waiters queued we cannot set owner to NULL here
1400 	 * because of:
1401 	 *
1402 	 * foo->lock->owner = NULL;
1403 	 *			rtmutex_lock(foo->lock);   <- fast path
1404 	 *			free = atomic_dec_and_test(foo->refcnt);
1405 	 *			rtmutex_unlock(foo->lock); <- fast path
1406 	 *			if (free)
1407 	 *				kfree(foo);
1408 	 * raw_spin_unlock(foo->lock->wait_lock);
1409 	 *
1410 	 * So for the fastpath enabled kernel:
1411 	 *
1412 	 * Nothing can set the waiters bit as long as we hold
1413 	 * lock->wait_lock. So we do the following sequence:
1414 	 *
1415 	 *	owner = rt_mutex_owner(lock);
1416 	 *	clear_rt_mutex_waiters(lock);
1417 	 *	raw_spin_unlock(&lock->wait_lock);
1418 	 *	if (cmpxchg(&lock->owner, owner, 0) == owner)
1419 	 *		return;
1420 	 *	goto retry;
1421 	 *
1422 	 * The fastpath disabled variant is simple as all access to
1423 	 * lock->owner is serialized by lock->wait_lock:
1424 	 *
1425 	 *	lock->owner = NULL;
1426 	 *	raw_spin_unlock(&lock->wait_lock);
1427 	 */
1428 	while (!rt_mutex_has_waiters(lock)) {
1429 		/* Drops lock->wait_lock ! */
1430 		if (unlock_rt_mutex_safe(lock, flags) == true)
1431 			return;
1432 		/* Relock the rtmutex and try again */
1433 		raw_spin_lock_irqsave(&lock->wait_lock, flags);
1434 	}
1435 
1436 	/*
1437 	 * The wakeup next waiter path does not suffer from the above
1438 	 * race. See the comments there.
1439 	 *
1440 	 * Queue the next waiter for wakeup once we release the wait_lock.
1441 	 */
1442 	mark_wakeup_next_waiter(&wqh, lock);
1443 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1444 
1445 	rt_mutex_wake_up_q(&wqh);
1446 }
1447 
1448 static __always_inline void __rt_mutex_unlock(struct rt_mutex_base *lock)
1449 {
1450 	if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
1451 		return;
1452 
1453 	rt_mutex_slowunlock(lock);
1454 }
1455 
1456 #ifdef CONFIG_SMP
1457 static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
1458 				  struct rt_mutex_waiter *waiter,
1459 				  struct task_struct *owner)
1460 {
1461 	bool res = true;
1462 
1463 	rcu_read_lock();
1464 	for (;;) {
1465 		/* If owner changed, trylock again. */
1466 		if (owner != rt_mutex_owner(lock))
1467 			break;
1468 		/*
1469 		 * Ensure that @owner is dereferenced after checking that
1470 		 * the lock owner still matches @owner. If that fails,
1471 		 * @owner might point to freed memory. If it still matches,
1472 		 * the rcu_read_lock() ensures the memory stays valid.
1473 		 */
1474 		barrier();
1475 		/*
1476 		 * Stop spinning when:
1477 		 *  - the lock owner has been scheduled out
1478 		 *  - current is not longer the top waiter
1479 		 *  - current is requested to reschedule (redundant
1480 		 *    for CONFIG_PREEMPT_RCU=y)
1481 		 *  - the VCPU on which owner runs is preempted
1482 		 */
1483 		if (!owner_on_cpu(owner) || need_resched() ||
1484 		    !rt_mutex_waiter_is_top_waiter(lock, waiter)) {
1485 			res = false;
1486 			break;
1487 		}
1488 		cpu_relax();
1489 	}
1490 	rcu_read_unlock();
1491 	return res;
1492 }
1493 #else
1494 static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
1495 				  struct rt_mutex_waiter *waiter,
1496 				  struct task_struct *owner)
1497 {
1498 	return false;
1499 }
1500 #endif
1501 
1502 #ifdef RT_MUTEX_BUILD_MUTEX
1503 /*
1504  * Functions required for:
1505  *	- rtmutex, futex on all kernels
1506  *	- mutex and rwsem substitutions on RT kernels
1507  */
1508 
1509 /*
1510  * Remove a waiter from a lock and give up
1511  *
1512  * Must be called with lock->wait_lock held and interrupts disabled. It must
1513  * have just failed to try_to_take_rt_mutex().
1514  */
1515 static void __sched remove_waiter(struct rt_mutex_base *lock,
1516 				  struct rt_mutex_waiter *waiter)
1517 {
1518 	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
1519 	struct task_struct *owner = rt_mutex_owner(lock);
1520 	struct rt_mutex_base *next_lock;
1521 
1522 	lockdep_assert_held(&lock->wait_lock);
1523 
1524 	raw_spin_lock(&current->pi_lock);
1525 	rt_mutex_dequeue(lock, waiter);
1526 	current->pi_blocked_on = NULL;
1527 	raw_spin_unlock(&current->pi_lock);
1528 
1529 	/*
1530 	 * Only update priority if the waiter was the highest priority
1531 	 * waiter of the lock and there is an owner to update.
1532 	 */
1533 	if (!owner || !is_top_waiter)
1534 		return;
1535 
1536 	raw_spin_lock(&owner->pi_lock);
1537 
1538 	rt_mutex_dequeue_pi(owner, waiter);
1539 
1540 	if (rt_mutex_has_waiters(lock))
1541 		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
1542 
1543 	rt_mutex_adjust_prio(lock, owner);
1544 
1545 	/* Store the lock on which owner is blocked or NULL */
1546 	next_lock = task_blocked_on_lock(owner);
1547 
1548 	raw_spin_unlock(&owner->pi_lock);
1549 
1550 	/*
1551 	 * Don't walk the chain, if the owner task is not blocked
1552 	 * itself.
1553 	 */
1554 	if (!next_lock)
1555 		return;
1556 
1557 	/* gets dropped in rt_mutex_adjust_prio_chain()! */
1558 	get_task_struct(owner);
1559 
1560 	raw_spin_unlock_irq(&lock->wait_lock);
1561 
1562 	rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
1563 				   next_lock, NULL, current);
1564 
1565 	raw_spin_lock_irq(&lock->wait_lock);
1566 }
1567 
1568 /**
1569  * rt_mutex_slowlock_block() - Perform the wait-wake-try-to-take loop
1570  * @lock:		 the rt_mutex to take
1571  * @ww_ctx:		 WW mutex context pointer
1572  * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
1573  *			 or TASK_UNINTERRUPTIBLE)
1574  * @timeout:		 the pre-initialized and started timer, or NULL for none
1575  * @waiter:		 the pre-initialized rt_mutex_waiter
1576  *
1577  * Must be called with lock->wait_lock held and interrupts disabled
1578  */
1579 static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
1580 					   struct ww_acquire_ctx *ww_ctx,
1581 					   unsigned int state,
1582 					   struct hrtimer_sleeper *timeout,
1583 					   struct rt_mutex_waiter *waiter)
1584 {
1585 	struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
1586 	struct task_struct *owner;
1587 	int ret = 0;
1588 
1589 	for (;;) {
1590 		/* Try to acquire the lock: */
1591 		if (try_to_take_rt_mutex(lock, current, waiter))
1592 			break;
1593 
1594 		if (timeout && !timeout->task) {
1595 			ret = -ETIMEDOUT;
1596 			break;
1597 		}
1598 		if (signal_pending_state(state, current)) {
1599 			ret = -EINTR;
1600 			break;
1601 		}
1602 
1603 		if (build_ww_mutex() && ww_ctx) {
1604 			ret = __ww_mutex_check_kill(rtm, waiter, ww_ctx);
1605 			if (ret)
1606 				break;
1607 		}
1608 
1609 		if (waiter == rt_mutex_top_waiter(lock))
1610 			owner = rt_mutex_owner(lock);
1611 		else
1612 			owner = NULL;
1613 		raw_spin_unlock_irq(&lock->wait_lock);
1614 
1615 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
1616 			schedule();
1617 
1618 		raw_spin_lock_irq(&lock->wait_lock);
1619 		set_current_state(state);
1620 	}
1621 
1622 	__set_current_state(TASK_RUNNING);
1623 	return ret;
1624 }
1625 
1626 static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
1627 					     struct rt_mutex_waiter *w)
1628 {
1629 	/*
1630 	 * If the result is not -EDEADLOCK or the caller requested
1631 	 * deadlock detection, nothing to do here.
1632 	 */
1633 	if (res != -EDEADLOCK || detect_deadlock)
1634 		return;
1635 
1636 	if (build_ww_mutex() && w->ww_ctx)
1637 		return;
1638 
1639 	/*
1640 	 * Yell loudly and stop the task right here.
1641 	 */
1642 	WARN(1, "rtmutex deadlock detected\n");
1643 	while (1) {
1644 		set_current_state(TASK_INTERRUPTIBLE);
1645 		schedule();
1646 	}
1647 }
1648 
1649 /**
1650  * __rt_mutex_slowlock - Locking slowpath invoked with lock::wait_lock held
1651  * @lock:	The rtmutex to block lock
1652  * @ww_ctx:	WW mutex context pointer
1653  * @state:	The task state for sleeping
1654  * @chwalk:	Indicator whether full or partial chainwalk is requested
1655  * @waiter:	Initializer waiter for blocking
1656  */
1657 static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
1658 				       struct ww_acquire_ctx *ww_ctx,
1659 				       unsigned int state,
1660 				       enum rtmutex_chainwalk chwalk,
1661 				       struct rt_mutex_waiter *waiter)
1662 {
1663 	struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
1664 	struct ww_mutex *ww = ww_container_of(rtm);
1665 	int ret;
1666 
1667 	lockdep_assert_held(&lock->wait_lock);
1668 
1669 	/* Try to acquire the lock again: */
1670 	if (try_to_take_rt_mutex(lock, current, NULL)) {
1671 		if (build_ww_mutex() && ww_ctx) {
1672 			__ww_mutex_check_waiters(rtm, ww_ctx);
1673 			ww_mutex_lock_acquired(ww, ww_ctx);
1674 		}
1675 		return 0;
1676 	}
1677 
1678 	set_current_state(state);
1679 
1680 	trace_contention_begin(lock, LCB_F_RT);
1681 
1682 	ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
1683 	if (likely(!ret))
1684 		ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
1685 
1686 	if (likely(!ret)) {
1687 		/* acquired the lock */
1688 		if (build_ww_mutex() && ww_ctx) {
1689 			if (!ww_ctx->is_wait_die)
1690 				__ww_mutex_check_waiters(rtm, ww_ctx);
1691 			ww_mutex_lock_acquired(ww, ww_ctx);
1692 		}
1693 	} else {
1694 		__set_current_state(TASK_RUNNING);
1695 		remove_waiter(lock, waiter);
1696 		rt_mutex_handle_deadlock(ret, chwalk, waiter);
1697 	}
1698 
1699 	/*
1700 	 * try_to_take_rt_mutex() sets the waiter bit
1701 	 * unconditionally. We might have to fix that up.
1702 	 */
1703 	fixup_rt_mutex_waiters(lock, true);
1704 
1705 	trace_contention_end(lock, ret);
1706 
1707 	return ret;
1708 }
1709 
1710 static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
1711 					     struct ww_acquire_ctx *ww_ctx,
1712 					     unsigned int state)
1713 {
1714 	struct rt_mutex_waiter waiter;
1715 	int ret;
1716 
1717 	rt_mutex_init_waiter(&waiter);
1718 	waiter.ww_ctx = ww_ctx;
1719 
1720 	ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK,
1721 				  &waiter);
1722 
1723 	debug_rt_mutex_free_waiter(&waiter);
1724 	return ret;
1725 }
1726 
1727 /*
1728  * rt_mutex_slowlock - Locking slowpath invoked when fast path fails
1729  * @lock:	The rtmutex to block lock
1730  * @ww_ctx:	WW mutex context pointer
1731  * @state:	The task state for sleeping
1732  */
1733 static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
1734 				     struct ww_acquire_ctx *ww_ctx,
1735 				     unsigned int state)
1736 {
1737 	unsigned long flags;
1738 	int ret;
1739 
1740 	/*
1741 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
1742 	 * be called in early boot if the cmpxchg() fast path is disabled
1743 	 * (debug, no architecture support). In this case we will acquire the
1744 	 * rtmutex with lock->wait_lock held. But we cannot unconditionally
1745 	 * enable interrupts in that early boot case. So we need to use the
1746 	 * irqsave/restore variants.
1747 	 */
1748 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
1749 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
1750 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1751 
1752 	return ret;
1753 }
1754 
1755 static __always_inline int __rt_mutex_lock(struct rt_mutex_base *lock,
1756 					   unsigned int state)
1757 {
1758 	if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
1759 		return 0;
1760 
1761 	return rt_mutex_slowlock(lock, NULL, state);
1762 }
1763 #endif /* RT_MUTEX_BUILD_MUTEX */
1764 
1765 #ifdef RT_MUTEX_BUILD_SPINLOCKS
1766 /*
1767  * Functions required for spin/rw_lock substitution on RT kernels
1768  */
1769 
1770 /**
1771  * rtlock_slowlock_locked - Slow path lock acquisition for RT locks
1772  * @lock:	The underlying RT mutex
1773  */
1774 static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
1775 {
1776 	struct rt_mutex_waiter waiter;
1777 	struct task_struct *owner;
1778 
1779 	lockdep_assert_held(&lock->wait_lock);
1780 
1781 	if (try_to_take_rt_mutex(lock, current, NULL))
1782 		return;
1783 
1784 	rt_mutex_init_rtlock_waiter(&waiter);
1785 
1786 	/* Save current state and set state to TASK_RTLOCK_WAIT */
1787 	current_save_and_set_rtlock_wait_state();
1788 
1789 	trace_contention_begin(lock, LCB_F_RT);
1790 
1791 	task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);
1792 
1793 	for (;;) {
1794 		/* Try to acquire the lock again */
1795 		if (try_to_take_rt_mutex(lock, current, &waiter))
1796 			break;
1797 
1798 		if (&waiter == rt_mutex_top_waiter(lock))
1799 			owner = rt_mutex_owner(lock);
1800 		else
1801 			owner = NULL;
1802 		raw_spin_unlock_irq(&lock->wait_lock);
1803 
1804 		if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
1805 			schedule_rtlock();
1806 
1807 		raw_spin_lock_irq(&lock->wait_lock);
1808 		set_current_state(TASK_RTLOCK_WAIT);
1809 	}
1810 
1811 	/* Restore the task state */
1812 	current_restore_rtlock_saved_state();
1813 
1814 	/*
1815 	 * try_to_take_rt_mutex() sets the waiter bit unconditionally.
1816 	 * We might have to fix that up:
1817 	 */
1818 	fixup_rt_mutex_waiters(lock, true);
1819 	debug_rt_mutex_free_waiter(&waiter);
1820 
1821 	trace_contention_end(lock, 0);
1822 }
1823 
1824 static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
1825 {
1826 	unsigned long flags;
1827 
1828 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
1829 	rtlock_slowlock_locked(lock);
1830 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
1831 }
1832 
1833 #endif /* RT_MUTEX_BUILD_SPINLOCKS */
1834