xref: /openbmc/linux/kernel/rcu/tree_plugin.h (revision 3d37ef41)
1 /* SPDX-License-Identifier: GPL-2.0+ */
2 /*
3  * Read-Copy Update mechanism for mutual exclusion (tree-based version)
4  * Internal non-public definitions that provide either classic
5  * or preemptible semantics.
6  *
7  * Copyright Red Hat, 2009
8  * Copyright IBM Corporation, 2009
9  *
10  * Author: Ingo Molnar <mingo@elte.hu>
11  *	   Paul E. McKenney <paulmck@linux.ibm.com>
12  */
13 
14 #include "../locking/rtmutex_common.h"
15 
16 #ifdef CONFIG_RCU_NOCB_CPU
17 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
18 static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
19 static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
20 {
21 	return lockdep_is_held(&rdp->nocb_lock);
22 }
23 
24 static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
25 {
26 	/* Race on early boot between thread creation and assignment */
27 	if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
28 		return true;
29 
30 	if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
31 		if (in_task())
32 			return true;
33 	return false;
34 }
35 
36 static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
37 {
38 	return (timer_curr_running(&rdp->nocb_timer) && !in_irq());
39 }
40 #else
41 static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
42 {
43 	return 0;
44 }
45 
46 static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
47 {
48 	return false;
49 }
50 
51 static inline bool rcu_running_nocb_timer(struct rcu_data *rdp)
52 {
53 	return false;
54 }
55 
56 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
57 
58 static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
59 {
60 	/*
61 	 * In order to read the offloaded state of an rdp is a safe
62 	 * and stable way and prevent from its value to be changed
63 	 * under us, we must either hold the barrier mutex, the cpu
64 	 * hotplug lock (read or write) or the nocb lock. Local
65 	 * non-preemptible reads are also safe. NOCB kthreads and
66 	 * timers have their own means of synchronization against the
67 	 * offloaded state updaters.
68 	 */
69 	RCU_LOCKDEP_WARN(
70 		!(lockdep_is_held(&rcu_state.barrier_mutex) ||
71 		  (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
72 		  rcu_lockdep_is_held_nocb(rdp) ||
73 		  (rdp == this_cpu_ptr(&rcu_data) &&
74 		   !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
75 		  rcu_current_is_nocb_kthread(rdp) ||
76 		  rcu_running_nocb_timer(rdp)),
77 		"Unsafe read of RCU_NOCB offloaded state"
78 	);
79 
80 	return rcu_segcblist_is_offloaded(&rdp->cblist);
81 }
82 
83 /*
84  * Check the RCU kernel configuration parameters and print informative
85  * messages about anything out of the ordinary.
86  */
87 static void __init rcu_bootup_announce_oddness(void)
88 {
89 	if (IS_ENABLED(CONFIG_RCU_TRACE))
90 		pr_info("\tRCU event tracing is enabled.\n");
91 	if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
92 	    (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
93 		pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n",
94 			RCU_FANOUT);
95 	if (rcu_fanout_exact)
96 		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
97 	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
98 		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
99 	if (IS_ENABLED(CONFIG_PROVE_RCU))
100 		pr_info("\tRCU lockdep checking is enabled.\n");
101 	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
102 		pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
103 	if (RCU_NUM_LVLS >= 4)
104 		pr_info("\tFour(or more)-level hierarchy is enabled.\n");
105 	if (RCU_FANOUT_LEAF != 16)
106 		pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
107 			RCU_FANOUT_LEAF);
108 	if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
109 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n",
110 			rcu_fanout_leaf);
111 	if (nr_cpu_ids != NR_CPUS)
112 		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
113 #ifdef CONFIG_RCU_BOOST
114 	pr_info("\tRCU priority boosting: priority %d delay %d ms.\n",
115 		kthread_prio, CONFIG_RCU_BOOST_DELAY);
116 #endif
117 	if (blimit != DEFAULT_RCU_BLIMIT)
118 		pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
119 	if (qhimark != DEFAULT_RCU_QHIMARK)
120 		pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
121 	if (qlowmark != DEFAULT_RCU_QLOMARK)
122 		pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
123 	if (qovld != DEFAULT_RCU_QOVLD)
124 		pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
125 	if (jiffies_till_first_fqs != ULONG_MAX)
126 		pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
127 	if (jiffies_till_next_fqs != ULONG_MAX)
128 		pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
129 	if (jiffies_till_sched_qs != ULONG_MAX)
130 		pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs);
131 	if (rcu_kick_kthreads)
132 		pr_info("\tKick kthreads if too-long grace period.\n");
133 	if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
134 		pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
135 	if (gp_preinit_delay)
136 		pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
137 	if (gp_init_delay)
138 		pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
139 	if (gp_cleanup_delay)
140 		pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
141 	if (!use_softirq)
142 		pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
143 	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
144 		pr_info("\tRCU debug extended QS entry/exit.\n");
145 	rcupdate_announce_bootup_oddness();
146 }
147 
148 #ifdef CONFIG_PREEMPT_RCU
149 
150 static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake);
151 static void rcu_read_unlock_special(struct task_struct *t);
152 
153 /*
154  * Tell them what RCU they are running.
155  */
156 static void __init rcu_bootup_announce(void)
157 {
158 	pr_info("Preemptible hierarchical RCU implementation.\n");
159 	rcu_bootup_announce_oddness();
160 }
161 
162 /* Flags for rcu_preempt_ctxt_queue() decision table. */
163 #define RCU_GP_TASKS	0x8
164 #define RCU_EXP_TASKS	0x4
165 #define RCU_GP_BLKD	0x2
166 #define RCU_EXP_BLKD	0x1
167 
168 /*
169  * Queues a task preempted within an RCU-preempt read-side critical
170  * section into the appropriate location within the ->blkd_tasks list,
171  * depending on the states of any ongoing normal and expedited grace
172  * periods.  The ->gp_tasks pointer indicates which element the normal
173  * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
174  * indicates which element the expedited grace period is waiting on (again,
175  * NULL if none).  If a grace period is waiting on a given element in the
176  * ->blkd_tasks list, it also waits on all subsequent elements.  Thus,
177  * adding a task to the tail of the list blocks any grace period that is
178  * already waiting on one of the elements.  In contrast, adding a task
179  * to the head of the list won't block any grace period that is already
180  * waiting on one of the elements.
181  *
182  * This queuing is imprecise, and can sometimes make an ongoing grace
183  * period wait for a task that is not strictly speaking blocking it.
184  * Given the choice, we needlessly block a normal grace period rather than
185  * blocking an expedited grace period.
186  *
187  * Note that an endless sequence of expedited grace periods still cannot
188  * indefinitely postpone a normal grace period.  Eventually, all of the
189  * fixed number of preempted tasks blocking the normal grace period that are
190  * not also blocking the expedited grace period will resume and complete
191  * their RCU read-side critical sections.  At that point, the ->gp_tasks
192  * pointer will equal the ->exp_tasks pointer, at which point the end of
193  * the corresponding expedited grace period will also be the end of the
194  * normal grace period.
195  */
196 static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
197 	__releases(rnp->lock) /* But leaves rrupts disabled. */
198 {
199 	int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
200 			 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
201 			 (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
202 			 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
203 	struct task_struct *t = current;
204 
205 	raw_lockdep_assert_held_rcu_node(rnp);
206 	WARN_ON_ONCE(rdp->mynode != rnp);
207 	WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
208 	/* RCU better not be waiting on newly onlined CPUs! */
209 	WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask &
210 		     rdp->grpmask);
211 
212 	/*
213 	 * Decide where to queue the newly blocked task.  In theory,
214 	 * this could be an if-statement.  In practice, when I tried
215 	 * that, it was quite messy.
216 	 */
217 	switch (blkd_state) {
218 	case 0:
219 	case                RCU_EXP_TASKS:
220 	case                RCU_EXP_TASKS + RCU_GP_BLKD:
221 	case RCU_GP_TASKS:
222 	case RCU_GP_TASKS + RCU_EXP_TASKS:
223 
224 		/*
225 		 * Blocking neither GP, or first task blocking the normal
226 		 * GP but not blocking the already-waiting expedited GP.
227 		 * Queue at the head of the list to avoid unnecessarily
228 		 * blocking the already-waiting GPs.
229 		 */
230 		list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
231 		break;
232 
233 	case                                              RCU_EXP_BLKD:
234 	case                                RCU_GP_BLKD:
235 	case                                RCU_GP_BLKD + RCU_EXP_BLKD:
236 	case RCU_GP_TASKS +                               RCU_EXP_BLKD:
237 	case RCU_GP_TASKS +                 RCU_GP_BLKD + RCU_EXP_BLKD:
238 	case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
239 
240 		/*
241 		 * First task arriving that blocks either GP, or first task
242 		 * arriving that blocks the expedited GP (with the normal
243 		 * GP already waiting), or a task arriving that blocks
244 		 * both GPs with both GPs already waiting.  Queue at the
245 		 * tail of the list to avoid any GP waiting on any of the
246 		 * already queued tasks that are not blocking it.
247 		 */
248 		list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
249 		break;
250 
251 	case                RCU_EXP_TASKS +               RCU_EXP_BLKD:
252 	case                RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
253 	case RCU_GP_TASKS + RCU_EXP_TASKS +               RCU_EXP_BLKD:
254 
255 		/*
256 		 * Second or subsequent task blocking the expedited GP.
257 		 * The task either does not block the normal GP, or is the
258 		 * first task blocking the normal GP.  Queue just after
259 		 * the first task blocking the expedited GP.
260 		 */
261 		list_add(&t->rcu_node_entry, rnp->exp_tasks);
262 		break;
263 
264 	case RCU_GP_TASKS +                 RCU_GP_BLKD:
265 	case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
266 
267 		/*
268 		 * Second or subsequent task blocking the normal GP.
269 		 * The task does not block the expedited GP. Queue just
270 		 * after the first task blocking the normal GP.
271 		 */
272 		list_add(&t->rcu_node_entry, rnp->gp_tasks);
273 		break;
274 
275 	default:
276 
277 		/* Yet another exercise in excessive paranoia. */
278 		WARN_ON_ONCE(1);
279 		break;
280 	}
281 
282 	/*
283 	 * We have now queued the task.  If it was the first one to
284 	 * block either grace period, update the ->gp_tasks and/or
285 	 * ->exp_tasks pointers, respectively, to reference the newly
286 	 * blocked tasks.
287 	 */
288 	if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
289 		WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
290 		WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
291 	}
292 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
293 		WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry);
294 	WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
295 		     !(rnp->qsmask & rdp->grpmask));
296 	WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
297 		     !(rnp->expmask & rdp->grpmask));
298 	raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
299 
300 	/*
301 	 * Report the quiescent state for the expedited GP.  This expedited
302 	 * GP should not be able to end until we report, so there should be
303 	 * no need to check for a subsequent expedited GP.  (Though we are
304 	 * still in a quiescent state in any case.)
305 	 */
306 	if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
307 		rcu_report_exp_rdp(rdp);
308 	else
309 		WARN_ON_ONCE(rdp->exp_deferred_qs);
310 }
311 
312 /*
313  * Record a preemptible-RCU quiescent state for the specified CPU.
314  * Note that this does not necessarily mean that the task currently running
315  * on the CPU is in a quiescent state:  Instead, it means that the current
316  * grace period need not wait on any RCU read-side critical section that
317  * starts later on this CPU.  It also means that if the current task is
318  * in an RCU read-side critical section, it has already added itself to
319  * some leaf rcu_node structure's ->blkd_tasks list.  In addition to the
320  * current task, there might be any number of other tasks blocked while
321  * in an RCU read-side critical section.
322  *
323  * Callers to this function must disable preemption.
324  */
325 static void rcu_qs(void)
326 {
327 	RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
328 	if (__this_cpu_read(rcu_data.cpu_no_qs.s)) {
329 		trace_rcu_grace_period(TPS("rcu_preempt"),
330 				       __this_cpu_read(rcu_data.gp_seq),
331 				       TPS("cpuqs"));
332 		__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
333 		barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
334 		WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
335 	}
336 }
337 
338 /*
339  * We have entered the scheduler, and the current task might soon be
340  * context-switched away from.  If this task is in an RCU read-side
341  * critical section, we will no longer be able to rely on the CPU to
342  * record that fact, so we enqueue the task on the blkd_tasks list.
343  * The task will dequeue itself when it exits the outermost enclosing
344  * RCU read-side critical section.  Therefore, the current grace period
345  * cannot be permitted to complete until the blkd_tasks list entries
346  * predating the current grace period drain, in other words, until
347  * rnp->gp_tasks becomes NULL.
348  *
349  * Caller must disable interrupts.
350  */
351 void rcu_note_context_switch(bool preempt)
352 {
353 	struct task_struct *t = current;
354 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
355 	struct rcu_node *rnp;
356 
357 	trace_rcu_utilization(TPS("Start context switch"));
358 	lockdep_assert_irqs_disabled();
359 	WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
360 	if (rcu_preempt_depth() > 0 &&
361 	    !t->rcu_read_unlock_special.b.blocked) {
362 
363 		/* Possibly blocking in an RCU read-side critical section. */
364 		rnp = rdp->mynode;
365 		raw_spin_lock_rcu_node(rnp);
366 		t->rcu_read_unlock_special.b.blocked = true;
367 		t->rcu_blocked_node = rnp;
368 
369 		/*
370 		 * Verify the CPU's sanity, trace the preemption, and
371 		 * then queue the task as required based on the states
372 		 * of any ongoing and expedited grace periods.
373 		 */
374 		WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
375 		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
376 		trace_rcu_preempt_task(rcu_state.name,
377 				       t->pid,
378 				       (rnp->qsmask & rdp->grpmask)
379 				       ? rnp->gp_seq
380 				       : rcu_seq_snap(&rnp->gp_seq));
381 		rcu_preempt_ctxt_queue(rnp, rdp);
382 	} else {
383 		rcu_preempt_deferred_qs(t);
384 	}
385 
386 	/*
387 	 * Either we were not in an RCU read-side critical section to
388 	 * begin with, or we have now recorded that critical section
389 	 * globally.  Either way, we can now note a quiescent state
390 	 * for this CPU.  Again, if we were in an RCU read-side critical
391 	 * section, and if that critical section was blocking the current
392 	 * grace period, then the fact that the task has been enqueued
393 	 * means that we continue to block the current grace period.
394 	 */
395 	rcu_qs();
396 	if (rdp->exp_deferred_qs)
397 		rcu_report_exp_rdp(rdp);
398 	rcu_tasks_qs(current, preempt);
399 	trace_rcu_utilization(TPS("End context switch"));
400 }
401 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
402 
403 /*
404  * Check for preempted RCU readers blocking the current grace period
405  * for the specified rcu_node structure.  If the caller needs a reliable
406  * answer, it must hold the rcu_node's ->lock.
407  */
408 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
409 {
410 	return READ_ONCE(rnp->gp_tasks) != NULL;
411 }
412 
413 /* limit value for ->rcu_read_lock_nesting. */
414 #define RCU_NEST_PMAX (INT_MAX / 2)
415 
416 static void rcu_preempt_read_enter(void)
417 {
418 	current->rcu_read_lock_nesting++;
419 }
420 
421 static int rcu_preempt_read_exit(void)
422 {
423 	return --current->rcu_read_lock_nesting;
424 }
425 
426 static void rcu_preempt_depth_set(int val)
427 {
428 	current->rcu_read_lock_nesting = val;
429 }
430 
431 /*
432  * Preemptible RCU implementation for rcu_read_lock().
433  * Just increment ->rcu_read_lock_nesting, shared state will be updated
434  * if we block.
435  */
436 void __rcu_read_lock(void)
437 {
438 	rcu_preempt_read_enter();
439 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
440 		WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
441 	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
442 		WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
443 	barrier();  /* critical section after entry code. */
444 }
445 EXPORT_SYMBOL_GPL(__rcu_read_lock);
446 
447 /*
448  * Preemptible RCU implementation for rcu_read_unlock().
449  * Decrement ->rcu_read_lock_nesting.  If the result is zero (outermost
450  * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
451  * invoke rcu_read_unlock_special() to clean up after a context switch
452  * in an RCU read-side critical section and other special cases.
453  */
454 void __rcu_read_unlock(void)
455 {
456 	struct task_struct *t = current;
457 
458 	barrier();  // critical section before exit code.
459 	if (rcu_preempt_read_exit() == 0) {
460 		barrier();  // critical-section exit before .s check.
461 		if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
462 			rcu_read_unlock_special(t);
463 	}
464 	if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
465 		int rrln = rcu_preempt_depth();
466 
467 		WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX);
468 	}
469 }
470 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
471 
472 /*
473  * Advance a ->blkd_tasks-list pointer to the next entry, instead
474  * returning NULL if at the end of the list.
475  */
476 static struct list_head *rcu_next_node_entry(struct task_struct *t,
477 					     struct rcu_node *rnp)
478 {
479 	struct list_head *np;
480 
481 	np = t->rcu_node_entry.next;
482 	if (np == &rnp->blkd_tasks)
483 		np = NULL;
484 	return np;
485 }
486 
487 /*
488  * Return true if the specified rcu_node structure has tasks that were
489  * preempted within an RCU read-side critical section.
490  */
491 static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
492 {
493 	return !list_empty(&rnp->blkd_tasks);
494 }
495 
496 /*
497  * Report deferred quiescent states.  The deferral time can
498  * be quite short, for example, in the case of the call from
499  * rcu_read_unlock_special().
500  */
501 static void
502 rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
503 {
504 	bool empty_exp;
505 	bool empty_norm;
506 	bool empty_exp_now;
507 	struct list_head *np;
508 	bool drop_boost_mutex = false;
509 	struct rcu_data *rdp;
510 	struct rcu_node *rnp;
511 	union rcu_special special;
512 
513 	/*
514 	 * If RCU core is waiting for this CPU to exit its critical section,
515 	 * report the fact that it has exited.  Because irqs are disabled,
516 	 * t->rcu_read_unlock_special cannot change.
517 	 */
518 	special = t->rcu_read_unlock_special;
519 	rdp = this_cpu_ptr(&rcu_data);
520 	if (!special.s && !rdp->exp_deferred_qs) {
521 		local_irq_restore(flags);
522 		return;
523 	}
524 	t->rcu_read_unlock_special.s = 0;
525 	if (special.b.need_qs) {
526 		if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
527 			rcu_report_qs_rdp(rdp);
528 			udelay(rcu_unlock_delay);
529 		} else {
530 			rcu_qs();
531 		}
532 	}
533 
534 	/*
535 	 * Respond to a request by an expedited grace period for a
536 	 * quiescent state from this CPU.  Note that requests from
537 	 * tasks are handled when removing the task from the
538 	 * blocked-tasks list below.
539 	 */
540 	if (rdp->exp_deferred_qs)
541 		rcu_report_exp_rdp(rdp);
542 
543 	/* Clean up if blocked during RCU read-side critical section. */
544 	if (special.b.blocked) {
545 
546 		/*
547 		 * Remove this task from the list it blocked on.  The task
548 		 * now remains queued on the rcu_node corresponding to the
549 		 * CPU it first blocked on, so there is no longer any need
550 		 * to loop.  Retain a WARN_ON_ONCE() out of sheer paranoia.
551 		 */
552 		rnp = t->rcu_blocked_node;
553 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
554 		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
555 		WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
556 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
557 		WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
558 			     (!empty_norm || rnp->qsmask));
559 		empty_exp = sync_rcu_exp_done(rnp);
560 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
561 		np = rcu_next_node_entry(t, rnp);
562 		list_del_init(&t->rcu_node_entry);
563 		t->rcu_blocked_node = NULL;
564 		trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
565 						rnp->gp_seq, t->pid);
566 		if (&t->rcu_node_entry == rnp->gp_tasks)
567 			WRITE_ONCE(rnp->gp_tasks, np);
568 		if (&t->rcu_node_entry == rnp->exp_tasks)
569 			WRITE_ONCE(rnp->exp_tasks, np);
570 		if (IS_ENABLED(CONFIG_RCU_BOOST)) {
571 			/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
572 			drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
573 			if (&t->rcu_node_entry == rnp->boost_tasks)
574 				WRITE_ONCE(rnp->boost_tasks, np);
575 		}
576 
577 		/*
578 		 * If this was the last task on the current list, and if
579 		 * we aren't waiting on any CPUs, report the quiescent state.
580 		 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
581 		 * so we must take a snapshot of the expedited state.
582 		 */
583 		empty_exp_now = sync_rcu_exp_done(rnp);
584 		if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
585 			trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
586 							 rnp->gp_seq,
587 							 0, rnp->qsmask,
588 							 rnp->level,
589 							 rnp->grplo,
590 							 rnp->grphi,
591 							 !!rnp->gp_tasks);
592 			rcu_report_unblock_qs_rnp(rnp, flags);
593 		} else {
594 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
595 		}
596 
597 		/* Unboost if we were boosted. */
598 		if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
599 			rt_mutex_futex_unlock(&rnp->boost_mtx);
600 
601 		/*
602 		 * If this was the last task on the expedited lists,
603 		 * then we need to report up the rcu_node hierarchy.
604 		 */
605 		if (!empty_exp && empty_exp_now)
606 			rcu_report_exp_rnp(rnp, true);
607 	} else {
608 		local_irq_restore(flags);
609 	}
610 }
611 
612 /*
613  * Is a deferred quiescent-state pending, and are we also not in
614  * an RCU read-side critical section?  It is the caller's responsibility
615  * to ensure it is otherwise safe to report any deferred quiescent
616  * states.  The reason for this is that it is safe to report a
617  * quiescent state during context switch even though preemption
618  * is disabled.  This function cannot be expected to understand these
619  * nuances, so the caller must handle them.
620  */
621 static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
622 {
623 	return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
624 		READ_ONCE(t->rcu_read_unlock_special.s)) &&
625 	       rcu_preempt_depth() == 0;
626 }
627 
628 /*
629  * Report a deferred quiescent state if needed and safe to do so.
630  * As with rcu_preempt_need_deferred_qs(), "safe" involves only
631  * not being in an RCU read-side critical section.  The caller must
632  * evaluate safety in terms of interrupt, softirq, and preemption
633  * disabling.
634  */
635 static void rcu_preempt_deferred_qs(struct task_struct *t)
636 {
637 	unsigned long flags;
638 
639 	if (!rcu_preempt_need_deferred_qs(t))
640 		return;
641 	local_irq_save(flags);
642 	rcu_preempt_deferred_qs_irqrestore(t, flags);
643 }
644 
645 /*
646  * Minimal handler to give the scheduler a chance to re-evaluate.
647  */
648 static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
649 {
650 	struct rcu_data *rdp;
651 
652 	rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
653 	rdp->defer_qs_iw_pending = false;
654 }
655 
656 /*
657  * Handle special cases during rcu_read_unlock(), such as needing to
658  * notify RCU core processing or task having blocked during the RCU
659  * read-side critical section.
660  */
661 static void rcu_read_unlock_special(struct task_struct *t)
662 {
663 	unsigned long flags;
664 	bool irqs_were_disabled;
665 	bool preempt_bh_were_disabled =
666 			!!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
667 
668 	/* NMI handlers cannot block and cannot safely manipulate state. */
669 	if (in_nmi())
670 		return;
671 
672 	local_irq_save(flags);
673 	irqs_were_disabled = irqs_disabled_flags(flags);
674 	if (preempt_bh_were_disabled || irqs_were_disabled) {
675 		bool expboost; // Expedited GP in flight or possible boosting.
676 		struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
677 		struct rcu_node *rnp = rdp->mynode;
678 
679 		expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
680 			   (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
681 			   IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
682 			   (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
683 			    t->rcu_blocked_node);
684 		// Need to defer quiescent state until everything is enabled.
685 		if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) {
686 			// Using softirq, safe to awaken, and either the
687 			// wakeup is free or there is either an expedited
688 			// GP in flight or a potential need to deboost.
689 			raise_softirq_irqoff(RCU_SOFTIRQ);
690 		} else {
691 			// Enabling BH or preempt does reschedule, so...
692 			// Also if no expediting and no possible deboosting,
693 			// slow is OK.  Plus nohz_full CPUs eventually get
694 			// tick enabled.
695 			set_tsk_need_resched(current);
696 			set_preempt_need_resched();
697 			if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
698 			    expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
699 				// Get scheduler to re-evaluate and call hooks.
700 				// If !IRQ_WORK, FQS scan will eventually IPI.
701 				init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler);
702 				rdp->defer_qs_iw_pending = true;
703 				irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
704 			}
705 		}
706 		local_irq_restore(flags);
707 		return;
708 	}
709 	rcu_preempt_deferred_qs_irqrestore(t, flags);
710 }
711 
712 /*
713  * Check that the list of blocked tasks for the newly completed grace
714  * period is in fact empty.  It is a serious bug to complete a grace
715  * period that still has RCU readers blocked!  This function must be
716  * invoked -before- updating this rnp's ->gp_seq.
717  *
718  * Also, if there are blocked tasks on the list, they automatically
719  * block the newly created grace period, so set up ->gp_tasks accordingly.
720  */
721 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
722 {
723 	struct task_struct *t;
724 
725 	RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
726 	raw_lockdep_assert_held_rcu_node(rnp);
727 	if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
728 		dump_blkd_tasks(rnp, 10);
729 	if (rcu_preempt_has_tasks(rnp) &&
730 	    (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
731 		WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
732 		t = container_of(rnp->gp_tasks, struct task_struct,
733 				 rcu_node_entry);
734 		trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
735 						rnp->gp_seq, t->pid);
736 	}
737 	WARN_ON_ONCE(rnp->qsmask);
738 }
739 
740 /*
741  * Check for a quiescent state from the current CPU, including voluntary
742  * context switches for Tasks RCU.  When a task blocks, the task is
743  * recorded in the corresponding CPU's rcu_node structure, which is checked
744  * elsewhere, hence this function need only check for quiescent states
745  * related to the current CPU, not to those related to tasks.
746  */
747 static void rcu_flavor_sched_clock_irq(int user)
748 {
749 	struct task_struct *t = current;
750 
751 	lockdep_assert_irqs_disabled();
752 	if (user || rcu_is_cpu_rrupt_from_idle()) {
753 		rcu_note_voluntary_context_switch(current);
754 	}
755 	if (rcu_preempt_depth() > 0 ||
756 	    (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
757 		/* No QS, force context switch if deferred. */
758 		if (rcu_preempt_need_deferred_qs(t)) {
759 			set_tsk_need_resched(t);
760 			set_preempt_need_resched();
761 		}
762 	} else if (rcu_preempt_need_deferred_qs(t)) {
763 		rcu_preempt_deferred_qs(t); /* Report deferred QS. */
764 		return;
765 	} else if (!WARN_ON_ONCE(rcu_preempt_depth())) {
766 		rcu_qs(); /* Report immediate QS. */
767 		return;
768 	}
769 
770 	/* If GP is oldish, ask for help from rcu_read_unlock_special(). */
771 	if (rcu_preempt_depth() > 0 &&
772 	    __this_cpu_read(rcu_data.core_needs_qs) &&
773 	    __this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
774 	    !t->rcu_read_unlock_special.b.need_qs &&
775 	    time_after(jiffies, rcu_state.gp_start + HZ))
776 		t->rcu_read_unlock_special.b.need_qs = true;
777 }
778 
779 /*
780  * Check for a task exiting while in a preemptible-RCU read-side
781  * critical section, clean up if so.  No need to issue warnings, as
782  * debug_check_no_locks_held() already does this if lockdep is enabled.
783  * Besides, if this function does anything other than just immediately
784  * return, there was a bug of some sort.  Spewing warnings from this
785  * function is like as not to simply obscure important prior warnings.
786  */
787 void exit_rcu(void)
788 {
789 	struct task_struct *t = current;
790 
791 	if (unlikely(!list_empty(&current->rcu_node_entry))) {
792 		rcu_preempt_depth_set(1);
793 		barrier();
794 		WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
795 	} else if (unlikely(rcu_preempt_depth())) {
796 		rcu_preempt_depth_set(1);
797 	} else {
798 		return;
799 	}
800 	__rcu_read_unlock();
801 	rcu_preempt_deferred_qs(current);
802 }
803 
804 /*
805  * Dump the blocked-tasks state, but limit the list dump to the
806  * specified number of elements.
807  */
808 static void
809 dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
810 {
811 	int cpu;
812 	int i;
813 	struct list_head *lhp;
814 	bool onl;
815 	struct rcu_data *rdp;
816 	struct rcu_node *rnp1;
817 
818 	raw_lockdep_assert_held_rcu_node(rnp);
819 	pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
820 		__func__, rnp->grplo, rnp->grphi, rnp->level,
821 		(long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
822 	for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
823 		pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
824 			__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
825 	pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
826 		__func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks),
827 		READ_ONCE(rnp->exp_tasks));
828 	pr_info("%s: ->blkd_tasks", __func__);
829 	i = 0;
830 	list_for_each(lhp, &rnp->blkd_tasks) {
831 		pr_cont(" %p", lhp);
832 		if (++i >= ncheck)
833 			break;
834 	}
835 	pr_cont("\n");
836 	for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
837 		rdp = per_cpu_ptr(&rcu_data, cpu);
838 		onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
839 		pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
840 			cpu, ".o"[onl],
841 			(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
842 			(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
843 	}
844 }
845 
846 #else /* #ifdef CONFIG_PREEMPT_RCU */
847 
848 /*
849  * If strict grace periods are enabled, and if the calling
850  * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
851  * report that quiescent state and, if requested, spin for a bit.
852  */
853 void rcu_read_unlock_strict(void)
854 {
855 	struct rcu_data *rdp;
856 
857 	if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
858 	   irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
859 		return;
860 	rdp = this_cpu_ptr(&rcu_data);
861 	rcu_report_qs_rdp(rdp);
862 	udelay(rcu_unlock_delay);
863 }
864 EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
865 
866 /*
867  * Tell them what RCU they are running.
868  */
869 static void __init rcu_bootup_announce(void)
870 {
871 	pr_info("Hierarchical RCU implementation.\n");
872 	rcu_bootup_announce_oddness();
873 }
874 
875 /*
876  * Note a quiescent state for PREEMPTION=n.  Because we do not need to know
877  * how many quiescent states passed, just if there was at least one since
878  * the start of the grace period, this just sets a flag.  The caller must
879  * have disabled preemption.
880  */
881 static void rcu_qs(void)
882 {
883 	RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
884 	if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
885 		return;
886 	trace_rcu_grace_period(TPS("rcu_sched"),
887 			       __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
888 	__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
889 	if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
890 		return;
891 	__this_cpu_write(rcu_data.cpu_no_qs.b.exp, false);
892 	rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
893 }
894 
895 /*
896  * Register an urgently needed quiescent state.  If there is an
897  * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
898  * dyntick-idle quiescent state visible to other CPUs, which will in
899  * some cases serve for expedited as well as normal grace periods.
900  * Either way, register a lightweight quiescent state.
901  */
902 void rcu_all_qs(void)
903 {
904 	unsigned long flags;
905 
906 	if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
907 		return;
908 	preempt_disable();
909 	/* Load rcu_urgent_qs before other flags. */
910 	if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
911 		preempt_enable();
912 		return;
913 	}
914 	this_cpu_write(rcu_data.rcu_urgent_qs, false);
915 	if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
916 		local_irq_save(flags);
917 		rcu_momentary_dyntick_idle();
918 		local_irq_restore(flags);
919 	}
920 	rcu_qs();
921 	preempt_enable();
922 }
923 EXPORT_SYMBOL_GPL(rcu_all_qs);
924 
925 /*
926  * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
927  */
928 void rcu_note_context_switch(bool preempt)
929 {
930 	trace_rcu_utilization(TPS("Start context switch"));
931 	rcu_qs();
932 	/* Load rcu_urgent_qs before other flags. */
933 	if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs)))
934 		goto out;
935 	this_cpu_write(rcu_data.rcu_urgent_qs, false);
936 	if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
937 		rcu_momentary_dyntick_idle();
938 	rcu_tasks_qs(current, preempt);
939 out:
940 	trace_rcu_utilization(TPS("End context switch"));
941 }
942 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
943 
944 /*
945  * Because preemptible RCU does not exist, there are never any preempted
946  * RCU readers.
947  */
948 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
949 {
950 	return 0;
951 }
952 
953 /*
954  * Because there is no preemptible RCU, there can be no readers blocked.
955  */
956 static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
957 {
958 	return false;
959 }
960 
961 /*
962  * Because there is no preemptible RCU, there can be no deferred quiescent
963  * states.
964  */
965 static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
966 {
967 	return false;
968 }
969 static void rcu_preempt_deferred_qs(struct task_struct *t) { }
970 
971 /*
972  * Because there is no preemptible RCU, there can be no readers blocked,
973  * so there is no need to check for blocked tasks.  So check only for
974  * bogus qsmask values.
975  */
976 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
977 {
978 	WARN_ON_ONCE(rnp->qsmask);
979 }
980 
981 /*
982  * Check to see if this CPU is in a non-context-switch quiescent state,
983  * namely user mode and idle loop.
984  */
985 static void rcu_flavor_sched_clock_irq(int user)
986 {
987 	if (user || rcu_is_cpu_rrupt_from_idle()) {
988 
989 		/*
990 		 * Get here if this CPU took its interrupt from user
991 		 * mode or from the idle loop, and if this is not a
992 		 * nested interrupt.  In this case, the CPU is in
993 		 * a quiescent state, so note it.
994 		 *
995 		 * No memory barrier is required here because rcu_qs()
996 		 * references only CPU-local variables that other CPUs
997 		 * neither access nor modify, at least not while the
998 		 * corresponding CPU is online.
999 		 */
1000 
1001 		rcu_qs();
1002 	}
1003 }
1004 
1005 /*
1006  * Because preemptible RCU does not exist, tasks cannot possibly exit
1007  * while in preemptible RCU read-side critical sections.
1008  */
1009 void exit_rcu(void)
1010 {
1011 }
1012 
1013 /*
1014  * Dump the guaranteed-empty blocked-tasks state.  Trust but verify.
1015  */
1016 static void
1017 dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
1018 {
1019 	WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
1020 }
1021 
1022 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
1023 
1024 /*
1025  * If boosting, set rcuc kthreads to realtime priority.
1026  */
1027 static void rcu_cpu_kthread_setup(unsigned int cpu)
1028 {
1029 #ifdef CONFIG_RCU_BOOST
1030 	struct sched_param sp;
1031 
1032 	sp.sched_priority = kthread_prio;
1033 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1034 #endif /* #ifdef CONFIG_RCU_BOOST */
1035 }
1036 
1037 #ifdef CONFIG_RCU_BOOST
1038 
1039 /*
1040  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1041  * or ->boost_tasks, advancing the pointer to the next task in the
1042  * ->blkd_tasks list.
1043  *
1044  * Note that irqs must be enabled: boosting the task can block.
1045  * Returns 1 if there are more tasks needing to be boosted.
1046  */
1047 static int rcu_boost(struct rcu_node *rnp)
1048 {
1049 	unsigned long flags;
1050 	struct task_struct *t;
1051 	struct list_head *tb;
1052 
1053 	if (READ_ONCE(rnp->exp_tasks) == NULL &&
1054 	    READ_ONCE(rnp->boost_tasks) == NULL)
1055 		return 0;  /* Nothing left to boost. */
1056 
1057 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1058 
1059 	/*
1060 	 * Recheck under the lock: all tasks in need of boosting
1061 	 * might exit their RCU read-side critical sections on their own.
1062 	 */
1063 	if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1064 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1065 		return 0;
1066 	}
1067 
1068 	/*
1069 	 * Preferentially boost tasks blocking expedited grace periods.
1070 	 * This cannot starve the normal grace periods because a second
1071 	 * expedited grace period must boost all blocked tasks, including
1072 	 * those blocking the pre-existing normal grace period.
1073 	 */
1074 	if (rnp->exp_tasks != NULL)
1075 		tb = rnp->exp_tasks;
1076 	else
1077 		tb = rnp->boost_tasks;
1078 
1079 	/*
1080 	 * We boost task t by manufacturing an rt_mutex that appears to
1081 	 * be held by task t.  We leave a pointer to that rt_mutex where
1082 	 * task t can find it, and task t will release the mutex when it
1083 	 * exits its outermost RCU read-side critical section.  Then
1084 	 * simply acquiring this artificial rt_mutex will boost task
1085 	 * t's priority.  (Thanks to tglx for suggesting this approach!)
1086 	 *
1087 	 * Note that task t must acquire rnp->lock to remove itself from
1088 	 * the ->blkd_tasks list, which it will do from exit() if from
1089 	 * nowhere else.  We therefore are guaranteed that task t will
1090 	 * stay around at least until we drop rnp->lock.  Note that
1091 	 * rnp->lock also resolves races between our priority boosting
1092 	 * and task t's exiting its outermost RCU read-side critical
1093 	 * section.
1094 	 */
1095 	t = container_of(tb, struct task_struct, rcu_node_entry);
1096 	rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1097 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1098 	/* Lock only for side effect: boosts task t's priority. */
1099 	rt_mutex_lock(&rnp->boost_mtx);
1100 	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
1101 
1102 	return READ_ONCE(rnp->exp_tasks) != NULL ||
1103 	       READ_ONCE(rnp->boost_tasks) != NULL;
1104 }
1105 
1106 /*
1107  * Priority-boosting kthread, one per leaf rcu_node.
1108  */
1109 static int rcu_boost_kthread(void *arg)
1110 {
1111 	struct rcu_node *rnp = (struct rcu_node *)arg;
1112 	int spincnt = 0;
1113 	int more2boost;
1114 
1115 	trace_rcu_utilization(TPS("Start boost kthread@init"));
1116 	for (;;) {
1117 		WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
1118 		trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1119 		rcu_wait(READ_ONCE(rnp->boost_tasks) ||
1120 			 READ_ONCE(rnp->exp_tasks));
1121 		trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1122 		WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
1123 		more2boost = rcu_boost(rnp);
1124 		if (more2boost)
1125 			spincnt++;
1126 		else
1127 			spincnt = 0;
1128 		if (spincnt > 10) {
1129 			WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
1130 			trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1131 			schedule_timeout_idle(2);
1132 			trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1133 			spincnt = 0;
1134 		}
1135 	}
1136 	/* NOTREACHED */
1137 	trace_rcu_utilization(TPS("End boost kthread@notreached"));
1138 	return 0;
1139 }
1140 
1141 /*
1142  * Check to see if it is time to start boosting RCU readers that are
1143  * blocking the current grace period, and, if so, tell the per-rcu_node
1144  * kthread to start boosting them.  If there is an expedited grace
1145  * period in progress, it is always time to boost.
1146  *
1147  * The caller must hold rnp->lock, which this function releases.
1148  * The ->boost_kthread_task is immortal, so we don't need to worry
1149  * about it going away.
1150  */
1151 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1152 	__releases(rnp->lock)
1153 {
1154 	raw_lockdep_assert_held_rcu_node(rnp);
1155 	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1156 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1157 		return;
1158 	}
1159 	if (rnp->exp_tasks != NULL ||
1160 	    (rnp->gp_tasks != NULL &&
1161 	     rnp->boost_tasks == NULL &&
1162 	     rnp->qsmask == 0 &&
1163 	     (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) {
1164 		if (rnp->exp_tasks == NULL)
1165 			WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
1166 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1167 		rcu_wake_cond(rnp->boost_kthread_task,
1168 			      READ_ONCE(rnp->boost_kthread_status));
1169 	} else {
1170 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1171 	}
1172 }
1173 
1174 /*
1175  * Is the current CPU running the RCU-callbacks kthread?
1176  * Caller must have preemption disabled.
1177  */
1178 static bool rcu_is_callbacks_kthread(void)
1179 {
1180 	return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current;
1181 }
1182 
1183 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1184 
1185 /*
1186  * Do priority-boost accounting for the start of a new grace period.
1187  */
1188 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1189 {
1190 	rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1191 }
1192 
1193 /*
1194  * Create an RCU-boost kthread for the specified node if one does not
1195  * already exist.  We only create this kthread for preemptible RCU.
1196  * Returns zero if all is well, a negated errno otherwise.
1197  */
1198 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
1199 {
1200 	int rnp_index = rnp - rcu_get_root();
1201 	unsigned long flags;
1202 	struct sched_param sp;
1203 	struct task_struct *t;
1204 
1205 	if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
1206 		return;
1207 
1208 	if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
1209 		return;
1210 
1211 	rcu_state.boost = 1;
1212 
1213 	if (rnp->boost_kthread_task != NULL)
1214 		return;
1215 
1216 	t = kthread_create(rcu_boost_kthread, (void *)rnp,
1217 			   "rcub/%d", rnp_index);
1218 	if (WARN_ON_ONCE(IS_ERR(t)))
1219 		return;
1220 
1221 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
1222 	rnp->boost_kthread_task = t;
1223 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1224 	sp.sched_priority = kthread_prio;
1225 	sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1226 	wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1227 }
1228 
1229 /*
1230  * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1231  * served by the rcu_node in question.  The CPU hotplug lock is still
1232  * held, so the value of rnp->qsmaskinit will be stable.
1233  *
1234  * We don't include outgoingcpu in the affinity set, use -1 if there is
1235  * no outgoing CPU.  If there are no CPUs left in the affinity set,
1236  * this function allows the kthread to execute on any CPU.
1237  */
1238 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1239 {
1240 	struct task_struct *t = rnp->boost_kthread_task;
1241 	unsigned long mask = rcu_rnp_online_cpus(rnp);
1242 	cpumask_var_t cm;
1243 	int cpu;
1244 
1245 	if (!t)
1246 		return;
1247 	if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1248 		return;
1249 	for_each_leaf_node_possible_cpu(rnp, cpu)
1250 		if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
1251 		    cpu != outgoingcpu)
1252 			cpumask_set_cpu(cpu, cm);
1253 	if (cpumask_weight(cm) == 0)
1254 		cpumask_setall(cm);
1255 	set_cpus_allowed_ptr(t, cm);
1256 	free_cpumask_var(cm);
1257 }
1258 
1259 /*
1260  * Spawn boost kthreads -- called as soon as the scheduler is running.
1261  */
1262 static void __init rcu_spawn_boost_kthreads(void)
1263 {
1264 	struct rcu_node *rnp;
1265 
1266 	rcu_for_each_leaf_node(rnp)
1267 		rcu_spawn_one_boost_kthread(rnp);
1268 }
1269 
1270 static void rcu_prepare_kthreads(int cpu)
1271 {
1272 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
1273 	struct rcu_node *rnp = rdp->mynode;
1274 
1275 	/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1276 	if (rcu_scheduler_fully_active)
1277 		rcu_spawn_one_boost_kthread(rnp);
1278 }
1279 
1280 #else /* #ifdef CONFIG_RCU_BOOST */
1281 
1282 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1283 	__releases(rnp->lock)
1284 {
1285 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1286 }
1287 
1288 static bool rcu_is_callbacks_kthread(void)
1289 {
1290 	return false;
1291 }
1292 
1293 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1294 {
1295 }
1296 
1297 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1298 {
1299 }
1300 
1301 static void __init rcu_spawn_boost_kthreads(void)
1302 {
1303 }
1304 
1305 static void rcu_prepare_kthreads(int cpu)
1306 {
1307 }
1308 
1309 #endif /* #else #ifdef CONFIG_RCU_BOOST */
1310 
1311 #if !defined(CONFIG_RCU_FAST_NO_HZ)
1312 
1313 /*
1314  * Check to see if any future non-offloaded RCU-related work will need
1315  * to be done by the current CPU, even if none need be done immediately,
1316  * returning 1 if so.  This function is part of the RCU implementation;
1317  * it is -not- an exported member of the RCU API.
1318  *
1319  * Because we not have RCU_FAST_NO_HZ, just check whether or not this
1320  * CPU has RCU callbacks queued.
1321  */
1322 int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1323 {
1324 	*nextevt = KTIME_MAX;
1325 	return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
1326 		!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
1327 }
1328 
1329 /*
1330  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1331  * after it.
1332  */
1333 static void rcu_cleanup_after_idle(void)
1334 {
1335 }
1336 
1337 /*
1338  * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1339  * is nothing.
1340  */
1341 static void rcu_prepare_for_idle(void)
1342 {
1343 }
1344 
1345 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1346 
1347 /*
1348  * This code is invoked when a CPU goes idle, at which point we want
1349  * to have the CPU do everything required for RCU so that it can enter
1350  * the energy-efficient dyntick-idle mode.
1351  *
1352  * The following preprocessor symbol controls this:
1353  *
1354  * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1355  *	to sleep in dyntick-idle mode with RCU callbacks pending.  This
1356  *	is sized to be roughly one RCU grace period.  Those energy-efficiency
1357  *	benchmarkers who might otherwise be tempted to set this to a large
1358  *	number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1359  *	system.  And if you are -that- concerned about energy efficiency,
1360  *	just power the system down and be done with it!
1361  *
1362  * The value below works well in practice.  If future workloads require
1363  * adjustment, they can be converted into kernel config parameters, though
1364  * making the state machine smarter might be a better option.
1365  */
1366 #define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
1367 
1368 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1369 module_param(rcu_idle_gp_delay, int, 0644);
1370 
1371 /*
1372  * Try to advance callbacks on the current CPU, but only if it has been
1373  * awhile since the last time we did so.  Afterwards, if there are any
1374  * callbacks ready for immediate invocation, return true.
1375  */
1376 static bool __maybe_unused rcu_try_advance_all_cbs(void)
1377 {
1378 	bool cbs_ready = false;
1379 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1380 	struct rcu_node *rnp;
1381 
1382 	/* Exit early if we advanced recently. */
1383 	if (jiffies == rdp->last_advance_all)
1384 		return false;
1385 	rdp->last_advance_all = jiffies;
1386 
1387 	rnp = rdp->mynode;
1388 
1389 	/*
1390 	 * Don't bother checking unless a grace period has
1391 	 * completed since we last checked and there are
1392 	 * callbacks not yet ready to invoke.
1393 	 */
1394 	if ((rcu_seq_completed_gp(rdp->gp_seq,
1395 				  rcu_seq_current(&rnp->gp_seq)) ||
1396 	     unlikely(READ_ONCE(rdp->gpwrap))) &&
1397 	    rcu_segcblist_pend_cbs(&rdp->cblist))
1398 		note_gp_changes(rdp);
1399 
1400 	if (rcu_segcblist_ready_cbs(&rdp->cblist))
1401 		cbs_ready = true;
1402 	return cbs_ready;
1403 }
1404 
1405 /*
1406  * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1407  * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
1408  * caller about what to set the timeout.
1409  *
1410  * The caller must have disabled interrupts.
1411  */
1412 int rcu_needs_cpu(u64 basemono, u64 *nextevt)
1413 {
1414 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1415 	unsigned long dj;
1416 
1417 	lockdep_assert_irqs_disabled();
1418 
1419 	/* If no non-offloaded callbacks, RCU doesn't need the CPU. */
1420 	if (rcu_segcblist_empty(&rdp->cblist) ||
1421 	    rcu_rdp_is_offloaded(rdp)) {
1422 		*nextevt = KTIME_MAX;
1423 		return 0;
1424 	}
1425 
1426 	/* Attempt to advance callbacks. */
1427 	if (rcu_try_advance_all_cbs()) {
1428 		/* Some ready to invoke, so initiate later invocation. */
1429 		invoke_rcu_core();
1430 		return 1;
1431 	}
1432 	rdp->last_accelerate = jiffies;
1433 
1434 	/* Request timer and round. */
1435 	dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies;
1436 
1437 	*nextevt = basemono + dj * TICK_NSEC;
1438 	return 0;
1439 }
1440 
1441 /*
1442  * Prepare a CPU for idle from an RCU perspective.  The first major task is to
1443  * sense whether nohz mode has been enabled or disabled via sysfs.  The second
1444  * major task is to accelerate (that is, assign grace-period numbers to) any
1445  * recently arrived callbacks.
1446  *
1447  * The caller must have disabled interrupts.
1448  */
1449 static void rcu_prepare_for_idle(void)
1450 {
1451 	bool needwake;
1452 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1453 	struct rcu_node *rnp;
1454 	int tne;
1455 
1456 	lockdep_assert_irqs_disabled();
1457 	if (rcu_rdp_is_offloaded(rdp))
1458 		return;
1459 
1460 	/* Handle nohz enablement switches conservatively. */
1461 	tne = READ_ONCE(tick_nohz_active);
1462 	if (tne != rdp->tick_nohz_enabled_snap) {
1463 		if (!rcu_segcblist_empty(&rdp->cblist))
1464 			invoke_rcu_core(); /* force nohz to see update. */
1465 		rdp->tick_nohz_enabled_snap = tne;
1466 		return;
1467 	}
1468 	if (!tne)
1469 		return;
1470 
1471 	/*
1472 	 * If we have not yet accelerated this jiffy, accelerate all
1473 	 * callbacks on this CPU.
1474 	 */
1475 	if (rdp->last_accelerate == jiffies)
1476 		return;
1477 	rdp->last_accelerate = jiffies;
1478 	if (rcu_segcblist_pend_cbs(&rdp->cblist)) {
1479 		rnp = rdp->mynode;
1480 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
1481 		needwake = rcu_accelerate_cbs(rnp, rdp);
1482 		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
1483 		if (needwake)
1484 			rcu_gp_kthread_wake();
1485 	}
1486 }
1487 
1488 /*
1489  * Clean up for exit from idle.  Attempt to advance callbacks based on
1490  * any grace periods that elapsed while the CPU was idle, and if any
1491  * callbacks are now ready to invoke, initiate invocation.
1492  */
1493 static void rcu_cleanup_after_idle(void)
1494 {
1495 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
1496 
1497 	lockdep_assert_irqs_disabled();
1498 	if (rcu_rdp_is_offloaded(rdp))
1499 		return;
1500 	if (rcu_try_advance_all_cbs())
1501 		invoke_rcu_core();
1502 }
1503 
1504 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1505 
1506 #ifdef CONFIG_RCU_NOCB_CPU
1507 
1508 /*
1509  * Offload callback processing from the boot-time-specified set of CPUs
1510  * specified by rcu_nocb_mask.  For the CPUs in the set, there are kthreads
1511  * created that pull the callbacks from the corresponding CPU, wait for
1512  * a grace period to elapse, and invoke the callbacks.  These kthreads
1513  * are organized into GP kthreads, which manage incoming callbacks, wait for
1514  * grace periods, and awaken CB kthreads, and the CB kthreads, which only
1515  * invoke callbacks.  Each GP kthread invokes its own CBs.  The no-CBs CPUs
1516  * do a wake_up() on their GP kthread when they insert a callback into any
1517  * empty list, unless the rcu_nocb_poll boot parameter has been specified,
1518  * in which case each kthread actively polls its CPU.  (Which isn't so great
1519  * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
1520  *
1521  * This is intended to be used in conjunction with Frederic Weisbecker's
1522  * adaptive-idle work, which would seriously reduce OS jitter on CPUs
1523  * running CPU-bound user-mode computations.
1524  *
1525  * Offloading of callbacks can also be used as an energy-efficiency
1526  * measure because CPUs with no RCU callbacks queued are more aggressive
1527  * about entering dyntick-idle mode.
1528  */
1529 
1530 
1531 /*
1532  * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
1533  * If the list is invalid, a warning is emitted and all CPUs are offloaded.
1534  */
1535 static int __init rcu_nocb_setup(char *str)
1536 {
1537 	alloc_bootmem_cpumask_var(&rcu_nocb_mask);
1538 	if (!strcasecmp(str, "all"))		/* legacy: use "0-N" instead */
1539 		cpumask_setall(rcu_nocb_mask);
1540 	else
1541 		if (cpulist_parse(str, rcu_nocb_mask)) {
1542 			pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
1543 			cpumask_setall(rcu_nocb_mask);
1544 		}
1545 	return 1;
1546 }
1547 __setup("rcu_nocbs=", rcu_nocb_setup);
1548 
1549 static int __init parse_rcu_nocb_poll(char *arg)
1550 {
1551 	rcu_nocb_poll = true;
1552 	return 0;
1553 }
1554 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
1555 
1556 /*
1557  * Don't bother bypassing ->cblist if the call_rcu() rate is low.
1558  * After all, the main point of bypassing is to avoid lock contention
1559  * on ->nocb_lock, which only can happen at high call_rcu() rates.
1560  */
1561 static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
1562 module_param(nocb_nobypass_lim_per_jiffy, int, 0);
1563 
1564 /*
1565  * Acquire the specified rcu_data structure's ->nocb_bypass_lock.  If the
1566  * lock isn't immediately available, increment ->nocb_lock_contended to
1567  * flag the contention.
1568  */
1569 static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
1570 	__acquires(&rdp->nocb_bypass_lock)
1571 {
1572 	lockdep_assert_irqs_disabled();
1573 	if (raw_spin_trylock(&rdp->nocb_bypass_lock))
1574 		return;
1575 	atomic_inc(&rdp->nocb_lock_contended);
1576 	WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
1577 	smp_mb__after_atomic(); /* atomic_inc() before lock. */
1578 	raw_spin_lock(&rdp->nocb_bypass_lock);
1579 	smp_mb__before_atomic(); /* atomic_dec() after lock. */
1580 	atomic_dec(&rdp->nocb_lock_contended);
1581 }
1582 
1583 /*
1584  * Spinwait until the specified rcu_data structure's ->nocb_lock is
1585  * not contended.  Please note that this is extremely special-purpose,
1586  * relying on the fact that at most two kthreads and one CPU contend for
1587  * this lock, and also that the two kthreads are guaranteed to have frequent
1588  * grace-period-duration time intervals between successive acquisitions
1589  * of the lock.  This allows us to use an extremely simple throttling
1590  * mechanism, and further to apply it only to the CPU doing floods of
1591  * call_rcu() invocations.  Don't try this at home!
1592  */
1593 static void rcu_nocb_wait_contended(struct rcu_data *rdp)
1594 {
1595 	WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
1596 	while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
1597 		cpu_relax();
1598 }
1599 
1600 /*
1601  * Conditionally acquire the specified rcu_data structure's
1602  * ->nocb_bypass_lock.
1603  */
1604 static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
1605 {
1606 	lockdep_assert_irqs_disabled();
1607 	return raw_spin_trylock(&rdp->nocb_bypass_lock);
1608 }
1609 
1610 /*
1611  * Release the specified rcu_data structure's ->nocb_bypass_lock.
1612  */
1613 static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
1614 	__releases(&rdp->nocb_bypass_lock)
1615 {
1616 	lockdep_assert_irqs_disabled();
1617 	raw_spin_unlock(&rdp->nocb_bypass_lock);
1618 }
1619 
1620 /*
1621  * Acquire the specified rcu_data structure's ->nocb_lock, but only
1622  * if it corresponds to a no-CBs CPU.
1623  */
1624 static void rcu_nocb_lock(struct rcu_data *rdp)
1625 {
1626 	lockdep_assert_irqs_disabled();
1627 	if (!rcu_rdp_is_offloaded(rdp))
1628 		return;
1629 	raw_spin_lock(&rdp->nocb_lock);
1630 }
1631 
1632 /*
1633  * Release the specified rcu_data structure's ->nocb_lock, but only
1634  * if it corresponds to a no-CBs CPU.
1635  */
1636 static void rcu_nocb_unlock(struct rcu_data *rdp)
1637 {
1638 	if (rcu_rdp_is_offloaded(rdp)) {
1639 		lockdep_assert_irqs_disabled();
1640 		raw_spin_unlock(&rdp->nocb_lock);
1641 	}
1642 }
1643 
1644 /*
1645  * Release the specified rcu_data structure's ->nocb_lock and restore
1646  * interrupts, but only if it corresponds to a no-CBs CPU.
1647  */
1648 static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
1649 				       unsigned long flags)
1650 {
1651 	if (rcu_rdp_is_offloaded(rdp)) {
1652 		lockdep_assert_irqs_disabled();
1653 		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1654 	} else {
1655 		local_irq_restore(flags);
1656 	}
1657 }
1658 
1659 /* Lockdep check that ->cblist may be safely accessed. */
1660 static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
1661 {
1662 	lockdep_assert_irqs_disabled();
1663 	if (rcu_rdp_is_offloaded(rdp))
1664 		lockdep_assert_held(&rdp->nocb_lock);
1665 }
1666 
1667 /*
1668  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
1669  * grace period.
1670  */
1671 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
1672 {
1673 	swake_up_all(sq);
1674 }
1675 
1676 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
1677 {
1678 	return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
1679 }
1680 
1681 static void rcu_init_one_nocb(struct rcu_node *rnp)
1682 {
1683 	init_swait_queue_head(&rnp->nocb_gp_wq[0]);
1684 	init_swait_queue_head(&rnp->nocb_gp_wq[1]);
1685 }
1686 
1687 /* Is the specified CPU a no-CBs CPU? */
1688 bool rcu_is_nocb_cpu(int cpu)
1689 {
1690 	if (cpumask_available(rcu_nocb_mask))
1691 		return cpumask_test_cpu(cpu, rcu_nocb_mask);
1692 	return false;
1693 }
1694 
1695 /*
1696  * Kick the GP kthread for this NOCB group.  Caller holds ->nocb_lock
1697  * and this function releases it.
1698  */
1699 static bool wake_nocb_gp(struct rcu_data *rdp, bool force,
1700 			 unsigned long flags)
1701 	__releases(rdp->nocb_lock)
1702 {
1703 	bool needwake = false;
1704 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
1705 
1706 	lockdep_assert_held(&rdp->nocb_lock);
1707 	if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
1708 		rcu_nocb_unlock_irqrestore(rdp, flags);
1709 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1710 				    TPS("AlreadyAwake"));
1711 		return false;
1712 	}
1713 
1714 	if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) {
1715 		WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
1716 		del_timer(&rdp->nocb_timer);
1717 	}
1718 	rcu_nocb_unlock_irqrestore(rdp, flags);
1719 	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
1720 	if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
1721 		WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
1722 		needwake = true;
1723 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
1724 	}
1725 	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
1726 	if (needwake)
1727 		wake_up_process(rdp_gp->nocb_gp_kthread);
1728 
1729 	return needwake;
1730 }
1731 
1732 /*
1733  * Arrange to wake the GP kthread for this NOCB group at some future
1734  * time when it is safe to do so.
1735  */
1736 static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
1737 			       const char *reason)
1738 {
1739 	if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF)
1740 		return;
1741 	if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
1742 		mod_timer(&rdp->nocb_timer, jiffies + 1);
1743 	if (rdp->nocb_defer_wakeup < waketype)
1744 		WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
1745 	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
1746 }
1747 
1748 /*
1749  * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
1750  * However, if there is a callback to be enqueued and if ->nocb_bypass
1751  * proves to be initially empty, just return false because the no-CB GP
1752  * kthread may need to be awakened in this case.
1753  *
1754  * Note that this function always returns true if rhp is NULL.
1755  */
1756 static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1757 				     unsigned long j)
1758 {
1759 	struct rcu_cblist rcl;
1760 
1761 	WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
1762 	rcu_lockdep_assert_cblist_protected(rdp);
1763 	lockdep_assert_held(&rdp->nocb_bypass_lock);
1764 	if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
1765 		raw_spin_unlock(&rdp->nocb_bypass_lock);
1766 		return false;
1767 	}
1768 	/* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
1769 	if (rhp)
1770 		rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
1771 	rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
1772 	rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
1773 	WRITE_ONCE(rdp->nocb_bypass_first, j);
1774 	rcu_nocb_bypass_unlock(rdp);
1775 	return true;
1776 }
1777 
1778 /*
1779  * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
1780  * However, if there is a callback to be enqueued and if ->nocb_bypass
1781  * proves to be initially empty, just return false because the no-CB GP
1782  * kthread may need to be awakened in this case.
1783  *
1784  * Note that this function always returns true if rhp is NULL.
1785  */
1786 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1787 				  unsigned long j)
1788 {
1789 	if (!rcu_rdp_is_offloaded(rdp))
1790 		return true;
1791 	rcu_lockdep_assert_cblist_protected(rdp);
1792 	rcu_nocb_bypass_lock(rdp);
1793 	return rcu_nocb_do_flush_bypass(rdp, rhp, j);
1794 }
1795 
1796 /*
1797  * If the ->nocb_bypass_lock is immediately available, flush the
1798  * ->nocb_bypass queue into ->cblist.
1799  */
1800 static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
1801 {
1802 	rcu_lockdep_assert_cblist_protected(rdp);
1803 	if (!rcu_rdp_is_offloaded(rdp) ||
1804 	    !rcu_nocb_bypass_trylock(rdp))
1805 		return;
1806 	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
1807 }
1808 
1809 /*
1810  * See whether it is appropriate to use the ->nocb_bypass list in order
1811  * to control contention on ->nocb_lock.  A limited number of direct
1812  * enqueues are permitted into ->cblist per jiffy.  If ->nocb_bypass
1813  * is non-empty, further callbacks must be placed into ->nocb_bypass,
1814  * otherwise rcu_barrier() breaks.  Use rcu_nocb_flush_bypass() to switch
1815  * back to direct use of ->cblist.  However, ->nocb_bypass should not be
1816  * used if ->cblist is empty, because otherwise callbacks can be stranded
1817  * on ->nocb_bypass because we cannot count on the current CPU ever again
1818  * invoking call_rcu().  The general rule is that if ->nocb_bypass is
1819  * non-empty, the corresponding no-CBs grace-period kthread must not be
1820  * in an indefinite sleep state.
1821  *
1822  * Finally, it is not permitted to use the bypass during early boot,
1823  * as doing so would confuse the auto-initialization code.  Besides
1824  * which, there is no point in worrying about lock contention while
1825  * there is only one CPU in operation.
1826  */
1827 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
1828 				bool *was_alldone, unsigned long flags)
1829 {
1830 	unsigned long c;
1831 	unsigned long cur_gp_seq;
1832 	unsigned long j = jiffies;
1833 	long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1834 
1835 	lockdep_assert_irqs_disabled();
1836 
1837 	// Pure softirq/rcuc based processing: no bypassing, no
1838 	// locking.
1839 	if (!rcu_rdp_is_offloaded(rdp)) {
1840 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1841 		return false;
1842 	}
1843 
1844 	// In the process of (de-)offloading: no bypassing, but
1845 	// locking.
1846 	if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
1847 		rcu_nocb_lock(rdp);
1848 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1849 		return false; /* Not offloaded, no bypassing. */
1850 	}
1851 
1852 	// Don't use ->nocb_bypass during early boot.
1853 	if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
1854 		rcu_nocb_lock(rdp);
1855 		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1856 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1857 		return false;
1858 	}
1859 
1860 	// If we have advanced to a new jiffy, reset counts to allow
1861 	// moving back from ->nocb_bypass to ->cblist.
1862 	if (j == rdp->nocb_nobypass_last) {
1863 		c = rdp->nocb_nobypass_count + 1;
1864 	} else {
1865 		WRITE_ONCE(rdp->nocb_nobypass_last, j);
1866 		c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
1867 		if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
1868 				 nocb_nobypass_lim_per_jiffy))
1869 			c = 0;
1870 		else if (c > nocb_nobypass_lim_per_jiffy)
1871 			c = nocb_nobypass_lim_per_jiffy;
1872 	}
1873 	WRITE_ONCE(rdp->nocb_nobypass_count, c);
1874 
1875 	// If there hasn't yet been all that many ->cblist enqueues
1876 	// this jiffy, tell the caller to enqueue onto ->cblist.  But flush
1877 	// ->nocb_bypass first.
1878 	if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
1879 		rcu_nocb_lock(rdp);
1880 		*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1881 		if (*was_alldone)
1882 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1883 					    TPS("FirstQ"));
1884 		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
1885 		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1886 		return false; // Caller must enqueue the callback.
1887 	}
1888 
1889 	// If ->nocb_bypass has been used too long or is too full,
1890 	// flush ->nocb_bypass to ->cblist.
1891 	if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
1892 	    ncbs >= qhimark) {
1893 		rcu_nocb_lock(rdp);
1894 		if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
1895 			*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
1896 			if (*was_alldone)
1897 				trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1898 						    TPS("FirstQ"));
1899 			WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
1900 			return false; // Caller must enqueue the callback.
1901 		}
1902 		if (j != rdp->nocb_gp_adv_time &&
1903 		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
1904 		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
1905 			rcu_advance_cbs_nowake(rdp->mynode, rdp);
1906 			rdp->nocb_gp_adv_time = j;
1907 		}
1908 		rcu_nocb_unlock_irqrestore(rdp, flags);
1909 		return true; // Callback already enqueued.
1910 	}
1911 
1912 	// We need to use the bypass.
1913 	rcu_nocb_wait_contended(rdp);
1914 	rcu_nocb_bypass_lock(rdp);
1915 	ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
1916 	rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
1917 	rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
1918 	if (!ncbs) {
1919 		WRITE_ONCE(rdp->nocb_bypass_first, j);
1920 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
1921 	}
1922 	rcu_nocb_bypass_unlock(rdp);
1923 	smp_mb(); /* Order enqueue before wake. */
1924 	if (ncbs) {
1925 		local_irq_restore(flags);
1926 	} else {
1927 		// No-CBs GP kthread might be indefinitely asleep, if so, wake.
1928 		rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
1929 		if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
1930 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1931 					    TPS("FirstBQwake"));
1932 			__call_rcu_nocb_wake(rdp, true, flags);
1933 		} else {
1934 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1935 					    TPS("FirstBQnoWake"));
1936 			rcu_nocb_unlock_irqrestore(rdp, flags);
1937 		}
1938 	}
1939 	return true; // Callback already enqueued.
1940 }
1941 
1942 /*
1943  * Awaken the no-CBs grace-period kthead if needed, either due to it
1944  * legitimately being asleep or due to overload conditions.
1945  *
1946  * If warranted, also wake up the kthread servicing this CPUs queues.
1947  */
1948 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
1949 				 unsigned long flags)
1950 				 __releases(rdp->nocb_lock)
1951 {
1952 	unsigned long cur_gp_seq;
1953 	unsigned long j;
1954 	long len;
1955 	struct task_struct *t;
1956 
1957 	// If we are being polled or there is no kthread, just leave.
1958 	t = READ_ONCE(rdp->nocb_gp_kthread);
1959 	if (rcu_nocb_poll || !t) {
1960 		rcu_nocb_unlock_irqrestore(rdp, flags);
1961 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1962 				    TPS("WakeNotPoll"));
1963 		return;
1964 	}
1965 	// Need to actually to a wakeup.
1966 	len = rcu_segcblist_n_cbs(&rdp->cblist);
1967 	if (was_alldone) {
1968 		rdp->qlen_last_fqs_check = len;
1969 		if (!irqs_disabled_flags(flags)) {
1970 			/* ... if queue was empty ... */
1971 			wake_nocb_gp(rdp, false, flags);
1972 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
1973 					    TPS("WakeEmpty"));
1974 		} else {
1975 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
1976 					   TPS("WakeEmptyIsDeferred"));
1977 			rcu_nocb_unlock_irqrestore(rdp, flags);
1978 		}
1979 	} else if (len > rdp->qlen_last_fqs_check + qhimark) {
1980 		/* ... or if many callbacks queued. */
1981 		rdp->qlen_last_fqs_check = len;
1982 		j = jiffies;
1983 		if (j != rdp->nocb_gp_adv_time &&
1984 		    rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
1985 		    rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
1986 			rcu_advance_cbs_nowake(rdp->mynode, rdp);
1987 			rdp->nocb_gp_adv_time = j;
1988 		}
1989 		smp_mb(); /* Enqueue before timer_pending(). */
1990 		if ((rdp->nocb_cb_sleep ||
1991 		     !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
1992 		    !timer_pending(&rdp->nocb_bypass_timer))
1993 			wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
1994 					   TPS("WakeOvfIsDeferred"));
1995 		rcu_nocb_unlock_irqrestore(rdp, flags);
1996 	} else {
1997 		rcu_nocb_unlock_irqrestore(rdp, flags);
1998 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
1999 	}
2000 	return;
2001 }
2002 
2003 /* Wake up the no-CBs GP kthread to flush ->nocb_bypass. */
2004 static void do_nocb_bypass_wakeup_timer(struct timer_list *t)
2005 {
2006 	unsigned long flags;
2007 	struct rcu_data *rdp = from_timer(rdp, t, nocb_bypass_timer);
2008 
2009 	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
2010 	rcu_nocb_lock_irqsave(rdp, flags);
2011 	smp_mb__after_spinlock(); /* Timer expire before wakeup. */
2012 	__call_rcu_nocb_wake(rdp, true, flags);
2013 }
2014 
2015 /*
2016  * Check if we ignore this rdp.
2017  *
2018  * We check that without holding the nocb lock but
2019  * we make sure not to miss a freshly offloaded rdp
2020  * with the current ordering:
2021  *
2022  *  rdp_offload_toggle()        nocb_gp_enabled_cb()
2023  * -------------------------   ----------------------------
2024  *    WRITE flags                 LOCK nocb_gp_lock
2025  *    LOCK nocb_gp_lock           READ/WRITE nocb_gp_sleep
2026  *    READ/WRITE nocb_gp_sleep    UNLOCK nocb_gp_lock
2027  *    UNLOCK nocb_gp_lock         READ flags
2028  */
2029 static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
2030 {
2031 	u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
2032 
2033 	return rcu_segcblist_test_flags(&rdp->cblist, flags);
2034 }
2035 
2036 static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
2037 						     bool *needwake_state)
2038 {
2039 	struct rcu_segcblist *cblist = &rdp->cblist;
2040 
2041 	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
2042 		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
2043 			rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
2044 			if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
2045 				*needwake_state = true;
2046 		}
2047 		return false;
2048 	}
2049 
2050 	/*
2051 	 * De-offloading. Clear our flag and notify the de-offload worker.
2052 	 * We will ignore this rdp until it ever gets re-offloaded.
2053 	 */
2054 	WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
2055 	rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
2056 	if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
2057 		*needwake_state = true;
2058 	return true;
2059 }
2060 
2061 
2062 /*
2063  * No-CBs GP kthreads come here to wait for additional callbacks to show up
2064  * or for grace periods to end.
2065  */
2066 static void nocb_gp_wait(struct rcu_data *my_rdp)
2067 {
2068 	bool bypass = false;
2069 	long bypass_ncbs;
2070 	int __maybe_unused cpu = my_rdp->cpu;
2071 	unsigned long cur_gp_seq;
2072 	unsigned long flags;
2073 	bool gotcbs = false;
2074 	unsigned long j = jiffies;
2075 	bool needwait_gp = false; // This prevents actual uninitialized use.
2076 	bool needwake;
2077 	bool needwake_gp;
2078 	struct rcu_data *rdp;
2079 	struct rcu_node *rnp;
2080 	unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
2081 	bool wasempty = false;
2082 
2083 	/*
2084 	 * Each pass through the following loop checks for CBs and for the
2085 	 * nearest grace period (if any) to wait for next.  The CB kthreads
2086 	 * and the global grace-period kthread are awakened if needed.
2087 	 */
2088 	WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
2089 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
2090 		bool needwake_state = false;
2091 
2092 		if (!nocb_gp_enabled_cb(rdp))
2093 			continue;
2094 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
2095 		rcu_nocb_lock_irqsave(rdp, flags);
2096 		if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
2097 			rcu_nocb_unlock_irqrestore(rdp, flags);
2098 			if (needwake_state)
2099 				swake_up_one(&rdp->nocb_state_wq);
2100 			continue;
2101 		}
2102 		bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
2103 		if (bypass_ncbs &&
2104 		    (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
2105 		     bypass_ncbs > 2 * qhimark)) {
2106 			// Bypass full or old, so flush it.
2107 			(void)rcu_nocb_try_flush_bypass(rdp, j);
2108 			bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
2109 		} else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
2110 			rcu_nocb_unlock_irqrestore(rdp, flags);
2111 			if (needwake_state)
2112 				swake_up_one(&rdp->nocb_state_wq);
2113 			continue; /* No callbacks here, try next. */
2114 		}
2115 		if (bypass_ncbs) {
2116 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
2117 					    TPS("Bypass"));
2118 			bypass = true;
2119 		}
2120 		rnp = rdp->mynode;
2121 		if (bypass) {  // Avoid race with first bypass CB.
2122 			WRITE_ONCE(my_rdp->nocb_defer_wakeup,
2123 				   RCU_NOCB_WAKE_NOT);
2124 			del_timer(&my_rdp->nocb_timer);
2125 		}
2126 		// Advance callbacks if helpful and low contention.
2127 		needwake_gp = false;
2128 		if (!rcu_segcblist_restempty(&rdp->cblist,
2129 					     RCU_NEXT_READY_TAIL) ||
2130 		    (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
2131 		     rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
2132 			raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
2133 			needwake_gp = rcu_advance_cbs(rnp, rdp);
2134 			wasempty = rcu_segcblist_restempty(&rdp->cblist,
2135 							   RCU_NEXT_READY_TAIL);
2136 			raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
2137 		}
2138 		// Need to wait on some grace period?
2139 		WARN_ON_ONCE(wasempty &&
2140 			     !rcu_segcblist_restempty(&rdp->cblist,
2141 						      RCU_NEXT_READY_TAIL));
2142 		if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
2143 			if (!needwait_gp ||
2144 			    ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
2145 				wait_gp_seq = cur_gp_seq;
2146 			needwait_gp = true;
2147 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
2148 					    TPS("NeedWaitGP"));
2149 		}
2150 		if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
2151 			needwake = rdp->nocb_cb_sleep;
2152 			WRITE_ONCE(rdp->nocb_cb_sleep, false);
2153 			smp_mb(); /* CB invocation -after- GP end. */
2154 		} else {
2155 			needwake = false;
2156 		}
2157 		rcu_nocb_unlock_irqrestore(rdp, flags);
2158 		if (needwake) {
2159 			swake_up_one(&rdp->nocb_cb_wq);
2160 			gotcbs = true;
2161 		}
2162 		if (needwake_gp)
2163 			rcu_gp_kthread_wake();
2164 		if (needwake_state)
2165 			swake_up_one(&rdp->nocb_state_wq);
2166 	}
2167 
2168 	my_rdp->nocb_gp_bypass = bypass;
2169 	my_rdp->nocb_gp_gp = needwait_gp;
2170 	my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
2171 	if (bypass && !rcu_nocb_poll) {
2172 		// At least one child with non-empty ->nocb_bypass, so set
2173 		// timer in order to avoid stranding its callbacks.
2174 		raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
2175 		mod_timer(&my_rdp->nocb_bypass_timer, j + 2);
2176 		raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
2177 	}
2178 	if (rcu_nocb_poll) {
2179 		/* Polling, so trace if first poll in the series. */
2180 		if (gotcbs)
2181 			trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
2182 		schedule_timeout_idle(1);
2183 	} else if (!needwait_gp) {
2184 		/* Wait for callbacks to appear. */
2185 		trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
2186 		swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
2187 				!READ_ONCE(my_rdp->nocb_gp_sleep));
2188 		trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
2189 	} else {
2190 		rnp = my_rdp->mynode;
2191 		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
2192 		swait_event_interruptible_exclusive(
2193 			rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
2194 			rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
2195 			!READ_ONCE(my_rdp->nocb_gp_sleep));
2196 		trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
2197 	}
2198 	if (!rcu_nocb_poll) {
2199 		raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
2200 		if (bypass)
2201 			del_timer(&my_rdp->nocb_bypass_timer);
2202 		WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
2203 		raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
2204 	}
2205 	my_rdp->nocb_gp_seq = -1;
2206 	WARN_ON(signal_pending(current));
2207 }
2208 
2209 /*
2210  * No-CBs grace-period-wait kthread.  There is one of these per group
2211  * of CPUs, but only once at least one CPU in that group has come online
2212  * at least once since boot.  This kthread checks for newly posted
2213  * callbacks from any of the CPUs it is responsible for, waits for a
2214  * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
2215  * that then have callback-invocation work to do.
2216  */
2217 static int rcu_nocb_gp_kthread(void *arg)
2218 {
2219 	struct rcu_data *rdp = arg;
2220 
2221 	for (;;) {
2222 		WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
2223 		nocb_gp_wait(rdp);
2224 		cond_resched_tasks_rcu_qs();
2225 	}
2226 	return 0;
2227 }
2228 
2229 static inline bool nocb_cb_can_run(struct rcu_data *rdp)
2230 {
2231 	u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
2232 	return rcu_segcblist_test_flags(&rdp->cblist, flags);
2233 }
2234 
2235 static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
2236 {
2237 	return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
2238 }
2239 
2240 /*
2241  * Invoke any ready callbacks from the corresponding no-CBs CPU,
2242  * then, if there are no more, wait for more to appear.
2243  */
2244 static void nocb_cb_wait(struct rcu_data *rdp)
2245 {
2246 	struct rcu_segcblist *cblist = &rdp->cblist;
2247 	unsigned long cur_gp_seq;
2248 	unsigned long flags;
2249 	bool needwake_state = false;
2250 	bool needwake_gp = false;
2251 	bool can_sleep = true;
2252 	struct rcu_node *rnp = rdp->mynode;
2253 
2254 	local_irq_save(flags);
2255 	rcu_momentary_dyntick_idle();
2256 	local_irq_restore(flags);
2257 	/*
2258 	 * Disable BH to provide the expected environment.  Also, when
2259 	 * transitioning to/from NOCB mode, a self-requeuing callback might
2260 	 * be invoked from softirq.  A short grace period could cause both
2261 	 * instances of this callback would execute concurrently.
2262 	 */
2263 	local_bh_disable();
2264 	rcu_do_batch(rdp);
2265 	local_bh_enable();
2266 	lockdep_assert_irqs_enabled();
2267 	rcu_nocb_lock_irqsave(rdp, flags);
2268 	if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
2269 	    rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
2270 	    raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
2271 		needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
2272 		raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
2273 	}
2274 
2275 	if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
2276 		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
2277 			rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
2278 			if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
2279 				needwake_state = true;
2280 		}
2281 		if (rcu_segcblist_ready_cbs(cblist))
2282 			can_sleep = false;
2283 	} else {
2284 		/*
2285 		 * De-offloading. Clear our flag and notify the de-offload worker.
2286 		 * We won't touch the callbacks and keep sleeping until we ever
2287 		 * get re-offloaded.
2288 		 */
2289 		WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
2290 		rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
2291 		if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
2292 			needwake_state = true;
2293 	}
2294 
2295 	WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
2296 
2297 	if (rdp->nocb_cb_sleep)
2298 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
2299 
2300 	rcu_nocb_unlock_irqrestore(rdp, flags);
2301 	if (needwake_gp)
2302 		rcu_gp_kthread_wake();
2303 
2304 	if (needwake_state)
2305 		swake_up_one(&rdp->nocb_state_wq);
2306 
2307 	do {
2308 		swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
2309 						    nocb_cb_wait_cond(rdp));
2310 
2311 		// VVV Ensure CB invocation follows _sleep test.
2312 		if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
2313 			WARN_ON(signal_pending(current));
2314 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
2315 		}
2316 	} while (!nocb_cb_can_run(rdp));
2317 }
2318 
2319 /*
2320  * Per-rcu_data kthread, but only for no-CBs CPUs.  Repeatedly invoke
2321  * nocb_cb_wait() to do the dirty work.
2322  */
2323 static int rcu_nocb_cb_kthread(void *arg)
2324 {
2325 	struct rcu_data *rdp = arg;
2326 
2327 	// Each pass through this loop does one callback batch, and,
2328 	// if there are no more ready callbacks, waits for them.
2329 	for (;;) {
2330 		nocb_cb_wait(rdp);
2331 		cond_resched_tasks_rcu_qs();
2332 	}
2333 	return 0;
2334 }
2335 
2336 /* Is a deferred wakeup of rcu_nocb_kthread() required? */
2337 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2338 {
2339 	return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT;
2340 }
2341 
2342 /* Do a deferred wakeup of rcu_nocb_kthread(). */
2343 static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
2344 {
2345 	unsigned long flags;
2346 	int ndw;
2347 	int ret;
2348 
2349 	rcu_nocb_lock_irqsave(rdp, flags);
2350 	if (!rcu_nocb_need_deferred_wakeup(rdp)) {
2351 		rcu_nocb_unlock_irqrestore(rdp, flags);
2352 		return false;
2353 	}
2354 	ndw = READ_ONCE(rdp->nocb_defer_wakeup);
2355 	ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
2356 	trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
2357 
2358 	return ret;
2359 }
2360 
2361 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
2362 static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
2363 {
2364 	struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
2365 
2366 	do_nocb_deferred_wakeup_common(rdp);
2367 }
2368 
2369 /*
2370  * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
2371  * This means we do an inexact common-case check.  Note that if
2372  * we miss, ->nocb_timer will eventually clean things up.
2373  */
2374 static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
2375 {
2376 	if (rcu_nocb_need_deferred_wakeup(rdp))
2377 		return do_nocb_deferred_wakeup_common(rdp);
2378 	return false;
2379 }
2380 
2381 void rcu_nocb_flush_deferred_wakeup(void)
2382 {
2383 	do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
2384 }
2385 EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
2386 
2387 static int rdp_offload_toggle(struct rcu_data *rdp,
2388 			       bool offload, unsigned long flags)
2389 	__releases(rdp->nocb_lock)
2390 {
2391 	struct rcu_segcblist *cblist = &rdp->cblist;
2392 	struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
2393 	bool wake_gp = false;
2394 
2395 	rcu_segcblist_offload(cblist, offload);
2396 
2397 	if (rdp->nocb_cb_sleep)
2398 		rdp->nocb_cb_sleep = false;
2399 	rcu_nocb_unlock_irqrestore(rdp, flags);
2400 
2401 	/*
2402 	 * Ignore former value of nocb_cb_sleep and force wake up as it could
2403 	 * have been spuriously set to false already.
2404 	 */
2405 	swake_up_one(&rdp->nocb_cb_wq);
2406 
2407 	raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
2408 	if (rdp_gp->nocb_gp_sleep) {
2409 		rdp_gp->nocb_gp_sleep = false;
2410 		wake_gp = true;
2411 	}
2412 	raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
2413 
2414 	if (wake_gp)
2415 		wake_up_process(rdp_gp->nocb_gp_kthread);
2416 
2417 	return 0;
2418 }
2419 
2420 static long rcu_nocb_rdp_deoffload(void *arg)
2421 {
2422 	struct rcu_data *rdp = arg;
2423 	struct rcu_segcblist *cblist = &rdp->cblist;
2424 	unsigned long flags;
2425 	int ret;
2426 
2427 	WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
2428 
2429 	pr_info("De-offloading %d\n", rdp->cpu);
2430 
2431 	rcu_nocb_lock_irqsave(rdp, flags);
2432 	/*
2433 	 * Flush once and for all now. This suffices because we are
2434 	 * running on the target CPU holding ->nocb_lock (thus having
2435 	 * interrupts disabled), and because rdp_offload_toggle()
2436 	 * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
2437 	 * Thus future calls to rcu_segcblist_completely_offloaded() will
2438 	 * return false, which means that future calls to rcu_nocb_try_bypass()
2439 	 * will refuse to put anything into the bypass.
2440 	 */
2441 	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
2442 	ret = rdp_offload_toggle(rdp, false, flags);
2443 	swait_event_exclusive(rdp->nocb_state_wq,
2444 			      !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
2445 							SEGCBLIST_KTHREAD_GP));
2446 	rcu_nocb_lock_irqsave(rdp, flags);
2447 	/* Make sure nocb timer won't stay around */
2448 	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF);
2449 	rcu_nocb_unlock_irqrestore(rdp, flags);
2450 	del_timer_sync(&rdp->nocb_timer);
2451 
2452 	/*
2453 	 * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY with CB unlocked
2454 	 * and IRQs disabled but let's be paranoid.
2455 	 */
2456 	rcu_nocb_lock_irqsave(rdp, flags);
2457 	rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
2458 	/*
2459 	 * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
2460 	 * rcu_nocb_unlock_irqrestore() anymore.
2461 	 */
2462 	raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2463 
2464 	/* Sanity check */
2465 	WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
2466 
2467 
2468 	return ret;
2469 }
2470 
2471 int rcu_nocb_cpu_deoffload(int cpu)
2472 {
2473 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2474 	int ret = 0;
2475 
2476 	if (rdp == rdp->nocb_gp_rdp) {
2477 		pr_info("Can't deoffload an rdp GP leader (yet)\n");
2478 		return -EINVAL;
2479 	}
2480 	mutex_lock(&rcu_state.barrier_mutex);
2481 	cpus_read_lock();
2482 	if (rcu_rdp_is_offloaded(rdp)) {
2483 		if (cpu_online(cpu)) {
2484 			ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
2485 			if (!ret)
2486 				cpumask_clear_cpu(cpu, rcu_nocb_mask);
2487 		} else {
2488 			pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
2489 			ret = -EINVAL;
2490 		}
2491 	}
2492 	cpus_read_unlock();
2493 	mutex_unlock(&rcu_state.barrier_mutex);
2494 
2495 	return ret;
2496 }
2497 EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
2498 
2499 static long rcu_nocb_rdp_offload(void *arg)
2500 {
2501 	struct rcu_data *rdp = arg;
2502 	struct rcu_segcblist *cblist = &rdp->cblist;
2503 	unsigned long flags;
2504 	int ret;
2505 
2506 	WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
2507 	/*
2508 	 * For now we only support re-offload, ie: the rdp must have been
2509 	 * offloaded on boot first.
2510 	 */
2511 	if (!rdp->nocb_gp_rdp)
2512 		return -EINVAL;
2513 
2514 	pr_info("Offloading %d\n", rdp->cpu);
2515 	/*
2516 	 * Can't use rcu_nocb_lock_irqsave() while we are in
2517 	 * SEGCBLIST_SOFTIRQ_ONLY mode.
2518 	 */
2519 	raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
2520 	/* Re-enable nocb timer */
2521 	WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2522 	/*
2523 	 * We didn't take the nocb lock while working on the
2524 	 * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
2525 	 * Every modifications that have been done previously on
2526 	 * rdp->cblist must be visible remotely by the nocb kthreads
2527 	 * upon wake up after reading the cblist flags.
2528 	 *
2529 	 * The layout against nocb_lock enforces that ordering:
2530 	 *
2531 	 *  __rcu_nocb_rdp_offload()   nocb_cb_wait()/nocb_gp_wait()
2532 	 * -------------------------   ----------------------------
2533 	 *      WRITE callbacks           rcu_nocb_lock()
2534 	 *      rcu_nocb_lock()           READ flags
2535 	 *      WRITE flags               READ callbacks
2536 	 *      rcu_nocb_unlock()         rcu_nocb_unlock()
2537 	 */
2538 	ret = rdp_offload_toggle(rdp, true, flags);
2539 	swait_event_exclusive(rdp->nocb_state_wq,
2540 			      rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
2541 			      rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
2542 
2543 	return ret;
2544 }
2545 
2546 int rcu_nocb_cpu_offload(int cpu)
2547 {
2548 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2549 	int ret = 0;
2550 
2551 	mutex_lock(&rcu_state.barrier_mutex);
2552 	cpus_read_lock();
2553 	if (!rcu_rdp_is_offloaded(rdp)) {
2554 		if (cpu_online(cpu)) {
2555 			ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
2556 			if (!ret)
2557 				cpumask_set_cpu(cpu, rcu_nocb_mask);
2558 		} else {
2559 			pr_info("NOCB: Can't CB-offload an offline CPU\n");
2560 			ret = -EINVAL;
2561 		}
2562 	}
2563 	cpus_read_unlock();
2564 	mutex_unlock(&rcu_state.barrier_mutex);
2565 
2566 	return ret;
2567 }
2568 EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
2569 
2570 void __init rcu_init_nohz(void)
2571 {
2572 	int cpu;
2573 	bool need_rcu_nocb_mask = false;
2574 	struct rcu_data *rdp;
2575 
2576 #if defined(CONFIG_NO_HZ_FULL)
2577 	if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
2578 		need_rcu_nocb_mask = true;
2579 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
2580 
2581 	if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
2582 		if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
2583 			pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
2584 			return;
2585 		}
2586 	}
2587 	if (!cpumask_available(rcu_nocb_mask))
2588 		return;
2589 
2590 #if defined(CONFIG_NO_HZ_FULL)
2591 	if (tick_nohz_full_running)
2592 		cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2593 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
2594 
2595 	if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
2596 		pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
2597 		cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2598 			    rcu_nocb_mask);
2599 	}
2600 	if (cpumask_empty(rcu_nocb_mask))
2601 		pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
2602 	else
2603 		pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
2604 			cpumask_pr_args(rcu_nocb_mask));
2605 	if (rcu_nocb_poll)
2606 		pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2607 
2608 	for_each_cpu(cpu, rcu_nocb_mask) {
2609 		rdp = per_cpu_ptr(&rcu_data, cpu);
2610 		if (rcu_segcblist_empty(&rdp->cblist))
2611 			rcu_segcblist_init(&rdp->cblist);
2612 		rcu_segcblist_offload(&rdp->cblist, true);
2613 		rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
2614 		rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
2615 	}
2616 	rcu_organize_nocb_kthreads();
2617 }
2618 
2619 /* Initialize per-rcu_data variables for no-CBs CPUs. */
2620 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2621 {
2622 	init_swait_queue_head(&rdp->nocb_cb_wq);
2623 	init_swait_queue_head(&rdp->nocb_gp_wq);
2624 	init_swait_queue_head(&rdp->nocb_state_wq);
2625 	raw_spin_lock_init(&rdp->nocb_lock);
2626 	raw_spin_lock_init(&rdp->nocb_bypass_lock);
2627 	raw_spin_lock_init(&rdp->nocb_gp_lock);
2628 	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
2629 	timer_setup(&rdp->nocb_bypass_timer, do_nocb_bypass_wakeup_timer, 0);
2630 	rcu_cblist_init(&rdp->nocb_bypass);
2631 }
2632 
2633 /*
2634  * If the specified CPU is a no-CBs CPU that does not already have its
2635  * rcuo CB kthread, spawn it.  Additionally, if the rcuo GP kthread
2636  * for this CPU's group has not yet been created, spawn it as well.
2637  */
2638 static void rcu_spawn_one_nocb_kthread(int cpu)
2639 {
2640 	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
2641 	struct rcu_data *rdp_gp;
2642 	struct task_struct *t;
2643 
2644 	/*
2645 	 * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
2646 	 * then nothing to do.
2647 	 */
2648 	if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
2649 		return;
2650 
2651 	/* If we didn't spawn the GP kthread first, reorganize! */
2652 	rdp_gp = rdp->nocb_gp_rdp;
2653 	if (!rdp_gp->nocb_gp_kthread) {
2654 		t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
2655 				"rcuog/%d", rdp_gp->cpu);
2656 		if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
2657 			return;
2658 		WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
2659 	}
2660 
2661 	/* Spawn the kthread for this CPU. */
2662 	t = kthread_run(rcu_nocb_cb_kthread, rdp,
2663 			"rcuo%c/%d", rcu_state.abbr, cpu);
2664 	if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
2665 		return;
2666 	WRITE_ONCE(rdp->nocb_cb_kthread, t);
2667 	WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
2668 }
2669 
2670 /*
2671  * If the specified CPU is a no-CBs CPU that does not already have its
2672  * rcuo kthread, spawn it.
2673  */
2674 static void rcu_spawn_cpu_nocb_kthread(int cpu)
2675 {
2676 	if (rcu_scheduler_fully_active)
2677 		rcu_spawn_one_nocb_kthread(cpu);
2678 }
2679 
2680 /*
2681  * Once the scheduler is running, spawn rcuo kthreads for all online
2682  * no-CBs CPUs.  This assumes that the early_initcall()s happen before
2683  * non-boot CPUs come online -- if this changes, we will need to add
2684  * some mutual exclusion.
2685  */
2686 static void __init rcu_spawn_nocb_kthreads(void)
2687 {
2688 	int cpu;
2689 
2690 	for_each_online_cpu(cpu)
2691 		rcu_spawn_cpu_nocb_kthread(cpu);
2692 }
2693 
2694 /* How many CB CPU IDs per GP kthread?  Default of -1 for sqrt(nr_cpu_ids). */
2695 static int rcu_nocb_gp_stride = -1;
2696 module_param(rcu_nocb_gp_stride, int, 0444);
2697 
2698 /*
2699  * Initialize GP-CB relationships for all no-CBs CPU.
2700  */
2701 static void __init rcu_organize_nocb_kthreads(void)
2702 {
2703 	int cpu;
2704 	bool firsttime = true;
2705 	bool gotnocbs = false;
2706 	bool gotnocbscbs = true;
2707 	int ls = rcu_nocb_gp_stride;
2708 	int nl = 0;  /* Next GP kthread. */
2709 	struct rcu_data *rdp;
2710 	struct rcu_data *rdp_gp = NULL;  /* Suppress misguided gcc warn. */
2711 	struct rcu_data *rdp_prev = NULL;
2712 
2713 	if (!cpumask_available(rcu_nocb_mask))
2714 		return;
2715 	if (ls == -1) {
2716 		ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
2717 		rcu_nocb_gp_stride = ls;
2718 	}
2719 
2720 	/*
2721 	 * Each pass through this loop sets up one rcu_data structure.
2722 	 * Should the corresponding CPU come online in the future, then
2723 	 * we will spawn the needed set of rcu_nocb_kthread() kthreads.
2724 	 */
2725 	for_each_cpu(cpu, rcu_nocb_mask) {
2726 		rdp = per_cpu_ptr(&rcu_data, cpu);
2727 		if (rdp->cpu >= nl) {
2728 			/* New GP kthread, set up for CBs & next GP. */
2729 			gotnocbs = true;
2730 			nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
2731 			rdp->nocb_gp_rdp = rdp;
2732 			rdp_gp = rdp;
2733 			if (dump_tree) {
2734 				if (!firsttime)
2735 					pr_cont("%s\n", gotnocbscbs
2736 							? "" : " (self only)");
2737 				gotnocbscbs = false;
2738 				firsttime = false;
2739 				pr_alert("%s: No-CB GP kthread CPU %d:",
2740 					 __func__, cpu);
2741 			}
2742 		} else {
2743 			/* Another CB kthread, link to previous GP kthread. */
2744 			gotnocbscbs = true;
2745 			rdp->nocb_gp_rdp = rdp_gp;
2746 			rdp_prev->nocb_next_cb_rdp = rdp;
2747 			if (dump_tree)
2748 				pr_cont(" %d", cpu);
2749 		}
2750 		rdp_prev = rdp;
2751 	}
2752 	if (gotnocbs && dump_tree)
2753 		pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
2754 }
2755 
2756 /*
2757  * Bind the current task to the offloaded CPUs.  If there are no offloaded
2758  * CPUs, leave the task unbound.  Splat if the bind attempt fails.
2759  */
2760 void rcu_bind_current_to_nocb(void)
2761 {
2762 	if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
2763 		WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
2764 }
2765 EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
2766 
2767 // The ->on_cpu field is available only in CONFIG_SMP=y, so...
2768 #ifdef CONFIG_SMP
2769 static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
2770 {
2771 	return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : "";
2772 }
2773 #else // #ifdef CONFIG_SMP
2774 static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
2775 {
2776 	return "";
2777 }
2778 #endif // #else #ifdef CONFIG_SMP
2779 
2780 /*
2781  * Dump out nocb grace-period kthread state for the specified rcu_data
2782  * structure.
2783  */
2784 static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
2785 {
2786 	struct rcu_node *rnp = rdp->mynode;
2787 
2788 	pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
2789 		rdp->cpu,
2790 		"kK"[!!rdp->nocb_gp_kthread],
2791 		"lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
2792 		"dD"[!!rdp->nocb_defer_wakeup],
2793 		"tT"[timer_pending(&rdp->nocb_timer)],
2794 		"bB"[timer_pending(&rdp->nocb_bypass_timer)],
2795 		"sS"[!!rdp->nocb_gp_sleep],
2796 		".W"[swait_active(&rdp->nocb_gp_wq)],
2797 		".W"[swait_active(&rnp->nocb_gp_wq[0])],
2798 		".W"[swait_active(&rnp->nocb_gp_wq[1])],
2799 		".B"[!!rdp->nocb_gp_bypass],
2800 		".G"[!!rdp->nocb_gp_gp],
2801 		(long)rdp->nocb_gp_seq,
2802 		rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
2803 		rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
2804 		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
2805 		show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
2806 }
2807 
2808 /* Dump out nocb kthread state for the specified rcu_data structure. */
2809 static void show_rcu_nocb_state(struct rcu_data *rdp)
2810 {
2811 	char bufw[20];
2812 	char bufr[20];
2813 	struct rcu_segcblist *rsclp = &rdp->cblist;
2814 	bool waslocked;
2815 	bool wastimer;
2816 	bool wassleep;
2817 
2818 	if (rdp->nocb_gp_rdp == rdp)
2819 		show_rcu_nocb_gp_state(rdp);
2820 
2821 	sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
2822 	sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
2823 	pr_info("   CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
2824 		rdp->cpu, rdp->nocb_gp_rdp->cpu,
2825 		rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
2826 		"kK"[!!rdp->nocb_cb_kthread],
2827 		"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
2828 		"cC"[!!atomic_read(&rdp->nocb_lock_contended)],
2829 		"lL"[raw_spin_is_locked(&rdp->nocb_lock)],
2830 		"sS"[!!rdp->nocb_cb_sleep],
2831 		".W"[swait_active(&rdp->nocb_cb_wq)],
2832 		jiffies - rdp->nocb_bypass_first,
2833 		jiffies - rdp->nocb_nobypass_last,
2834 		rdp->nocb_nobypass_count,
2835 		".D"[rcu_segcblist_ready_cbs(rsclp)],
2836 		".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
2837 		rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
2838 		".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
2839 		rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
2840 		".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
2841 		".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
2842 		rcu_segcblist_n_cbs(&rdp->cblist),
2843 		rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
2844 		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
2845 		show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
2846 
2847 	/* It is OK for GP kthreads to have GP state. */
2848 	if (rdp->nocb_gp_rdp == rdp)
2849 		return;
2850 
2851 	waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
2852 	wastimer = timer_pending(&rdp->nocb_bypass_timer);
2853 	wassleep = swait_active(&rdp->nocb_gp_wq);
2854 	if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
2855 		return;  /* Nothing untowards. */
2856 
2857 	pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
2858 		"lL"[waslocked],
2859 		"dD"[!!rdp->nocb_defer_wakeup],
2860 		"tT"[wastimer],
2861 		"sS"[!!rdp->nocb_gp_sleep],
2862 		".W"[wassleep]);
2863 }
2864 
2865 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
2866 
2867 /* No ->nocb_lock to acquire.  */
2868 static void rcu_nocb_lock(struct rcu_data *rdp)
2869 {
2870 }
2871 
2872 /* No ->nocb_lock to release.  */
2873 static void rcu_nocb_unlock(struct rcu_data *rdp)
2874 {
2875 }
2876 
2877 /* No ->nocb_lock to release.  */
2878 static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
2879 				       unsigned long flags)
2880 {
2881 	local_irq_restore(flags);
2882 }
2883 
2884 /* Lockdep check that ->cblist may be safely accessed. */
2885 static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
2886 {
2887 	lockdep_assert_irqs_disabled();
2888 }
2889 
2890 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
2891 {
2892 }
2893 
2894 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
2895 {
2896 	return NULL;
2897 }
2898 
2899 static void rcu_init_one_nocb(struct rcu_node *rnp)
2900 {
2901 }
2902 
2903 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
2904 				  unsigned long j)
2905 {
2906 	return true;
2907 }
2908 
2909 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
2910 				bool *was_alldone, unsigned long flags)
2911 {
2912 	return false;
2913 }
2914 
2915 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
2916 				 unsigned long flags)
2917 {
2918 	WARN_ON_ONCE(1);  /* Should be dead code! */
2919 }
2920 
2921 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2922 {
2923 }
2924 
2925 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2926 {
2927 	return false;
2928 }
2929 
2930 static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
2931 {
2932 	return false;
2933 }
2934 
2935 static void rcu_spawn_cpu_nocb_kthread(int cpu)
2936 {
2937 }
2938 
2939 static void __init rcu_spawn_nocb_kthreads(void)
2940 {
2941 }
2942 
2943 static void show_rcu_nocb_state(struct rcu_data *rdp)
2944 {
2945 }
2946 
2947 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2948 
2949 /*
2950  * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
2951  * grace-period kthread will do force_quiescent_state() processing?
2952  * The idea is to avoid waking up RCU core processing on such a
2953  * CPU unless the grace period has extended for too long.
2954  *
2955  * This code relies on the fact that all NO_HZ_FULL CPUs are also
2956  * CONFIG_RCU_NOCB_CPU CPUs.
2957  */
2958 static bool rcu_nohz_full_cpu(void)
2959 {
2960 #ifdef CONFIG_NO_HZ_FULL
2961 	if (tick_nohz_full_cpu(smp_processor_id()) &&
2962 	    (!rcu_gp_in_progress() ||
2963 	     time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ)))
2964 		return true;
2965 #endif /* #ifdef CONFIG_NO_HZ_FULL */
2966 	return false;
2967 }
2968 
2969 /*
2970  * Bind the RCU grace-period kthreads to the housekeeping CPU.
2971  */
2972 static void rcu_bind_gp_kthread(void)
2973 {
2974 	if (!tick_nohz_full_enabled())
2975 		return;
2976 	housekeeping_affine(current, HK_FLAG_RCU);
2977 }
2978 
2979 /* Record the current task on dyntick-idle entry. */
2980 static void noinstr rcu_dynticks_task_enter(void)
2981 {
2982 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
2983 	WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id());
2984 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
2985 }
2986 
2987 /* Record no current task on dyntick-idle exit. */
2988 static void noinstr rcu_dynticks_task_exit(void)
2989 {
2990 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
2991 	WRITE_ONCE(current->rcu_tasks_idle_cpu, -1);
2992 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
2993 }
2994 
2995 /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
2996 static void rcu_dynticks_task_trace_enter(void)
2997 {
2998 #ifdef CONFIG_TASKS_RCU_TRACE
2999 	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
3000 		current->trc_reader_special.b.need_mb = true;
3001 #endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
3002 }
3003 
3004 /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
3005 static void rcu_dynticks_task_trace_exit(void)
3006 {
3007 #ifdef CONFIG_TASKS_RCU_TRACE
3008 	if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
3009 		current->trc_reader_special.b.need_mb = false;
3010 #endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
3011 }
3012