1 /* SPDX-License-Identifier: GPL-2.0+ */ 2 /* 3 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 4 * Internal non-public definitions that provide either classic 5 * or preemptible semantics. 6 * 7 * Copyright Red Hat, 2009 8 * Copyright IBM Corporation, 2009 9 * 10 * Author: Ingo Molnar <mingo@elte.hu> 11 * Paul E. McKenney <paulmck@linux.ibm.com> 12 */ 13 14 #include <linux/delay.h> 15 #include <linux/gfp.h> 16 #include <linux/oom.h> 17 #include <linux/sched/debug.h> 18 #include <linux/smpboot.h> 19 #include <linux/sched/isolation.h> 20 #include <uapi/linux/sched/types.h> 21 #include "../time/tick-internal.h" 22 23 #ifdef CONFIG_RCU_BOOST 24 #include "../locking/rtmutex_common.h" 25 #else /* #ifdef CONFIG_RCU_BOOST */ 26 27 /* 28 * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, 29 * all uses are in dead code. Provide a definition to keep the compiler 30 * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. 31 * This probably needs to be excluded from -rt builds. 32 */ 33 #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) 34 #define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) 35 36 #endif /* #else #ifdef CONFIG_RCU_BOOST */ 37 38 #ifdef CONFIG_RCU_NOCB_CPU 39 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 40 static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 41 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 42 43 /* 44 * Check the RCU kernel configuration parameters and print informative 45 * messages about anything out of the ordinary. 46 */ 47 static void __init rcu_bootup_announce_oddness(void) 48 { 49 if (IS_ENABLED(CONFIG_RCU_TRACE)) 50 pr_info("\tRCU event tracing is enabled.\n"); 51 if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || 52 (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) 53 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n", 54 RCU_FANOUT); 55 if (rcu_fanout_exact) 56 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 57 if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) 58 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 59 if (IS_ENABLED(CONFIG_PROVE_RCU)) 60 pr_info("\tRCU lockdep checking is enabled.\n"); 61 if (RCU_NUM_LVLS >= 4) 62 pr_info("\tFour(or more)-level hierarchy is enabled.\n"); 63 if (RCU_FANOUT_LEAF != 16) 64 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", 65 RCU_FANOUT_LEAF); 66 if (rcu_fanout_leaf != RCU_FANOUT_LEAF) 67 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", 68 rcu_fanout_leaf); 69 if (nr_cpu_ids != NR_CPUS) 70 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); 71 #ifdef CONFIG_RCU_BOOST 72 pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", 73 kthread_prio, CONFIG_RCU_BOOST_DELAY); 74 #endif 75 if (blimit != DEFAULT_RCU_BLIMIT) 76 pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); 77 if (qhimark != DEFAULT_RCU_QHIMARK) 78 pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); 79 if (qlowmark != DEFAULT_RCU_QLOMARK) 80 pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); 81 if (jiffies_till_first_fqs != ULONG_MAX) 82 pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); 83 if (jiffies_till_next_fqs != ULONG_MAX) 84 pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); 85 if (jiffies_till_sched_qs != ULONG_MAX) 86 pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs); 87 if (rcu_kick_kthreads) 88 pr_info("\tKick kthreads if too-long grace period.\n"); 89 if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) 90 pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); 91 if (gp_preinit_delay) 92 pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); 93 if (gp_init_delay) 94 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); 95 if (gp_cleanup_delay) 96 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); 97 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) 98 pr_info("\tRCU debug extended QS entry/exit.\n"); 99 rcupdate_announce_bootup_oddness(); 100 } 101 102 #ifdef CONFIG_PREEMPT_RCU 103 104 static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake); 105 static void rcu_read_unlock_special(struct task_struct *t); 106 107 /* 108 * Tell them what RCU they are running. 109 */ 110 static void __init rcu_bootup_announce(void) 111 { 112 pr_info("Preemptible hierarchical RCU implementation.\n"); 113 rcu_bootup_announce_oddness(); 114 } 115 116 /* Flags for rcu_preempt_ctxt_queue() decision table. */ 117 #define RCU_GP_TASKS 0x8 118 #define RCU_EXP_TASKS 0x4 119 #define RCU_GP_BLKD 0x2 120 #define RCU_EXP_BLKD 0x1 121 122 /* 123 * Queues a task preempted within an RCU-preempt read-side critical 124 * section into the appropriate location within the ->blkd_tasks list, 125 * depending on the states of any ongoing normal and expedited grace 126 * periods. The ->gp_tasks pointer indicates which element the normal 127 * grace period is waiting on (NULL if none), and the ->exp_tasks pointer 128 * indicates which element the expedited grace period is waiting on (again, 129 * NULL if none). If a grace period is waiting on a given element in the 130 * ->blkd_tasks list, it also waits on all subsequent elements. Thus, 131 * adding a task to the tail of the list blocks any grace period that is 132 * already waiting on one of the elements. In contrast, adding a task 133 * to the head of the list won't block any grace period that is already 134 * waiting on one of the elements. 135 * 136 * This queuing is imprecise, and can sometimes make an ongoing grace 137 * period wait for a task that is not strictly speaking blocking it. 138 * Given the choice, we needlessly block a normal grace period rather than 139 * blocking an expedited grace period. 140 * 141 * Note that an endless sequence of expedited grace periods still cannot 142 * indefinitely postpone a normal grace period. Eventually, all of the 143 * fixed number of preempted tasks blocking the normal grace period that are 144 * not also blocking the expedited grace period will resume and complete 145 * their RCU read-side critical sections. At that point, the ->gp_tasks 146 * pointer will equal the ->exp_tasks pointer, at which point the end of 147 * the corresponding expedited grace period will also be the end of the 148 * normal grace period. 149 */ 150 static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) 151 __releases(rnp->lock) /* But leaves rrupts disabled. */ 152 { 153 int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + 154 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + 155 (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + 156 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); 157 struct task_struct *t = current; 158 159 raw_lockdep_assert_held_rcu_node(rnp); 160 WARN_ON_ONCE(rdp->mynode != rnp); 161 WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); 162 /* RCU better not be waiting on newly onlined CPUs! */ 163 WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & 164 rdp->grpmask); 165 166 /* 167 * Decide where to queue the newly blocked task. In theory, 168 * this could be an if-statement. In practice, when I tried 169 * that, it was quite messy. 170 */ 171 switch (blkd_state) { 172 case 0: 173 case RCU_EXP_TASKS: 174 case RCU_EXP_TASKS + RCU_GP_BLKD: 175 case RCU_GP_TASKS: 176 case RCU_GP_TASKS + RCU_EXP_TASKS: 177 178 /* 179 * Blocking neither GP, or first task blocking the normal 180 * GP but not blocking the already-waiting expedited GP. 181 * Queue at the head of the list to avoid unnecessarily 182 * blocking the already-waiting GPs. 183 */ 184 list_add(&t->rcu_node_entry, &rnp->blkd_tasks); 185 break; 186 187 case RCU_EXP_BLKD: 188 case RCU_GP_BLKD: 189 case RCU_GP_BLKD + RCU_EXP_BLKD: 190 case RCU_GP_TASKS + RCU_EXP_BLKD: 191 case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 192 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 193 194 /* 195 * First task arriving that blocks either GP, or first task 196 * arriving that blocks the expedited GP (with the normal 197 * GP already waiting), or a task arriving that blocks 198 * both GPs with both GPs already waiting. Queue at the 199 * tail of the list to avoid any GP waiting on any of the 200 * already queued tasks that are not blocking it. 201 */ 202 list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); 203 break; 204 205 case RCU_EXP_TASKS + RCU_EXP_BLKD: 206 case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 207 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: 208 209 /* 210 * Second or subsequent task blocking the expedited GP. 211 * The task either does not block the normal GP, or is the 212 * first task blocking the normal GP. Queue just after 213 * the first task blocking the expedited GP. 214 */ 215 list_add(&t->rcu_node_entry, rnp->exp_tasks); 216 break; 217 218 case RCU_GP_TASKS + RCU_GP_BLKD: 219 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: 220 221 /* 222 * Second or subsequent task blocking the normal GP. 223 * The task does not block the expedited GP. Queue just 224 * after the first task blocking the normal GP. 225 */ 226 list_add(&t->rcu_node_entry, rnp->gp_tasks); 227 break; 228 229 default: 230 231 /* Yet another exercise in excessive paranoia. */ 232 WARN_ON_ONCE(1); 233 break; 234 } 235 236 /* 237 * We have now queued the task. If it was the first one to 238 * block either grace period, update the ->gp_tasks and/or 239 * ->exp_tasks pointers, respectively, to reference the newly 240 * blocked tasks. 241 */ 242 if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { 243 rnp->gp_tasks = &t->rcu_node_entry; 244 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); 245 } 246 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) 247 rnp->exp_tasks = &t->rcu_node_entry; 248 WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != 249 !(rnp->qsmask & rdp->grpmask)); 250 WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != 251 !(rnp->expmask & rdp->grpmask)); 252 raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ 253 254 /* 255 * Report the quiescent state for the expedited GP. This expedited 256 * GP should not be able to end until we report, so there should be 257 * no need to check for a subsequent expedited GP. (Though we are 258 * still in a quiescent state in any case.) 259 */ 260 if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) 261 rcu_report_exp_rdp(rdp); 262 else 263 WARN_ON_ONCE(rdp->deferred_qs); 264 } 265 266 /* 267 * Record a preemptible-RCU quiescent state for the specified CPU. 268 * Note that this does not necessarily mean that the task currently running 269 * on the CPU is in a quiescent state: Instead, it means that the current 270 * grace period need not wait on any RCU read-side critical section that 271 * starts later on this CPU. It also means that if the current task is 272 * in an RCU read-side critical section, it has already added itself to 273 * some leaf rcu_node structure's ->blkd_tasks list. In addition to the 274 * current task, there might be any number of other tasks blocked while 275 * in an RCU read-side critical section. 276 * 277 * Callers to this function must disable preemption. 278 */ 279 static void rcu_qs(void) 280 { 281 RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); 282 if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { 283 trace_rcu_grace_period(TPS("rcu_preempt"), 284 __this_cpu_read(rcu_data.gp_seq), 285 TPS("cpuqs")); 286 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); 287 barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ 288 WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false); 289 } 290 } 291 292 /* 293 * We have entered the scheduler, and the current task might soon be 294 * context-switched away from. If this task is in an RCU read-side 295 * critical section, we will no longer be able to rely on the CPU to 296 * record that fact, so we enqueue the task on the blkd_tasks list. 297 * The task will dequeue itself when it exits the outermost enclosing 298 * RCU read-side critical section. Therefore, the current grace period 299 * cannot be permitted to complete until the blkd_tasks list entries 300 * predating the current grace period drain, in other words, until 301 * rnp->gp_tasks becomes NULL. 302 * 303 * Caller must disable interrupts. 304 */ 305 void rcu_note_context_switch(bool preempt) 306 { 307 struct task_struct *t = current; 308 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 309 struct rcu_node *rnp; 310 311 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 312 trace_rcu_utilization(TPS("Start context switch")); 313 lockdep_assert_irqs_disabled(); 314 WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); 315 if (t->rcu_read_lock_nesting > 0 && 316 !t->rcu_read_unlock_special.b.blocked) { 317 318 /* Possibly blocking in an RCU read-side critical section. */ 319 rnp = rdp->mynode; 320 raw_spin_lock_rcu_node(rnp); 321 t->rcu_read_unlock_special.b.blocked = true; 322 t->rcu_blocked_node = rnp; 323 324 /* 325 * Verify the CPU's sanity, trace the preemption, and 326 * then queue the task as required based on the states 327 * of any ongoing and expedited grace periods. 328 */ 329 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); 330 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 331 trace_rcu_preempt_task(rcu_state.name, 332 t->pid, 333 (rnp->qsmask & rdp->grpmask) 334 ? rnp->gp_seq 335 : rcu_seq_snap(&rnp->gp_seq)); 336 rcu_preempt_ctxt_queue(rnp, rdp); 337 } else if (t->rcu_read_lock_nesting < 0 && 338 t->rcu_read_unlock_special.s) { 339 340 /* 341 * Complete exit from RCU read-side critical section on 342 * behalf of preempted instance of __rcu_read_unlock(). 343 */ 344 rcu_read_unlock_special(t); 345 rcu_preempt_deferred_qs(t); 346 } else { 347 rcu_preempt_deferred_qs(t); 348 } 349 350 /* 351 * Either we were not in an RCU read-side critical section to 352 * begin with, or we have now recorded that critical section 353 * globally. Either way, we can now note a quiescent state 354 * for this CPU. Again, if we were in an RCU read-side critical 355 * section, and if that critical section was blocking the current 356 * grace period, then the fact that the task has been enqueued 357 * means that we continue to block the current grace period. 358 */ 359 rcu_qs(); 360 if (rdp->deferred_qs) 361 rcu_report_exp_rdp(rdp); 362 trace_rcu_utilization(TPS("End context switch")); 363 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 364 } 365 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 366 367 /* 368 * Check for preempted RCU readers blocking the current grace period 369 * for the specified rcu_node structure. If the caller needs a reliable 370 * answer, it must hold the rcu_node's ->lock. 371 */ 372 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 373 { 374 return rnp->gp_tasks != NULL; 375 } 376 377 /* Bias and limit values for ->rcu_read_lock_nesting. */ 378 #define RCU_NEST_BIAS INT_MAX 379 #define RCU_NEST_NMAX (-INT_MAX / 2) 380 #define RCU_NEST_PMAX (INT_MAX / 2) 381 382 /* 383 * Preemptible RCU implementation for rcu_read_lock(). 384 * Just increment ->rcu_read_lock_nesting, shared state will be updated 385 * if we block. 386 */ 387 void __rcu_read_lock(void) 388 { 389 current->rcu_read_lock_nesting++; 390 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 391 WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); 392 barrier(); /* critical section after entry code. */ 393 } 394 EXPORT_SYMBOL_GPL(__rcu_read_lock); 395 396 /* 397 * Preemptible RCU implementation for rcu_read_unlock(). 398 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 399 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 400 * invoke rcu_read_unlock_special() to clean up after a context switch 401 * in an RCU read-side critical section and other special cases. 402 */ 403 void __rcu_read_unlock(void) 404 { 405 struct task_struct *t = current; 406 407 if (t->rcu_read_lock_nesting != 1) { 408 --t->rcu_read_lock_nesting; 409 } else { 410 barrier(); /* critical section before exit code. */ 411 t->rcu_read_lock_nesting = -RCU_NEST_BIAS; 412 barrier(); /* assign before ->rcu_read_unlock_special load */ 413 if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) 414 rcu_read_unlock_special(t); 415 barrier(); /* ->rcu_read_unlock_special load before assign */ 416 t->rcu_read_lock_nesting = 0; 417 } 418 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 419 int rrln = t->rcu_read_lock_nesting; 420 421 WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); 422 } 423 } 424 EXPORT_SYMBOL_GPL(__rcu_read_unlock); 425 426 /* 427 * Advance a ->blkd_tasks-list pointer to the next entry, instead 428 * returning NULL if at the end of the list. 429 */ 430 static struct list_head *rcu_next_node_entry(struct task_struct *t, 431 struct rcu_node *rnp) 432 { 433 struct list_head *np; 434 435 np = t->rcu_node_entry.next; 436 if (np == &rnp->blkd_tasks) 437 np = NULL; 438 return np; 439 } 440 441 /* 442 * Return true if the specified rcu_node structure has tasks that were 443 * preempted within an RCU read-side critical section. 444 */ 445 static bool rcu_preempt_has_tasks(struct rcu_node *rnp) 446 { 447 return !list_empty(&rnp->blkd_tasks); 448 } 449 450 /* 451 * Report deferred quiescent states. The deferral time can 452 * be quite short, for example, in the case of the call from 453 * rcu_read_unlock_special(). 454 */ 455 static void 456 rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) 457 { 458 bool empty_exp; 459 bool empty_norm; 460 bool empty_exp_now; 461 struct list_head *np; 462 bool drop_boost_mutex = false; 463 struct rcu_data *rdp; 464 struct rcu_node *rnp; 465 union rcu_special special; 466 467 /* 468 * If RCU core is waiting for this CPU to exit its critical section, 469 * report the fact that it has exited. Because irqs are disabled, 470 * t->rcu_read_unlock_special cannot change. 471 */ 472 special = t->rcu_read_unlock_special; 473 rdp = this_cpu_ptr(&rcu_data); 474 if (!special.s && !rdp->deferred_qs) { 475 local_irq_restore(flags); 476 return; 477 } 478 if (special.b.need_qs) { 479 rcu_qs(); 480 t->rcu_read_unlock_special.b.need_qs = false; 481 if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { 482 local_irq_restore(flags); 483 return; 484 } 485 } 486 487 /* 488 * Respond to a request by an expedited grace period for a 489 * quiescent state from this CPU. Note that requests from 490 * tasks are handled when removing the task from the 491 * blocked-tasks list below. 492 */ 493 if (rdp->deferred_qs) { 494 rcu_report_exp_rdp(rdp); 495 if (!t->rcu_read_unlock_special.s) { 496 local_irq_restore(flags); 497 return; 498 } 499 } 500 501 /* Clean up if blocked during RCU read-side critical section. */ 502 if (special.b.blocked) { 503 t->rcu_read_unlock_special.b.blocked = false; 504 505 /* 506 * Remove this task from the list it blocked on. The task 507 * now remains queued on the rcu_node corresponding to the 508 * CPU it first blocked on, so there is no longer any need 509 * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. 510 */ 511 rnp = t->rcu_blocked_node; 512 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 513 WARN_ON_ONCE(rnp != t->rcu_blocked_node); 514 WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); 515 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 516 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && 517 (!empty_norm || rnp->qsmask)); 518 empty_exp = sync_rcu_preempt_exp_done(rnp); 519 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 520 np = rcu_next_node_entry(t, rnp); 521 list_del_init(&t->rcu_node_entry); 522 t->rcu_blocked_node = NULL; 523 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), 524 rnp->gp_seq, t->pid); 525 if (&t->rcu_node_entry == rnp->gp_tasks) 526 rnp->gp_tasks = np; 527 if (&t->rcu_node_entry == rnp->exp_tasks) 528 rnp->exp_tasks = np; 529 if (IS_ENABLED(CONFIG_RCU_BOOST)) { 530 /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ 531 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; 532 if (&t->rcu_node_entry == rnp->boost_tasks) 533 rnp->boost_tasks = np; 534 } 535 536 /* 537 * If this was the last task on the current list, and if 538 * we aren't waiting on any CPUs, report the quiescent state. 539 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 540 * so we must take a snapshot of the expedited state. 541 */ 542 empty_exp_now = sync_rcu_preempt_exp_done(rnp); 543 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { 544 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 545 rnp->gp_seq, 546 0, rnp->qsmask, 547 rnp->level, 548 rnp->grplo, 549 rnp->grphi, 550 !!rnp->gp_tasks); 551 rcu_report_unblock_qs_rnp(rnp, flags); 552 } else { 553 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 554 } 555 556 /* Unboost if we were boosted. */ 557 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) 558 rt_mutex_futex_unlock(&rnp->boost_mtx); 559 560 /* 561 * If this was the last task on the expedited lists, 562 * then we need to report up the rcu_node hierarchy. 563 */ 564 if (!empty_exp && empty_exp_now) 565 rcu_report_exp_rnp(rnp, true); 566 } else { 567 local_irq_restore(flags); 568 } 569 } 570 571 /* 572 * Is a deferred quiescent-state pending, and are we also not in 573 * an RCU read-side critical section? It is the caller's responsibility 574 * to ensure it is otherwise safe to report any deferred quiescent 575 * states. The reason for this is that it is safe to report a 576 * quiescent state during context switch even though preemption 577 * is disabled. This function cannot be expected to understand these 578 * nuances, so the caller must handle them. 579 */ 580 static bool rcu_preempt_need_deferred_qs(struct task_struct *t) 581 { 582 return (__this_cpu_read(rcu_data.deferred_qs) || 583 READ_ONCE(t->rcu_read_unlock_special.s)) && 584 t->rcu_read_lock_nesting <= 0; 585 } 586 587 /* 588 * Report a deferred quiescent state if needed and safe to do so. 589 * As with rcu_preempt_need_deferred_qs(), "safe" involves only 590 * not being in an RCU read-side critical section. The caller must 591 * evaluate safety in terms of interrupt, softirq, and preemption 592 * disabling. 593 */ 594 static void rcu_preempt_deferred_qs(struct task_struct *t) 595 { 596 unsigned long flags; 597 bool couldrecurse = t->rcu_read_lock_nesting >= 0; 598 599 if (!rcu_preempt_need_deferred_qs(t)) 600 return; 601 if (couldrecurse) 602 t->rcu_read_lock_nesting -= RCU_NEST_BIAS; 603 local_irq_save(flags); 604 rcu_preempt_deferred_qs_irqrestore(t, flags); 605 if (couldrecurse) 606 t->rcu_read_lock_nesting += RCU_NEST_BIAS; 607 } 608 609 /* 610 * Handle special cases during rcu_read_unlock(), such as needing to 611 * notify RCU core processing or task having blocked during the RCU 612 * read-side critical section. 613 */ 614 static void rcu_read_unlock_special(struct task_struct *t) 615 { 616 unsigned long flags; 617 bool preempt_bh_were_disabled = 618 !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); 619 bool irqs_were_disabled; 620 621 /* NMI handlers cannot block and cannot safely manipulate state. */ 622 if (in_nmi()) 623 return; 624 625 local_irq_save(flags); 626 irqs_were_disabled = irqs_disabled_flags(flags); 627 if (preempt_bh_were_disabled || irqs_were_disabled) { 628 WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); 629 /* Need to defer quiescent state until everything is enabled. */ 630 if (irqs_were_disabled) { 631 /* Enabling irqs does not reschedule, so... */ 632 raise_softirq_irqoff(RCU_SOFTIRQ); 633 } else { 634 /* Enabling BH or preempt does reschedule, so... */ 635 set_tsk_need_resched(current); 636 set_preempt_need_resched(); 637 } 638 local_irq_restore(flags); 639 return; 640 } 641 WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); 642 rcu_preempt_deferred_qs_irqrestore(t, flags); 643 } 644 645 /* 646 * Check that the list of blocked tasks for the newly completed grace 647 * period is in fact empty. It is a serious bug to complete a grace 648 * period that still has RCU readers blocked! This function must be 649 * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock 650 * must be held by the caller. 651 * 652 * Also, if there are blocked tasks on the list, they automatically 653 * block the newly created grace period, so set up ->gp_tasks accordingly. 654 */ 655 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 656 { 657 struct task_struct *t; 658 659 RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); 660 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) 661 dump_blkd_tasks(rnp, 10); 662 if (rcu_preempt_has_tasks(rnp) && 663 (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { 664 rnp->gp_tasks = rnp->blkd_tasks.next; 665 t = container_of(rnp->gp_tasks, struct task_struct, 666 rcu_node_entry); 667 trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), 668 rnp->gp_seq, t->pid); 669 } 670 WARN_ON_ONCE(rnp->qsmask); 671 } 672 673 /* 674 * Check for a quiescent state from the current CPU, including voluntary 675 * context switches for Tasks RCU. When a task blocks, the task is 676 * recorded in the corresponding CPU's rcu_node structure, which is checked 677 * elsewhere, hence this function need only check for quiescent states 678 * related to the current CPU, not to those related to tasks. 679 */ 680 static void rcu_flavor_sched_clock_irq(int user) 681 { 682 struct task_struct *t = current; 683 684 if (user || rcu_is_cpu_rrupt_from_idle()) { 685 rcu_note_voluntary_context_switch(current); 686 } 687 if (t->rcu_read_lock_nesting > 0 || 688 (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { 689 /* No QS, force context switch if deferred. */ 690 if (rcu_preempt_need_deferred_qs(t)) { 691 set_tsk_need_resched(t); 692 set_preempt_need_resched(); 693 } 694 } else if (rcu_preempt_need_deferred_qs(t)) { 695 rcu_preempt_deferred_qs(t); /* Report deferred QS. */ 696 return; 697 } else if (!t->rcu_read_lock_nesting) { 698 rcu_qs(); /* Report immediate QS. */ 699 return; 700 } 701 702 /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ 703 if (t->rcu_read_lock_nesting > 0 && 704 __this_cpu_read(rcu_data.core_needs_qs) && 705 __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && 706 !t->rcu_read_unlock_special.b.need_qs && 707 time_after(jiffies, rcu_state.gp_start + HZ)) 708 t->rcu_read_unlock_special.b.need_qs = true; 709 } 710 711 /* 712 * Check for a task exiting while in a preemptible-RCU read-side 713 * critical section, clean up if so. No need to issue warnings, as 714 * debug_check_no_locks_held() already does this if lockdep is enabled. 715 * Besides, if this function does anything other than just immediately 716 * return, there was a bug of some sort. Spewing warnings from this 717 * function is like as not to simply obscure important prior warnings. 718 */ 719 void exit_rcu(void) 720 { 721 struct task_struct *t = current; 722 723 if (unlikely(!list_empty(¤t->rcu_node_entry))) { 724 t->rcu_read_lock_nesting = 1; 725 barrier(); 726 WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); 727 } else if (unlikely(t->rcu_read_lock_nesting)) { 728 t->rcu_read_lock_nesting = 1; 729 } else { 730 return; 731 } 732 __rcu_read_unlock(); 733 rcu_preempt_deferred_qs(current); 734 } 735 736 /* 737 * Dump the blocked-tasks state, but limit the list dump to the 738 * specified number of elements. 739 */ 740 static void 741 dump_blkd_tasks(struct rcu_node *rnp, int ncheck) 742 { 743 int cpu; 744 int i; 745 struct list_head *lhp; 746 bool onl; 747 struct rcu_data *rdp; 748 struct rcu_node *rnp1; 749 750 raw_lockdep_assert_held_rcu_node(rnp); 751 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", 752 __func__, rnp->grplo, rnp->grphi, rnp->level, 753 (long)rnp->gp_seq, (long)rnp->completedqs); 754 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) 755 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", 756 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); 757 pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", 758 __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); 759 pr_info("%s: ->blkd_tasks", __func__); 760 i = 0; 761 list_for_each(lhp, &rnp->blkd_tasks) { 762 pr_cont(" %p", lhp); 763 if (++i >= 10) 764 break; 765 } 766 pr_cont("\n"); 767 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { 768 rdp = per_cpu_ptr(&rcu_data, cpu); 769 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); 770 pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", 771 cpu, ".o"[onl], 772 (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, 773 (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); 774 } 775 } 776 777 #else /* #ifdef CONFIG_PREEMPT_RCU */ 778 779 /* 780 * Tell them what RCU they are running. 781 */ 782 static void __init rcu_bootup_announce(void) 783 { 784 pr_info("Hierarchical RCU implementation.\n"); 785 rcu_bootup_announce_oddness(); 786 } 787 788 /* 789 * Note a quiescent state for PREEMPT=n. Because we do not need to know 790 * how many quiescent states passed, just if there was at least one since 791 * the start of the grace period, this just sets a flag. The caller must 792 * have disabled preemption. 793 */ 794 static void rcu_qs(void) 795 { 796 RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!"); 797 if (!__this_cpu_read(rcu_data.cpu_no_qs.s)) 798 return; 799 trace_rcu_grace_period(TPS("rcu_sched"), 800 __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); 801 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); 802 if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) 803 return; 804 __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); 805 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); 806 } 807 808 /* 809 * Register an urgently needed quiescent state. If there is an 810 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 811 * dyntick-idle quiescent state visible to other CPUs, which will in 812 * some cases serve for expedited as well as normal grace periods. 813 * Either way, register a lightweight quiescent state. 814 * 815 * The barrier() calls are redundant in the common case when this is 816 * called externally, but just in case this is called from within this 817 * file. 818 * 819 */ 820 void rcu_all_qs(void) 821 { 822 unsigned long flags; 823 824 if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) 825 return; 826 preempt_disable(); 827 /* Load rcu_urgent_qs before other flags. */ 828 if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { 829 preempt_enable(); 830 return; 831 } 832 this_cpu_write(rcu_data.rcu_urgent_qs, false); 833 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 834 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { 835 local_irq_save(flags); 836 rcu_momentary_dyntick_idle(); 837 local_irq_restore(flags); 838 } 839 rcu_qs(); 840 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 841 preempt_enable(); 842 } 843 EXPORT_SYMBOL_GPL(rcu_all_qs); 844 845 /* 846 * Note a PREEMPT=n context switch. The caller must have disabled interrupts. 847 */ 848 void rcu_note_context_switch(bool preempt) 849 { 850 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 851 trace_rcu_utilization(TPS("Start context switch")); 852 rcu_qs(); 853 /* Load rcu_urgent_qs before other flags. */ 854 if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) 855 goto out; 856 this_cpu_write(rcu_data.rcu_urgent_qs, false); 857 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) 858 rcu_momentary_dyntick_idle(); 859 if (!preempt) 860 rcu_tasks_qs(current); 861 out: 862 trace_rcu_utilization(TPS("End context switch")); 863 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 864 } 865 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 866 867 /* 868 * Because preemptible RCU does not exist, there are never any preempted 869 * RCU readers. 870 */ 871 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 872 { 873 return 0; 874 } 875 876 /* 877 * Because there is no preemptible RCU, there can be no readers blocked. 878 */ 879 static bool rcu_preempt_has_tasks(struct rcu_node *rnp) 880 { 881 return false; 882 } 883 884 /* 885 * Because there is no preemptible RCU, there can be no deferred quiescent 886 * states. 887 */ 888 static bool rcu_preempt_need_deferred_qs(struct task_struct *t) 889 { 890 return false; 891 } 892 static void rcu_preempt_deferred_qs(struct task_struct *t) { } 893 894 /* 895 * Because there is no preemptible RCU, there can be no readers blocked, 896 * so there is no need to check for blocked tasks. So check only for 897 * bogus qsmask values. 898 */ 899 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 900 { 901 WARN_ON_ONCE(rnp->qsmask); 902 } 903 904 /* 905 * Check to see if this CPU is in a non-context-switch quiescent state, 906 * namely user mode and idle loop. 907 */ 908 static void rcu_flavor_sched_clock_irq(int user) 909 { 910 if (user || rcu_is_cpu_rrupt_from_idle()) { 911 912 /* 913 * Get here if this CPU took its interrupt from user 914 * mode or from the idle loop, and if this is not a 915 * nested interrupt. In this case, the CPU is in 916 * a quiescent state, so note it. 917 * 918 * No memory barrier is required here because rcu_qs() 919 * references only CPU-local variables that other CPUs 920 * neither access nor modify, at least not while the 921 * corresponding CPU is online. 922 */ 923 924 rcu_qs(); 925 } 926 } 927 928 /* 929 * Because preemptible RCU does not exist, tasks cannot possibly exit 930 * while in preemptible RCU read-side critical sections. 931 */ 932 void exit_rcu(void) 933 { 934 } 935 936 /* 937 * Dump the guaranteed-empty blocked-tasks state. Trust but verify. 938 */ 939 static void 940 dump_blkd_tasks(struct rcu_node *rnp, int ncheck) 941 { 942 WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); 943 } 944 945 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 946 947 #ifdef CONFIG_RCU_BOOST 948 949 static void rcu_wake_cond(struct task_struct *t, int status) 950 { 951 /* 952 * If the thread is yielding, only wake it when this 953 * is invoked from idle 954 */ 955 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) 956 wake_up_process(t); 957 } 958 959 /* 960 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 961 * or ->boost_tasks, advancing the pointer to the next task in the 962 * ->blkd_tasks list. 963 * 964 * Note that irqs must be enabled: boosting the task can block. 965 * Returns 1 if there are more tasks needing to be boosted. 966 */ 967 static int rcu_boost(struct rcu_node *rnp) 968 { 969 unsigned long flags; 970 struct task_struct *t; 971 struct list_head *tb; 972 973 if (READ_ONCE(rnp->exp_tasks) == NULL && 974 READ_ONCE(rnp->boost_tasks) == NULL) 975 return 0; /* Nothing left to boost. */ 976 977 raw_spin_lock_irqsave_rcu_node(rnp, flags); 978 979 /* 980 * Recheck under the lock: all tasks in need of boosting 981 * might exit their RCU read-side critical sections on their own. 982 */ 983 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { 984 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 985 return 0; 986 } 987 988 /* 989 * Preferentially boost tasks blocking expedited grace periods. 990 * This cannot starve the normal grace periods because a second 991 * expedited grace period must boost all blocked tasks, including 992 * those blocking the pre-existing normal grace period. 993 */ 994 if (rnp->exp_tasks != NULL) 995 tb = rnp->exp_tasks; 996 else 997 tb = rnp->boost_tasks; 998 999 /* 1000 * We boost task t by manufacturing an rt_mutex that appears to 1001 * be held by task t. We leave a pointer to that rt_mutex where 1002 * task t can find it, and task t will release the mutex when it 1003 * exits its outermost RCU read-side critical section. Then 1004 * simply acquiring this artificial rt_mutex will boost task 1005 * t's priority. (Thanks to tglx for suggesting this approach!) 1006 * 1007 * Note that task t must acquire rnp->lock to remove itself from 1008 * the ->blkd_tasks list, which it will do from exit() if from 1009 * nowhere else. We therefore are guaranteed that task t will 1010 * stay around at least until we drop rnp->lock. Note that 1011 * rnp->lock also resolves races between our priority boosting 1012 * and task t's exiting its outermost RCU read-side critical 1013 * section. 1014 */ 1015 t = container_of(tb, struct task_struct, rcu_node_entry); 1016 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1017 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1018 /* Lock only for side effect: boosts task t's priority. */ 1019 rt_mutex_lock(&rnp->boost_mtx); 1020 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1021 1022 return READ_ONCE(rnp->exp_tasks) != NULL || 1023 READ_ONCE(rnp->boost_tasks) != NULL; 1024 } 1025 1026 /* 1027 * Priority-boosting kthread, one per leaf rcu_node. 1028 */ 1029 static int rcu_boost_kthread(void *arg) 1030 { 1031 struct rcu_node *rnp = (struct rcu_node *)arg; 1032 int spincnt = 0; 1033 int more2boost; 1034 1035 trace_rcu_utilization(TPS("Start boost kthread@init")); 1036 for (;;) { 1037 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1038 trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); 1039 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1040 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); 1041 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1042 more2boost = rcu_boost(rnp); 1043 if (more2boost) 1044 spincnt++; 1045 else 1046 spincnt = 0; 1047 if (spincnt > 10) { 1048 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1049 trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); 1050 schedule_timeout_interruptible(2); 1051 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); 1052 spincnt = 0; 1053 } 1054 } 1055 /* NOTREACHED */ 1056 trace_rcu_utilization(TPS("End boost kthread@notreached")); 1057 return 0; 1058 } 1059 1060 /* 1061 * Check to see if it is time to start boosting RCU readers that are 1062 * blocking the current grace period, and, if so, tell the per-rcu_node 1063 * kthread to start boosting them. If there is an expedited grace 1064 * period in progress, it is always time to boost. 1065 * 1066 * The caller must hold rnp->lock, which this function releases. 1067 * The ->boost_kthread_task is immortal, so we don't need to worry 1068 * about it going away. 1069 */ 1070 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1071 __releases(rnp->lock) 1072 { 1073 raw_lockdep_assert_held_rcu_node(rnp); 1074 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1075 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1076 return; 1077 } 1078 if (rnp->exp_tasks != NULL || 1079 (rnp->gp_tasks != NULL && 1080 rnp->boost_tasks == NULL && 1081 rnp->qsmask == 0 && 1082 ULONG_CMP_GE(jiffies, rnp->boost_time))) { 1083 if (rnp->exp_tasks == NULL) 1084 rnp->boost_tasks = rnp->gp_tasks; 1085 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1086 rcu_wake_cond(rnp->boost_kthread_task, 1087 rnp->boost_kthread_status); 1088 } else { 1089 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1090 } 1091 } 1092 1093 /* 1094 * Wake up the per-CPU kthread to invoke RCU callbacks. 1095 */ 1096 static void invoke_rcu_callbacks_kthread(void) 1097 { 1098 unsigned long flags; 1099 1100 local_irq_save(flags); 1101 __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); 1102 if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && 1103 current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { 1104 rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), 1105 __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); 1106 } 1107 local_irq_restore(flags); 1108 } 1109 1110 /* 1111 * Is the current CPU running the RCU-callbacks kthread? 1112 * Caller must have preemption disabled. 1113 */ 1114 static bool rcu_is_callbacks_kthread(void) 1115 { 1116 return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; 1117 } 1118 1119 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1120 1121 /* 1122 * Do priority-boost accounting for the start of a new grace period. 1123 */ 1124 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1125 { 1126 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 1127 } 1128 1129 /* 1130 * Create an RCU-boost kthread for the specified node if one does not 1131 * already exist. We only create this kthread for preemptible RCU. 1132 * Returns zero if all is well, a negated errno otherwise. 1133 */ 1134 static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) 1135 { 1136 int rnp_index = rnp - rcu_get_root(); 1137 unsigned long flags; 1138 struct sched_param sp; 1139 struct task_struct *t; 1140 1141 if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) 1142 return 0; 1143 1144 if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) 1145 return 0; 1146 1147 rcu_state.boost = 1; 1148 if (rnp->boost_kthread_task != NULL) 1149 return 0; 1150 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1151 "rcub/%d", rnp_index); 1152 if (IS_ERR(t)) 1153 return PTR_ERR(t); 1154 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1155 rnp->boost_kthread_task = t; 1156 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1157 sp.sched_priority = kthread_prio; 1158 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1159 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1160 return 0; 1161 } 1162 1163 static void rcu_cpu_kthread_setup(unsigned int cpu) 1164 { 1165 struct sched_param sp; 1166 1167 sp.sched_priority = kthread_prio; 1168 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1169 } 1170 1171 static void rcu_cpu_kthread_park(unsigned int cpu) 1172 { 1173 per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; 1174 } 1175 1176 static int rcu_cpu_kthread_should_run(unsigned int cpu) 1177 { 1178 return __this_cpu_read(rcu_data.rcu_cpu_has_work); 1179 } 1180 1181 /* 1182 * Per-CPU kernel thread that invokes RCU callbacks. This replaces 1183 * the RCU softirq used in configurations of RCU that do not support RCU 1184 * priority boosting. 1185 */ 1186 static void rcu_cpu_kthread(unsigned int cpu) 1187 { 1188 unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); 1189 char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); 1190 int spincnt; 1191 1192 for (spincnt = 0; spincnt < 10; spincnt++) { 1193 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); 1194 local_bh_disable(); 1195 *statusp = RCU_KTHREAD_RUNNING; 1196 local_irq_disable(); 1197 work = *workp; 1198 *workp = 0; 1199 local_irq_enable(); 1200 if (work) 1201 rcu_do_batch(this_cpu_ptr(&rcu_data)); 1202 local_bh_enable(); 1203 if (*workp == 0) { 1204 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); 1205 *statusp = RCU_KTHREAD_WAITING; 1206 return; 1207 } 1208 } 1209 *statusp = RCU_KTHREAD_YIELDING; 1210 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); 1211 schedule_timeout_interruptible(2); 1212 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); 1213 *statusp = RCU_KTHREAD_WAITING; 1214 } 1215 1216 /* 1217 * Set the per-rcu_node kthread's affinity to cover all CPUs that are 1218 * served by the rcu_node in question. The CPU hotplug lock is still 1219 * held, so the value of rnp->qsmaskinit will be stable. 1220 * 1221 * We don't include outgoingcpu in the affinity set, use -1 if there is 1222 * no outgoing CPU. If there are no CPUs left in the affinity set, 1223 * this function allows the kthread to execute on any CPU. 1224 */ 1225 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1226 { 1227 struct task_struct *t = rnp->boost_kthread_task; 1228 unsigned long mask = rcu_rnp_online_cpus(rnp); 1229 cpumask_var_t cm; 1230 int cpu; 1231 1232 if (!t) 1233 return; 1234 if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) 1235 return; 1236 for_each_leaf_node_possible_cpu(rnp, cpu) 1237 if ((mask & leaf_node_cpu_bit(rnp, cpu)) && 1238 cpu != outgoingcpu) 1239 cpumask_set_cpu(cpu, cm); 1240 if (cpumask_weight(cm) == 0) 1241 cpumask_setall(cm); 1242 set_cpus_allowed_ptr(t, cm); 1243 free_cpumask_var(cm); 1244 } 1245 1246 static struct smp_hotplug_thread rcu_cpu_thread_spec = { 1247 .store = &rcu_data.rcu_cpu_kthread_task, 1248 .thread_should_run = rcu_cpu_kthread_should_run, 1249 .thread_fn = rcu_cpu_kthread, 1250 .thread_comm = "rcuc/%u", 1251 .setup = rcu_cpu_kthread_setup, 1252 .park = rcu_cpu_kthread_park, 1253 }; 1254 1255 /* 1256 * Spawn boost kthreads -- called as soon as the scheduler is running. 1257 */ 1258 static void __init rcu_spawn_boost_kthreads(void) 1259 { 1260 struct rcu_node *rnp; 1261 int cpu; 1262 1263 for_each_possible_cpu(cpu) 1264 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; 1265 if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) 1266 return; 1267 rcu_for_each_leaf_node(rnp) 1268 (void)rcu_spawn_one_boost_kthread(rnp); 1269 } 1270 1271 static void rcu_prepare_kthreads(int cpu) 1272 { 1273 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 1274 struct rcu_node *rnp = rdp->mynode; 1275 1276 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1277 if (rcu_scheduler_fully_active) 1278 (void)rcu_spawn_one_boost_kthread(rnp); 1279 } 1280 1281 #else /* #ifdef CONFIG_RCU_BOOST */ 1282 1283 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1284 __releases(rnp->lock) 1285 { 1286 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1287 } 1288 1289 static void invoke_rcu_callbacks_kthread(void) 1290 { 1291 WARN_ON_ONCE(1); 1292 } 1293 1294 static bool rcu_is_callbacks_kthread(void) 1295 { 1296 return false; 1297 } 1298 1299 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1300 { 1301 } 1302 1303 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1304 { 1305 } 1306 1307 static void __init rcu_spawn_boost_kthreads(void) 1308 { 1309 } 1310 1311 static void rcu_prepare_kthreads(int cpu) 1312 { 1313 } 1314 1315 #endif /* #else #ifdef CONFIG_RCU_BOOST */ 1316 1317 #if !defined(CONFIG_RCU_FAST_NO_HZ) 1318 1319 /* 1320 * Check to see if any future RCU-related work will need to be done 1321 * by the current CPU, even if none need be done immediately, returning 1322 * 1 if so. This function is part of the RCU implementation; it is -not- 1323 * an exported member of the RCU API. 1324 * 1325 * Because we not have RCU_FAST_NO_HZ, just check whether or not this 1326 * CPU has RCU callbacks queued. 1327 */ 1328 int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1329 { 1330 *nextevt = KTIME_MAX; 1331 return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); 1332 } 1333 1334 /* 1335 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1336 * after it. 1337 */ 1338 static void rcu_cleanup_after_idle(void) 1339 { 1340 } 1341 1342 /* 1343 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1344 * is nothing. 1345 */ 1346 static void rcu_prepare_for_idle(void) 1347 { 1348 } 1349 1350 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1351 1352 /* 1353 * This code is invoked when a CPU goes idle, at which point we want 1354 * to have the CPU do everything required for RCU so that it can enter 1355 * the energy-efficient dyntick-idle mode. This is handled by a 1356 * state machine implemented by rcu_prepare_for_idle() below. 1357 * 1358 * The following three proprocessor symbols control this state machine: 1359 * 1360 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1361 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1362 * is sized to be roughly one RCU grace period. Those energy-efficiency 1363 * benchmarkers who might otherwise be tempted to set this to a large 1364 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1365 * system. And if you are -that- concerned about energy efficiency, 1366 * just power the system down and be done with it! 1367 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is 1368 * permitted to sleep in dyntick-idle mode with only lazy RCU 1369 * callbacks pending. Setting this too high can OOM your system. 1370 * 1371 * The values below work well in practice. If future workloads require 1372 * adjustment, they can be converted into kernel config parameters, though 1373 * making the state machine smarter might be a better option. 1374 */ 1375 #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1376 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1377 1378 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; 1379 module_param(rcu_idle_gp_delay, int, 0644); 1380 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; 1381 module_param(rcu_idle_lazy_gp_delay, int, 0644); 1382 1383 /* 1384 * Try to advance callbacks on the current CPU, but only if it has been 1385 * awhile since the last time we did so. Afterwards, if there are any 1386 * callbacks ready for immediate invocation, return true. 1387 */ 1388 static bool __maybe_unused rcu_try_advance_all_cbs(void) 1389 { 1390 bool cbs_ready = false; 1391 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1392 struct rcu_node *rnp; 1393 1394 /* Exit early if we advanced recently. */ 1395 if (jiffies == rdp->last_advance_all) 1396 return false; 1397 rdp->last_advance_all = jiffies; 1398 1399 rnp = rdp->mynode; 1400 1401 /* 1402 * Don't bother checking unless a grace period has 1403 * completed since we last checked and there are 1404 * callbacks not yet ready to invoke. 1405 */ 1406 if ((rcu_seq_completed_gp(rdp->gp_seq, 1407 rcu_seq_current(&rnp->gp_seq)) || 1408 unlikely(READ_ONCE(rdp->gpwrap))) && 1409 rcu_segcblist_pend_cbs(&rdp->cblist)) 1410 note_gp_changes(rdp); 1411 1412 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 1413 cbs_ready = true; 1414 return cbs_ready; 1415 } 1416 1417 /* 1418 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready 1419 * to invoke. If the CPU has callbacks, try to advance them. Tell the 1420 * caller to set the timeout based on whether or not there are non-lazy 1421 * callbacks. 1422 * 1423 * The caller must have disabled interrupts. 1424 */ 1425 int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1426 { 1427 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1428 unsigned long dj; 1429 1430 lockdep_assert_irqs_disabled(); 1431 1432 /* If no callbacks, RCU doesn't need the CPU. */ 1433 if (rcu_segcblist_empty(&rdp->cblist)) { 1434 *nextevt = KTIME_MAX; 1435 return 0; 1436 } 1437 1438 /* Attempt to advance callbacks. */ 1439 if (rcu_try_advance_all_cbs()) { 1440 /* Some ready to invoke, so initiate later invocation. */ 1441 invoke_rcu_core(); 1442 return 1; 1443 } 1444 rdp->last_accelerate = jiffies; 1445 1446 /* Request timer delay depending on laziness, and round. */ 1447 rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); 1448 if (rdp->all_lazy) { 1449 dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; 1450 } else { 1451 dj = round_up(rcu_idle_gp_delay + jiffies, 1452 rcu_idle_gp_delay) - jiffies; 1453 } 1454 *nextevt = basemono + dj * TICK_NSEC; 1455 return 0; 1456 } 1457 1458 /* 1459 * Prepare a CPU for idle from an RCU perspective. The first major task 1460 * is to sense whether nohz mode has been enabled or disabled via sysfs. 1461 * The second major task is to check to see if a non-lazy callback has 1462 * arrived at a CPU that previously had only lazy callbacks. The third 1463 * major task is to accelerate (that is, assign grace-period numbers to) 1464 * any recently arrived callbacks. 1465 * 1466 * The caller must have disabled interrupts. 1467 */ 1468 static void rcu_prepare_for_idle(void) 1469 { 1470 bool needwake; 1471 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1472 struct rcu_node *rnp; 1473 int tne; 1474 1475 lockdep_assert_irqs_disabled(); 1476 if (rcu_is_nocb_cpu(smp_processor_id())) 1477 return; 1478 1479 /* Handle nohz enablement switches conservatively. */ 1480 tne = READ_ONCE(tick_nohz_active); 1481 if (tne != rdp->tick_nohz_enabled_snap) { 1482 if (!rcu_segcblist_empty(&rdp->cblist)) 1483 invoke_rcu_core(); /* force nohz to see update. */ 1484 rdp->tick_nohz_enabled_snap = tne; 1485 return; 1486 } 1487 if (!tne) 1488 return; 1489 1490 /* 1491 * If a non-lazy callback arrived at a CPU having only lazy 1492 * callbacks, invoke RCU core for the side-effect of recalculating 1493 * idle duration on re-entry to idle. 1494 */ 1495 if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { 1496 rdp->all_lazy = false; 1497 invoke_rcu_core(); 1498 return; 1499 } 1500 1501 /* 1502 * If we have not yet accelerated this jiffy, accelerate all 1503 * callbacks on this CPU. 1504 */ 1505 if (rdp->last_accelerate == jiffies) 1506 return; 1507 rdp->last_accelerate = jiffies; 1508 if (rcu_segcblist_pend_cbs(&rdp->cblist)) { 1509 rnp = rdp->mynode; 1510 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1511 needwake = rcu_accelerate_cbs(rnp, rdp); 1512 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 1513 if (needwake) 1514 rcu_gp_kthread_wake(); 1515 } 1516 } 1517 1518 /* 1519 * Clean up for exit from idle. Attempt to advance callbacks based on 1520 * any grace periods that elapsed while the CPU was idle, and if any 1521 * callbacks are now ready to invoke, initiate invocation. 1522 */ 1523 static void rcu_cleanup_after_idle(void) 1524 { 1525 lockdep_assert_irqs_disabled(); 1526 if (rcu_is_nocb_cpu(smp_processor_id())) 1527 return; 1528 if (rcu_try_advance_all_cbs()) 1529 invoke_rcu_core(); 1530 } 1531 1532 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1533 1534 #ifdef CONFIG_RCU_NOCB_CPU 1535 1536 /* 1537 * Offload callback processing from the boot-time-specified set of CPUs 1538 * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads 1539 * created that pull the callbacks from the corresponding CPU, wait for 1540 * a grace period to elapse, and invoke the callbacks. These kthreads 1541 * are organized into leaders, which manage incoming callbacks, wait for 1542 * grace periods, and awaken followers, and the followers, which only 1543 * invoke callbacks. Each leader is its own follower. The no-CBs CPUs 1544 * do a wake_up() on their kthread when they insert a callback into any 1545 * empty list, unless the rcu_nocb_poll boot parameter has been specified, 1546 * in which case each kthread actively polls its CPU. (Which isn't so great 1547 * for energy efficiency, but which does reduce RCU's overhead on that CPU.) 1548 * 1549 * This is intended to be used in conjunction with Frederic Weisbecker's 1550 * adaptive-idle work, which would seriously reduce OS jitter on CPUs 1551 * running CPU-bound user-mode computations. 1552 * 1553 * Offloading of callbacks can also be used as an energy-efficiency 1554 * measure because CPUs with no RCU callbacks queued are more aggressive 1555 * about entering dyntick-idle mode. 1556 */ 1557 1558 1559 /* 1560 * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. 1561 * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a 1562 * comma-separated list of CPUs and/or CPU ranges. If an invalid list is 1563 * given, a warning is emitted and all CPUs are offloaded. 1564 */ 1565 static int __init rcu_nocb_setup(char *str) 1566 { 1567 alloc_bootmem_cpumask_var(&rcu_nocb_mask); 1568 if (!strcasecmp(str, "all")) 1569 cpumask_setall(rcu_nocb_mask); 1570 else 1571 if (cpulist_parse(str, rcu_nocb_mask)) { 1572 pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); 1573 cpumask_setall(rcu_nocb_mask); 1574 } 1575 return 1; 1576 } 1577 __setup("rcu_nocbs=", rcu_nocb_setup); 1578 1579 static int __init parse_rcu_nocb_poll(char *arg) 1580 { 1581 rcu_nocb_poll = true; 1582 return 0; 1583 } 1584 early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 1585 1586 /* 1587 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 1588 * grace period. 1589 */ 1590 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 1591 { 1592 swake_up_all(sq); 1593 } 1594 1595 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 1596 { 1597 return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; 1598 } 1599 1600 static void rcu_init_one_nocb(struct rcu_node *rnp) 1601 { 1602 init_swait_queue_head(&rnp->nocb_gp_wq[0]); 1603 init_swait_queue_head(&rnp->nocb_gp_wq[1]); 1604 } 1605 1606 /* Is the specified CPU a no-CBs CPU? */ 1607 bool rcu_is_nocb_cpu(int cpu) 1608 { 1609 if (cpumask_available(rcu_nocb_mask)) 1610 return cpumask_test_cpu(cpu, rcu_nocb_mask); 1611 return false; 1612 } 1613 1614 /* 1615 * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock 1616 * and this function releases it. 1617 */ 1618 static void __wake_nocb_leader(struct rcu_data *rdp, bool force, 1619 unsigned long flags) 1620 __releases(rdp->nocb_lock) 1621 { 1622 struct rcu_data *rdp_leader = rdp->nocb_leader; 1623 1624 lockdep_assert_held(&rdp->nocb_lock); 1625 if (!READ_ONCE(rdp_leader->nocb_kthread)) { 1626 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1627 return; 1628 } 1629 if (rdp_leader->nocb_leader_sleep || force) { 1630 /* Prior smp_mb__after_atomic() orders against prior enqueue. */ 1631 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1632 del_timer(&rdp->nocb_timer); 1633 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1634 smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ 1635 swake_up_one(&rdp_leader->nocb_wq); 1636 } else { 1637 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1638 } 1639 } 1640 1641 /* 1642 * Kick the leader kthread for this NOCB group, but caller has not 1643 * acquired locks. 1644 */ 1645 static void wake_nocb_leader(struct rcu_data *rdp, bool force) 1646 { 1647 unsigned long flags; 1648 1649 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 1650 __wake_nocb_leader(rdp, force, flags); 1651 } 1652 1653 /* 1654 * Arrange to wake the leader kthread for this NOCB group at some 1655 * future time when it is safe to do so. 1656 */ 1657 static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, 1658 const char *reason) 1659 { 1660 unsigned long flags; 1661 1662 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 1663 if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) 1664 mod_timer(&rdp->nocb_timer, jiffies + 1); 1665 WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); 1666 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); 1667 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1668 } 1669 1670 /* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ 1671 static bool rcu_nocb_cpu_needs_barrier(int cpu) 1672 { 1673 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 1674 unsigned long ret; 1675 #ifdef CONFIG_PROVE_RCU 1676 struct rcu_head *rhp; 1677 #endif /* #ifdef CONFIG_PROVE_RCU */ 1678 1679 /* 1680 * Check count of all no-CBs callbacks awaiting invocation. 1681 * There needs to be a barrier before this function is called, 1682 * but associated with a prior determination that no more 1683 * callbacks would be posted. In the worst case, the first 1684 * barrier in rcu_barrier() suffices (but the caller cannot 1685 * necessarily rely on this, not a substitute for the caller 1686 * getting the concurrency design right!). There must also be a 1687 * barrier between the following load and posting of a callback 1688 * (if a callback is in fact needed). This is associated with an 1689 * atomic_inc() in the caller. 1690 */ 1691 ret = rcu_get_n_cbs_nocb_cpu(rdp); 1692 1693 #ifdef CONFIG_PROVE_RCU 1694 rhp = READ_ONCE(rdp->nocb_head); 1695 if (!rhp) 1696 rhp = READ_ONCE(rdp->nocb_gp_head); 1697 if (!rhp) 1698 rhp = READ_ONCE(rdp->nocb_follower_head); 1699 1700 /* Having no rcuo kthread but CBs after scheduler starts is bad! */ 1701 if (!READ_ONCE(rdp->nocb_kthread) && rhp && 1702 rcu_scheduler_fully_active) { 1703 /* RCU callback enqueued before CPU first came online??? */ 1704 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", 1705 cpu, rhp->func); 1706 WARN_ON_ONCE(1); 1707 } 1708 #endif /* #ifdef CONFIG_PROVE_RCU */ 1709 1710 return !!ret; 1711 } 1712 1713 /* 1714 * Enqueue the specified string of rcu_head structures onto the specified 1715 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 1716 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 1717 * counts are supplied by rhcount and rhcount_lazy. 1718 * 1719 * If warranted, also wake up the kthread servicing this CPUs queues. 1720 */ 1721 static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, 1722 struct rcu_head *rhp, 1723 struct rcu_head **rhtp, 1724 int rhcount, int rhcount_lazy, 1725 unsigned long flags) 1726 { 1727 int len; 1728 struct rcu_head **old_rhpp; 1729 struct task_struct *t; 1730 1731 /* Enqueue the callback on the nocb list and update counts. */ 1732 atomic_long_add(rhcount, &rdp->nocb_q_count); 1733 /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ 1734 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 1735 WRITE_ONCE(*old_rhpp, rhp); 1736 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 1737 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ 1738 1739 /* If we are not being polled and there is a kthread, awaken it ... */ 1740 t = READ_ONCE(rdp->nocb_kthread); 1741 if (rcu_nocb_poll || !t) { 1742 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1743 TPS("WakeNotPoll")); 1744 return; 1745 } 1746 len = rcu_get_n_cbs_nocb_cpu(rdp); 1747 if (old_rhpp == &rdp->nocb_head) { 1748 if (!irqs_disabled_flags(flags)) { 1749 /* ... if queue was empty ... */ 1750 wake_nocb_leader(rdp, false); 1751 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1752 TPS("WakeEmpty")); 1753 } else { 1754 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, 1755 TPS("WakeEmptyIsDeferred")); 1756 } 1757 rdp->qlen_last_fqs_check = 0; 1758 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 1759 /* ... or if many callbacks queued. */ 1760 if (!irqs_disabled_flags(flags)) { 1761 wake_nocb_leader(rdp, true); 1762 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1763 TPS("WakeOvf")); 1764 } else { 1765 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, 1766 TPS("WakeOvfIsDeferred")); 1767 } 1768 rdp->qlen_last_fqs_check = LONG_MAX / 2; 1769 } else { 1770 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); 1771 } 1772 return; 1773 } 1774 1775 /* 1776 * This is a helper for __call_rcu(), which invokes this when the normal 1777 * callback queue is inoperable. If this is not a no-CBs CPU, this 1778 * function returns failure back to __call_rcu(), which can complain 1779 * appropriately. 1780 * 1781 * Otherwise, this function queues the callback where the corresponding 1782 * "rcuo" kthread can find it. 1783 */ 1784 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 1785 bool lazy, unsigned long flags) 1786 { 1787 1788 if (!rcu_is_nocb_cpu(rdp->cpu)) 1789 return false; 1790 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); 1791 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 1792 trace_rcu_kfree_callback(rcu_state.name, rhp, 1793 (unsigned long)rhp->func, 1794 -atomic_long_read(&rdp->nocb_q_count_lazy), 1795 -rcu_get_n_cbs_nocb_cpu(rdp)); 1796 else 1797 trace_rcu_callback(rcu_state.name, rhp, 1798 -atomic_long_read(&rdp->nocb_q_count_lazy), 1799 -rcu_get_n_cbs_nocb_cpu(rdp)); 1800 1801 /* 1802 * If called from an extended quiescent state with interrupts 1803 * disabled, invoke the RCU core in order to allow the idle-entry 1804 * deferred-wakeup check to function. 1805 */ 1806 if (irqs_disabled_flags(flags) && 1807 !rcu_is_watching() && 1808 cpu_online(smp_processor_id())) 1809 invoke_rcu_core(); 1810 1811 return true; 1812 } 1813 1814 /* 1815 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is 1816 * not a no-CBs CPU. 1817 */ 1818 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, 1819 struct rcu_data *rdp, 1820 unsigned long flags) 1821 { 1822 lockdep_assert_irqs_disabled(); 1823 if (!rcu_is_nocb_cpu(smp_processor_id())) 1824 return false; /* Not NOCBs CPU, caller must migrate CBs. */ 1825 __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), 1826 rcu_segcblist_tail(&rdp->cblist), 1827 rcu_segcblist_n_cbs(&rdp->cblist), 1828 rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); 1829 rcu_segcblist_init(&rdp->cblist); 1830 rcu_segcblist_disable(&rdp->cblist); 1831 return true; 1832 } 1833 1834 /* 1835 * If necessary, kick off a new grace period, and either way wait 1836 * for a subsequent grace period to complete. 1837 */ 1838 static void rcu_nocb_wait_gp(struct rcu_data *rdp) 1839 { 1840 unsigned long c; 1841 bool d; 1842 unsigned long flags; 1843 bool needwake; 1844 struct rcu_node *rnp = rdp->mynode; 1845 1846 local_irq_save(flags); 1847 c = rcu_seq_snap(&rcu_state.gp_seq); 1848 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { 1849 local_irq_restore(flags); 1850 } else { 1851 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1852 needwake = rcu_start_this_gp(rnp, rdp, c); 1853 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1854 if (needwake) 1855 rcu_gp_kthread_wake(); 1856 } 1857 1858 /* 1859 * Wait for the grace period. Do so interruptibly to avoid messing 1860 * up the load average. 1861 */ 1862 trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); 1863 for (;;) { 1864 swait_event_interruptible_exclusive( 1865 rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], 1866 (d = rcu_seq_done(&rnp->gp_seq, c))); 1867 if (likely(d)) 1868 break; 1869 WARN_ON(signal_pending(current)); 1870 trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); 1871 } 1872 trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); 1873 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 1874 } 1875 1876 /* 1877 * Leaders come here to wait for additional callbacks to show up. 1878 * This function does not return until callbacks appear. 1879 */ 1880 static void nocb_leader_wait(struct rcu_data *my_rdp) 1881 { 1882 bool firsttime = true; 1883 unsigned long flags; 1884 bool gotcbs; 1885 struct rcu_data *rdp; 1886 struct rcu_head **tail; 1887 1888 wait_again: 1889 1890 /* Wait for callbacks to appear. */ 1891 if (!rcu_nocb_poll) { 1892 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); 1893 swait_event_interruptible_exclusive(my_rdp->nocb_wq, 1894 !READ_ONCE(my_rdp->nocb_leader_sleep)); 1895 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); 1896 my_rdp->nocb_leader_sleep = true; 1897 WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 1898 del_timer(&my_rdp->nocb_timer); 1899 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); 1900 } else if (firsttime) { 1901 firsttime = false; /* Don't drown trace log with "Poll"! */ 1902 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); 1903 } 1904 1905 /* 1906 * Each pass through the following loop checks a follower for CBs. 1907 * We are our own first follower. Any CBs found are moved to 1908 * nocb_gp_head, where they await a grace period. 1909 */ 1910 gotcbs = false; 1911 smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ 1912 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 1913 rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); 1914 if (!rdp->nocb_gp_head) 1915 continue; /* No CBs here, try next follower. */ 1916 1917 /* Move callbacks to wait-for-GP list, which is empty. */ 1918 WRITE_ONCE(rdp->nocb_head, NULL); 1919 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 1920 gotcbs = true; 1921 } 1922 1923 /* No callbacks? Sleep a bit if polling, and go retry. */ 1924 if (unlikely(!gotcbs)) { 1925 WARN_ON(signal_pending(current)); 1926 if (rcu_nocb_poll) { 1927 schedule_timeout_interruptible(1); 1928 } else { 1929 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, 1930 TPS("WokeEmpty")); 1931 } 1932 goto wait_again; 1933 } 1934 1935 /* Wait for one grace period. */ 1936 rcu_nocb_wait_gp(my_rdp); 1937 1938 /* Each pass through the following loop wakes a follower, if needed. */ 1939 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 1940 if (!rcu_nocb_poll && 1941 READ_ONCE(rdp->nocb_head) && 1942 READ_ONCE(my_rdp->nocb_leader_sleep)) { 1943 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); 1944 my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ 1945 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); 1946 } 1947 if (!rdp->nocb_gp_head) 1948 continue; /* No CBs, so no need to wake follower. */ 1949 1950 /* Append callbacks to follower's "done" list. */ 1951 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 1952 tail = rdp->nocb_follower_tail; 1953 rdp->nocb_follower_tail = rdp->nocb_gp_tail; 1954 *tail = rdp->nocb_gp_head; 1955 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1956 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 1957 /* List was empty, so wake up the follower. */ 1958 swake_up_one(&rdp->nocb_wq); 1959 } 1960 } 1961 1962 /* If we (the leader) don't have CBs, go wait some more. */ 1963 if (!my_rdp->nocb_follower_head) 1964 goto wait_again; 1965 } 1966 1967 /* 1968 * Followers come here to wait for additional callbacks to show up. 1969 * This function does not return until callbacks appear. 1970 */ 1971 static void nocb_follower_wait(struct rcu_data *rdp) 1972 { 1973 for (;;) { 1974 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); 1975 swait_event_interruptible_exclusive(rdp->nocb_wq, 1976 READ_ONCE(rdp->nocb_follower_head)); 1977 if (smp_load_acquire(&rdp->nocb_follower_head)) { 1978 /* ^^^ Ensure CB invocation follows _head test. */ 1979 return; 1980 } 1981 WARN_ON(signal_pending(current)); 1982 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); 1983 } 1984 } 1985 1986 /* 1987 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes 1988 * callbacks queued by the corresponding no-CBs CPU, however, there is 1989 * an optional leader-follower relationship so that the grace-period 1990 * kthreads don't have to do quite so many wakeups. 1991 */ 1992 static int rcu_nocb_kthread(void *arg) 1993 { 1994 int c, cl; 1995 unsigned long flags; 1996 struct rcu_head *list; 1997 struct rcu_head *next; 1998 struct rcu_head **tail; 1999 struct rcu_data *rdp = arg; 2000 2001 /* Each pass through this loop invokes one batch of callbacks */ 2002 for (;;) { 2003 /* Wait for callbacks. */ 2004 if (rdp->nocb_leader == rdp) 2005 nocb_leader_wait(rdp); 2006 else 2007 nocb_follower_wait(rdp); 2008 2009 /* Pull the ready-to-invoke callbacks onto local list. */ 2010 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 2011 list = rdp->nocb_follower_head; 2012 rdp->nocb_follower_head = NULL; 2013 tail = rdp->nocb_follower_tail; 2014 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2015 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2016 if (WARN_ON_ONCE(!list)) 2017 continue; 2018 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); 2019 2020 /* Each pass through the following loop invokes a callback. */ 2021 trace_rcu_batch_start(rcu_state.name, 2022 atomic_long_read(&rdp->nocb_q_count_lazy), 2023 rcu_get_n_cbs_nocb_cpu(rdp), -1); 2024 c = cl = 0; 2025 while (list) { 2026 next = list->next; 2027 /* Wait for enqueuing to complete, if needed. */ 2028 while (next == NULL && &list->next != tail) { 2029 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 2030 TPS("WaitQueue")); 2031 schedule_timeout_interruptible(1); 2032 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 2033 TPS("WokeQueue")); 2034 next = list->next; 2035 } 2036 debug_rcu_head_unqueue(list); 2037 local_bh_disable(); 2038 if (__rcu_reclaim(rcu_state.name, list)) 2039 cl++; 2040 c++; 2041 local_bh_enable(); 2042 cond_resched_tasks_rcu_qs(); 2043 list = next; 2044 } 2045 trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); 2046 smp_mb__before_atomic(); /* _add after CB invocation. */ 2047 atomic_long_add(-c, &rdp->nocb_q_count); 2048 atomic_long_add(-cl, &rdp->nocb_q_count_lazy); 2049 } 2050 return 0; 2051 } 2052 2053 /* Is a deferred wakeup of rcu_nocb_kthread() required? */ 2054 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2055 { 2056 return READ_ONCE(rdp->nocb_defer_wakeup); 2057 } 2058 2059 /* Do a deferred wakeup of rcu_nocb_kthread(). */ 2060 static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) 2061 { 2062 unsigned long flags; 2063 int ndw; 2064 2065 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 2066 if (!rcu_nocb_need_deferred_wakeup(rdp)) { 2067 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2068 return; 2069 } 2070 ndw = READ_ONCE(rdp->nocb_defer_wakeup); 2071 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2072 __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); 2073 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); 2074 } 2075 2076 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ 2077 static void do_nocb_deferred_wakeup_timer(struct timer_list *t) 2078 { 2079 struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); 2080 2081 do_nocb_deferred_wakeup_common(rdp); 2082 } 2083 2084 /* 2085 * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. 2086 * This means we do an inexact common-case check. Note that if 2087 * we miss, ->nocb_timer will eventually clean things up. 2088 */ 2089 static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2090 { 2091 if (rcu_nocb_need_deferred_wakeup(rdp)) 2092 do_nocb_deferred_wakeup_common(rdp); 2093 } 2094 2095 void __init rcu_init_nohz(void) 2096 { 2097 int cpu; 2098 bool need_rcu_nocb_mask = false; 2099 2100 #if defined(CONFIG_NO_HZ_FULL) 2101 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) 2102 need_rcu_nocb_mask = true; 2103 #endif /* #if defined(CONFIG_NO_HZ_FULL) */ 2104 2105 if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { 2106 if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { 2107 pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); 2108 return; 2109 } 2110 } 2111 if (!cpumask_available(rcu_nocb_mask)) 2112 return; 2113 2114 #if defined(CONFIG_NO_HZ_FULL) 2115 if (tick_nohz_full_running) 2116 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); 2117 #endif /* #if defined(CONFIG_NO_HZ_FULL) */ 2118 2119 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { 2120 pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); 2121 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 2122 rcu_nocb_mask); 2123 } 2124 if (cpumask_empty(rcu_nocb_mask)) 2125 pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); 2126 else 2127 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", 2128 cpumask_pr_args(rcu_nocb_mask)); 2129 if (rcu_nocb_poll) 2130 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2131 2132 for_each_cpu(cpu, rcu_nocb_mask) 2133 init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); 2134 rcu_organize_nocb_kthreads(); 2135 } 2136 2137 /* Initialize per-rcu_data variables for no-CBs CPUs. */ 2138 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2139 { 2140 rdp->nocb_tail = &rdp->nocb_head; 2141 init_swait_queue_head(&rdp->nocb_wq); 2142 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2143 raw_spin_lock_init(&rdp->nocb_lock); 2144 timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); 2145 } 2146 2147 /* 2148 * If the specified CPU is a no-CBs CPU that does not already have its 2149 * rcuo kthread, spawn it. If the CPUs are brought online out of order, 2150 * this can require re-organizing the leader-follower relationships. 2151 */ 2152 static void rcu_spawn_one_nocb_kthread(int cpu) 2153 { 2154 struct rcu_data *rdp; 2155 struct rcu_data *rdp_last; 2156 struct rcu_data *rdp_old_leader; 2157 struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); 2158 struct task_struct *t; 2159 2160 /* 2161 * If this isn't a no-CBs CPU or if it already has an rcuo kthread, 2162 * then nothing to do. 2163 */ 2164 if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) 2165 return; 2166 2167 /* If we didn't spawn the leader first, reorganize! */ 2168 rdp_old_leader = rdp_spawn->nocb_leader; 2169 if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { 2170 rdp_last = NULL; 2171 rdp = rdp_old_leader; 2172 do { 2173 rdp->nocb_leader = rdp_spawn; 2174 if (rdp_last && rdp != rdp_spawn) 2175 rdp_last->nocb_next_follower = rdp; 2176 if (rdp == rdp_spawn) { 2177 rdp = rdp->nocb_next_follower; 2178 } else { 2179 rdp_last = rdp; 2180 rdp = rdp->nocb_next_follower; 2181 rdp_last->nocb_next_follower = NULL; 2182 } 2183 } while (rdp); 2184 rdp_spawn->nocb_next_follower = rdp_old_leader; 2185 } 2186 2187 /* Spawn the kthread for this CPU. */ 2188 t = kthread_run(rcu_nocb_kthread, rdp_spawn, 2189 "rcuo%c/%d", rcu_state.abbr, cpu); 2190 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) 2191 return; 2192 WRITE_ONCE(rdp_spawn->nocb_kthread, t); 2193 } 2194 2195 /* 2196 * If the specified CPU is a no-CBs CPU that does not already have its 2197 * rcuo kthread, spawn it. 2198 */ 2199 static void rcu_spawn_cpu_nocb_kthread(int cpu) 2200 { 2201 if (rcu_scheduler_fully_active) 2202 rcu_spawn_one_nocb_kthread(cpu); 2203 } 2204 2205 /* 2206 * Once the scheduler is running, spawn rcuo kthreads for all online 2207 * no-CBs CPUs. This assumes that the early_initcall()s happen before 2208 * non-boot CPUs come online -- if this changes, we will need to add 2209 * some mutual exclusion. 2210 */ 2211 static void __init rcu_spawn_nocb_kthreads(void) 2212 { 2213 int cpu; 2214 2215 for_each_online_cpu(cpu) 2216 rcu_spawn_cpu_nocb_kthread(cpu); 2217 } 2218 2219 /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ 2220 static int rcu_nocb_leader_stride = -1; 2221 module_param(rcu_nocb_leader_stride, int, 0444); 2222 2223 /* 2224 * Initialize leader-follower relationships for all no-CBs CPU. 2225 */ 2226 static void __init rcu_organize_nocb_kthreads(void) 2227 { 2228 int cpu; 2229 int ls = rcu_nocb_leader_stride; 2230 int nl = 0; /* Next leader. */ 2231 struct rcu_data *rdp; 2232 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ 2233 struct rcu_data *rdp_prev = NULL; 2234 2235 if (!cpumask_available(rcu_nocb_mask)) 2236 return; 2237 if (ls == -1) { 2238 ls = int_sqrt(nr_cpu_ids); 2239 rcu_nocb_leader_stride = ls; 2240 } 2241 2242 /* 2243 * Each pass through this loop sets up one rcu_data structure. 2244 * Should the corresponding CPU come online in the future, then 2245 * we will spawn the needed set of rcu_nocb_kthread() kthreads. 2246 */ 2247 for_each_cpu(cpu, rcu_nocb_mask) { 2248 rdp = per_cpu_ptr(&rcu_data, cpu); 2249 if (rdp->cpu >= nl) { 2250 /* New leader, set up for followers & next leader. */ 2251 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; 2252 rdp->nocb_leader = rdp; 2253 rdp_leader = rdp; 2254 } else { 2255 /* Another follower, link to previous leader. */ 2256 rdp->nocb_leader = rdp_leader; 2257 rdp_prev->nocb_next_follower = rdp; 2258 } 2259 rdp_prev = rdp; 2260 } 2261 } 2262 2263 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2264 static bool init_nocb_callback_list(struct rcu_data *rdp) 2265 { 2266 if (!rcu_is_nocb_cpu(rdp->cpu)) 2267 return false; 2268 2269 /* If there are early-boot callbacks, move them to nocb lists. */ 2270 if (!rcu_segcblist_empty(&rdp->cblist)) { 2271 rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); 2272 rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); 2273 atomic_long_set(&rdp->nocb_q_count, 2274 rcu_segcblist_n_cbs(&rdp->cblist)); 2275 atomic_long_set(&rdp->nocb_q_count_lazy, 2276 rcu_segcblist_n_lazy_cbs(&rdp->cblist)); 2277 rcu_segcblist_init(&rdp->cblist); 2278 } 2279 rcu_segcblist_disable(&rdp->cblist); 2280 return true; 2281 } 2282 2283 /* 2284 * Bind the current task to the offloaded CPUs. If there are no offloaded 2285 * CPUs, leave the task unbound. Splat if the bind attempt fails. 2286 */ 2287 void rcu_bind_current_to_nocb(void) 2288 { 2289 if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) 2290 WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); 2291 } 2292 EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); 2293 2294 /* 2295 * Return the number of RCU callbacks still queued from the specified 2296 * CPU, which must be a nocbs CPU. 2297 */ 2298 static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) 2299 { 2300 return atomic_long_read(&rdp->nocb_q_count); 2301 } 2302 2303 #else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2304 2305 static bool rcu_nocb_cpu_needs_barrier(int cpu) 2306 { 2307 WARN_ON_ONCE(1); /* Should be dead code. */ 2308 return false; 2309 } 2310 2311 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 2312 { 2313 } 2314 2315 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 2316 { 2317 return NULL; 2318 } 2319 2320 static void rcu_init_one_nocb(struct rcu_node *rnp) 2321 { 2322 } 2323 2324 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2325 bool lazy, unsigned long flags) 2326 { 2327 return false; 2328 } 2329 2330 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, 2331 struct rcu_data *rdp, 2332 unsigned long flags) 2333 { 2334 return false; 2335 } 2336 2337 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2338 { 2339 } 2340 2341 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2342 { 2343 return false; 2344 } 2345 2346 static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2347 { 2348 } 2349 2350 static void rcu_spawn_cpu_nocb_kthread(int cpu) 2351 { 2352 } 2353 2354 static void __init rcu_spawn_nocb_kthreads(void) 2355 { 2356 } 2357 2358 static bool init_nocb_callback_list(struct rcu_data *rdp) 2359 { 2360 return false; 2361 } 2362 2363 static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) 2364 { 2365 return 0; 2366 } 2367 2368 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2369 2370 /* 2371 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the 2372 * grace-period kthread will do force_quiescent_state() processing? 2373 * The idea is to avoid waking up RCU core processing on such a 2374 * CPU unless the grace period has extended for too long. 2375 * 2376 * This code relies on the fact that all NO_HZ_FULL CPUs are also 2377 * CONFIG_RCU_NOCB_CPU CPUs. 2378 */ 2379 static bool rcu_nohz_full_cpu(void) 2380 { 2381 #ifdef CONFIG_NO_HZ_FULL 2382 if (tick_nohz_full_cpu(smp_processor_id()) && 2383 (!rcu_gp_in_progress() || 2384 ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) 2385 return true; 2386 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 2387 return false; 2388 } 2389 2390 /* 2391 * Bind the RCU grace-period kthreads to the housekeeping CPU. 2392 */ 2393 static void rcu_bind_gp_kthread(void) 2394 { 2395 if (!tick_nohz_full_enabled()) 2396 return; 2397 housekeeping_affine(current, HK_FLAG_RCU); 2398 } 2399 2400 /* Record the current task on dyntick-idle entry. */ 2401 static void rcu_dynticks_task_enter(void) 2402 { 2403 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) 2404 WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); 2405 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ 2406 } 2407 2408 /* Record no current task on dyntick-idle exit. */ 2409 static void rcu_dynticks_task_exit(void) 2410 { 2411 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) 2412 WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); 2413 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ 2414 } 2415