1 /* SPDX-License-Identifier: GPL-2.0+ */ 2 /* 3 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 4 * Internal non-public definitions that provide either classic 5 * or preemptible semantics. 6 * 7 * Copyright Red Hat, 2009 8 * Copyright IBM Corporation, 2009 9 * 10 * Author: Ingo Molnar <mingo@elte.hu> 11 * Paul E. McKenney <paulmck@linux.ibm.com> 12 */ 13 14 #include <linux/delay.h> 15 #include <linux/gfp.h> 16 #include <linux/oom.h> 17 #include <linux/sched/debug.h> 18 #include <linux/smpboot.h> 19 #include <linux/sched/isolation.h> 20 #include <uapi/linux/sched/types.h> 21 #include "../time/tick-internal.h" 22 23 #ifdef CONFIG_RCU_BOOST 24 #include "../locking/rtmutex_common.h" 25 #else /* #ifdef CONFIG_RCU_BOOST */ 26 27 /* 28 * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, 29 * all uses are in dead code. Provide a definition to keep the compiler 30 * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. 31 * This probably needs to be excluded from -rt builds. 32 */ 33 #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) 34 #define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) 35 36 #endif /* #else #ifdef CONFIG_RCU_BOOST */ 37 38 #ifdef CONFIG_RCU_NOCB_CPU 39 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 40 static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 41 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 42 43 /* 44 * Check the RCU kernel configuration parameters and print informative 45 * messages about anything out of the ordinary. 46 */ 47 static void __init rcu_bootup_announce_oddness(void) 48 { 49 if (IS_ENABLED(CONFIG_RCU_TRACE)) 50 pr_info("\tRCU event tracing is enabled.\n"); 51 if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || 52 (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) 53 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n", 54 RCU_FANOUT); 55 if (rcu_fanout_exact) 56 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 57 if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) 58 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 59 if (IS_ENABLED(CONFIG_PROVE_RCU)) 60 pr_info("\tRCU lockdep checking is enabled.\n"); 61 if (RCU_NUM_LVLS >= 4) 62 pr_info("\tFour(or more)-level hierarchy is enabled.\n"); 63 if (RCU_FANOUT_LEAF != 16) 64 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", 65 RCU_FANOUT_LEAF); 66 if (rcu_fanout_leaf != RCU_FANOUT_LEAF) 67 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", 68 rcu_fanout_leaf); 69 if (nr_cpu_ids != NR_CPUS) 70 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); 71 #ifdef CONFIG_RCU_BOOST 72 pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", 73 kthread_prio, CONFIG_RCU_BOOST_DELAY); 74 #endif 75 if (blimit != DEFAULT_RCU_BLIMIT) 76 pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); 77 if (qhimark != DEFAULT_RCU_QHIMARK) 78 pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); 79 if (qlowmark != DEFAULT_RCU_QLOMARK) 80 pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); 81 if (jiffies_till_first_fqs != ULONG_MAX) 82 pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); 83 if (jiffies_till_next_fqs != ULONG_MAX) 84 pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); 85 if (jiffies_till_sched_qs != ULONG_MAX) 86 pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs); 87 if (rcu_kick_kthreads) 88 pr_info("\tKick kthreads if too-long grace period.\n"); 89 if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) 90 pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); 91 if (gp_preinit_delay) 92 pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); 93 if (gp_init_delay) 94 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); 95 if (gp_cleanup_delay) 96 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); 97 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) 98 pr_info("\tRCU debug extended QS entry/exit.\n"); 99 rcupdate_announce_bootup_oddness(); 100 } 101 102 #ifdef CONFIG_PREEMPT_RCU 103 104 static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake); 105 static void rcu_read_unlock_special(struct task_struct *t); 106 107 /* 108 * Tell them what RCU they are running. 109 */ 110 static void __init rcu_bootup_announce(void) 111 { 112 pr_info("Preemptible hierarchical RCU implementation.\n"); 113 rcu_bootup_announce_oddness(); 114 } 115 116 /* Flags for rcu_preempt_ctxt_queue() decision table. */ 117 #define RCU_GP_TASKS 0x8 118 #define RCU_EXP_TASKS 0x4 119 #define RCU_GP_BLKD 0x2 120 #define RCU_EXP_BLKD 0x1 121 122 /* 123 * Queues a task preempted within an RCU-preempt read-side critical 124 * section into the appropriate location within the ->blkd_tasks list, 125 * depending on the states of any ongoing normal and expedited grace 126 * periods. The ->gp_tasks pointer indicates which element the normal 127 * grace period is waiting on (NULL if none), and the ->exp_tasks pointer 128 * indicates which element the expedited grace period is waiting on (again, 129 * NULL if none). If a grace period is waiting on a given element in the 130 * ->blkd_tasks list, it also waits on all subsequent elements. Thus, 131 * adding a task to the tail of the list blocks any grace period that is 132 * already waiting on one of the elements. In contrast, adding a task 133 * to the head of the list won't block any grace period that is already 134 * waiting on one of the elements. 135 * 136 * This queuing is imprecise, and can sometimes make an ongoing grace 137 * period wait for a task that is not strictly speaking blocking it. 138 * Given the choice, we needlessly block a normal grace period rather than 139 * blocking an expedited grace period. 140 * 141 * Note that an endless sequence of expedited grace periods still cannot 142 * indefinitely postpone a normal grace period. Eventually, all of the 143 * fixed number of preempted tasks blocking the normal grace period that are 144 * not also blocking the expedited grace period will resume and complete 145 * their RCU read-side critical sections. At that point, the ->gp_tasks 146 * pointer will equal the ->exp_tasks pointer, at which point the end of 147 * the corresponding expedited grace period will also be the end of the 148 * normal grace period. 149 */ 150 static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) 151 __releases(rnp->lock) /* But leaves rrupts disabled. */ 152 { 153 int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + 154 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + 155 (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + 156 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); 157 struct task_struct *t = current; 158 159 raw_lockdep_assert_held_rcu_node(rnp); 160 WARN_ON_ONCE(rdp->mynode != rnp); 161 WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); 162 /* RCU better not be waiting on newly onlined CPUs! */ 163 WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & 164 rdp->grpmask); 165 166 /* 167 * Decide where to queue the newly blocked task. In theory, 168 * this could be an if-statement. In practice, when I tried 169 * that, it was quite messy. 170 */ 171 switch (blkd_state) { 172 case 0: 173 case RCU_EXP_TASKS: 174 case RCU_EXP_TASKS + RCU_GP_BLKD: 175 case RCU_GP_TASKS: 176 case RCU_GP_TASKS + RCU_EXP_TASKS: 177 178 /* 179 * Blocking neither GP, or first task blocking the normal 180 * GP but not blocking the already-waiting expedited GP. 181 * Queue at the head of the list to avoid unnecessarily 182 * blocking the already-waiting GPs. 183 */ 184 list_add(&t->rcu_node_entry, &rnp->blkd_tasks); 185 break; 186 187 case RCU_EXP_BLKD: 188 case RCU_GP_BLKD: 189 case RCU_GP_BLKD + RCU_EXP_BLKD: 190 case RCU_GP_TASKS + RCU_EXP_BLKD: 191 case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 192 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 193 194 /* 195 * First task arriving that blocks either GP, or first task 196 * arriving that blocks the expedited GP (with the normal 197 * GP already waiting), or a task arriving that blocks 198 * both GPs with both GPs already waiting. Queue at the 199 * tail of the list to avoid any GP waiting on any of the 200 * already queued tasks that are not blocking it. 201 */ 202 list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); 203 break; 204 205 case RCU_EXP_TASKS + RCU_EXP_BLKD: 206 case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 207 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: 208 209 /* 210 * Second or subsequent task blocking the expedited GP. 211 * The task either does not block the normal GP, or is the 212 * first task blocking the normal GP. Queue just after 213 * the first task blocking the expedited GP. 214 */ 215 list_add(&t->rcu_node_entry, rnp->exp_tasks); 216 break; 217 218 case RCU_GP_TASKS + RCU_GP_BLKD: 219 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: 220 221 /* 222 * Second or subsequent task blocking the normal GP. 223 * The task does not block the expedited GP. Queue just 224 * after the first task blocking the normal GP. 225 */ 226 list_add(&t->rcu_node_entry, rnp->gp_tasks); 227 break; 228 229 default: 230 231 /* Yet another exercise in excessive paranoia. */ 232 WARN_ON_ONCE(1); 233 break; 234 } 235 236 /* 237 * We have now queued the task. If it was the first one to 238 * block either grace period, update the ->gp_tasks and/or 239 * ->exp_tasks pointers, respectively, to reference the newly 240 * blocked tasks. 241 */ 242 if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { 243 rnp->gp_tasks = &t->rcu_node_entry; 244 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); 245 } 246 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) 247 rnp->exp_tasks = &t->rcu_node_entry; 248 WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != 249 !(rnp->qsmask & rdp->grpmask)); 250 WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != 251 !(rnp->expmask & rdp->grpmask)); 252 raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ 253 254 /* 255 * Report the quiescent state for the expedited GP. This expedited 256 * GP should not be able to end until we report, so there should be 257 * no need to check for a subsequent expedited GP. (Though we are 258 * still in a quiescent state in any case.) 259 */ 260 if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) 261 rcu_report_exp_rdp(rdp); 262 else 263 WARN_ON_ONCE(rdp->deferred_qs); 264 } 265 266 /* 267 * Record a preemptible-RCU quiescent state for the specified CPU. 268 * Note that this does not necessarily mean that the task currently running 269 * on the CPU is in a quiescent state: Instead, it means that the current 270 * grace period need not wait on any RCU read-side critical section that 271 * starts later on this CPU. It also means that if the current task is 272 * in an RCU read-side critical section, it has already added itself to 273 * some leaf rcu_node structure's ->blkd_tasks list. In addition to the 274 * current task, there might be any number of other tasks blocked while 275 * in an RCU read-side critical section. 276 * 277 * Callers to this function must disable preemption. 278 */ 279 static void rcu_qs(void) 280 { 281 RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); 282 if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { 283 trace_rcu_grace_period(TPS("rcu_preempt"), 284 __this_cpu_read(rcu_data.gp_seq), 285 TPS("cpuqs")); 286 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); 287 barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ 288 current->rcu_read_unlock_special.b.need_qs = false; 289 } 290 } 291 292 /* 293 * We have entered the scheduler, and the current task might soon be 294 * context-switched away from. If this task is in an RCU read-side 295 * critical section, we will no longer be able to rely on the CPU to 296 * record that fact, so we enqueue the task on the blkd_tasks list. 297 * The task will dequeue itself when it exits the outermost enclosing 298 * RCU read-side critical section. Therefore, the current grace period 299 * cannot be permitted to complete until the blkd_tasks list entries 300 * predating the current grace period drain, in other words, until 301 * rnp->gp_tasks becomes NULL. 302 * 303 * Caller must disable interrupts. 304 */ 305 void rcu_note_context_switch(bool preempt) 306 { 307 struct task_struct *t = current; 308 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 309 struct rcu_node *rnp; 310 311 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 312 trace_rcu_utilization(TPS("Start context switch")); 313 lockdep_assert_irqs_disabled(); 314 WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); 315 if (t->rcu_read_lock_nesting > 0 && 316 !t->rcu_read_unlock_special.b.blocked) { 317 318 /* Possibly blocking in an RCU read-side critical section. */ 319 rnp = rdp->mynode; 320 raw_spin_lock_rcu_node(rnp); 321 t->rcu_read_unlock_special.b.blocked = true; 322 t->rcu_blocked_node = rnp; 323 324 /* 325 * Verify the CPU's sanity, trace the preemption, and 326 * then queue the task as required based on the states 327 * of any ongoing and expedited grace periods. 328 */ 329 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); 330 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 331 trace_rcu_preempt_task(rcu_state.name, 332 t->pid, 333 (rnp->qsmask & rdp->grpmask) 334 ? rnp->gp_seq 335 : rcu_seq_snap(&rnp->gp_seq)); 336 rcu_preempt_ctxt_queue(rnp, rdp); 337 } else if (t->rcu_read_lock_nesting < 0 && 338 t->rcu_read_unlock_special.s) { 339 340 /* 341 * Complete exit from RCU read-side critical section on 342 * behalf of preempted instance of __rcu_read_unlock(). 343 */ 344 rcu_read_unlock_special(t); 345 rcu_preempt_deferred_qs(t); 346 } else { 347 rcu_preempt_deferred_qs(t); 348 } 349 350 /* 351 * Either we were not in an RCU read-side critical section to 352 * begin with, or we have now recorded that critical section 353 * globally. Either way, we can now note a quiescent state 354 * for this CPU. Again, if we were in an RCU read-side critical 355 * section, and if that critical section was blocking the current 356 * grace period, then the fact that the task has been enqueued 357 * means that we continue to block the current grace period. 358 */ 359 rcu_qs(); 360 if (rdp->deferred_qs) 361 rcu_report_exp_rdp(rdp); 362 trace_rcu_utilization(TPS("End context switch")); 363 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 364 } 365 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 366 367 /* 368 * Check for preempted RCU readers blocking the current grace period 369 * for the specified rcu_node structure. If the caller needs a reliable 370 * answer, it must hold the rcu_node's ->lock. 371 */ 372 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 373 { 374 return rnp->gp_tasks != NULL; 375 } 376 377 /* Bias and limit values for ->rcu_read_lock_nesting. */ 378 #define RCU_NEST_BIAS INT_MAX 379 #define RCU_NEST_NMAX (-INT_MAX / 2) 380 #define RCU_NEST_PMAX (INT_MAX / 2) 381 382 /* 383 * Preemptible RCU implementation for rcu_read_lock(). 384 * Just increment ->rcu_read_lock_nesting, shared state will be updated 385 * if we block. 386 */ 387 void __rcu_read_lock(void) 388 { 389 current->rcu_read_lock_nesting++; 390 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 391 WARN_ON_ONCE(current->rcu_read_lock_nesting > RCU_NEST_PMAX); 392 barrier(); /* critical section after entry code. */ 393 } 394 EXPORT_SYMBOL_GPL(__rcu_read_lock); 395 396 /* 397 * Preemptible RCU implementation for rcu_read_unlock(). 398 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 399 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 400 * invoke rcu_read_unlock_special() to clean up after a context switch 401 * in an RCU read-side critical section and other special cases. 402 */ 403 void __rcu_read_unlock(void) 404 { 405 struct task_struct *t = current; 406 407 if (t->rcu_read_lock_nesting != 1) { 408 --t->rcu_read_lock_nesting; 409 } else { 410 barrier(); /* critical section before exit code. */ 411 t->rcu_read_lock_nesting = -RCU_NEST_BIAS; 412 barrier(); /* assign before ->rcu_read_unlock_special load */ 413 if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) 414 rcu_read_unlock_special(t); 415 barrier(); /* ->rcu_read_unlock_special load before assign */ 416 t->rcu_read_lock_nesting = 0; 417 } 418 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 419 int rrln = t->rcu_read_lock_nesting; 420 421 WARN_ON_ONCE(rrln < 0 && rrln > RCU_NEST_NMAX); 422 } 423 } 424 EXPORT_SYMBOL_GPL(__rcu_read_unlock); 425 426 /* 427 * Advance a ->blkd_tasks-list pointer to the next entry, instead 428 * returning NULL if at the end of the list. 429 */ 430 static struct list_head *rcu_next_node_entry(struct task_struct *t, 431 struct rcu_node *rnp) 432 { 433 struct list_head *np; 434 435 np = t->rcu_node_entry.next; 436 if (np == &rnp->blkd_tasks) 437 np = NULL; 438 return np; 439 } 440 441 /* 442 * Return true if the specified rcu_node structure has tasks that were 443 * preempted within an RCU read-side critical section. 444 */ 445 static bool rcu_preempt_has_tasks(struct rcu_node *rnp) 446 { 447 return !list_empty(&rnp->blkd_tasks); 448 } 449 450 /* 451 * Report deferred quiescent states. The deferral time can 452 * be quite short, for example, in the case of the call from 453 * rcu_read_unlock_special(). 454 */ 455 static void 456 rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) 457 { 458 bool empty_exp; 459 bool empty_norm; 460 bool empty_exp_now; 461 struct list_head *np; 462 bool drop_boost_mutex = false; 463 struct rcu_data *rdp; 464 struct rcu_node *rnp; 465 union rcu_special special; 466 467 /* 468 * If RCU core is waiting for this CPU to exit its critical section, 469 * report the fact that it has exited. Because irqs are disabled, 470 * t->rcu_read_unlock_special cannot change. 471 */ 472 special = t->rcu_read_unlock_special; 473 rdp = this_cpu_ptr(&rcu_data); 474 if (!special.s && !rdp->deferred_qs) { 475 local_irq_restore(flags); 476 return; 477 } 478 if (special.b.need_qs) { 479 rcu_qs(); 480 t->rcu_read_unlock_special.b.need_qs = false; 481 if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { 482 local_irq_restore(flags); 483 return; 484 } 485 } 486 487 /* 488 * Respond to a request by an expedited grace period for a 489 * quiescent state from this CPU. Note that requests from 490 * tasks are handled when removing the task from the 491 * blocked-tasks list below. 492 */ 493 if (rdp->deferred_qs) { 494 rcu_report_exp_rdp(rdp); 495 if (!t->rcu_read_unlock_special.s) { 496 local_irq_restore(flags); 497 return; 498 } 499 } 500 501 /* Clean up if blocked during RCU read-side critical section. */ 502 if (special.b.blocked) { 503 t->rcu_read_unlock_special.b.blocked = false; 504 505 /* 506 * Remove this task from the list it blocked on. The task 507 * now remains queued on the rcu_node corresponding to the 508 * CPU it first blocked on, so there is no longer any need 509 * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. 510 */ 511 rnp = t->rcu_blocked_node; 512 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 513 WARN_ON_ONCE(rnp != t->rcu_blocked_node); 514 WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); 515 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 516 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && 517 (!empty_norm || rnp->qsmask)); 518 empty_exp = sync_rcu_preempt_exp_done(rnp); 519 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 520 np = rcu_next_node_entry(t, rnp); 521 list_del_init(&t->rcu_node_entry); 522 t->rcu_blocked_node = NULL; 523 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), 524 rnp->gp_seq, t->pid); 525 if (&t->rcu_node_entry == rnp->gp_tasks) 526 rnp->gp_tasks = np; 527 if (&t->rcu_node_entry == rnp->exp_tasks) 528 rnp->exp_tasks = np; 529 if (IS_ENABLED(CONFIG_RCU_BOOST)) { 530 /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ 531 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; 532 if (&t->rcu_node_entry == rnp->boost_tasks) 533 rnp->boost_tasks = np; 534 } 535 536 /* 537 * If this was the last task on the current list, and if 538 * we aren't waiting on any CPUs, report the quiescent state. 539 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 540 * so we must take a snapshot of the expedited state. 541 */ 542 empty_exp_now = sync_rcu_preempt_exp_done(rnp); 543 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { 544 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 545 rnp->gp_seq, 546 0, rnp->qsmask, 547 rnp->level, 548 rnp->grplo, 549 rnp->grphi, 550 !!rnp->gp_tasks); 551 rcu_report_unblock_qs_rnp(rnp, flags); 552 } else { 553 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 554 } 555 556 /* Unboost if we were boosted. */ 557 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) 558 rt_mutex_futex_unlock(&rnp->boost_mtx); 559 560 /* 561 * If this was the last task on the expedited lists, 562 * then we need to report up the rcu_node hierarchy. 563 */ 564 if (!empty_exp && empty_exp_now) 565 rcu_report_exp_rnp(rnp, true); 566 } else { 567 local_irq_restore(flags); 568 } 569 } 570 571 /* 572 * Is a deferred quiescent-state pending, and are we also not in 573 * an RCU read-side critical section? It is the caller's responsibility 574 * to ensure it is otherwise safe to report any deferred quiescent 575 * states. The reason for this is that it is safe to report a 576 * quiescent state during context switch even though preemption 577 * is disabled. This function cannot be expected to understand these 578 * nuances, so the caller must handle them. 579 */ 580 static bool rcu_preempt_need_deferred_qs(struct task_struct *t) 581 { 582 return (__this_cpu_read(rcu_data.deferred_qs) || 583 READ_ONCE(t->rcu_read_unlock_special.s)) && 584 t->rcu_read_lock_nesting <= 0; 585 } 586 587 /* 588 * Report a deferred quiescent state if needed and safe to do so. 589 * As with rcu_preempt_need_deferred_qs(), "safe" involves only 590 * not being in an RCU read-side critical section. The caller must 591 * evaluate safety in terms of interrupt, softirq, and preemption 592 * disabling. 593 */ 594 static void rcu_preempt_deferred_qs(struct task_struct *t) 595 { 596 unsigned long flags; 597 bool couldrecurse = t->rcu_read_lock_nesting >= 0; 598 599 if (!rcu_preempt_need_deferred_qs(t)) 600 return; 601 if (couldrecurse) 602 t->rcu_read_lock_nesting -= RCU_NEST_BIAS; 603 local_irq_save(flags); 604 rcu_preempt_deferred_qs_irqrestore(t, flags); 605 if (couldrecurse) 606 t->rcu_read_lock_nesting += RCU_NEST_BIAS; 607 } 608 609 /* 610 * Handle special cases during rcu_read_unlock(), such as needing to 611 * notify RCU core processing or task having blocked during the RCU 612 * read-side critical section. 613 */ 614 static void rcu_read_unlock_special(struct task_struct *t) 615 { 616 unsigned long flags; 617 bool preempt_bh_were_disabled = 618 !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); 619 bool irqs_were_disabled; 620 621 /* NMI handlers cannot block and cannot safely manipulate state. */ 622 if (in_nmi()) 623 return; 624 625 local_irq_save(flags); 626 irqs_were_disabled = irqs_disabled_flags(flags); 627 if (preempt_bh_were_disabled || irqs_were_disabled) { 628 WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); 629 /* Need to defer quiescent state until everything is enabled. */ 630 if (irqs_were_disabled) { 631 /* Enabling irqs does not reschedule, so... */ 632 raise_softirq_irqoff(RCU_SOFTIRQ); 633 } else { 634 /* Enabling BH or preempt does reschedule, so... */ 635 set_tsk_need_resched(current); 636 set_preempt_need_resched(); 637 } 638 local_irq_restore(flags); 639 return; 640 } 641 WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); 642 rcu_preempt_deferred_qs_irqrestore(t, flags); 643 } 644 645 /* 646 * Dump detailed information for all tasks blocking the current RCU 647 * grace period on the specified rcu_node structure. 648 */ 649 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 650 { 651 unsigned long flags; 652 struct task_struct *t; 653 654 raw_spin_lock_irqsave_rcu_node(rnp, flags); 655 if (!rcu_preempt_blocked_readers_cgp(rnp)) { 656 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 657 return; 658 } 659 t = list_entry(rnp->gp_tasks->prev, 660 struct task_struct, rcu_node_entry); 661 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 662 /* 663 * We could be printing a lot while holding a spinlock. 664 * Avoid triggering hard lockup. 665 */ 666 touch_nmi_watchdog(); 667 sched_show_task(t); 668 } 669 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 670 } 671 672 /* 673 * Dump detailed information for all tasks blocking the current RCU 674 * grace period. 675 */ 676 static void rcu_print_detail_task_stall(void) 677 { 678 struct rcu_node *rnp = rcu_get_root(); 679 680 rcu_print_detail_task_stall_rnp(rnp); 681 rcu_for_each_leaf_node(rnp) 682 rcu_print_detail_task_stall_rnp(rnp); 683 } 684 685 static void rcu_print_task_stall_begin(struct rcu_node *rnp) 686 { 687 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", 688 rnp->level, rnp->grplo, rnp->grphi); 689 } 690 691 static void rcu_print_task_stall_end(void) 692 { 693 pr_cont("\n"); 694 } 695 696 /* 697 * Scan the current list of tasks blocked within RCU read-side critical 698 * sections, printing out the tid of each. 699 */ 700 static int rcu_print_task_stall(struct rcu_node *rnp) 701 { 702 struct task_struct *t; 703 int ndetected = 0; 704 705 if (!rcu_preempt_blocked_readers_cgp(rnp)) 706 return 0; 707 rcu_print_task_stall_begin(rnp); 708 t = list_entry(rnp->gp_tasks->prev, 709 struct task_struct, rcu_node_entry); 710 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 711 pr_cont(" P%d", t->pid); 712 ndetected++; 713 } 714 rcu_print_task_stall_end(); 715 return ndetected; 716 } 717 718 /* 719 * Scan the current list of tasks blocked within RCU read-side critical 720 * sections, printing out the tid of each that is blocking the current 721 * expedited grace period. 722 */ 723 static int rcu_print_task_exp_stall(struct rcu_node *rnp) 724 { 725 struct task_struct *t; 726 int ndetected = 0; 727 728 if (!rnp->exp_tasks) 729 return 0; 730 t = list_entry(rnp->exp_tasks->prev, 731 struct task_struct, rcu_node_entry); 732 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 733 pr_cont(" P%d", t->pid); 734 ndetected++; 735 } 736 return ndetected; 737 } 738 739 /* 740 * Check that the list of blocked tasks for the newly completed grace 741 * period is in fact empty. It is a serious bug to complete a grace 742 * period that still has RCU readers blocked! This function must be 743 * invoked -before- updating this rnp's ->gp_seq, and the rnp's ->lock 744 * must be held by the caller. 745 * 746 * Also, if there are blocked tasks on the list, they automatically 747 * block the newly created grace period, so set up ->gp_tasks accordingly. 748 */ 749 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 750 { 751 struct task_struct *t; 752 753 RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); 754 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) 755 dump_blkd_tasks(rnp, 10); 756 if (rcu_preempt_has_tasks(rnp) && 757 (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { 758 rnp->gp_tasks = rnp->blkd_tasks.next; 759 t = container_of(rnp->gp_tasks, struct task_struct, 760 rcu_node_entry); 761 trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), 762 rnp->gp_seq, t->pid); 763 } 764 WARN_ON_ONCE(rnp->qsmask); 765 } 766 767 /* 768 * Check for a quiescent state from the current CPU, including voluntary 769 * context switches for Tasks RCU. When a task blocks, the task is 770 * recorded in the corresponding CPU's rcu_node structure, which is checked 771 * elsewhere, hence this function need only check for quiescent states 772 * related to the current CPU, not to those related to tasks. 773 */ 774 static void rcu_flavor_sched_clock_irq(int user) 775 { 776 struct task_struct *t = current; 777 778 if (user || rcu_is_cpu_rrupt_from_idle()) { 779 rcu_note_voluntary_context_switch(current); 780 } 781 if (t->rcu_read_lock_nesting > 0 || 782 (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { 783 /* No QS, force context switch if deferred. */ 784 if (rcu_preempt_need_deferred_qs(t)) { 785 set_tsk_need_resched(t); 786 set_preempt_need_resched(); 787 } 788 } else if (rcu_preempt_need_deferred_qs(t)) { 789 rcu_preempt_deferred_qs(t); /* Report deferred QS. */ 790 return; 791 } else if (!t->rcu_read_lock_nesting) { 792 rcu_qs(); /* Report immediate QS. */ 793 return; 794 } 795 796 /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ 797 if (t->rcu_read_lock_nesting > 0 && 798 __this_cpu_read(rcu_data.core_needs_qs) && 799 __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && 800 !t->rcu_read_unlock_special.b.need_qs && 801 time_after(jiffies, rcu_state.gp_start + HZ)) 802 t->rcu_read_unlock_special.b.need_qs = true; 803 } 804 805 /* 806 * Check for a task exiting while in a preemptible-RCU read-side 807 * critical section, clean up if so. No need to issue warnings, 808 * as debug_check_no_locks_held() already does this if lockdep 809 * is enabled. 810 */ 811 void exit_rcu(void) 812 { 813 struct task_struct *t = current; 814 815 if (likely(list_empty(¤t->rcu_node_entry))) 816 return; 817 t->rcu_read_lock_nesting = 1; 818 barrier(); 819 t->rcu_read_unlock_special.b.blocked = true; 820 __rcu_read_unlock(); 821 rcu_preempt_deferred_qs(current); 822 } 823 824 /* 825 * Dump the blocked-tasks state, but limit the list dump to the 826 * specified number of elements. 827 */ 828 static void 829 dump_blkd_tasks(struct rcu_node *rnp, int ncheck) 830 { 831 int cpu; 832 int i; 833 struct list_head *lhp; 834 bool onl; 835 struct rcu_data *rdp; 836 struct rcu_node *rnp1; 837 838 raw_lockdep_assert_held_rcu_node(rnp); 839 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", 840 __func__, rnp->grplo, rnp->grphi, rnp->level, 841 (long)rnp->gp_seq, (long)rnp->completedqs); 842 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) 843 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", 844 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); 845 pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", 846 __func__, rnp->gp_tasks, rnp->boost_tasks, rnp->exp_tasks); 847 pr_info("%s: ->blkd_tasks", __func__); 848 i = 0; 849 list_for_each(lhp, &rnp->blkd_tasks) { 850 pr_cont(" %p", lhp); 851 if (++i >= 10) 852 break; 853 } 854 pr_cont("\n"); 855 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { 856 rdp = per_cpu_ptr(&rcu_data, cpu); 857 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); 858 pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", 859 cpu, ".o"[onl], 860 (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, 861 (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); 862 } 863 } 864 865 #else /* #ifdef CONFIG_PREEMPT_RCU */ 866 867 /* 868 * Tell them what RCU they are running. 869 */ 870 static void __init rcu_bootup_announce(void) 871 { 872 pr_info("Hierarchical RCU implementation.\n"); 873 rcu_bootup_announce_oddness(); 874 } 875 876 /* 877 * Note a quiescent state for PREEMPT=n. Because we do not need to know 878 * how many quiescent states passed, just if there was at least one since 879 * the start of the grace period, this just sets a flag. The caller must 880 * have disabled preemption. 881 */ 882 static void rcu_qs(void) 883 { 884 RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!"); 885 if (!__this_cpu_read(rcu_data.cpu_no_qs.s)) 886 return; 887 trace_rcu_grace_period(TPS("rcu_sched"), 888 __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); 889 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); 890 if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) 891 return; 892 __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); 893 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); 894 } 895 896 /* 897 * Register an urgently needed quiescent state. If there is an 898 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 899 * dyntick-idle quiescent state visible to other CPUs, which will in 900 * some cases serve for expedited as well as normal grace periods. 901 * Either way, register a lightweight quiescent state. 902 * 903 * The barrier() calls are redundant in the common case when this is 904 * called externally, but just in case this is called from within this 905 * file. 906 * 907 */ 908 void rcu_all_qs(void) 909 { 910 unsigned long flags; 911 912 if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) 913 return; 914 preempt_disable(); 915 /* Load rcu_urgent_qs before other flags. */ 916 if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { 917 preempt_enable(); 918 return; 919 } 920 this_cpu_write(rcu_data.rcu_urgent_qs, false); 921 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 922 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { 923 local_irq_save(flags); 924 rcu_momentary_dyntick_idle(); 925 local_irq_restore(flags); 926 } 927 rcu_qs(); 928 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 929 preempt_enable(); 930 } 931 EXPORT_SYMBOL_GPL(rcu_all_qs); 932 933 /* 934 * Note a PREEMPT=n context switch. The caller must have disabled interrupts. 935 */ 936 void rcu_note_context_switch(bool preempt) 937 { 938 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 939 trace_rcu_utilization(TPS("Start context switch")); 940 rcu_qs(); 941 /* Load rcu_urgent_qs before other flags. */ 942 if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) 943 goto out; 944 this_cpu_write(rcu_data.rcu_urgent_qs, false); 945 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) 946 rcu_momentary_dyntick_idle(); 947 if (!preempt) 948 rcu_tasks_qs(current); 949 out: 950 trace_rcu_utilization(TPS("End context switch")); 951 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 952 } 953 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 954 955 /* 956 * Because preemptible RCU does not exist, there are never any preempted 957 * RCU readers. 958 */ 959 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 960 { 961 return 0; 962 } 963 964 /* 965 * Because there is no preemptible RCU, there can be no readers blocked. 966 */ 967 static bool rcu_preempt_has_tasks(struct rcu_node *rnp) 968 { 969 return false; 970 } 971 972 /* 973 * Because there is no preemptible RCU, there can be no deferred quiescent 974 * states. 975 */ 976 static bool rcu_preempt_need_deferred_qs(struct task_struct *t) 977 { 978 return false; 979 } 980 static void rcu_preempt_deferred_qs(struct task_struct *t) { } 981 982 /* 983 * Because preemptible RCU does not exist, we never have to check for 984 * tasks blocked within RCU read-side critical sections. 985 */ 986 static void rcu_print_detail_task_stall(void) 987 { 988 } 989 990 /* 991 * Because preemptible RCU does not exist, we never have to check for 992 * tasks blocked within RCU read-side critical sections. 993 */ 994 static int rcu_print_task_stall(struct rcu_node *rnp) 995 { 996 return 0; 997 } 998 999 /* 1000 * Because preemptible RCU does not exist, we never have to check for 1001 * tasks blocked within RCU read-side critical sections that are 1002 * blocking the current expedited grace period. 1003 */ 1004 static int rcu_print_task_exp_stall(struct rcu_node *rnp) 1005 { 1006 return 0; 1007 } 1008 1009 /* 1010 * Because there is no preemptible RCU, there can be no readers blocked, 1011 * so there is no need to check for blocked tasks. So check only for 1012 * bogus qsmask values. 1013 */ 1014 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 1015 { 1016 WARN_ON_ONCE(rnp->qsmask); 1017 } 1018 1019 /* 1020 * Check to see if this CPU is in a non-context-switch quiescent state, 1021 * namely user mode and idle loop. 1022 */ 1023 static void rcu_flavor_sched_clock_irq(int user) 1024 { 1025 if (user || rcu_is_cpu_rrupt_from_idle()) { 1026 1027 /* 1028 * Get here if this CPU took its interrupt from user 1029 * mode or from the idle loop, and if this is not a 1030 * nested interrupt. In this case, the CPU is in 1031 * a quiescent state, so note it. 1032 * 1033 * No memory barrier is required here because rcu_qs() 1034 * references only CPU-local variables that other CPUs 1035 * neither access nor modify, at least not while the 1036 * corresponding CPU is online. 1037 */ 1038 1039 rcu_qs(); 1040 } 1041 } 1042 1043 /* 1044 * Because preemptible RCU does not exist, tasks cannot possibly exit 1045 * while in preemptible RCU read-side critical sections. 1046 */ 1047 void exit_rcu(void) 1048 { 1049 } 1050 1051 /* 1052 * Dump the guaranteed-empty blocked-tasks state. Trust but verify. 1053 */ 1054 static void 1055 dump_blkd_tasks(struct rcu_node *rnp, int ncheck) 1056 { 1057 WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); 1058 } 1059 1060 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 1061 1062 #ifdef CONFIG_RCU_BOOST 1063 1064 static void rcu_wake_cond(struct task_struct *t, int status) 1065 { 1066 /* 1067 * If the thread is yielding, only wake it when this 1068 * is invoked from idle 1069 */ 1070 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) 1071 wake_up_process(t); 1072 } 1073 1074 /* 1075 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1076 * or ->boost_tasks, advancing the pointer to the next task in the 1077 * ->blkd_tasks list. 1078 * 1079 * Note that irqs must be enabled: boosting the task can block. 1080 * Returns 1 if there are more tasks needing to be boosted. 1081 */ 1082 static int rcu_boost(struct rcu_node *rnp) 1083 { 1084 unsigned long flags; 1085 struct task_struct *t; 1086 struct list_head *tb; 1087 1088 if (READ_ONCE(rnp->exp_tasks) == NULL && 1089 READ_ONCE(rnp->boost_tasks) == NULL) 1090 return 0; /* Nothing left to boost. */ 1091 1092 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1093 1094 /* 1095 * Recheck under the lock: all tasks in need of boosting 1096 * might exit their RCU read-side critical sections on their own. 1097 */ 1098 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { 1099 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1100 return 0; 1101 } 1102 1103 /* 1104 * Preferentially boost tasks blocking expedited grace periods. 1105 * This cannot starve the normal grace periods because a second 1106 * expedited grace period must boost all blocked tasks, including 1107 * those blocking the pre-existing normal grace period. 1108 */ 1109 if (rnp->exp_tasks != NULL) 1110 tb = rnp->exp_tasks; 1111 else 1112 tb = rnp->boost_tasks; 1113 1114 /* 1115 * We boost task t by manufacturing an rt_mutex that appears to 1116 * be held by task t. We leave a pointer to that rt_mutex where 1117 * task t can find it, and task t will release the mutex when it 1118 * exits its outermost RCU read-side critical section. Then 1119 * simply acquiring this artificial rt_mutex will boost task 1120 * t's priority. (Thanks to tglx for suggesting this approach!) 1121 * 1122 * Note that task t must acquire rnp->lock to remove itself from 1123 * the ->blkd_tasks list, which it will do from exit() if from 1124 * nowhere else. We therefore are guaranteed that task t will 1125 * stay around at least until we drop rnp->lock. Note that 1126 * rnp->lock also resolves races between our priority boosting 1127 * and task t's exiting its outermost RCU read-side critical 1128 * section. 1129 */ 1130 t = container_of(tb, struct task_struct, rcu_node_entry); 1131 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1132 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1133 /* Lock only for side effect: boosts task t's priority. */ 1134 rt_mutex_lock(&rnp->boost_mtx); 1135 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1136 1137 return READ_ONCE(rnp->exp_tasks) != NULL || 1138 READ_ONCE(rnp->boost_tasks) != NULL; 1139 } 1140 1141 /* 1142 * Priority-boosting kthread, one per leaf rcu_node. 1143 */ 1144 static int rcu_boost_kthread(void *arg) 1145 { 1146 struct rcu_node *rnp = (struct rcu_node *)arg; 1147 int spincnt = 0; 1148 int more2boost; 1149 1150 trace_rcu_utilization(TPS("Start boost kthread@init")); 1151 for (;;) { 1152 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1153 trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); 1154 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1155 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); 1156 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1157 more2boost = rcu_boost(rnp); 1158 if (more2boost) 1159 spincnt++; 1160 else 1161 spincnt = 0; 1162 if (spincnt > 10) { 1163 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1164 trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); 1165 schedule_timeout_interruptible(2); 1166 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); 1167 spincnt = 0; 1168 } 1169 } 1170 /* NOTREACHED */ 1171 trace_rcu_utilization(TPS("End boost kthread@notreached")); 1172 return 0; 1173 } 1174 1175 /* 1176 * Check to see if it is time to start boosting RCU readers that are 1177 * blocking the current grace period, and, if so, tell the per-rcu_node 1178 * kthread to start boosting them. If there is an expedited grace 1179 * period in progress, it is always time to boost. 1180 * 1181 * The caller must hold rnp->lock, which this function releases. 1182 * The ->boost_kthread_task is immortal, so we don't need to worry 1183 * about it going away. 1184 */ 1185 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1186 __releases(rnp->lock) 1187 { 1188 struct task_struct *t; 1189 1190 raw_lockdep_assert_held_rcu_node(rnp); 1191 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1192 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1193 return; 1194 } 1195 if (rnp->exp_tasks != NULL || 1196 (rnp->gp_tasks != NULL && 1197 rnp->boost_tasks == NULL && 1198 rnp->qsmask == 0 && 1199 ULONG_CMP_GE(jiffies, rnp->boost_time))) { 1200 if (rnp->exp_tasks == NULL) 1201 rnp->boost_tasks = rnp->gp_tasks; 1202 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1203 t = rnp->boost_kthread_task; 1204 if (t) 1205 rcu_wake_cond(t, rnp->boost_kthread_status); 1206 } else { 1207 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1208 } 1209 } 1210 1211 /* 1212 * Wake up the per-CPU kthread to invoke RCU callbacks. 1213 */ 1214 static void invoke_rcu_callbacks_kthread(void) 1215 { 1216 unsigned long flags; 1217 1218 local_irq_save(flags); 1219 __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); 1220 if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && 1221 current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { 1222 rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), 1223 __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); 1224 } 1225 local_irq_restore(flags); 1226 } 1227 1228 /* 1229 * Is the current CPU running the RCU-callbacks kthread? 1230 * Caller must have preemption disabled. 1231 */ 1232 static bool rcu_is_callbacks_kthread(void) 1233 { 1234 return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; 1235 } 1236 1237 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1238 1239 /* 1240 * Do priority-boost accounting for the start of a new grace period. 1241 */ 1242 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1243 { 1244 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 1245 } 1246 1247 /* 1248 * Create an RCU-boost kthread for the specified node if one does not 1249 * already exist. We only create this kthread for preemptible RCU. 1250 * Returns zero if all is well, a negated errno otherwise. 1251 */ 1252 static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) 1253 { 1254 int rnp_index = rnp - rcu_get_root(); 1255 unsigned long flags; 1256 struct sched_param sp; 1257 struct task_struct *t; 1258 1259 if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) 1260 return 0; 1261 1262 if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) 1263 return 0; 1264 1265 rcu_state.boost = 1; 1266 if (rnp->boost_kthread_task != NULL) 1267 return 0; 1268 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1269 "rcub/%d", rnp_index); 1270 if (IS_ERR(t)) 1271 return PTR_ERR(t); 1272 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1273 rnp->boost_kthread_task = t; 1274 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1275 sp.sched_priority = kthread_prio; 1276 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1277 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1278 return 0; 1279 } 1280 1281 static void rcu_cpu_kthread_setup(unsigned int cpu) 1282 { 1283 struct sched_param sp; 1284 1285 sp.sched_priority = kthread_prio; 1286 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1287 } 1288 1289 static void rcu_cpu_kthread_park(unsigned int cpu) 1290 { 1291 per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; 1292 } 1293 1294 static int rcu_cpu_kthread_should_run(unsigned int cpu) 1295 { 1296 return __this_cpu_read(rcu_data.rcu_cpu_has_work); 1297 } 1298 1299 /* 1300 * Per-CPU kernel thread that invokes RCU callbacks. This replaces 1301 * the RCU softirq used in configurations of RCU that do not support RCU 1302 * priority boosting. 1303 */ 1304 static void rcu_cpu_kthread(unsigned int cpu) 1305 { 1306 unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); 1307 char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); 1308 int spincnt; 1309 1310 for (spincnt = 0; spincnt < 10; spincnt++) { 1311 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); 1312 local_bh_disable(); 1313 *statusp = RCU_KTHREAD_RUNNING; 1314 local_irq_disable(); 1315 work = *workp; 1316 *workp = 0; 1317 local_irq_enable(); 1318 if (work) 1319 rcu_do_batch(this_cpu_ptr(&rcu_data)); 1320 local_bh_enable(); 1321 if (*workp == 0) { 1322 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); 1323 *statusp = RCU_KTHREAD_WAITING; 1324 return; 1325 } 1326 } 1327 *statusp = RCU_KTHREAD_YIELDING; 1328 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); 1329 schedule_timeout_interruptible(2); 1330 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); 1331 *statusp = RCU_KTHREAD_WAITING; 1332 } 1333 1334 /* 1335 * Set the per-rcu_node kthread's affinity to cover all CPUs that are 1336 * served by the rcu_node in question. The CPU hotplug lock is still 1337 * held, so the value of rnp->qsmaskinit will be stable. 1338 * 1339 * We don't include outgoingcpu in the affinity set, use -1 if there is 1340 * no outgoing CPU. If there are no CPUs left in the affinity set, 1341 * this function allows the kthread to execute on any CPU. 1342 */ 1343 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1344 { 1345 struct task_struct *t = rnp->boost_kthread_task; 1346 unsigned long mask = rcu_rnp_online_cpus(rnp); 1347 cpumask_var_t cm; 1348 int cpu; 1349 1350 if (!t) 1351 return; 1352 if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) 1353 return; 1354 for_each_leaf_node_possible_cpu(rnp, cpu) 1355 if ((mask & leaf_node_cpu_bit(rnp, cpu)) && 1356 cpu != outgoingcpu) 1357 cpumask_set_cpu(cpu, cm); 1358 if (cpumask_weight(cm) == 0) 1359 cpumask_setall(cm); 1360 set_cpus_allowed_ptr(t, cm); 1361 free_cpumask_var(cm); 1362 } 1363 1364 static struct smp_hotplug_thread rcu_cpu_thread_spec = { 1365 .store = &rcu_data.rcu_cpu_kthread_task, 1366 .thread_should_run = rcu_cpu_kthread_should_run, 1367 .thread_fn = rcu_cpu_kthread, 1368 .thread_comm = "rcuc/%u", 1369 .setup = rcu_cpu_kthread_setup, 1370 .park = rcu_cpu_kthread_park, 1371 }; 1372 1373 /* 1374 * Spawn boost kthreads -- called as soon as the scheduler is running. 1375 */ 1376 static void __init rcu_spawn_boost_kthreads(void) 1377 { 1378 struct rcu_node *rnp; 1379 int cpu; 1380 1381 for_each_possible_cpu(cpu) 1382 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; 1383 if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) 1384 return; 1385 rcu_for_each_leaf_node(rnp) 1386 (void)rcu_spawn_one_boost_kthread(rnp); 1387 } 1388 1389 static void rcu_prepare_kthreads(int cpu) 1390 { 1391 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 1392 struct rcu_node *rnp = rdp->mynode; 1393 1394 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1395 if (rcu_scheduler_fully_active) 1396 (void)rcu_spawn_one_boost_kthread(rnp); 1397 } 1398 1399 #else /* #ifdef CONFIG_RCU_BOOST */ 1400 1401 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1402 __releases(rnp->lock) 1403 { 1404 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1405 } 1406 1407 static void invoke_rcu_callbacks_kthread(void) 1408 { 1409 WARN_ON_ONCE(1); 1410 } 1411 1412 static bool rcu_is_callbacks_kthread(void) 1413 { 1414 return false; 1415 } 1416 1417 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1418 { 1419 } 1420 1421 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1422 { 1423 } 1424 1425 static void __init rcu_spawn_boost_kthreads(void) 1426 { 1427 } 1428 1429 static void rcu_prepare_kthreads(int cpu) 1430 { 1431 } 1432 1433 #endif /* #else #ifdef CONFIG_RCU_BOOST */ 1434 1435 #if !defined(CONFIG_RCU_FAST_NO_HZ) 1436 1437 /* 1438 * Check to see if any future RCU-related work will need to be done 1439 * by the current CPU, even if none need be done immediately, returning 1440 * 1 if so. This function is part of the RCU implementation; it is -not- 1441 * an exported member of the RCU API. 1442 * 1443 * Because we not have RCU_FAST_NO_HZ, just check whether or not this 1444 * CPU has RCU callbacks queued. 1445 */ 1446 int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1447 { 1448 *nextevt = KTIME_MAX; 1449 return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); 1450 } 1451 1452 /* 1453 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1454 * after it. 1455 */ 1456 static void rcu_cleanup_after_idle(void) 1457 { 1458 } 1459 1460 /* 1461 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1462 * is nothing. 1463 */ 1464 static void rcu_prepare_for_idle(void) 1465 { 1466 } 1467 1468 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1469 1470 /* 1471 * This code is invoked when a CPU goes idle, at which point we want 1472 * to have the CPU do everything required for RCU so that it can enter 1473 * the energy-efficient dyntick-idle mode. This is handled by a 1474 * state machine implemented by rcu_prepare_for_idle() below. 1475 * 1476 * The following three proprocessor symbols control this state machine: 1477 * 1478 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1479 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1480 * is sized to be roughly one RCU grace period. Those energy-efficiency 1481 * benchmarkers who might otherwise be tempted to set this to a large 1482 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1483 * system. And if you are -that- concerned about energy efficiency, 1484 * just power the system down and be done with it! 1485 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is 1486 * permitted to sleep in dyntick-idle mode with only lazy RCU 1487 * callbacks pending. Setting this too high can OOM your system. 1488 * 1489 * The values below work well in practice. If future workloads require 1490 * adjustment, they can be converted into kernel config parameters, though 1491 * making the state machine smarter might be a better option. 1492 */ 1493 #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1494 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1495 1496 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; 1497 module_param(rcu_idle_gp_delay, int, 0644); 1498 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; 1499 module_param(rcu_idle_lazy_gp_delay, int, 0644); 1500 1501 /* 1502 * Try to advance callbacks on the current CPU, but only if it has been 1503 * awhile since the last time we did so. Afterwards, if there are any 1504 * callbacks ready for immediate invocation, return true. 1505 */ 1506 static bool __maybe_unused rcu_try_advance_all_cbs(void) 1507 { 1508 bool cbs_ready = false; 1509 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1510 struct rcu_node *rnp; 1511 1512 /* Exit early if we advanced recently. */ 1513 if (jiffies == rdp->last_advance_all) 1514 return false; 1515 rdp->last_advance_all = jiffies; 1516 1517 rnp = rdp->mynode; 1518 1519 /* 1520 * Don't bother checking unless a grace period has 1521 * completed since we last checked and there are 1522 * callbacks not yet ready to invoke. 1523 */ 1524 if ((rcu_seq_completed_gp(rdp->gp_seq, 1525 rcu_seq_current(&rnp->gp_seq)) || 1526 unlikely(READ_ONCE(rdp->gpwrap))) && 1527 rcu_segcblist_pend_cbs(&rdp->cblist)) 1528 note_gp_changes(rdp); 1529 1530 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 1531 cbs_ready = true; 1532 return cbs_ready; 1533 } 1534 1535 /* 1536 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready 1537 * to invoke. If the CPU has callbacks, try to advance them. Tell the 1538 * caller to set the timeout based on whether or not there are non-lazy 1539 * callbacks. 1540 * 1541 * The caller must have disabled interrupts. 1542 */ 1543 int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1544 { 1545 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1546 unsigned long dj; 1547 1548 lockdep_assert_irqs_disabled(); 1549 1550 /* If no callbacks, RCU doesn't need the CPU. */ 1551 if (rcu_segcblist_empty(&rdp->cblist)) { 1552 *nextevt = KTIME_MAX; 1553 return 0; 1554 } 1555 1556 /* Attempt to advance callbacks. */ 1557 if (rcu_try_advance_all_cbs()) { 1558 /* Some ready to invoke, so initiate later invocation. */ 1559 invoke_rcu_core(); 1560 return 1; 1561 } 1562 rdp->last_accelerate = jiffies; 1563 1564 /* Request timer delay depending on laziness, and round. */ 1565 rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); 1566 if (rdp->all_lazy) { 1567 dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; 1568 } else { 1569 dj = round_up(rcu_idle_gp_delay + jiffies, 1570 rcu_idle_gp_delay) - jiffies; 1571 } 1572 *nextevt = basemono + dj * TICK_NSEC; 1573 return 0; 1574 } 1575 1576 /* 1577 * Prepare a CPU for idle from an RCU perspective. The first major task 1578 * is to sense whether nohz mode has been enabled or disabled via sysfs. 1579 * The second major task is to check to see if a non-lazy callback has 1580 * arrived at a CPU that previously had only lazy callbacks. The third 1581 * major task is to accelerate (that is, assign grace-period numbers to) 1582 * any recently arrived callbacks. 1583 * 1584 * The caller must have disabled interrupts. 1585 */ 1586 static void rcu_prepare_for_idle(void) 1587 { 1588 bool needwake; 1589 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1590 struct rcu_node *rnp; 1591 int tne; 1592 1593 lockdep_assert_irqs_disabled(); 1594 if (rcu_is_nocb_cpu(smp_processor_id())) 1595 return; 1596 1597 /* Handle nohz enablement switches conservatively. */ 1598 tne = READ_ONCE(tick_nohz_active); 1599 if (tne != rdp->tick_nohz_enabled_snap) { 1600 if (!rcu_segcblist_empty(&rdp->cblist)) 1601 invoke_rcu_core(); /* force nohz to see update. */ 1602 rdp->tick_nohz_enabled_snap = tne; 1603 return; 1604 } 1605 if (!tne) 1606 return; 1607 1608 /* 1609 * If a non-lazy callback arrived at a CPU having only lazy 1610 * callbacks, invoke RCU core for the side-effect of recalculating 1611 * idle duration on re-entry to idle. 1612 */ 1613 if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { 1614 rdp->all_lazy = false; 1615 invoke_rcu_core(); 1616 return; 1617 } 1618 1619 /* 1620 * If we have not yet accelerated this jiffy, accelerate all 1621 * callbacks on this CPU. 1622 */ 1623 if (rdp->last_accelerate == jiffies) 1624 return; 1625 rdp->last_accelerate = jiffies; 1626 if (rcu_segcblist_pend_cbs(&rdp->cblist)) { 1627 rnp = rdp->mynode; 1628 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1629 needwake = rcu_accelerate_cbs(rnp, rdp); 1630 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 1631 if (needwake) 1632 rcu_gp_kthread_wake(); 1633 } 1634 } 1635 1636 /* 1637 * Clean up for exit from idle. Attempt to advance callbacks based on 1638 * any grace periods that elapsed while the CPU was idle, and if any 1639 * callbacks are now ready to invoke, initiate invocation. 1640 */ 1641 static void rcu_cleanup_after_idle(void) 1642 { 1643 lockdep_assert_irqs_disabled(); 1644 if (rcu_is_nocb_cpu(smp_processor_id())) 1645 return; 1646 if (rcu_try_advance_all_cbs()) 1647 invoke_rcu_core(); 1648 } 1649 1650 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1651 1652 #ifdef CONFIG_RCU_FAST_NO_HZ 1653 1654 static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1655 { 1656 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 1657 1658 sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", 1659 rdp->last_accelerate & 0xffff, jiffies & 0xffff, 1660 ".l"[rdp->all_lazy], 1661 ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], 1662 ".D"[!rdp->tick_nohz_enabled_snap]); 1663 } 1664 1665 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1666 1667 static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1668 { 1669 *cp = '\0'; 1670 } 1671 1672 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ 1673 1674 /* Initiate the stall-info list. */ 1675 static void print_cpu_stall_info_begin(void) 1676 { 1677 pr_cont("\n"); 1678 } 1679 1680 /* 1681 * Print out diagnostic information for the specified stalled CPU. 1682 * 1683 * If the specified CPU is aware of the current RCU grace period, then 1684 * print the number of scheduling clock interrupts the CPU has taken 1685 * during the time that it has been aware. Otherwise, print the number 1686 * of RCU grace periods that this CPU is ignorant of, for example, "1" 1687 * if the CPU was aware of the previous grace period. 1688 * 1689 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. 1690 */ 1691 static void print_cpu_stall_info(int cpu) 1692 { 1693 unsigned long delta; 1694 char fast_no_hz[72]; 1695 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 1696 char *ticks_title; 1697 unsigned long ticks_value; 1698 1699 /* 1700 * We could be printing a lot while holding a spinlock. Avoid 1701 * triggering hard lockup. 1702 */ 1703 touch_nmi_watchdog(); 1704 1705 ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); 1706 if (ticks_value) { 1707 ticks_title = "GPs behind"; 1708 } else { 1709 ticks_title = "ticks this GP"; 1710 ticks_value = rdp->ticks_this_gp; 1711 } 1712 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1713 delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); 1714 pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", 1715 cpu, 1716 "O."[!!cpu_online(cpu)], 1717 "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], 1718 "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], 1719 !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : 1720 rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : 1721 "!."[!delta], 1722 ticks_value, ticks_title, 1723 rcu_dynticks_snap(rdp) & 0xfff, 1724 rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, 1725 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1726 READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, 1727 fast_no_hz); 1728 } 1729 1730 /* Terminate the stall-info list. */ 1731 static void print_cpu_stall_info_end(void) 1732 { 1733 pr_err("\t"); 1734 } 1735 1736 /* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ 1737 static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1738 { 1739 rdp->ticks_this_gp = 0; 1740 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); 1741 WRITE_ONCE(rdp->last_fqs_resched, jiffies); 1742 } 1743 1744 #ifdef CONFIG_RCU_NOCB_CPU 1745 1746 /* 1747 * Offload callback processing from the boot-time-specified set of CPUs 1748 * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads 1749 * created that pull the callbacks from the corresponding CPU, wait for 1750 * a grace period to elapse, and invoke the callbacks. These kthreads 1751 * are organized into leaders, which manage incoming callbacks, wait for 1752 * grace periods, and awaken followers, and the followers, which only 1753 * invoke callbacks. Each leader is its own follower. The no-CBs CPUs 1754 * do a wake_up() on their kthread when they insert a callback into any 1755 * empty list, unless the rcu_nocb_poll boot parameter has been specified, 1756 * in which case each kthread actively polls its CPU. (Which isn't so great 1757 * for energy efficiency, but which does reduce RCU's overhead on that CPU.) 1758 * 1759 * This is intended to be used in conjunction with Frederic Weisbecker's 1760 * adaptive-idle work, which would seriously reduce OS jitter on CPUs 1761 * running CPU-bound user-mode computations. 1762 * 1763 * Offloading of callbacks can also be used as an energy-efficiency 1764 * measure because CPUs with no RCU callbacks queued are more aggressive 1765 * about entering dyntick-idle mode. 1766 */ 1767 1768 1769 /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ 1770 static int __init rcu_nocb_setup(char *str) 1771 { 1772 alloc_bootmem_cpumask_var(&rcu_nocb_mask); 1773 cpulist_parse(str, rcu_nocb_mask); 1774 return 1; 1775 } 1776 __setup("rcu_nocbs=", rcu_nocb_setup); 1777 1778 static int __init parse_rcu_nocb_poll(char *arg) 1779 { 1780 rcu_nocb_poll = true; 1781 return 0; 1782 } 1783 early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 1784 1785 /* 1786 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 1787 * grace period. 1788 */ 1789 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 1790 { 1791 swake_up_all(sq); 1792 } 1793 1794 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 1795 { 1796 return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; 1797 } 1798 1799 static void rcu_init_one_nocb(struct rcu_node *rnp) 1800 { 1801 init_swait_queue_head(&rnp->nocb_gp_wq[0]); 1802 init_swait_queue_head(&rnp->nocb_gp_wq[1]); 1803 } 1804 1805 /* Is the specified CPU a no-CBs CPU? */ 1806 bool rcu_is_nocb_cpu(int cpu) 1807 { 1808 if (cpumask_available(rcu_nocb_mask)) 1809 return cpumask_test_cpu(cpu, rcu_nocb_mask); 1810 return false; 1811 } 1812 1813 /* 1814 * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock 1815 * and this function releases it. 1816 */ 1817 static void __wake_nocb_leader(struct rcu_data *rdp, bool force, 1818 unsigned long flags) 1819 __releases(rdp->nocb_lock) 1820 { 1821 struct rcu_data *rdp_leader = rdp->nocb_leader; 1822 1823 lockdep_assert_held(&rdp->nocb_lock); 1824 if (!READ_ONCE(rdp_leader->nocb_kthread)) { 1825 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1826 return; 1827 } 1828 if (rdp_leader->nocb_leader_sleep || force) { 1829 /* Prior smp_mb__after_atomic() orders against prior enqueue. */ 1830 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1831 del_timer(&rdp->nocb_timer); 1832 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1833 smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */ 1834 swake_up_one(&rdp_leader->nocb_wq); 1835 } else { 1836 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1837 } 1838 } 1839 1840 /* 1841 * Kick the leader kthread for this NOCB group, but caller has not 1842 * acquired locks. 1843 */ 1844 static void wake_nocb_leader(struct rcu_data *rdp, bool force) 1845 { 1846 unsigned long flags; 1847 1848 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 1849 __wake_nocb_leader(rdp, force, flags); 1850 } 1851 1852 /* 1853 * Arrange to wake the leader kthread for this NOCB group at some 1854 * future time when it is safe to do so. 1855 */ 1856 static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, 1857 const char *reason) 1858 { 1859 unsigned long flags; 1860 1861 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 1862 if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) 1863 mod_timer(&rdp->nocb_timer, jiffies + 1); 1864 WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); 1865 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); 1866 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1867 } 1868 1869 /* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ 1870 static bool rcu_nocb_cpu_needs_barrier(int cpu) 1871 { 1872 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 1873 unsigned long ret; 1874 #ifdef CONFIG_PROVE_RCU 1875 struct rcu_head *rhp; 1876 #endif /* #ifdef CONFIG_PROVE_RCU */ 1877 1878 /* 1879 * Check count of all no-CBs callbacks awaiting invocation. 1880 * There needs to be a barrier before this function is called, 1881 * but associated with a prior determination that no more 1882 * callbacks would be posted. In the worst case, the first 1883 * barrier in rcu_barrier() suffices (but the caller cannot 1884 * necessarily rely on this, not a substitute for the caller 1885 * getting the concurrency design right!). There must also be a 1886 * barrier between the following load and posting of a callback 1887 * (if a callback is in fact needed). This is associated with an 1888 * atomic_inc() in the caller. 1889 */ 1890 ret = rcu_get_n_cbs_nocb_cpu(rdp); 1891 1892 #ifdef CONFIG_PROVE_RCU 1893 rhp = READ_ONCE(rdp->nocb_head); 1894 if (!rhp) 1895 rhp = READ_ONCE(rdp->nocb_gp_head); 1896 if (!rhp) 1897 rhp = READ_ONCE(rdp->nocb_follower_head); 1898 1899 /* Having no rcuo kthread but CBs after scheduler starts is bad! */ 1900 if (!READ_ONCE(rdp->nocb_kthread) && rhp && 1901 rcu_scheduler_fully_active) { 1902 /* RCU callback enqueued before CPU first came online??? */ 1903 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", 1904 cpu, rhp->func); 1905 WARN_ON_ONCE(1); 1906 } 1907 #endif /* #ifdef CONFIG_PROVE_RCU */ 1908 1909 return !!ret; 1910 } 1911 1912 /* 1913 * Enqueue the specified string of rcu_head structures onto the specified 1914 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 1915 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 1916 * counts are supplied by rhcount and rhcount_lazy. 1917 * 1918 * If warranted, also wake up the kthread servicing this CPUs queues. 1919 */ 1920 static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, 1921 struct rcu_head *rhp, 1922 struct rcu_head **rhtp, 1923 int rhcount, int rhcount_lazy, 1924 unsigned long flags) 1925 { 1926 int len; 1927 struct rcu_head **old_rhpp; 1928 struct task_struct *t; 1929 1930 /* Enqueue the callback on the nocb list and update counts. */ 1931 atomic_long_add(rhcount, &rdp->nocb_q_count); 1932 /* rcu_barrier() relies on ->nocb_q_count add before xchg. */ 1933 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 1934 WRITE_ONCE(*old_rhpp, rhp); 1935 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 1936 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */ 1937 1938 /* If we are not being polled and there is a kthread, awaken it ... */ 1939 t = READ_ONCE(rdp->nocb_kthread); 1940 if (rcu_nocb_poll || !t) { 1941 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1942 TPS("WakeNotPoll")); 1943 return; 1944 } 1945 len = rcu_get_n_cbs_nocb_cpu(rdp); 1946 if (old_rhpp == &rdp->nocb_head) { 1947 if (!irqs_disabled_flags(flags)) { 1948 /* ... if queue was empty ... */ 1949 wake_nocb_leader(rdp, false); 1950 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1951 TPS("WakeEmpty")); 1952 } else { 1953 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, 1954 TPS("WakeEmptyIsDeferred")); 1955 } 1956 rdp->qlen_last_fqs_check = 0; 1957 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 1958 /* ... or if many callbacks queued. */ 1959 if (!irqs_disabled_flags(flags)) { 1960 wake_nocb_leader(rdp, true); 1961 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1962 TPS("WakeOvf")); 1963 } else { 1964 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE, 1965 TPS("WakeOvfIsDeferred")); 1966 } 1967 rdp->qlen_last_fqs_check = LONG_MAX / 2; 1968 } else { 1969 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); 1970 } 1971 return; 1972 } 1973 1974 /* 1975 * This is a helper for __call_rcu(), which invokes this when the normal 1976 * callback queue is inoperable. If this is not a no-CBs CPU, this 1977 * function returns failure back to __call_rcu(), which can complain 1978 * appropriately. 1979 * 1980 * Otherwise, this function queues the callback where the corresponding 1981 * "rcuo" kthread can find it. 1982 */ 1983 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 1984 bool lazy, unsigned long flags) 1985 { 1986 1987 if (!rcu_is_nocb_cpu(rdp->cpu)) 1988 return false; 1989 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); 1990 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 1991 trace_rcu_kfree_callback(rcu_state.name, rhp, 1992 (unsigned long)rhp->func, 1993 -atomic_long_read(&rdp->nocb_q_count_lazy), 1994 -rcu_get_n_cbs_nocb_cpu(rdp)); 1995 else 1996 trace_rcu_callback(rcu_state.name, rhp, 1997 -atomic_long_read(&rdp->nocb_q_count_lazy), 1998 -rcu_get_n_cbs_nocb_cpu(rdp)); 1999 2000 /* 2001 * If called from an extended quiescent state with interrupts 2002 * disabled, invoke the RCU core in order to allow the idle-entry 2003 * deferred-wakeup check to function. 2004 */ 2005 if (irqs_disabled_flags(flags) && 2006 !rcu_is_watching() && 2007 cpu_online(smp_processor_id())) 2008 invoke_rcu_core(); 2009 2010 return true; 2011 } 2012 2013 /* 2014 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is 2015 * not a no-CBs CPU. 2016 */ 2017 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, 2018 struct rcu_data *rdp, 2019 unsigned long flags) 2020 { 2021 lockdep_assert_irqs_disabled(); 2022 if (!rcu_is_nocb_cpu(smp_processor_id())) 2023 return false; /* Not NOCBs CPU, caller must migrate CBs. */ 2024 __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), 2025 rcu_segcblist_tail(&rdp->cblist), 2026 rcu_segcblist_n_cbs(&rdp->cblist), 2027 rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); 2028 rcu_segcblist_init(&rdp->cblist); 2029 rcu_segcblist_disable(&rdp->cblist); 2030 return true; 2031 } 2032 2033 /* 2034 * If necessary, kick off a new grace period, and either way wait 2035 * for a subsequent grace period to complete. 2036 */ 2037 static void rcu_nocb_wait_gp(struct rcu_data *rdp) 2038 { 2039 unsigned long c; 2040 bool d; 2041 unsigned long flags; 2042 bool needwake; 2043 struct rcu_node *rnp = rdp->mynode; 2044 2045 local_irq_save(flags); 2046 c = rcu_seq_snap(&rcu_state.gp_seq); 2047 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { 2048 local_irq_restore(flags); 2049 } else { 2050 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 2051 needwake = rcu_start_this_gp(rnp, rdp, c); 2052 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2053 if (needwake) 2054 rcu_gp_kthread_wake(); 2055 } 2056 2057 /* 2058 * Wait for the grace period. Do so interruptibly to avoid messing 2059 * up the load average. 2060 */ 2061 trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait")); 2062 for (;;) { 2063 swait_event_interruptible_exclusive( 2064 rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1], 2065 (d = rcu_seq_done(&rnp->gp_seq, c))); 2066 if (likely(d)) 2067 break; 2068 WARN_ON(signal_pending(current)); 2069 trace_rcu_this_gp(rnp, rdp, c, TPS("ResumeWait")); 2070 } 2071 trace_rcu_this_gp(rnp, rdp, c, TPS("EndWait")); 2072 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2073 } 2074 2075 /* 2076 * Leaders come here to wait for additional callbacks to show up. 2077 * This function does not return until callbacks appear. 2078 */ 2079 static void nocb_leader_wait(struct rcu_data *my_rdp) 2080 { 2081 bool firsttime = true; 2082 unsigned long flags; 2083 bool gotcbs; 2084 struct rcu_data *rdp; 2085 struct rcu_head **tail; 2086 2087 wait_again: 2088 2089 /* Wait for callbacks to appear. */ 2090 if (!rcu_nocb_poll) { 2091 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Sleep")); 2092 swait_event_interruptible_exclusive(my_rdp->nocb_wq, 2093 !READ_ONCE(my_rdp->nocb_leader_sleep)); 2094 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); 2095 my_rdp->nocb_leader_sleep = true; 2096 WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2097 del_timer(&my_rdp->nocb_timer); 2098 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); 2099 } else if (firsttime) { 2100 firsttime = false; /* Don't drown trace log with "Poll"! */ 2101 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, TPS("Poll")); 2102 } 2103 2104 /* 2105 * Each pass through the following loop checks a follower for CBs. 2106 * We are our own first follower. Any CBs found are moved to 2107 * nocb_gp_head, where they await a grace period. 2108 */ 2109 gotcbs = false; 2110 smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ 2111 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2112 rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); 2113 if (!rdp->nocb_gp_head) 2114 continue; /* No CBs here, try next follower. */ 2115 2116 /* Move callbacks to wait-for-GP list, which is empty. */ 2117 WRITE_ONCE(rdp->nocb_head, NULL); 2118 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 2119 gotcbs = true; 2120 } 2121 2122 /* No callbacks? Sleep a bit if polling, and go retry. */ 2123 if (unlikely(!gotcbs)) { 2124 WARN_ON(signal_pending(current)); 2125 if (rcu_nocb_poll) { 2126 schedule_timeout_interruptible(1); 2127 } else { 2128 trace_rcu_nocb_wake(rcu_state.name, my_rdp->cpu, 2129 TPS("WokeEmpty")); 2130 } 2131 goto wait_again; 2132 } 2133 2134 /* Wait for one grace period. */ 2135 rcu_nocb_wait_gp(my_rdp); 2136 2137 /* Each pass through the following loop wakes a follower, if needed. */ 2138 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2139 if (!rcu_nocb_poll && 2140 READ_ONCE(rdp->nocb_head) && 2141 READ_ONCE(my_rdp->nocb_leader_sleep)) { 2142 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); 2143 my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ 2144 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); 2145 } 2146 if (!rdp->nocb_gp_head) 2147 continue; /* No CBs, so no need to wake follower. */ 2148 2149 /* Append callbacks to follower's "done" list. */ 2150 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 2151 tail = rdp->nocb_follower_tail; 2152 rdp->nocb_follower_tail = rdp->nocb_gp_tail; 2153 *tail = rdp->nocb_gp_head; 2154 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2155 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2156 /* List was empty, so wake up the follower. */ 2157 swake_up_one(&rdp->nocb_wq); 2158 } 2159 } 2160 2161 /* If we (the leader) don't have CBs, go wait some more. */ 2162 if (!my_rdp->nocb_follower_head) 2163 goto wait_again; 2164 } 2165 2166 /* 2167 * Followers come here to wait for additional callbacks to show up. 2168 * This function does not return until callbacks appear. 2169 */ 2170 static void nocb_follower_wait(struct rcu_data *rdp) 2171 { 2172 for (;;) { 2173 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FollowerSleep")); 2174 swait_event_interruptible_exclusive(rdp->nocb_wq, 2175 READ_ONCE(rdp->nocb_follower_head)); 2176 if (smp_load_acquire(&rdp->nocb_follower_head)) { 2177 /* ^^^ Ensure CB invocation follows _head test. */ 2178 return; 2179 } 2180 WARN_ON(signal_pending(current)); 2181 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); 2182 } 2183 } 2184 2185 /* 2186 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes 2187 * callbacks queued by the corresponding no-CBs CPU, however, there is 2188 * an optional leader-follower relationship so that the grace-period 2189 * kthreads don't have to do quite so many wakeups. 2190 */ 2191 static int rcu_nocb_kthread(void *arg) 2192 { 2193 int c, cl; 2194 unsigned long flags; 2195 struct rcu_head *list; 2196 struct rcu_head *next; 2197 struct rcu_head **tail; 2198 struct rcu_data *rdp = arg; 2199 2200 /* Each pass through this loop invokes one batch of callbacks */ 2201 for (;;) { 2202 /* Wait for callbacks. */ 2203 if (rdp->nocb_leader == rdp) 2204 nocb_leader_wait(rdp); 2205 else 2206 nocb_follower_wait(rdp); 2207 2208 /* Pull the ready-to-invoke callbacks onto local list. */ 2209 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 2210 list = rdp->nocb_follower_head; 2211 rdp->nocb_follower_head = NULL; 2212 tail = rdp->nocb_follower_tail; 2213 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2214 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2215 if (WARN_ON_ONCE(!list)) 2216 continue; 2217 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeNonEmpty")); 2218 2219 /* Each pass through the following loop invokes a callback. */ 2220 trace_rcu_batch_start(rcu_state.name, 2221 atomic_long_read(&rdp->nocb_q_count_lazy), 2222 rcu_get_n_cbs_nocb_cpu(rdp), -1); 2223 c = cl = 0; 2224 while (list) { 2225 next = list->next; 2226 /* Wait for enqueuing to complete, if needed. */ 2227 while (next == NULL && &list->next != tail) { 2228 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 2229 TPS("WaitQueue")); 2230 schedule_timeout_interruptible(1); 2231 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 2232 TPS("WokeQueue")); 2233 next = list->next; 2234 } 2235 debug_rcu_head_unqueue(list); 2236 local_bh_disable(); 2237 if (__rcu_reclaim(rcu_state.name, list)) 2238 cl++; 2239 c++; 2240 local_bh_enable(); 2241 cond_resched_tasks_rcu_qs(); 2242 list = next; 2243 } 2244 trace_rcu_batch_end(rcu_state.name, c, !!list, 0, 0, 1); 2245 smp_mb__before_atomic(); /* _add after CB invocation. */ 2246 atomic_long_add(-c, &rdp->nocb_q_count); 2247 atomic_long_add(-cl, &rdp->nocb_q_count_lazy); 2248 } 2249 return 0; 2250 } 2251 2252 /* Is a deferred wakeup of rcu_nocb_kthread() required? */ 2253 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2254 { 2255 return READ_ONCE(rdp->nocb_defer_wakeup); 2256 } 2257 2258 /* Do a deferred wakeup of rcu_nocb_kthread(). */ 2259 static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) 2260 { 2261 unsigned long flags; 2262 int ndw; 2263 2264 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 2265 if (!rcu_nocb_need_deferred_wakeup(rdp)) { 2266 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2267 return; 2268 } 2269 ndw = READ_ONCE(rdp->nocb_defer_wakeup); 2270 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2271 __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); 2272 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); 2273 } 2274 2275 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ 2276 static void do_nocb_deferred_wakeup_timer(struct timer_list *t) 2277 { 2278 struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); 2279 2280 do_nocb_deferred_wakeup_common(rdp); 2281 } 2282 2283 /* 2284 * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. 2285 * This means we do an inexact common-case check. Note that if 2286 * we miss, ->nocb_timer will eventually clean things up. 2287 */ 2288 static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2289 { 2290 if (rcu_nocb_need_deferred_wakeup(rdp)) 2291 do_nocb_deferred_wakeup_common(rdp); 2292 } 2293 2294 void __init rcu_init_nohz(void) 2295 { 2296 int cpu; 2297 bool need_rcu_nocb_mask = false; 2298 2299 #if defined(CONFIG_NO_HZ_FULL) 2300 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) 2301 need_rcu_nocb_mask = true; 2302 #endif /* #if defined(CONFIG_NO_HZ_FULL) */ 2303 2304 if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { 2305 if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { 2306 pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); 2307 return; 2308 } 2309 } 2310 if (!cpumask_available(rcu_nocb_mask)) 2311 return; 2312 2313 #if defined(CONFIG_NO_HZ_FULL) 2314 if (tick_nohz_full_running) 2315 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); 2316 #endif /* #if defined(CONFIG_NO_HZ_FULL) */ 2317 2318 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { 2319 pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); 2320 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 2321 rcu_nocb_mask); 2322 } 2323 if (cpumask_empty(rcu_nocb_mask)) 2324 pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); 2325 else 2326 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", 2327 cpumask_pr_args(rcu_nocb_mask)); 2328 if (rcu_nocb_poll) 2329 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2330 2331 for_each_cpu(cpu, rcu_nocb_mask) 2332 init_nocb_callback_list(per_cpu_ptr(&rcu_data, cpu)); 2333 rcu_organize_nocb_kthreads(); 2334 } 2335 2336 /* Initialize per-rcu_data variables for no-CBs CPUs. */ 2337 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2338 { 2339 rdp->nocb_tail = &rdp->nocb_head; 2340 init_swait_queue_head(&rdp->nocb_wq); 2341 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2342 raw_spin_lock_init(&rdp->nocb_lock); 2343 timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); 2344 } 2345 2346 /* 2347 * If the specified CPU is a no-CBs CPU that does not already have its 2348 * rcuo kthread, spawn it. If the CPUs are brought online out of order, 2349 * this can require re-organizing the leader-follower relationships. 2350 */ 2351 static void rcu_spawn_one_nocb_kthread(int cpu) 2352 { 2353 struct rcu_data *rdp; 2354 struct rcu_data *rdp_last; 2355 struct rcu_data *rdp_old_leader; 2356 struct rcu_data *rdp_spawn = per_cpu_ptr(&rcu_data, cpu); 2357 struct task_struct *t; 2358 2359 /* 2360 * If this isn't a no-CBs CPU or if it already has an rcuo kthread, 2361 * then nothing to do. 2362 */ 2363 if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread) 2364 return; 2365 2366 /* If we didn't spawn the leader first, reorganize! */ 2367 rdp_old_leader = rdp_spawn->nocb_leader; 2368 if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) { 2369 rdp_last = NULL; 2370 rdp = rdp_old_leader; 2371 do { 2372 rdp->nocb_leader = rdp_spawn; 2373 if (rdp_last && rdp != rdp_spawn) 2374 rdp_last->nocb_next_follower = rdp; 2375 if (rdp == rdp_spawn) { 2376 rdp = rdp->nocb_next_follower; 2377 } else { 2378 rdp_last = rdp; 2379 rdp = rdp->nocb_next_follower; 2380 rdp_last->nocb_next_follower = NULL; 2381 } 2382 } while (rdp); 2383 rdp_spawn->nocb_next_follower = rdp_old_leader; 2384 } 2385 2386 /* Spawn the kthread for this CPU. */ 2387 t = kthread_run(rcu_nocb_kthread, rdp_spawn, 2388 "rcuo%c/%d", rcu_state.abbr, cpu); 2389 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo kthread, OOM is now expected behavior\n", __func__)) 2390 return; 2391 WRITE_ONCE(rdp_spawn->nocb_kthread, t); 2392 } 2393 2394 /* 2395 * If the specified CPU is a no-CBs CPU that does not already have its 2396 * rcuo kthread, spawn it. 2397 */ 2398 static void rcu_spawn_cpu_nocb_kthread(int cpu) 2399 { 2400 if (rcu_scheduler_fully_active) 2401 rcu_spawn_one_nocb_kthread(cpu); 2402 } 2403 2404 /* 2405 * Once the scheduler is running, spawn rcuo kthreads for all online 2406 * no-CBs CPUs. This assumes that the early_initcall()s happen before 2407 * non-boot CPUs come online -- if this changes, we will need to add 2408 * some mutual exclusion. 2409 */ 2410 static void __init rcu_spawn_nocb_kthreads(void) 2411 { 2412 int cpu; 2413 2414 for_each_online_cpu(cpu) 2415 rcu_spawn_cpu_nocb_kthread(cpu); 2416 } 2417 2418 /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ 2419 static int rcu_nocb_leader_stride = -1; 2420 module_param(rcu_nocb_leader_stride, int, 0444); 2421 2422 /* 2423 * Initialize leader-follower relationships for all no-CBs CPU. 2424 */ 2425 static void __init rcu_organize_nocb_kthreads(void) 2426 { 2427 int cpu; 2428 int ls = rcu_nocb_leader_stride; 2429 int nl = 0; /* Next leader. */ 2430 struct rcu_data *rdp; 2431 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ 2432 struct rcu_data *rdp_prev = NULL; 2433 2434 if (!cpumask_available(rcu_nocb_mask)) 2435 return; 2436 if (ls == -1) { 2437 ls = int_sqrt(nr_cpu_ids); 2438 rcu_nocb_leader_stride = ls; 2439 } 2440 2441 /* 2442 * Each pass through this loop sets up one rcu_data structure. 2443 * Should the corresponding CPU come online in the future, then 2444 * we will spawn the needed set of rcu_nocb_kthread() kthreads. 2445 */ 2446 for_each_cpu(cpu, rcu_nocb_mask) { 2447 rdp = per_cpu_ptr(&rcu_data, cpu); 2448 if (rdp->cpu >= nl) { 2449 /* New leader, set up for followers & next leader. */ 2450 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; 2451 rdp->nocb_leader = rdp; 2452 rdp_leader = rdp; 2453 } else { 2454 /* Another follower, link to previous leader. */ 2455 rdp->nocb_leader = rdp_leader; 2456 rdp_prev->nocb_next_follower = rdp; 2457 } 2458 rdp_prev = rdp; 2459 } 2460 } 2461 2462 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2463 static bool init_nocb_callback_list(struct rcu_data *rdp) 2464 { 2465 if (!rcu_is_nocb_cpu(rdp->cpu)) 2466 return false; 2467 2468 /* If there are early-boot callbacks, move them to nocb lists. */ 2469 if (!rcu_segcblist_empty(&rdp->cblist)) { 2470 rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); 2471 rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); 2472 atomic_long_set(&rdp->nocb_q_count, 2473 rcu_segcblist_n_cbs(&rdp->cblist)); 2474 atomic_long_set(&rdp->nocb_q_count_lazy, 2475 rcu_segcblist_n_lazy_cbs(&rdp->cblist)); 2476 rcu_segcblist_init(&rdp->cblist); 2477 } 2478 rcu_segcblist_disable(&rdp->cblist); 2479 return true; 2480 } 2481 2482 /* 2483 * Bind the current task to the offloaded CPUs. If there are no offloaded 2484 * CPUs, leave the task unbound. Splat if the bind attempt fails. 2485 */ 2486 void rcu_bind_current_to_nocb(void) 2487 { 2488 if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) 2489 WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); 2490 } 2491 EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); 2492 2493 /* 2494 * Return the number of RCU callbacks still queued from the specified 2495 * CPU, which must be a nocbs CPU. 2496 */ 2497 static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) 2498 { 2499 return atomic_long_read(&rdp->nocb_q_count); 2500 } 2501 2502 #else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2503 2504 static bool rcu_nocb_cpu_needs_barrier(int cpu) 2505 { 2506 WARN_ON_ONCE(1); /* Should be dead code. */ 2507 return false; 2508 } 2509 2510 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 2511 { 2512 } 2513 2514 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 2515 { 2516 return NULL; 2517 } 2518 2519 static void rcu_init_one_nocb(struct rcu_node *rnp) 2520 { 2521 } 2522 2523 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2524 bool lazy, unsigned long flags) 2525 { 2526 return false; 2527 } 2528 2529 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, 2530 struct rcu_data *rdp, 2531 unsigned long flags) 2532 { 2533 return false; 2534 } 2535 2536 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2537 { 2538 } 2539 2540 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2541 { 2542 return false; 2543 } 2544 2545 static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2546 { 2547 } 2548 2549 static void rcu_spawn_cpu_nocb_kthread(int cpu) 2550 { 2551 } 2552 2553 static void __init rcu_spawn_nocb_kthreads(void) 2554 { 2555 } 2556 2557 static bool init_nocb_callback_list(struct rcu_data *rdp) 2558 { 2559 return false; 2560 } 2561 2562 static unsigned long rcu_get_n_cbs_nocb_cpu(struct rcu_data *rdp) 2563 { 2564 return 0; 2565 } 2566 2567 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2568 2569 /* 2570 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the 2571 * grace-period kthread will do force_quiescent_state() processing? 2572 * The idea is to avoid waking up RCU core processing on such a 2573 * CPU unless the grace period has extended for too long. 2574 * 2575 * This code relies on the fact that all NO_HZ_FULL CPUs are also 2576 * CONFIG_RCU_NOCB_CPU CPUs. 2577 */ 2578 static bool rcu_nohz_full_cpu(void) 2579 { 2580 #ifdef CONFIG_NO_HZ_FULL 2581 if (tick_nohz_full_cpu(smp_processor_id()) && 2582 (!rcu_gp_in_progress() || 2583 ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) 2584 return true; 2585 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 2586 return false; 2587 } 2588 2589 /* 2590 * Bind the RCU grace-period kthreads to the housekeeping CPU. 2591 */ 2592 static void rcu_bind_gp_kthread(void) 2593 { 2594 if (!tick_nohz_full_enabled()) 2595 return; 2596 housekeeping_affine(current, HK_FLAG_RCU); 2597 } 2598 2599 /* Record the current task on dyntick-idle entry. */ 2600 static void rcu_dynticks_task_enter(void) 2601 { 2602 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) 2603 WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); 2604 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ 2605 } 2606 2607 /* Record no current task on dyntick-idle exit. */ 2608 static void rcu_dynticks_task_exit(void) 2609 { 2610 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) 2611 WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); 2612 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ 2613 } 2614