1 /* 2 * Read-Copy Update mechanism for mutual exclusion 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, you can access it online at 16 * http://www.gnu.org/licenses/gpl-2.0.html. 17 * 18 * Copyright IBM Corporation, 2008 19 * 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 21 * Manfred Spraul <manfred@colorfullife.com> 22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version 23 * 24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 26 * 27 * For detailed explanation of Read-Copy Update mechanism see - 28 * Documentation/RCU 29 */ 30 #include <linux/types.h> 31 #include <linux/kernel.h> 32 #include <linux/init.h> 33 #include <linux/spinlock.h> 34 #include <linux/smp.h> 35 #include <linux/rcupdate_wait.h> 36 #include <linux/interrupt.h> 37 #include <linux/sched.h> 38 #include <linux/sched/debug.h> 39 #include <linux/nmi.h> 40 #include <linux/atomic.h> 41 #include <linux/bitops.h> 42 #include <linux/export.h> 43 #include <linux/completion.h> 44 #include <linux/moduleparam.h> 45 #include <linux/percpu.h> 46 #include <linux/notifier.h> 47 #include <linux/cpu.h> 48 #include <linux/mutex.h> 49 #include <linux/time.h> 50 #include <linux/kernel_stat.h> 51 #include <linux/wait.h> 52 #include <linux/kthread.h> 53 #include <uapi/linux/sched/types.h> 54 #include <linux/prefetch.h> 55 #include <linux/delay.h> 56 #include <linux/stop_machine.h> 57 #include <linux/random.h> 58 #include <linux/trace_events.h> 59 #include <linux/suspend.h> 60 #include <linux/ftrace.h> 61 62 #include "tree.h" 63 #include "rcu.h" 64 65 #ifdef MODULE_PARAM_PREFIX 66 #undef MODULE_PARAM_PREFIX 67 #endif 68 #define MODULE_PARAM_PREFIX "rcutree." 69 70 /* Data structures. */ 71 72 /* 73 * In order to export the rcu_state name to the tracing tools, it 74 * needs to be added in the __tracepoint_string section. 75 * This requires defining a separate variable tp_<sname>_varname 76 * that points to the string being used, and this will allow 77 * the tracing userspace tools to be able to decipher the string 78 * address to the matching string. 79 */ 80 #ifdef CONFIG_TRACING 81 # define DEFINE_RCU_TPS(sname) \ 82 static char sname##_varname[] = #sname; \ 83 static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; 84 # define RCU_STATE_NAME(sname) sname##_varname 85 #else 86 # define DEFINE_RCU_TPS(sname) 87 # define RCU_STATE_NAME(sname) __stringify(sname) 88 #endif 89 90 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 91 DEFINE_RCU_TPS(sname) \ 92 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ 93 struct rcu_state sname##_state = { \ 94 .level = { &sname##_state.node[0] }, \ 95 .rda = &sname##_data, \ 96 .call = cr, \ 97 .gp_state = RCU_GP_IDLE, \ 98 .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, \ 99 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 100 .name = RCU_STATE_NAME(sname), \ 101 .abbr = sabbr, \ 102 .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ 103 .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ 104 .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ 105 } 106 107 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 108 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 109 110 static struct rcu_state *const rcu_state_p; 111 LIST_HEAD(rcu_struct_flavors); 112 113 /* Dump rcu_node combining tree at boot to verify correct setup. */ 114 static bool dump_tree; 115 module_param(dump_tree, bool, 0444); 116 /* Control rcu_node-tree auto-balancing at boot time. */ 117 static bool rcu_fanout_exact; 118 module_param(rcu_fanout_exact, bool, 0444); 119 /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */ 120 static int rcu_fanout_leaf = RCU_FANOUT_LEAF; 121 module_param(rcu_fanout_leaf, int, 0444); 122 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 123 /* Number of rcu_nodes at specified level. */ 124 int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 125 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 126 /* panic() on RCU Stall sysctl. */ 127 int sysctl_panic_on_rcu_stall __read_mostly; 128 129 /* 130 * The rcu_scheduler_active variable is initialized to the value 131 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the 132 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE, 133 * RCU can assume that there is but one task, allowing RCU to (for example) 134 * optimize synchronize_rcu() to a simple barrier(). When this variable 135 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required 136 * to detect real grace periods. This variable is also used to suppress 137 * boot-time false positives from lockdep-RCU error checking. Finally, it 138 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU 139 * is fully initialized, including all of its kthreads having been spawned. 140 */ 141 int rcu_scheduler_active __read_mostly; 142 EXPORT_SYMBOL_GPL(rcu_scheduler_active); 143 144 /* 145 * The rcu_scheduler_fully_active variable transitions from zero to one 146 * during the early_initcall() processing, which is after the scheduler 147 * is capable of creating new tasks. So RCU processing (for example, 148 * creating tasks for RCU priority boosting) must be delayed until after 149 * rcu_scheduler_fully_active transitions from zero to one. We also 150 * currently delay invocation of any RCU callbacks until after this point. 151 * 152 * It might later prove better for people registering RCU callbacks during 153 * early boot to take responsibility for these callbacks, but one step at 154 * a time. 155 */ 156 static int rcu_scheduler_fully_active __read_mostly; 157 158 static void 159 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 160 struct rcu_node *rnp, unsigned long gps, unsigned long flags); 161 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); 162 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); 163 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 164 static void invoke_rcu_core(void); 165 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 166 static void rcu_report_exp_rdp(struct rcu_state *rsp, 167 struct rcu_data *rdp, bool wake); 168 static void sync_sched_exp_online_cleanup(int cpu); 169 170 /* rcuc/rcub kthread realtime priority */ 171 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; 172 module_param(kthread_prio, int, 0644); 173 174 /* Delay in jiffies for grace-period initialization delays, debug only. */ 175 176 static int gp_preinit_delay; 177 module_param(gp_preinit_delay, int, 0444); 178 static int gp_init_delay; 179 module_param(gp_init_delay, int, 0444); 180 static int gp_cleanup_delay; 181 module_param(gp_cleanup_delay, int, 0444); 182 183 /* 184 * Number of grace periods between delays, normalized by the duration of 185 * the delay. The longer the delay, the more the grace periods between 186 * each delay. The reason for this normalization is that it means that, 187 * for non-zero delays, the overall slowdown of grace periods is constant 188 * regardless of the duration of the delay. This arrangement balances 189 * the need for long delays to increase some race probabilities with the 190 * need for fast grace periods to increase other race probabilities. 191 */ 192 #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ 193 194 /* 195 * Compute the mask of online CPUs for the specified rcu_node structure. 196 * This will not be stable unless the rcu_node structure's ->lock is 197 * held, but the bit corresponding to the current CPU will be stable 198 * in most contexts. 199 */ 200 unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) 201 { 202 return READ_ONCE(rnp->qsmaskinitnext); 203 } 204 205 /* 206 * Return true if an RCU grace period is in progress. The READ_ONCE()s 207 * permit this function to be invoked without holding the root rcu_node 208 * structure's ->lock, but of course results can be subject to change. 209 */ 210 static int rcu_gp_in_progress(struct rcu_state *rsp) 211 { 212 return rcu_seq_state(rcu_seq_current(&rsp->gp_seq)); 213 } 214 215 /* 216 * Note a quiescent state. Because we do not need to know 217 * how many quiescent states passed, just if there was at least 218 * one since the start of the grace period, this just sets a flag. 219 * The caller must have disabled preemption. 220 */ 221 void rcu_sched_qs(void) 222 { 223 RCU_LOCKDEP_WARN(preemptible(), "rcu_sched_qs() invoked with preemption enabled!!!"); 224 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) 225 return; 226 trace_rcu_grace_period(TPS("rcu_sched"), 227 __this_cpu_read(rcu_sched_data.gp_seq), 228 TPS("cpuqs")); 229 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); 230 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) 231 return; 232 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); 233 rcu_report_exp_rdp(&rcu_sched_state, 234 this_cpu_ptr(&rcu_sched_data), true); 235 } 236 237 void rcu_bh_qs(void) 238 { 239 RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); 240 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { 241 trace_rcu_grace_period(TPS("rcu_bh"), 242 __this_cpu_read(rcu_bh_data.gp_seq), 243 TPS("cpuqs")); 244 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); 245 } 246 } 247 248 /* 249 * Steal a bit from the bottom of ->dynticks for idle entry/exit 250 * control. Initially this is for TLB flushing. 251 */ 252 #define RCU_DYNTICK_CTRL_MASK 0x1 253 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) 254 #ifndef rcu_eqs_special_exit 255 #define rcu_eqs_special_exit() do { } while (0) 256 #endif 257 258 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 259 .dynticks_nesting = 1, 260 .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, 261 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), 262 }; 263 264 /* 265 * Record entry into an extended quiescent state. This is only to be 266 * called when not already in an extended quiescent state. 267 */ 268 static void rcu_dynticks_eqs_enter(void) 269 { 270 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 271 int seq; 272 273 /* 274 * CPUs seeing atomic_add_return() must see prior RCU read-side 275 * critical sections, and we also must force ordering with the 276 * next idle sojourn. 277 */ 278 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); 279 /* Better be in an extended quiescent state! */ 280 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 281 (seq & RCU_DYNTICK_CTRL_CTR)); 282 /* Better not have special action (TLB flush) pending! */ 283 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 284 (seq & RCU_DYNTICK_CTRL_MASK)); 285 } 286 287 /* 288 * Record exit from an extended quiescent state. This is only to be 289 * called from an extended quiescent state. 290 */ 291 static void rcu_dynticks_eqs_exit(void) 292 { 293 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 294 int seq; 295 296 /* 297 * CPUs seeing atomic_add_return() must see prior idle sojourns, 298 * and we also must force ordering with the next RCU read-side 299 * critical section. 300 */ 301 seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); 302 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 303 !(seq & RCU_DYNTICK_CTRL_CTR)); 304 if (seq & RCU_DYNTICK_CTRL_MASK) { 305 atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); 306 smp_mb__after_atomic(); /* _exit after clearing mask. */ 307 /* Prefer duplicate flushes to losing a flush. */ 308 rcu_eqs_special_exit(); 309 } 310 } 311 312 /* 313 * Reset the current CPU's ->dynticks counter to indicate that the 314 * newly onlined CPU is no longer in an extended quiescent state. 315 * This will either leave the counter unchanged, or increment it 316 * to the next non-quiescent value. 317 * 318 * The non-atomic test/increment sequence works because the upper bits 319 * of the ->dynticks counter are manipulated only by the corresponding CPU, 320 * or when the corresponding CPU is offline. 321 */ 322 static void rcu_dynticks_eqs_online(void) 323 { 324 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 325 326 if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) 327 return; 328 atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); 329 } 330 331 /* 332 * Is the current CPU in an extended quiescent state? 333 * 334 * No ordering, as we are sampling CPU-local information. 335 */ 336 bool rcu_dynticks_curr_cpu_in_eqs(void) 337 { 338 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 339 340 return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); 341 } 342 343 /* 344 * Snapshot the ->dynticks counter with full ordering so as to allow 345 * stable comparison of this counter with past and future snapshots. 346 */ 347 int rcu_dynticks_snap(struct rcu_dynticks *rdtp) 348 { 349 int snap = atomic_add_return(0, &rdtp->dynticks); 350 351 return snap & ~RCU_DYNTICK_CTRL_MASK; 352 } 353 354 /* 355 * Return true if the snapshot returned from rcu_dynticks_snap() 356 * indicates that RCU is in an extended quiescent state. 357 */ 358 static bool rcu_dynticks_in_eqs(int snap) 359 { 360 return !(snap & RCU_DYNTICK_CTRL_CTR); 361 } 362 363 /* 364 * Return true if the CPU corresponding to the specified rcu_dynticks 365 * structure has spent some time in an extended quiescent state since 366 * rcu_dynticks_snap() returned the specified snapshot. 367 */ 368 static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) 369 { 370 return snap != rcu_dynticks_snap(rdtp); 371 } 372 373 /* 374 * Do a double-increment of the ->dynticks counter to emulate a 375 * momentary idle-CPU quiescent state. 376 */ 377 static void rcu_dynticks_momentary_idle(void) 378 { 379 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 380 int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, 381 &rdtp->dynticks); 382 383 /* It is illegal to call this from idle state. */ 384 WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); 385 } 386 387 /* 388 * Set the special (bottom) bit of the specified CPU so that it 389 * will take special action (such as flushing its TLB) on the 390 * next exit from an extended quiescent state. Returns true if 391 * the bit was successfully set, or false if the CPU was not in 392 * an extended quiescent state. 393 */ 394 bool rcu_eqs_special_set(int cpu) 395 { 396 int old; 397 int new; 398 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 399 400 do { 401 old = atomic_read(&rdtp->dynticks); 402 if (old & RCU_DYNTICK_CTRL_CTR) 403 return false; 404 new = old | RCU_DYNTICK_CTRL_MASK; 405 } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); 406 return true; 407 } 408 409 /* 410 * Let the RCU core know that this CPU has gone through the scheduler, 411 * which is a quiescent state. This is called when the need for a 412 * quiescent state is urgent, so we burn an atomic operation and full 413 * memory barriers to let the RCU core know about it, regardless of what 414 * this CPU might (or might not) do in the near future. 415 * 416 * We inform the RCU core by emulating a zero-duration dyntick-idle period. 417 * 418 * The caller must have disabled interrupts. 419 */ 420 static void rcu_momentary_dyntick_idle(void) 421 { 422 raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); 423 rcu_dynticks_momentary_idle(); 424 } 425 426 /* 427 * Note a context switch. This is a quiescent state for RCU-sched, 428 * and requires special handling for preemptible RCU. 429 * The caller must have disabled interrupts. 430 */ 431 void rcu_note_context_switch(bool preempt) 432 { 433 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 434 trace_rcu_utilization(TPS("Start context switch")); 435 rcu_sched_qs(); 436 rcu_preempt_note_context_switch(preempt); 437 /* Load rcu_urgent_qs before other flags. */ 438 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) 439 goto out; 440 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); 441 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) 442 rcu_momentary_dyntick_idle(); 443 this_cpu_inc(rcu_dynticks.rcu_qs_ctr); 444 if (!preempt) 445 rcu_note_voluntary_context_switch_lite(current); 446 out: 447 trace_rcu_utilization(TPS("End context switch")); 448 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 449 } 450 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 451 452 /* 453 * Register a quiescent state for all RCU flavors. If there is an 454 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 455 * dyntick-idle quiescent state visible to other CPUs (but only for those 456 * RCU flavors in desperate need of a quiescent state, which will normally 457 * be none of them). Either way, do a lightweight quiescent state for 458 * all RCU flavors. 459 * 460 * The barrier() calls are redundant in the common case when this is 461 * called externally, but just in case this is called from within this 462 * file. 463 * 464 */ 465 void rcu_all_qs(void) 466 { 467 unsigned long flags; 468 469 if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) 470 return; 471 preempt_disable(); 472 /* Load rcu_urgent_qs before other flags. */ 473 if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { 474 preempt_enable(); 475 return; 476 } 477 this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); 478 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 479 if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { 480 local_irq_save(flags); 481 rcu_momentary_dyntick_idle(); 482 local_irq_restore(flags); 483 } 484 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) 485 rcu_sched_qs(); 486 this_cpu_inc(rcu_dynticks.rcu_qs_ctr); 487 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 488 preempt_enable(); 489 } 490 EXPORT_SYMBOL_GPL(rcu_all_qs); 491 492 #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ 493 static long blimit = DEFAULT_RCU_BLIMIT; 494 #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ 495 static long qhimark = DEFAULT_RCU_QHIMARK; 496 #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ 497 static long qlowmark = DEFAULT_RCU_QLOMARK; 498 499 module_param(blimit, long, 0444); 500 module_param(qhimark, long, 0444); 501 module_param(qlowmark, long, 0444); 502 503 static ulong jiffies_till_first_fqs = ULONG_MAX; 504 static ulong jiffies_till_next_fqs = ULONG_MAX; 505 static bool rcu_kick_kthreads; 506 507 module_param(jiffies_till_first_fqs, ulong, 0644); 508 module_param(jiffies_till_next_fqs, ulong, 0644); 509 module_param(rcu_kick_kthreads, bool, 0644); 510 511 /* 512 * How long the grace period must be before we start recruiting 513 * quiescent-state help from rcu_note_context_switch(). 514 */ 515 static ulong jiffies_till_sched_qs = HZ / 10; 516 module_param(jiffies_till_sched_qs, ulong, 0444); 517 518 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)); 519 static void force_quiescent_state(struct rcu_state *rsp); 520 static int rcu_pending(void); 521 522 /* 523 * Return the number of RCU GPs completed thus far for debug & stats. 524 */ 525 unsigned long rcu_get_gp_seq(void) 526 { 527 return READ_ONCE(rcu_state_p->gp_seq); 528 } 529 EXPORT_SYMBOL_GPL(rcu_get_gp_seq); 530 531 /* 532 * Return the number of RCU-sched GPs completed thus far for debug & stats. 533 */ 534 unsigned long rcu_sched_get_gp_seq(void) 535 { 536 return READ_ONCE(rcu_sched_state.gp_seq); 537 } 538 EXPORT_SYMBOL_GPL(rcu_sched_get_gp_seq); 539 540 /* 541 * Return the number of RCU-bh GPs completed thus far for debug & stats. 542 */ 543 unsigned long rcu_bh_get_gp_seq(void) 544 { 545 return READ_ONCE(rcu_bh_state.gp_seq); 546 } 547 EXPORT_SYMBOL_GPL(rcu_bh_get_gp_seq); 548 549 /* 550 * Return the number of RCU expedited batches completed thus far for 551 * debug & stats. Odd numbers mean that a batch is in progress, even 552 * numbers mean idle. The value returned will thus be roughly double 553 * the cumulative batches since boot. 554 */ 555 unsigned long rcu_exp_batches_completed(void) 556 { 557 return rcu_state_p->expedited_sequence; 558 } 559 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); 560 561 /* 562 * Return the number of RCU-sched expedited batches completed thus far 563 * for debug & stats. Similar to rcu_exp_batches_completed(). 564 */ 565 unsigned long rcu_exp_batches_completed_sched(void) 566 { 567 return rcu_sched_state.expedited_sequence; 568 } 569 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); 570 571 /* 572 * Force a quiescent state. 573 */ 574 void rcu_force_quiescent_state(void) 575 { 576 force_quiescent_state(rcu_state_p); 577 } 578 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 579 580 /* 581 * Force a quiescent state for RCU BH. 582 */ 583 void rcu_bh_force_quiescent_state(void) 584 { 585 force_quiescent_state(&rcu_bh_state); 586 } 587 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 588 589 /* 590 * Force a quiescent state for RCU-sched. 591 */ 592 void rcu_sched_force_quiescent_state(void) 593 { 594 force_quiescent_state(&rcu_sched_state); 595 } 596 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 597 598 /* 599 * Show the state of the grace-period kthreads. 600 */ 601 void show_rcu_gp_kthreads(void) 602 { 603 struct rcu_state *rsp; 604 605 for_each_rcu_flavor(rsp) { 606 pr_info("%s: wait state: %d ->state: %#lx\n", 607 rsp->name, rsp->gp_state, rsp->gp_kthread->state); 608 /* sched_show_task(rsp->gp_kthread); */ 609 } 610 } 611 EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); 612 613 /* 614 * Send along grace-period-related data for rcutorture diagnostics. 615 */ 616 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, 617 unsigned long *gp_seq) 618 { 619 struct rcu_state *rsp = NULL; 620 621 switch (test_type) { 622 case RCU_FLAVOR: 623 rsp = rcu_state_p; 624 break; 625 case RCU_BH_FLAVOR: 626 rsp = &rcu_bh_state; 627 break; 628 case RCU_SCHED_FLAVOR: 629 rsp = &rcu_sched_state; 630 break; 631 default: 632 break; 633 } 634 if (rsp == NULL) 635 return; 636 *flags = READ_ONCE(rsp->gp_flags); 637 *gp_seq = rcu_seq_current(&rsp->gp_seq); 638 } 639 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); 640 641 /* 642 * Return the root node of the specified rcu_state structure. 643 */ 644 static struct rcu_node *rcu_get_root(struct rcu_state *rsp) 645 { 646 return &rsp->node[0]; 647 } 648 649 /* 650 * Enter an RCU extended quiescent state, which can be either the 651 * idle loop or adaptive-tickless usermode execution. 652 * 653 * We crowbar the ->dynticks_nmi_nesting field to zero to allow for 654 * the possibility of usermode upcalls having messed up our count 655 * of interrupt nesting level during the prior busy period. 656 */ 657 static void rcu_eqs_enter(bool user) 658 { 659 struct rcu_state *rsp; 660 struct rcu_data *rdp; 661 struct rcu_dynticks *rdtp; 662 663 rdtp = this_cpu_ptr(&rcu_dynticks); 664 WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); 665 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 666 rdtp->dynticks_nesting == 0); 667 if (rdtp->dynticks_nesting != 1) { 668 rdtp->dynticks_nesting--; 669 return; 670 } 671 672 lockdep_assert_irqs_disabled(); 673 trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); 674 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); 675 for_each_rcu_flavor(rsp) { 676 rdp = this_cpu_ptr(rsp->rda); 677 do_nocb_deferred_wakeup(rdp); 678 } 679 rcu_prepare_for_idle(); 680 WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ 681 rcu_dynticks_eqs_enter(); 682 rcu_dynticks_task_enter(); 683 } 684 685 /** 686 * rcu_idle_enter - inform RCU that current CPU is entering idle 687 * 688 * Enter idle mode, in other words, -leave- the mode in which RCU 689 * read-side critical sections can occur. (Though RCU read-side 690 * critical sections can occur in irq handlers in idle, a possibility 691 * handled by irq_enter() and irq_exit().) 692 * 693 * If you add or remove a call to rcu_idle_enter(), be sure to test with 694 * CONFIG_RCU_EQS_DEBUG=y. 695 */ 696 void rcu_idle_enter(void) 697 { 698 lockdep_assert_irqs_disabled(); 699 rcu_eqs_enter(false); 700 } 701 702 #ifdef CONFIG_NO_HZ_FULL 703 /** 704 * rcu_user_enter - inform RCU that we are resuming userspace. 705 * 706 * Enter RCU idle mode right before resuming userspace. No use of RCU 707 * is permitted between this call and rcu_user_exit(). This way the 708 * CPU doesn't need to maintain the tick for RCU maintenance purposes 709 * when the CPU runs in userspace. 710 * 711 * If you add or remove a call to rcu_user_enter(), be sure to test with 712 * CONFIG_RCU_EQS_DEBUG=y. 713 */ 714 void rcu_user_enter(void) 715 { 716 lockdep_assert_irqs_disabled(); 717 rcu_eqs_enter(true); 718 } 719 #endif /* CONFIG_NO_HZ_FULL */ 720 721 /** 722 * rcu_nmi_exit - inform RCU of exit from NMI context 723 * 724 * If we are returning from the outermost NMI handler that interrupted an 725 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting 726 * to let the RCU grace-period handling know that the CPU is back to 727 * being RCU-idle. 728 * 729 * If you add or remove a call to rcu_nmi_exit(), be sure to test 730 * with CONFIG_RCU_EQS_DEBUG=y. 731 */ 732 void rcu_nmi_exit(void) 733 { 734 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 735 736 /* 737 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. 738 * (We are exiting an NMI handler, so RCU better be paying attention 739 * to us!) 740 */ 741 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); 742 WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); 743 744 /* 745 * If the nesting level is not 1, the CPU wasn't RCU-idle, so 746 * leave it in non-RCU-idle state. 747 */ 748 if (rdtp->dynticks_nmi_nesting != 1) { 749 trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks); 750 WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ 751 rdtp->dynticks_nmi_nesting - 2); 752 return; 753 } 754 755 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ 756 trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks); 757 WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ 758 rcu_dynticks_eqs_enter(); 759 } 760 761 /** 762 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 763 * 764 * Exit from an interrupt handler, which might possibly result in entering 765 * idle mode, in other words, leaving the mode in which read-side critical 766 * sections can occur. The caller must have disabled interrupts. 767 * 768 * This code assumes that the idle loop never does anything that might 769 * result in unbalanced calls to irq_enter() and irq_exit(). If your 770 * architecture's idle loop violates this assumption, RCU will give you what 771 * you deserve, good and hard. But very infrequently and irreproducibly. 772 * 773 * Use things like work queues to work around this limitation. 774 * 775 * You have been warned. 776 * 777 * If you add or remove a call to rcu_irq_exit(), be sure to test with 778 * CONFIG_RCU_EQS_DEBUG=y. 779 */ 780 void rcu_irq_exit(void) 781 { 782 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 783 784 lockdep_assert_irqs_disabled(); 785 if (rdtp->dynticks_nmi_nesting == 1) 786 rcu_prepare_for_idle(); 787 rcu_nmi_exit(); 788 if (rdtp->dynticks_nmi_nesting == 0) 789 rcu_dynticks_task_enter(); 790 } 791 792 /* 793 * Wrapper for rcu_irq_exit() where interrupts are enabled. 794 * 795 * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test 796 * with CONFIG_RCU_EQS_DEBUG=y. 797 */ 798 void rcu_irq_exit_irqson(void) 799 { 800 unsigned long flags; 801 802 local_irq_save(flags); 803 rcu_irq_exit(); 804 local_irq_restore(flags); 805 } 806 807 /* 808 * Exit an RCU extended quiescent state, which can be either the 809 * idle loop or adaptive-tickless usermode execution. 810 * 811 * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to 812 * allow for the possibility of usermode upcalls messing up our count of 813 * interrupt nesting level during the busy period that is just now starting. 814 */ 815 static void rcu_eqs_exit(bool user) 816 { 817 struct rcu_dynticks *rdtp; 818 long oldval; 819 820 lockdep_assert_irqs_disabled(); 821 rdtp = this_cpu_ptr(&rcu_dynticks); 822 oldval = rdtp->dynticks_nesting; 823 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); 824 if (oldval) { 825 rdtp->dynticks_nesting++; 826 return; 827 } 828 rcu_dynticks_task_exit(); 829 rcu_dynticks_eqs_exit(); 830 rcu_cleanup_after_idle(); 831 trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); 832 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); 833 WRITE_ONCE(rdtp->dynticks_nesting, 1); 834 WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); 835 } 836 837 /** 838 * rcu_idle_exit - inform RCU that current CPU is leaving idle 839 * 840 * Exit idle mode, in other words, -enter- the mode in which RCU 841 * read-side critical sections can occur. 842 * 843 * If you add or remove a call to rcu_idle_exit(), be sure to test with 844 * CONFIG_RCU_EQS_DEBUG=y. 845 */ 846 void rcu_idle_exit(void) 847 { 848 unsigned long flags; 849 850 local_irq_save(flags); 851 rcu_eqs_exit(false); 852 local_irq_restore(flags); 853 } 854 855 #ifdef CONFIG_NO_HZ_FULL 856 /** 857 * rcu_user_exit - inform RCU that we are exiting userspace. 858 * 859 * Exit RCU idle mode while entering the kernel because it can 860 * run a RCU read side critical section anytime. 861 * 862 * If you add or remove a call to rcu_user_exit(), be sure to test with 863 * CONFIG_RCU_EQS_DEBUG=y. 864 */ 865 void rcu_user_exit(void) 866 { 867 rcu_eqs_exit(1); 868 } 869 #endif /* CONFIG_NO_HZ_FULL */ 870 871 /** 872 * rcu_nmi_enter - inform RCU of entry to NMI context 873 * 874 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and 875 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know 876 * that the CPU is active. This implementation permits nested NMIs, as 877 * long as the nesting level does not overflow an int. (You will probably 878 * run out of stack space first.) 879 * 880 * If you add or remove a call to rcu_nmi_enter(), be sure to test 881 * with CONFIG_RCU_EQS_DEBUG=y. 882 */ 883 void rcu_nmi_enter(void) 884 { 885 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 886 long incby = 2; 887 888 /* Complain about underflow. */ 889 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); 890 891 /* 892 * If idle from RCU viewpoint, atomically increment ->dynticks 893 * to mark non-idle and increment ->dynticks_nmi_nesting by one. 894 * Otherwise, increment ->dynticks_nmi_nesting by two. This means 895 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed 896 * to be in the outermost NMI handler that interrupted an RCU-idle 897 * period (observation due to Andy Lutomirski). 898 */ 899 if (rcu_dynticks_curr_cpu_in_eqs()) { 900 rcu_dynticks_eqs_exit(); 901 incby = 1; 902 } 903 trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), 904 rdtp->dynticks_nmi_nesting, 905 rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks); 906 WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ 907 rdtp->dynticks_nmi_nesting + incby); 908 barrier(); 909 } 910 911 /** 912 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 913 * 914 * Enter an interrupt handler, which might possibly result in exiting 915 * idle mode, in other words, entering the mode in which read-side critical 916 * sections can occur. The caller must have disabled interrupts. 917 * 918 * Note that the Linux kernel is fully capable of entering an interrupt 919 * handler that it never exits, for example when doing upcalls to user mode! 920 * This code assumes that the idle loop never does upcalls to user mode. 921 * If your architecture's idle loop does do upcalls to user mode (or does 922 * anything else that results in unbalanced calls to the irq_enter() and 923 * irq_exit() functions), RCU will give you what you deserve, good and hard. 924 * But very infrequently and irreproducibly. 925 * 926 * Use things like work queues to work around this limitation. 927 * 928 * You have been warned. 929 * 930 * If you add or remove a call to rcu_irq_enter(), be sure to test with 931 * CONFIG_RCU_EQS_DEBUG=y. 932 */ 933 void rcu_irq_enter(void) 934 { 935 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 936 937 lockdep_assert_irqs_disabled(); 938 if (rdtp->dynticks_nmi_nesting == 0) 939 rcu_dynticks_task_exit(); 940 rcu_nmi_enter(); 941 if (rdtp->dynticks_nmi_nesting == 1) 942 rcu_cleanup_after_idle(); 943 } 944 945 /* 946 * Wrapper for rcu_irq_enter() where interrupts are enabled. 947 * 948 * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test 949 * with CONFIG_RCU_EQS_DEBUG=y. 950 */ 951 void rcu_irq_enter_irqson(void) 952 { 953 unsigned long flags; 954 955 local_irq_save(flags); 956 rcu_irq_enter(); 957 local_irq_restore(flags); 958 } 959 960 /** 961 * rcu_is_watching - see if RCU thinks that the current CPU is idle 962 * 963 * Return true if RCU is watching the running CPU, which means that this 964 * CPU can safely enter RCU read-side critical sections. In other words, 965 * if the current CPU is in its idle loop and is neither in an interrupt 966 * or NMI handler, return true. 967 */ 968 bool notrace rcu_is_watching(void) 969 { 970 bool ret; 971 972 preempt_disable_notrace(); 973 ret = !rcu_dynticks_curr_cpu_in_eqs(); 974 preempt_enable_notrace(); 975 return ret; 976 } 977 EXPORT_SYMBOL_GPL(rcu_is_watching); 978 979 /* 980 * If a holdout task is actually running, request an urgent quiescent 981 * state from its CPU. This is unsynchronized, so migrations can cause 982 * the request to go to the wrong CPU. Which is OK, all that will happen 983 * is that the CPU's next context switch will be a bit slower and next 984 * time around this task will generate another request. 985 */ 986 void rcu_request_urgent_qs_task(struct task_struct *t) 987 { 988 int cpu; 989 990 barrier(); 991 cpu = task_cpu(t); 992 if (!task_curr(t)) 993 return; /* This task is not running on that CPU. */ 994 smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); 995 } 996 997 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 998 999 /* 1000 * Is the current CPU online as far as RCU is concerned? 1001 * 1002 * Disable preemption to avoid false positives that could otherwise 1003 * happen due to the current CPU number being sampled, this task being 1004 * preempted, its old CPU being taken offline, resuming on some other CPU, 1005 * then determining that its old CPU is now offline. Because there are 1006 * multiple flavors of RCU, and because this function can be called in the 1007 * midst of updating the flavors while a given CPU coming online or going 1008 * offline, it is necessary to check all flavors. If any of the flavors 1009 * believe that given CPU is online, it is considered to be online. 1010 * 1011 * Disable checking if in an NMI handler because we cannot safely 1012 * report errors from NMI handlers anyway. In addition, it is OK to use 1013 * RCU on an offline processor during initial boot, hence the check for 1014 * rcu_scheduler_fully_active. 1015 */ 1016 bool rcu_lockdep_current_cpu_online(void) 1017 { 1018 struct rcu_data *rdp; 1019 struct rcu_node *rnp; 1020 struct rcu_state *rsp; 1021 1022 if (in_nmi() || !rcu_scheduler_fully_active) 1023 return true; 1024 preempt_disable(); 1025 for_each_rcu_flavor(rsp) { 1026 rdp = this_cpu_ptr(rsp->rda); 1027 rnp = rdp->mynode; 1028 if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) { 1029 preempt_enable(); 1030 return true; 1031 } 1032 } 1033 preempt_enable(); 1034 return false; 1035 } 1036 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 1037 1038 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ 1039 1040 /** 1041 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 1042 * 1043 * If the current CPU is idle or running at a first-level (not nested) 1044 * interrupt from idle, return true. The caller must have at least 1045 * disabled preemption. 1046 */ 1047 static int rcu_is_cpu_rrupt_from_idle(void) 1048 { 1049 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 && 1050 __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1; 1051 } 1052 1053 /* 1054 * We are reporting a quiescent state on behalf of some other CPU, so 1055 * it is our responsibility to check for and handle potential overflow 1056 * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. 1057 * After all, the CPU might be in deep idle state, and thus executing no 1058 * code whatsoever. 1059 */ 1060 static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) 1061 { 1062 raw_lockdep_assert_held_rcu_node(rnp); 1063 if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, 1064 rnp->gp_seq)) 1065 WRITE_ONCE(rdp->gpwrap, true); 1066 if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) 1067 rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; 1068 } 1069 1070 /* 1071 * Snapshot the specified CPU's dynticks counter so that we can later 1072 * credit them with an implicit quiescent state. Return 1 if this CPU 1073 * is in dynticks idle mode, which is an extended quiescent state. 1074 */ 1075 static int dyntick_save_progress_counter(struct rcu_data *rdp) 1076 { 1077 rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); 1078 if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { 1079 trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); 1080 rcu_gpnum_ovf(rdp->mynode, rdp); 1081 return 1; 1082 } 1083 return 0; 1084 } 1085 1086 /* 1087 * Handler for the irq_work request posted when a grace period has 1088 * gone on for too long, but not yet long enough for an RCU CPU 1089 * stall warning. Set state appropriately, but just complain if 1090 * there is unexpected state on entry. 1091 */ 1092 static void rcu_iw_handler(struct irq_work *iwp) 1093 { 1094 struct rcu_data *rdp; 1095 struct rcu_node *rnp; 1096 1097 rdp = container_of(iwp, struct rcu_data, rcu_iw); 1098 rnp = rdp->mynode; 1099 raw_spin_lock_rcu_node(rnp); 1100 if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { 1101 rdp->rcu_iw_gp_seq = rnp->gp_seq; 1102 rdp->rcu_iw_pending = false; 1103 } 1104 raw_spin_unlock_rcu_node(rnp); 1105 } 1106 1107 /* 1108 * Return true if the specified CPU has passed through a quiescent 1109 * state by virtue of being in or having passed through an dynticks 1110 * idle state since the last call to dyntick_save_progress_counter() 1111 * for this same CPU, or by virtue of having been offline. 1112 */ 1113 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 1114 { 1115 unsigned long jtsq; 1116 bool *rnhqp; 1117 bool *ruqp; 1118 struct rcu_node *rnp = rdp->mynode; 1119 1120 /* 1121 * If the CPU passed through or entered a dynticks idle phase with 1122 * no active irq/NMI handlers, then we can safely pretend that the CPU 1123 * already acknowledged the request to pass through a quiescent 1124 * state. Either way, that CPU cannot possibly be in an RCU 1125 * read-side critical section that started before the beginning 1126 * of the current RCU grace period. 1127 */ 1128 if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { 1129 trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("dti")); 1130 rdp->dynticks_fqs++; 1131 rcu_gpnum_ovf(rnp, rdp); 1132 return 1; 1133 } 1134 1135 /* 1136 * Has this CPU encountered a cond_resched() since the beginning 1137 * of the grace period? For this to be the case, the CPU has to 1138 * have noticed the current grace period. This might not be the 1139 * case for nohz_full CPUs looping in the kernel. 1140 */ 1141 jtsq = jiffies_till_sched_qs; 1142 ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); 1143 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && 1144 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && 1145 rcu_seq_current(&rdp->gp_seq) == rnp->gp_seq && !rdp->gpwrap) { 1146 trace_rcu_fqs(rdp->rsp->name, rdp->gp_seq, rdp->cpu, TPS("rqc")); 1147 rcu_gpnum_ovf(rnp, rdp); 1148 return 1; 1149 } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) { 1150 /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ 1151 smp_store_release(ruqp, true); 1152 } 1153 1154 /* If waiting too long on an offline CPU, complain. */ 1155 if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && 1156 time_after(jiffies, rdp->rsp->gp_start + HZ)) { 1157 bool onl; 1158 struct rcu_node *rnp1; 1159 1160 WARN_ON(1); /* Offline CPUs are supposed to report QS! */ 1161 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", 1162 __func__, rnp->grplo, rnp->grphi, rnp->level, 1163 (long)rnp->gp_seq, (long)rnp->completedqs); 1164 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) 1165 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", 1166 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); 1167 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); 1168 pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", 1169 __func__, rdp->cpu, ".o"[onl], 1170 (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, 1171 (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); 1172 return 1; /* Break things loose after complaining. */ 1173 } 1174 1175 /* 1176 * A CPU running for an extended time within the kernel can 1177 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, 1178 * even context-switching back and forth between a pair of 1179 * in-kernel CPU-bound tasks cannot advance grace periods. 1180 * So if the grace period is old enough, make the CPU pay attention. 1181 * Note that the unsynchronized assignments to the per-CPU 1182 * rcu_need_heavy_qs variable are safe. Yes, setting of 1183 * bits can be lost, but they will be set again on the next 1184 * force-quiescent-state pass. So lost bit sets do not result 1185 * in incorrect behavior, merely in a grace period lasting 1186 * a few jiffies longer than it might otherwise. Because 1187 * there are at most four threads involved, and because the 1188 * updates are only once every few jiffies, the probability of 1189 * lossage (and thus of slight grace-period extension) is 1190 * quite low. 1191 */ 1192 rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); 1193 if (!READ_ONCE(*rnhqp) && 1194 (time_after(jiffies, rdp->rsp->gp_start + jtsq) || 1195 time_after(jiffies, rdp->rsp->jiffies_resched))) { 1196 WRITE_ONCE(*rnhqp, true); 1197 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ 1198 smp_store_release(ruqp, true); 1199 rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */ 1200 } 1201 1202 /* 1203 * If more than halfway to RCU CPU stall-warning time, do a 1204 * resched_cpu() to try to loosen things up a bit. Also check to 1205 * see if the CPU is getting hammered with interrupts, but only 1206 * once per grace period, just to keep the IPIs down to a dull roar. 1207 */ 1208 if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) { 1209 resched_cpu(rdp->cpu); 1210 if (IS_ENABLED(CONFIG_IRQ_WORK) && 1211 !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && 1212 (rnp->ffmask & rdp->grpmask)) { 1213 init_irq_work(&rdp->rcu_iw, rcu_iw_handler); 1214 rdp->rcu_iw_pending = true; 1215 rdp->rcu_iw_gp_seq = rnp->gp_seq; 1216 irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); 1217 } 1218 } 1219 1220 return 0; 1221 } 1222 1223 static void record_gp_stall_check_time(struct rcu_state *rsp) 1224 { 1225 unsigned long j = jiffies; 1226 unsigned long j1; 1227 1228 rsp->gp_start = j; 1229 j1 = rcu_jiffies_till_stall_check(); 1230 /* Record ->gp_start before ->jiffies_stall. */ 1231 smp_store_release(&rsp->jiffies_stall, j + j1); /* ^^^ */ 1232 rsp->jiffies_resched = j + j1 / 2; 1233 rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); 1234 } 1235 1236 /* 1237 * Convert a ->gp_state value to a character string. 1238 */ 1239 static const char *gp_state_getname(short gs) 1240 { 1241 if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) 1242 return "???"; 1243 return gp_state_names[gs]; 1244 } 1245 1246 /* 1247 * Complain about starvation of grace-period kthread. 1248 */ 1249 static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) 1250 { 1251 unsigned long gpa; 1252 unsigned long j; 1253 1254 j = jiffies; 1255 gpa = READ_ONCE(rsp->gp_activity); 1256 if (j - gpa > 2 * HZ) { 1257 pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", 1258 rsp->name, j - gpa, 1259 (long)rcu_seq_current(&rsp->gp_seq), 1260 rsp->gp_flags, 1261 gp_state_getname(rsp->gp_state), rsp->gp_state, 1262 rsp->gp_kthread ? rsp->gp_kthread->state : ~0, 1263 rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); 1264 if (rsp->gp_kthread) { 1265 pr_err("RCU grace-period kthread stack dump:\n"); 1266 sched_show_task(rsp->gp_kthread); 1267 wake_up_process(rsp->gp_kthread); 1268 } 1269 } 1270 } 1271 1272 /* 1273 * Dump stacks of all tasks running on stalled CPUs. First try using 1274 * NMIs, but fall back to manual remote stack tracing on architectures 1275 * that don't support NMI-based stack dumps. The NMI-triggered stack 1276 * traces are more accurate because they are printed by the target CPU. 1277 */ 1278 static void rcu_dump_cpu_stacks(struct rcu_state *rsp) 1279 { 1280 int cpu; 1281 unsigned long flags; 1282 struct rcu_node *rnp; 1283 1284 rcu_for_each_leaf_node(rsp, rnp) { 1285 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1286 for_each_leaf_node_possible_cpu(rnp, cpu) 1287 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) 1288 if (!trigger_single_cpu_backtrace(cpu)) 1289 dump_cpu_task(cpu); 1290 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1291 } 1292 } 1293 1294 /* 1295 * If too much time has passed in the current grace period, and if 1296 * so configured, go kick the relevant kthreads. 1297 */ 1298 static void rcu_stall_kick_kthreads(struct rcu_state *rsp) 1299 { 1300 unsigned long j; 1301 1302 if (!rcu_kick_kthreads) 1303 return; 1304 j = READ_ONCE(rsp->jiffies_kick_kthreads); 1305 if (time_after(jiffies, j) && rsp->gp_kthread && 1306 (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { 1307 WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); 1308 rcu_ftrace_dump(DUMP_ALL); 1309 wake_up_process(rsp->gp_kthread); 1310 WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); 1311 } 1312 } 1313 1314 static inline void panic_on_rcu_stall(void) 1315 { 1316 if (sysctl_panic_on_rcu_stall) 1317 panic("RCU Stall\n"); 1318 } 1319 1320 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gp_seq) 1321 { 1322 int cpu; 1323 unsigned long flags; 1324 unsigned long gpa; 1325 unsigned long j; 1326 int ndetected = 0; 1327 struct rcu_node *rnp = rcu_get_root(rsp); 1328 long totqlen = 0; 1329 1330 /* Kick and suppress, if so configured. */ 1331 rcu_stall_kick_kthreads(rsp); 1332 if (rcu_cpu_stall_suppress) 1333 return; 1334 1335 /* 1336 * OK, time to rat on our buddy... 1337 * See Documentation/RCU/stallwarn.txt for info on how to debug 1338 * RCU CPU stall warnings. 1339 */ 1340 pr_err("INFO: %s detected stalls on CPUs/tasks:", 1341 rsp->name); 1342 print_cpu_stall_info_begin(); 1343 rcu_for_each_leaf_node(rsp, rnp) { 1344 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1345 ndetected += rcu_print_task_stall(rnp); 1346 if (rnp->qsmask != 0) { 1347 for_each_leaf_node_possible_cpu(rnp, cpu) 1348 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { 1349 print_cpu_stall_info(rsp, cpu); 1350 ndetected++; 1351 } 1352 } 1353 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1354 } 1355 1356 print_cpu_stall_info_end(); 1357 for_each_possible_cpu(cpu) 1358 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, 1359 cpu)->cblist); 1360 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", 1361 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1362 (long)rcu_seq_current(&rsp->gp_seq), totqlen); 1363 if (ndetected) { 1364 rcu_dump_cpu_stacks(rsp); 1365 1366 /* Complain about tasks blocking the grace period. */ 1367 rcu_print_detail_task_stall(rsp); 1368 } else { 1369 if (rcu_seq_current(&rsp->gp_seq) != gp_seq) { 1370 pr_err("INFO: Stall ended before state dump start\n"); 1371 } else { 1372 j = jiffies; 1373 gpa = READ_ONCE(rsp->gp_activity); 1374 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", 1375 rsp->name, j - gpa, j, gpa, 1376 jiffies_till_next_fqs, 1377 rcu_get_root(rsp)->qsmask); 1378 /* In this case, the current CPU might be at fault. */ 1379 sched_show_task(current); 1380 } 1381 } 1382 /* Rewrite if needed in case of slow consoles. */ 1383 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) 1384 WRITE_ONCE(rsp->jiffies_stall, 1385 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1386 1387 rcu_check_gp_kthread_starvation(rsp); 1388 1389 panic_on_rcu_stall(); 1390 1391 force_quiescent_state(rsp); /* Kick them all. */ 1392 } 1393 1394 static void print_cpu_stall(struct rcu_state *rsp) 1395 { 1396 int cpu; 1397 unsigned long flags; 1398 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1399 struct rcu_node *rnp = rcu_get_root(rsp); 1400 long totqlen = 0; 1401 1402 /* Kick and suppress, if so configured. */ 1403 rcu_stall_kick_kthreads(rsp); 1404 if (rcu_cpu_stall_suppress) 1405 return; 1406 1407 /* 1408 * OK, time to rat on ourselves... 1409 * See Documentation/RCU/stallwarn.txt for info on how to debug 1410 * RCU CPU stall warnings. 1411 */ 1412 pr_err("INFO: %s self-detected stall on CPU", rsp->name); 1413 print_cpu_stall_info_begin(); 1414 raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); 1415 print_cpu_stall_info(rsp, smp_processor_id()); 1416 raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); 1417 print_cpu_stall_info_end(); 1418 for_each_possible_cpu(cpu) 1419 totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, 1420 cpu)->cblist); 1421 pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", 1422 jiffies - rsp->gp_start, 1423 (long)rcu_seq_current(&rsp->gp_seq), totqlen); 1424 1425 rcu_check_gp_kthread_starvation(rsp); 1426 1427 rcu_dump_cpu_stacks(rsp); 1428 1429 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1430 /* Rewrite if needed in case of slow consoles. */ 1431 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) 1432 WRITE_ONCE(rsp->jiffies_stall, 1433 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1434 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1435 1436 panic_on_rcu_stall(); 1437 1438 /* 1439 * Attempt to revive the RCU machinery by forcing a context switch. 1440 * 1441 * A context switch would normally allow the RCU state machine to make 1442 * progress and it could be we're stuck in kernel space without context 1443 * switches for an entirely unreasonable amount of time. 1444 */ 1445 resched_cpu(smp_processor_id()); 1446 } 1447 1448 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 1449 { 1450 unsigned long gs1; 1451 unsigned long gs2; 1452 unsigned long gps; 1453 unsigned long j; 1454 unsigned long jn; 1455 unsigned long js; 1456 struct rcu_node *rnp; 1457 1458 if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || 1459 !rcu_gp_in_progress(rsp)) 1460 return; 1461 rcu_stall_kick_kthreads(rsp); 1462 j = jiffies; 1463 1464 /* 1465 * Lots of memory barriers to reject false positives. 1466 * 1467 * The idea is to pick up rsp->gp_seq, then rsp->jiffies_stall, 1468 * then rsp->gp_start, and finally another copy of rsp->gp_seq. 1469 * These values are updated in the opposite order with memory 1470 * barriers (or equivalent) during grace-period initialization 1471 * and cleanup. Now, a false positive can occur if we get an new 1472 * value of rsp->gp_start and a old value of rsp->jiffies_stall. 1473 * But given the memory barriers, the only way that this can happen 1474 * is if one grace period ends and another starts between these 1475 * two fetches. This is detected by comparing the second fetch 1476 * of rsp->gp_seq with the previous fetch from rsp->gp_seq. 1477 * 1478 * Given this check, comparisons of jiffies, rsp->jiffies_stall, 1479 * and rsp->gp_start suffice to forestall false positives. 1480 */ 1481 gs1 = READ_ONCE(rsp->gp_seq); 1482 smp_rmb(); /* Pick up ->gp_seq first... */ 1483 js = READ_ONCE(rsp->jiffies_stall); 1484 smp_rmb(); /* ...then ->jiffies_stall before the rest... */ 1485 gps = READ_ONCE(rsp->gp_start); 1486 smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ 1487 gs2 = READ_ONCE(rsp->gp_seq); 1488 if (gs1 != gs2 || 1489 ULONG_CMP_LT(j, js) || 1490 ULONG_CMP_GE(gps, js)) 1491 return; /* No stall or GP completed since entering function. */ 1492 rnp = rdp->mynode; 1493 jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; 1494 if (rcu_gp_in_progress(rsp) && 1495 (READ_ONCE(rnp->qsmask) & rdp->grpmask) && 1496 cmpxchg(&rsp->jiffies_stall, js, jn) == js) { 1497 1498 /* We haven't checked in, so go dump stack. */ 1499 print_cpu_stall(rsp); 1500 1501 } else if (rcu_gp_in_progress(rsp) && 1502 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && 1503 cmpxchg(&rsp->jiffies_stall, js, jn) == js) { 1504 1505 /* They had a few time units to dump stack, so complain. */ 1506 print_other_cpu_stall(rsp, gs2); 1507 } 1508 } 1509 1510 /** 1511 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 1512 * 1513 * Set the stall-warning timeout way off into the future, thus preventing 1514 * any RCU CPU stall-warning messages from appearing in the current set of 1515 * RCU grace periods. 1516 * 1517 * The caller must disable hard irqs. 1518 */ 1519 void rcu_cpu_stall_reset(void) 1520 { 1521 struct rcu_state *rsp; 1522 1523 for_each_rcu_flavor(rsp) 1524 WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); 1525 } 1526 1527 /* Trace-event wrapper function for trace_rcu_future_grace_period. */ 1528 static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1529 unsigned long gp_seq_req, const char *s) 1530 { 1531 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gp_seq, gp_seq_req, 1532 rnp->level, rnp->grplo, rnp->grphi, s); 1533 } 1534 1535 /* 1536 * rcu_start_this_gp - Request the start of a particular grace period 1537 * @rnp_start: The leaf node of the CPU from which to start. 1538 * @rdp: The rcu_data corresponding to the CPU from which to start. 1539 * @gp_seq_req: The gp_seq of the grace period to start. 1540 * 1541 * Start the specified grace period, as needed to handle newly arrived 1542 * callbacks. The required future grace periods are recorded in each 1543 * rcu_node structure's ->gp_seq_needed field. Returns true if there 1544 * is reason to awaken the grace-period kthread. 1545 * 1546 * The caller must hold the specified rcu_node structure's ->lock, which 1547 * is why the caller is responsible for waking the grace-period kthread. 1548 * 1549 * Returns true if the GP thread needs to be awakened else false. 1550 */ 1551 static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, 1552 unsigned long gp_seq_req) 1553 { 1554 bool ret = false; 1555 struct rcu_state *rsp = rdp->rsp; 1556 struct rcu_node *rnp; 1557 1558 /* 1559 * Use funnel locking to either acquire the root rcu_node 1560 * structure's lock or bail out if the need for this grace period 1561 * has already been recorded -- or if that grace period has in 1562 * fact already started. If there is already a grace period in 1563 * progress in a non-leaf node, no recording is needed because the 1564 * end of the grace period will scan the leaf rcu_node structures. 1565 * Note that rnp_start->lock must not be released. 1566 */ 1567 raw_lockdep_assert_held_rcu_node(rnp_start); 1568 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf")); 1569 for (rnp = rnp_start; 1; rnp = rnp->parent) { 1570 if (rnp != rnp_start) 1571 raw_spin_lock_rcu_node(rnp); 1572 if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) || 1573 rcu_seq_started(&rnp->gp_seq, gp_seq_req) || 1574 (rnp != rnp_start && 1575 rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) { 1576 trace_rcu_this_gp(rnp, rdp, gp_seq_req, 1577 TPS("Prestarted")); 1578 goto unlock_out; 1579 } 1580 rnp->gp_seq_needed = gp_seq_req; 1581 if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { 1582 /* 1583 * We just marked the leaf or internal node, and a 1584 * grace period is in progress, which means that 1585 * rcu_gp_cleanup() will see the marking. Bail to 1586 * reduce contention. 1587 */ 1588 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, 1589 TPS("Startedleaf")); 1590 goto unlock_out; 1591 } 1592 if (rnp != rnp_start && rnp->parent != NULL) 1593 raw_spin_unlock_rcu_node(rnp); 1594 if (!rnp->parent) 1595 break; /* At root, and perhaps also leaf. */ 1596 } 1597 1598 /* If GP already in progress, just leave, otherwise start one. */ 1599 if (rcu_gp_in_progress(rsp)) { 1600 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); 1601 goto unlock_out; 1602 } 1603 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); 1604 WRITE_ONCE(rsp->gp_flags, rsp->gp_flags | RCU_GP_FLAG_INIT); 1605 rsp->gp_req_activity = jiffies; 1606 if (!rsp->gp_kthread) { 1607 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); 1608 goto unlock_out; 1609 } 1610 trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), TPS("newreq")); 1611 ret = true; /* Caller must wake GP kthread. */ 1612 unlock_out: 1613 /* Push furthest requested GP to leaf node and rcu_data structure. */ 1614 if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { 1615 rnp_start->gp_seq_needed = rnp->gp_seq_needed; 1616 rdp->gp_seq_needed = rnp->gp_seq_needed; 1617 } 1618 if (rnp != rnp_start) 1619 raw_spin_unlock_rcu_node(rnp); 1620 return ret; 1621 } 1622 1623 /* 1624 * Clean up any old requests for the just-ended grace period. Also return 1625 * whether any additional grace periods have been requested. 1626 */ 1627 static bool rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1628 { 1629 bool needmore; 1630 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1631 1632 needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); 1633 if (!needmore) 1634 rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ 1635 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq, 1636 needmore ? TPS("CleanupMore") : TPS("Cleanup")); 1637 return needmore; 1638 } 1639 1640 /* 1641 * Awaken the grace-period kthread for the specified flavor of RCU. 1642 * Don't do a self-awaken, and don't bother awakening when there is 1643 * nothing for the grace-period kthread to do (as in several CPUs 1644 * raced to awaken, and we lost), and finally don't try to awaken 1645 * a kthread that has not yet been created. 1646 */ 1647 static void rcu_gp_kthread_wake(struct rcu_state *rsp) 1648 { 1649 if (current == rsp->gp_kthread || 1650 !READ_ONCE(rsp->gp_flags) || 1651 !rsp->gp_kthread) 1652 return; 1653 swake_up(&rsp->gp_wq); 1654 } 1655 1656 /* 1657 * If there is room, assign a ->gp_seq number to any callbacks on this 1658 * CPU that have not already been assigned. Also accelerate any callbacks 1659 * that were previously assigned a ->gp_seq number that has since proven 1660 * to be too conservative, which can happen if callbacks get assigned a 1661 * ->gp_seq number while RCU is idle, but with reference to a non-root 1662 * rcu_node structure. This function is idempotent, so it does not hurt 1663 * to call it repeatedly. Returns an flag saying that we should awaken 1664 * the RCU grace-period kthread. 1665 * 1666 * The caller must hold rnp->lock with interrupts disabled. 1667 */ 1668 static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1669 struct rcu_data *rdp) 1670 { 1671 unsigned long gp_seq_req; 1672 bool ret = false; 1673 1674 raw_lockdep_assert_held_rcu_node(rnp); 1675 1676 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1677 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1678 return false; 1679 1680 /* 1681 * Callbacks are often registered with incomplete grace-period 1682 * information. Something about the fact that getting exact 1683 * information requires acquiring a global lock... RCU therefore 1684 * makes a conservative estimate of the grace period number at which 1685 * a given callback will become ready to invoke. The following 1686 * code checks this estimate and improves it when possible, thus 1687 * accelerating callback invocation to an earlier grace-period 1688 * number. 1689 */ 1690 gp_seq_req = rcu_seq_snap(&rsp->gp_seq); 1691 if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) 1692 ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); 1693 1694 /* Trace depending on how much we were able to accelerate. */ 1695 if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) 1696 trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccWaitCB")); 1697 else 1698 trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("AccReadyCB")); 1699 return ret; 1700 } 1701 1702 /* 1703 * Similar to rcu_accelerate_cbs(), but does not require that the leaf 1704 * rcu_node structure's ->lock be held. It consults the cached value 1705 * of ->gp_seq_needed in the rcu_data structure, and if that indicates 1706 * that a new grace-period request be made, invokes rcu_accelerate_cbs() 1707 * while holding the leaf rcu_node structure's ->lock. 1708 */ 1709 static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp, 1710 struct rcu_node *rnp, 1711 struct rcu_data *rdp) 1712 { 1713 unsigned long c; 1714 bool needwake; 1715 1716 lockdep_assert_irqs_disabled(); 1717 c = rcu_seq_snap(&rsp->gp_seq); 1718 if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { 1719 /* Old request still live, so mark recent callbacks. */ 1720 (void)rcu_segcblist_accelerate(&rdp->cblist, c); 1721 return; 1722 } 1723 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1724 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 1725 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 1726 if (needwake) 1727 rcu_gp_kthread_wake(rsp); 1728 } 1729 1730 /* 1731 * Move any callbacks whose grace period has completed to the 1732 * RCU_DONE_TAIL sublist, then compact the remaining sublists and 1733 * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL 1734 * sublist. This function is idempotent, so it does not hurt to 1735 * invoke it repeatedly. As long as it is not invoked -too- often... 1736 * Returns true if the RCU grace-period kthread needs to be awakened. 1737 * 1738 * The caller must hold rnp->lock with interrupts disabled. 1739 */ 1740 static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1741 struct rcu_data *rdp) 1742 { 1743 raw_lockdep_assert_held_rcu_node(rnp); 1744 1745 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1746 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1747 return false; 1748 1749 /* 1750 * Find all callbacks whose ->gp_seq numbers indicate that they 1751 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1752 */ 1753 rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); 1754 1755 /* Classify any remaining callbacks. */ 1756 return rcu_accelerate_cbs(rsp, rnp, rdp); 1757 } 1758 1759 /* 1760 * Update CPU-local rcu_data state to record the beginnings and ends of 1761 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1762 * structure corresponding to the current CPU, and must have irqs disabled. 1763 * Returns true if the grace-period kthread needs to be awakened. 1764 */ 1765 static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, 1766 struct rcu_data *rdp) 1767 { 1768 bool ret; 1769 bool need_gp; 1770 1771 raw_lockdep_assert_held_rcu_node(rnp); 1772 1773 if (rdp->gp_seq == rnp->gp_seq) 1774 return false; /* Nothing to do. */ 1775 1776 /* Handle the ends of any preceding grace periods first. */ 1777 if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || 1778 unlikely(READ_ONCE(rdp->gpwrap))) { 1779 ret = rcu_advance_cbs(rsp, rnp, rdp); /* Advance callbacks. */ 1780 trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuend")); 1781 } else { 1782 ret = rcu_accelerate_cbs(rsp, rnp, rdp); /* Recent callbacks. */ 1783 } 1784 1785 /* Now handle the beginnings of any new-to-this-CPU grace periods. */ 1786 if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || 1787 unlikely(READ_ONCE(rdp->gpwrap))) { 1788 /* 1789 * If the current grace period is waiting for this CPU, 1790 * set up to detect a quiescent state, otherwise don't 1791 * go looking for one. 1792 */ 1793 trace_rcu_grace_period(rsp->name, rnp->gp_seq, TPS("cpustart")); 1794 need_gp = !!(rnp->qsmask & rdp->grpmask); 1795 rdp->cpu_no_qs.b.norm = need_gp; 1796 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); 1797 rdp->core_needs_qs = need_gp; 1798 zero_cpu_stall_ticks(rdp); 1799 } 1800 rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ 1801 if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) 1802 rdp->gp_seq_needed = rnp->gp_seq_needed; 1803 WRITE_ONCE(rdp->gpwrap, false); 1804 rcu_gpnum_ovf(rnp, rdp); 1805 return ret; 1806 } 1807 1808 static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 1809 { 1810 unsigned long flags; 1811 bool needwake; 1812 struct rcu_node *rnp; 1813 1814 local_irq_save(flags); 1815 rnp = rdp->mynode; 1816 if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) && 1817 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ 1818 !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ 1819 local_irq_restore(flags); 1820 return; 1821 } 1822 needwake = __note_gp_changes(rsp, rnp, rdp); 1823 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1824 if (needwake) 1825 rcu_gp_kthread_wake(rsp); 1826 } 1827 1828 static void rcu_gp_slow(struct rcu_state *rsp, int delay) 1829 { 1830 if (delay > 0 && 1831 !(rcu_seq_ctr(rsp->gp_seq) % 1832 (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) 1833 schedule_timeout_uninterruptible(delay); 1834 } 1835 1836 /* 1837 * Initialize a new grace period. Return false if no grace period required. 1838 */ 1839 static bool rcu_gp_init(struct rcu_state *rsp) 1840 { 1841 unsigned long flags; 1842 unsigned long oldmask; 1843 unsigned long mask; 1844 struct rcu_data *rdp; 1845 struct rcu_node *rnp = rcu_get_root(rsp); 1846 1847 WRITE_ONCE(rsp->gp_activity, jiffies); 1848 raw_spin_lock_irq_rcu_node(rnp); 1849 if (!READ_ONCE(rsp->gp_flags)) { 1850 /* Spurious wakeup, tell caller to go back to sleep. */ 1851 raw_spin_unlock_irq_rcu_node(rnp); 1852 return false; 1853 } 1854 WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ 1855 1856 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { 1857 /* 1858 * Grace period already in progress, don't start another. 1859 * Not supposed to be able to happen. 1860 */ 1861 raw_spin_unlock_irq_rcu_node(rnp); 1862 return false; 1863 } 1864 1865 /* Advance to a new grace period and initialize state. */ 1866 record_gp_stall_check_time(rsp); 1867 /* Record GP times before starting GP, hence rcu_seq_start(). */ 1868 rcu_seq_start(&rsp->gp_seq); 1869 trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("start")); 1870 raw_spin_unlock_irq_rcu_node(rnp); 1871 1872 /* 1873 * Apply per-leaf buffered online and offline operations to the 1874 * rcu_node tree. Note that this new grace period need not wait 1875 * for subsequent online CPUs, and that quiescent-state forcing 1876 * will handle subsequent offline CPUs. 1877 */ 1878 rsp->gp_state = RCU_GP_ONOFF; 1879 rcu_for_each_leaf_node(rsp, rnp) { 1880 spin_lock(&rsp->ofl_lock); 1881 raw_spin_lock_irq_rcu_node(rnp); 1882 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1883 !rnp->wait_blkd_tasks) { 1884 /* Nothing to do on this leaf rcu_node structure. */ 1885 raw_spin_unlock_irq_rcu_node(rnp); 1886 spin_unlock(&rsp->ofl_lock); 1887 continue; 1888 } 1889 1890 /* Record old state, apply changes to ->qsmaskinit field. */ 1891 oldmask = rnp->qsmaskinit; 1892 rnp->qsmaskinit = rnp->qsmaskinitnext; 1893 1894 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ 1895 if (!oldmask != !rnp->qsmaskinit) { 1896 if (!oldmask) { /* First online CPU for rcu_node. */ 1897 if (!rnp->wait_blkd_tasks) /* Ever offline? */ 1898 rcu_init_new_rnp(rnp); 1899 } else if (rcu_preempt_has_tasks(rnp)) { 1900 rnp->wait_blkd_tasks = true; /* blocked tasks */ 1901 } else { /* Last offline CPU and can propagate. */ 1902 rcu_cleanup_dead_rnp(rnp); 1903 } 1904 } 1905 1906 /* 1907 * If all waited-on tasks from prior grace period are 1908 * done, and if all this rcu_node structure's CPUs are 1909 * still offline, propagate up the rcu_node tree and 1910 * clear ->wait_blkd_tasks. Otherwise, if one of this 1911 * rcu_node structure's CPUs has since come back online, 1912 * simply clear ->wait_blkd_tasks. 1913 */ 1914 if (rnp->wait_blkd_tasks && 1915 (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) { 1916 rnp->wait_blkd_tasks = false; 1917 if (!rnp->qsmaskinit) 1918 rcu_cleanup_dead_rnp(rnp); 1919 } 1920 1921 raw_spin_unlock_irq_rcu_node(rnp); 1922 spin_unlock(&rsp->ofl_lock); 1923 } 1924 rcu_gp_slow(rsp, gp_preinit_delay); /* Races with CPU hotplug. */ 1925 1926 /* 1927 * Set the quiescent-state-needed bits in all the rcu_node 1928 * structures for all currently online CPUs in breadth-first order, 1929 * starting from the root rcu_node structure, relying on the layout 1930 * of the tree within the rsp->node[] array. Note that other CPUs 1931 * will access only the leaves of the hierarchy, thus seeing that no 1932 * grace period is in progress, at least until the corresponding 1933 * leaf node has been initialized. 1934 * 1935 * The grace period cannot complete until the initialization 1936 * process finishes, because this kthread handles both. 1937 */ 1938 rsp->gp_state = RCU_GP_INIT; 1939 rcu_for_each_node_breadth_first(rsp, rnp) { 1940 rcu_gp_slow(rsp, gp_init_delay); 1941 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1942 rdp = this_cpu_ptr(rsp->rda); 1943 rcu_preempt_check_blocked_tasks(rsp, rnp); 1944 rnp->qsmask = rnp->qsmaskinit; 1945 WRITE_ONCE(rnp->gp_seq, rsp->gp_seq); 1946 if (rnp == rdp->mynode) 1947 (void)__note_gp_changes(rsp, rnp, rdp); 1948 rcu_preempt_boost_start_gp(rnp); 1949 trace_rcu_grace_period_init(rsp->name, rnp->gp_seq, 1950 rnp->level, rnp->grplo, 1951 rnp->grphi, rnp->qsmask); 1952 /* Quiescent states for tasks on any now-offline CPUs. */ 1953 mask = rnp->qsmask & ~rnp->qsmaskinitnext; 1954 rnp->rcu_gp_init_mask = mask; 1955 if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) 1956 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); 1957 else 1958 raw_spin_unlock_irq_rcu_node(rnp); 1959 cond_resched_tasks_rcu_qs(); 1960 WRITE_ONCE(rsp->gp_activity, jiffies); 1961 } 1962 1963 return true; 1964 } 1965 1966 /* 1967 * Helper function for swait_event_idle() wakeup at force-quiescent-state 1968 * time. 1969 */ 1970 static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) 1971 { 1972 struct rcu_node *rnp = rcu_get_root(rsp); 1973 1974 /* Someone like call_rcu() requested a force-quiescent-state scan. */ 1975 *gfp = READ_ONCE(rsp->gp_flags); 1976 if (*gfp & RCU_GP_FLAG_FQS) 1977 return true; 1978 1979 /* The current grace period has completed. */ 1980 if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) 1981 return true; 1982 1983 return false; 1984 } 1985 1986 /* 1987 * Do one round of quiescent-state forcing. 1988 */ 1989 static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) 1990 { 1991 struct rcu_node *rnp = rcu_get_root(rsp); 1992 1993 WRITE_ONCE(rsp->gp_activity, jiffies); 1994 rsp->n_force_qs++; 1995 if (first_time) { 1996 /* Collect dyntick-idle snapshots. */ 1997 force_qs_rnp(rsp, dyntick_save_progress_counter); 1998 } else { 1999 /* Handle dyntick-idle and offline CPUs. */ 2000 force_qs_rnp(rsp, rcu_implicit_dynticks_qs); 2001 } 2002 /* Clear flag to prevent immediate re-entry. */ 2003 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2004 raw_spin_lock_irq_rcu_node(rnp); 2005 WRITE_ONCE(rsp->gp_flags, 2006 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); 2007 raw_spin_unlock_irq_rcu_node(rnp); 2008 } 2009 } 2010 2011 /* 2012 * Clean up after the old grace period. 2013 */ 2014 static void rcu_gp_cleanup(struct rcu_state *rsp) 2015 { 2016 unsigned long gp_duration; 2017 bool needgp = false; 2018 unsigned long new_gp_seq; 2019 struct rcu_data *rdp; 2020 struct rcu_node *rnp = rcu_get_root(rsp); 2021 struct swait_queue_head *sq; 2022 2023 WRITE_ONCE(rsp->gp_activity, jiffies); 2024 raw_spin_lock_irq_rcu_node(rnp); 2025 gp_duration = jiffies - rsp->gp_start; 2026 if (gp_duration > rsp->gp_max) 2027 rsp->gp_max = gp_duration; 2028 2029 /* 2030 * We know the grace period is complete, but to everyone else 2031 * it appears to still be ongoing. But it is also the case 2032 * that to everyone else it looks like there is nothing that 2033 * they can do to advance the grace period. It is therefore 2034 * safe for us to drop the lock in order to mark the grace 2035 * period as completed in all of the rcu_node structures. 2036 */ 2037 raw_spin_unlock_irq_rcu_node(rnp); 2038 2039 /* 2040 * Propagate new ->gp_seq value to rcu_node structures so that 2041 * other CPUs don't have to wait until the start of the next grace 2042 * period to process their callbacks. This also avoids some nasty 2043 * RCU grace-period initialization races by forcing the end of 2044 * the current grace period to be completely recorded in all of 2045 * the rcu_node structures before the beginning of the next grace 2046 * period is recorded in any of the rcu_node structures. 2047 */ 2048 new_gp_seq = rsp->gp_seq; 2049 rcu_seq_end(&new_gp_seq); 2050 rcu_for_each_node_breadth_first(rsp, rnp) { 2051 raw_spin_lock_irq_rcu_node(rnp); 2052 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) 2053 dump_blkd_tasks(rsp, rnp, 10); 2054 WARN_ON_ONCE(rnp->qsmask); 2055 WRITE_ONCE(rnp->gp_seq, new_gp_seq); 2056 rdp = this_cpu_ptr(rsp->rda); 2057 if (rnp == rdp->mynode) 2058 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; 2059 /* smp_mb() provided by prior unlock-lock pair. */ 2060 needgp = rcu_future_gp_cleanup(rsp, rnp) || needgp; 2061 sq = rcu_nocb_gp_get(rnp); 2062 raw_spin_unlock_irq_rcu_node(rnp); 2063 rcu_nocb_gp_cleanup(sq); 2064 cond_resched_tasks_rcu_qs(); 2065 WRITE_ONCE(rsp->gp_activity, jiffies); 2066 rcu_gp_slow(rsp, gp_cleanup_delay); 2067 } 2068 rnp = rcu_get_root(rsp); 2069 raw_spin_lock_irq_rcu_node(rnp); /* GP before rsp->gp_seq update. */ 2070 2071 /* Declare grace period done. */ 2072 rcu_seq_end(&rsp->gp_seq); 2073 trace_rcu_grace_period(rsp->name, rsp->gp_seq, TPS("end")); 2074 rsp->gp_state = RCU_GP_IDLE; 2075 /* Check for GP requests since above loop. */ 2076 rdp = this_cpu_ptr(rsp->rda); 2077 if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { 2078 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, 2079 TPS("CleanupMore")); 2080 needgp = true; 2081 } 2082 /* Advance CBs to reduce false positives below. */ 2083 if (!rcu_accelerate_cbs(rsp, rnp, rdp) && needgp) { 2084 WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); 2085 rsp->gp_req_activity = jiffies; 2086 trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gp_seq), 2087 TPS("newreq")); 2088 } else { 2089 WRITE_ONCE(rsp->gp_flags, rsp->gp_flags & RCU_GP_FLAG_INIT); 2090 } 2091 raw_spin_unlock_irq_rcu_node(rnp); 2092 } 2093 2094 /* 2095 * Body of kthread that handles grace periods. 2096 */ 2097 static int __noreturn rcu_gp_kthread(void *arg) 2098 { 2099 bool first_gp_fqs; 2100 int gf; 2101 unsigned long j; 2102 int ret; 2103 struct rcu_state *rsp = arg; 2104 struct rcu_node *rnp = rcu_get_root(rsp); 2105 2106 rcu_bind_gp_kthread(); 2107 for (;;) { 2108 2109 /* Handle grace-period start. */ 2110 for (;;) { 2111 trace_rcu_grace_period(rsp->name, 2112 READ_ONCE(rsp->gp_seq), 2113 TPS("reqwait")); 2114 rsp->gp_state = RCU_GP_WAIT_GPS; 2115 swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & 2116 RCU_GP_FLAG_INIT); 2117 rsp->gp_state = RCU_GP_DONE_GPS; 2118 /* Locking provides needed memory barrier. */ 2119 if (rcu_gp_init(rsp)) 2120 break; 2121 cond_resched_tasks_rcu_qs(); 2122 WRITE_ONCE(rsp->gp_activity, jiffies); 2123 WARN_ON(signal_pending(current)); 2124 trace_rcu_grace_period(rsp->name, 2125 READ_ONCE(rsp->gp_seq), 2126 TPS("reqwaitsig")); 2127 } 2128 2129 /* Handle quiescent-state forcing. */ 2130 first_gp_fqs = true; 2131 j = jiffies_till_first_fqs; 2132 if (j > HZ) { 2133 j = HZ; 2134 jiffies_till_first_fqs = HZ; 2135 } 2136 ret = 0; 2137 for (;;) { 2138 if (!ret) { 2139 rsp->jiffies_force_qs = jiffies + j; 2140 WRITE_ONCE(rsp->jiffies_kick_kthreads, 2141 jiffies + 3 * j); 2142 } 2143 trace_rcu_grace_period(rsp->name, 2144 READ_ONCE(rsp->gp_seq), 2145 TPS("fqswait")); 2146 rsp->gp_state = RCU_GP_WAIT_FQS; 2147 ret = swait_event_idle_timeout(rsp->gp_wq, 2148 rcu_gp_fqs_check_wake(rsp, &gf), j); 2149 rsp->gp_state = RCU_GP_DOING_FQS; 2150 /* Locking provides needed memory barriers. */ 2151 /* If grace period done, leave loop. */ 2152 if (!READ_ONCE(rnp->qsmask) && 2153 !rcu_preempt_blocked_readers_cgp(rnp)) 2154 break; 2155 /* If time for quiescent-state forcing, do it. */ 2156 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || 2157 (gf & RCU_GP_FLAG_FQS)) { 2158 trace_rcu_grace_period(rsp->name, 2159 READ_ONCE(rsp->gp_seq), 2160 TPS("fqsstart")); 2161 rcu_gp_fqs(rsp, first_gp_fqs); 2162 first_gp_fqs = false; 2163 trace_rcu_grace_period(rsp->name, 2164 READ_ONCE(rsp->gp_seq), 2165 TPS("fqsend")); 2166 cond_resched_tasks_rcu_qs(); 2167 WRITE_ONCE(rsp->gp_activity, jiffies); 2168 ret = 0; /* Force full wait till next FQS. */ 2169 j = jiffies_till_next_fqs; 2170 if (j > HZ) { 2171 j = HZ; 2172 jiffies_till_next_fqs = HZ; 2173 } else if (j < 1) { 2174 j = 1; 2175 jiffies_till_next_fqs = 1; 2176 } 2177 } else { 2178 /* Deal with stray signal. */ 2179 cond_resched_tasks_rcu_qs(); 2180 WRITE_ONCE(rsp->gp_activity, jiffies); 2181 WARN_ON(signal_pending(current)); 2182 trace_rcu_grace_period(rsp->name, 2183 READ_ONCE(rsp->gp_seq), 2184 TPS("fqswaitsig")); 2185 ret = 1; /* Keep old FQS timing. */ 2186 j = jiffies; 2187 if (time_after(jiffies, rsp->jiffies_force_qs)) 2188 j = 1; 2189 else 2190 j = rsp->jiffies_force_qs - j; 2191 } 2192 } 2193 2194 /* Handle grace-period end. */ 2195 rsp->gp_state = RCU_GP_CLEANUP; 2196 rcu_gp_cleanup(rsp); 2197 rsp->gp_state = RCU_GP_CLEANED; 2198 } 2199 } 2200 2201 /* 2202 * Report a full set of quiescent states to the specified rcu_state data 2203 * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period 2204 * kthread if another grace period is required. Whether we wake 2205 * the grace-period kthread or it awakens itself for the next round 2206 * of quiescent-state forcing, that kthread will clean up after the 2207 * just-completed grace period. Note that the caller must hold rnp->lock, 2208 * which is released before return. 2209 */ 2210 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 2211 __releases(rcu_get_root(rsp)->lock) 2212 { 2213 raw_lockdep_assert_held_rcu_node(rcu_get_root(rsp)); 2214 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2215 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2216 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); 2217 rcu_gp_kthread_wake(rsp); 2218 } 2219 2220 /* 2221 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 2222 * Allows quiescent states for a group of CPUs to be reported at one go 2223 * to the specified rcu_node structure, though all the CPUs in the group 2224 * must be represented by the same rcu_node structure (which need not be a 2225 * leaf rcu_node structure, though it often will be). The gps parameter 2226 * is the grace-period snapshot, which means that the quiescent states 2227 * are valid only if rnp->gp_seq is equal to gps. That structure's lock 2228 * must be held upon entry, and it is released before return. 2229 * 2230 * As a special case, if mask is zero, the bit-already-cleared check is 2231 * disabled. This allows propagating quiescent state due to resumed tasks 2232 * during grace-period initialization. 2233 */ 2234 static void 2235 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 2236 struct rcu_node *rnp, unsigned long gps, unsigned long flags) 2237 __releases(rnp->lock) 2238 { 2239 unsigned long oldmask = 0; 2240 struct rcu_node *rnp_c; 2241 2242 raw_lockdep_assert_held_rcu_node(rnp); 2243 2244 /* Walk up the rcu_node hierarchy. */ 2245 for (;;) { 2246 if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) { 2247 2248 /* 2249 * Our bit has already been cleared, or the 2250 * relevant grace period is already over, so done. 2251 */ 2252 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2253 return; 2254 } 2255 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ 2256 WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && 2257 rcu_preempt_blocked_readers_cgp(rnp)); 2258 rnp->qsmask &= ~mask; 2259 trace_rcu_quiescent_state_report(rsp->name, rnp->gp_seq, 2260 mask, rnp->qsmask, rnp->level, 2261 rnp->grplo, rnp->grphi, 2262 !!rnp->gp_tasks); 2263 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2264 2265 /* Other bits still set at this level, so done. */ 2266 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2267 return; 2268 } 2269 rnp->completedqs = rnp->gp_seq; 2270 mask = rnp->grpmask; 2271 if (rnp->parent == NULL) { 2272 2273 /* No more levels. Exit loop holding root lock. */ 2274 2275 break; 2276 } 2277 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2278 rnp_c = rnp; 2279 rnp = rnp->parent; 2280 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2281 oldmask = rnp_c->qsmask; 2282 } 2283 2284 /* 2285 * Get here if we are the last CPU to pass through a quiescent 2286 * state for this grace period. Invoke rcu_report_qs_rsp() 2287 * to clean up and start the next grace period if one is needed. 2288 */ 2289 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ 2290 } 2291 2292 /* 2293 * Record a quiescent state for all tasks that were previously queued 2294 * on the specified rcu_node structure and that were blocking the current 2295 * RCU grace period. The caller must hold the specified rnp->lock with 2296 * irqs disabled, and this lock is released upon return, but irqs remain 2297 * disabled. 2298 */ 2299 static void __maybe_unused 2300 rcu_report_unblock_qs_rnp(struct rcu_state *rsp, 2301 struct rcu_node *rnp, unsigned long flags) 2302 __releases(rnp->lock) 2303 { 2304 unsigned long gps; 2305 unsigned long mask; 2306 struct rcu_node *rnp_p; 2307 2308 raw_lockdep_assert_held_rcu_node(rnp); 2309 if (WARN_ON_ONCE(rcu_state_p == &rcu_sched_state) || 2310 WARN_ON_ONCE(rsp != rcu_state_p) || 2311 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || 2312 rnp->qsmask != 0) { 2313 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2314 return; /* Still need more quiescent states! */ 2315 } 2316 2317 rnp->completedqs = rnp->gp_seq; 2318 rnp_p = rnp->parent; 2319 if (rnp_p == NULL) { 2320 /* 2321 * Only one rcu_node structure in the tree, so don't 2322 * try to report up to its nonexistent parent! 2323 */ 2324 rcu_report_qs_rsp(rsp, flags); 2325 return; 2326 } 2327 2328 /* Report up the rest of the hierarchy, tracking current ->gp_seq. */ 2329 gps = rnp->gp_seq; 2330 mask = rnp->grpmask; 2331 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 2332 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ 2333 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); 2334 } 2335 2336 /* 2337 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2338 * structure. This must be called from the specified CPU. 2339 */ 2340 static void 2341 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) 2342 { 2343 unsigned long flags; 2344 unsigned long mask; 2345 bool needwake; 2346 struct rcu_node *rnp; 2347 2348 rnp = rdp->mynode; 2349 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2350 if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || 2351 rdp->gpwrap) { 2352 2353 /* 2354 * The grace period in which this quiescent state was 2355 * recorded has ended, so don't report it upwards. 2356 * We will instead need a new quiescent state that lies 2357 * within the current grace period. 2358 */ 2359 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2360 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); 2361 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2362 return; 2363 } 2364 mask = rdp->grpmask; 2365 if ((rnp->qsmask & mask) == 0) { 2366 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2367 } else { 2368 rdp->core_needs_qs = false; 2369 2370 /* 2371 * This GP can't end until cpu checks in, so all of our 2372 * callbacks can be processed during the next GP. 2373 */ 2374 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 2375 2376 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); 2377 /* ^^^ Released rnp->lock */ 2378 if (needwake) 2379 rcu_gp_kthread_wake(rsp); 2380 } 2381 } 2382 2383 /* 2384 * Check to see if there is a new grace period of which this CPU 2385 * is not yet aware, and if so, set up local rcu_data state for it. 2386 * Otherwise, see if this CPU has just passed through its first 2387 * quiescent state for this grace period, and record that fact if so. 2388 */ 2389 static void 2390 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) 2391 { 2392 /* Check for grace-period ends and beginnings. */ 2393 note_gp_changes(rsp, rdp); 2394 2395 /* 2396 * Does this CPU still need to do its part for current grace period? 2397 * If no, return and let the other CPUs do their part as well. 2398 */ 2399 if (!rdp->core_needs_qs) 2400 return; 2401 2402 /* 2403 * Was there a quiescent state since the beginning of the grace 2404 * period? If no, then exit and wait for the next call. 2405 */ 2406 if (rdp->cpu_no_qs.b.norm) 2407 return; 2408 2409 /* 2410 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 2411 * judge of that). 2412 */ 2413 rcu_report_qs_rdp(rdp->cpu, rsp, rdp); 2414 } 2415 2416 /* 2417 * Trace the fact that this CPU is going offline. 2418 */ 2419 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2420 { 2421 RCU_TRACE(bool blkd;) 2422 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) 2423 RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) 2424 2425 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2426 return; 2427 2428 RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) 2429 trace_rcu_grace_period(rsp->name, rnp->gp_seq, 2430 blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); 2431 } 2432 2433 /* 2434 * All CPUs for the specified rcu_node structure have gone offline, 2435 * and all tasks that were preempted within an RCU read-side critical 2436 * section while running on one of those CPUs have since exited their RCU 2437 * read-side critical section. Some other CPU is reporting this fact with 2438 * the specified rcu_node structure's ->lock held and interrupts disabled. 2439 * This function therefore goes up the tree of rcu_node structures, 2440 * clearing the corresponding bits in the ->qsmaskinit fields. Note that 2441 * the leaf rcu_node structure's ->qsmaskinit field has already been 2442 * updated. 2443 * 2444 * This function does check that the specified rcu_node structure has 2445 * all CPUs offline and no blocked tasks, so it is OK to invoke it 2446 * prematurely. That said, invoking it after the fact will cost you 2447 * a needless lock acquisition. So once it has done its work, don't 2448 * invoke it again. 2449 */ 2450 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) 2451 { 2452 long mask; 2453 struct rcu_node *rnp = rnp_leaf; 2454 2455 raw_lockdep_assert_held_rcu_node(rnp_leaf); 2456 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2457 WARN_ON_ONCE(rnp_leaf->qsmaskinit) || 2458 WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) 2459 return; 2460 for (;;) { 2461 mask = rnp->grpmask; 2462 rnp = rnp->parent; 2463 if (!rnp) 2464 break; 2465 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 2466 rnp->qsmaskinit &= ~mask; 2467 /* Between grace periods, so better already be zero! */ 2468 WARN_ON_ONCE(rnp->qsmask); 2469 if (rnp->qsmaskinit) { 2470 raw_spin_unlock_rcu_node(rnp); 2471 /* irqs remain disabled. */ 2472 return; 2473 } 2474 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 2475 } 2476 } 2477 2478 /* 2479 * The CPU has been completely removed, and some other CPU is reporting 2480 * this fact from process context. Do the remainder of the cleanup. 2481 * There can only be one CPU hotplug operation at a time, so no need for 2482 * explicit locking. 2483 */ 2484 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2485 { 2486 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2487 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2488 2489 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2490 return; 2491 2492 /* Adjust any no-longer-needed kthreads. */ 2493 rcu_boost_kthread_setaffinity(rnp, -1); 2494 } 2495 2496 /* 2497 * Invoke any RCU callbacks that have made it to the end of their grace 2498 * period. Thottle as specified by rdp->blimit. 2499 */ 2500 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) 2501 { 2502 unsigned long flags; 2503 struct rcu_head *rhp; 2504 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); 2505 long bl, count; 2506 2507 /* If no callbacks are ready, just return. */ 2508 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { 2509 trace_rcu_batch_start(rsp->name, 2510 rcu_segcblist_n_lazy_cbs(&rdp->cblist), 2511 rcu_segcblist_n_cbs(&rdp->cblist), 0); 2512 trace_rcu_batch_end(rsp->name, 0, 2513 !rcu_segcblist_empty(&rdp->cblist), 2514 need_resched(), is_idle_task(current), 2515 rcu_is_callbacks_kthread()); 2516 return; 2517 } 2518 2519 /* 2520 * Extract the list of ready callbacks, disabling to prevent 2521 * races with call_rcu() from interrupt handlers. Leave the 2522 * callback counts, as rcu_barrier() needs to be conservative. 2523 */ 2524 local_irq_save(flags); 2525 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2526 bl = rdp->blimit; 2527 trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), 2528 rcu_segcblist_n_cbs(&rdp->cblist), bl); 2529 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); 2530 local_irq_restore(flags); 2531 2532 /* Invoke callbacks. */ 2533 rhp = rcu_cblist_dequeue(&rcl); 2534 for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { 2535 debug_rcu_head_unqueue(rhp); 2536 if (__rcu_reclaim(rsp->name, rhp)) 2537 rcu_cblist_dequeued_lazy(&rcl); 2538 /* 2539 * Stop only if limit reached and CPU has something to do. 2540 * Note: The rcl structure counts down from zero. 2541 */ 2542 if (-rcl.len >= bl && 2543 (need_resched() || 2544 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2545 break; 2546 } 2547 2548 local_irq_save(flags); 2549 count = -rcl.len; 2550 trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), 2551 is_idle_task(current), rcu_is_callbacks_kthread()); 2552 2553 /* Update counts and requeue any remaining callbacks. */ 2554 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); 2555 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2556 rcu_segcblist_insert_count(&rdp->cblist, &rcl); 2557 2558 /* Reinstate batch limit if we have worked down the excess. */ 2559 count = rcu_segcblist_n_cbs(&rdp->cblist); 2560 if (rdp->blimit == LONG_MAX && count <= qlowmark) 2561 rdp->blimit = blimit; 2562 2563 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2564 if (count == 0 && rdp->qlen_last_fqs_check != 0) { 2565 rdp->qlen_last_fqs_check = 0; 2566 rdp->n_force_qs_snap = rsp->n_force_qs; 2567 } else if (count < rdp->qlen_last_fqs_check - qhimark) 2568 rdp->qlen_last_fqs_check = count; 2569 2570 /* 2571 * The following usually indicates a double call_rcu(). To track 2572 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. 2573 */ 2574 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); 2575 2576 local_irq_restore(flags); 2577 2578 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2579 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 2580 invoke_rcu_core(); 2581 } 2582 2583 /* 2584 * Check to see if this CPU is in a non-context-switch quiescent state 2585 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 2586 * Also schedule RCU core processing. 2587 * 2588 * This function must be called from hardirq context. It is normally 2589 * invoked from the scheduling-clock interrupt. 2590 */ 2591 void rcu_check_callbacks(int user) 2592 { 2593 trace_rcu_utilization(TPS("Start scheduler-tick")); 2594 increment_cpu_stall_ticks(); 2595 if (user || rcu_is_cpu_rrupt_from_idle()) { 2596 2597 /* 2598 * Get here if this CPU took its interrupt from user 2599 * mode or from the idle loop, and if this is not a 2600 * nested interrupt. In this case, the CPU is in 2601 * a quiescent state, so note it. 2602 * 2603 * No memory barrier is required here because both 2604 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local 2605 * variables that other CPUs neither access nor modify, 2606 * at least not while the corresponding CPU is online. 2607 */ 2608 2609 rcu_sched_qs(); 2610 rcu_bh_qs(); 2611 2612 } else if (!in_softirq()) { 2613 2614 /* 2615 * Get here if this CPU did not take its interrupt from 2616 * softirq, in other words, if it is not interrupting 2617 * a rcu_bh read-side critical section. This is an _bh 2618 * critical section, so note it. 2619 */ 2620 2621 rcu_bh_qs(); 2622 } 2623 rcu_preempt_check_callbacks(); 2624 if (rcu_pending()) 2625 invoke_rcu_core(); 2626 if (user) 2627 rcu_note_voluntary_context_switch(current); 2628 trace_rcu_utilization(TPS("End scheduler-tick")); 2629 } 2630 2631 /* 2632 * Scan the leaf rcu_node structures, processing dyntick state for any that 2633 * have not yet encountered a quiescent state, using the function specified. 2634 * Also initiate boosting for any threads blocked on the root rcu_node. 2635 * 2636 * The caller must have suppressed start of new grace periods. 2637 */ 2638 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *rsp)) 2639 { 2640 int cpu; 2641 unsigned long flags; 2642 unsigned long mask; 2643 struct rcu_node *rnp; 2644 2645 rcu_for_each_leaf_node(rsp, rnp) { 2646 cond_resched_tasks_rcu_qs(); 2647 mask = 0; 2648 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2649 if (rnp->qsmask == 0) { 2650 if (rcu_state_p == &rcu_sched_state || 2651 rsp != rcu_state_p || 2652 rcu_preempt_blocked_readers_cgp(rnp)) { 2653 /* 2654 * No point in scanning bits because they 2655 * are all zero. But we might need to 2656 * priority-boost blocked readers. 2657 */ 2658 rcu_initiate_boost(rnp, flags); 2659 /* rcu_initiate_boost() releases rnp->lock */ 2660 continue; 2661 } 2662 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2663 continue; 2664 } 2665 for_each_leaf_node_possible_cpu(rnp, cpu) { 2666 unsigned long bit = leaf_node_cpu_bit(rnp, cpu); 2667 if ((rnp->qsmask & bit) != 0) { 2668 if (f(per_cpu_ptr(rsp->rda, cpu))) 2669 mask |= bit; 2670 } 2671 } 2672 if (mask != 0) { 2673 /* Idle/offline CPUs, report (releases rnp->lock). */ 2674 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); 2675 } else { 2676 /* Nothing to do here, so just drop the lock. */ 2677 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2678 } 2679 } 2680 } 2681 2682 /* 2683 * Force quiescent states on reluctant CPUs, and also detect which 2684 * CPUs are in dyntick-idle mode. 2685 */ 2686 static void force_quiescent_state(struct rcu_state *rsp) 2687 { 2688 unsigned long flags; 2689 bool ret; 2690 struct rcu_node *rnp; 2691 struct rcu_node *rnp_old = NULL; 2692 2693 /* Funnel through hierarchy to reduce memory contention. */ 2694 rnp = __this_cpu_read(rsp->rda->mynode); 2695 for (; rnp != NULL; rnp = rnp->parent) { 2696 ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || 2697 !raw_spin_trylock(&rnp->fqslock); 2698 if (rnp_old != NULL) 2699 raw_spin_unlock(&rnp_old->fqslock); 2700 if (ret) 2701 return; 2702 rnp_old = rnp; 2703 } 2704 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 2705 2706 /* Reached the root of the rcu_node tree, acquire lock. */ 2707 raw_spin_lock_irqsave_rcu_node(rnp_old, flags); 2708 raw_spin_unlock(&rnp_old->fqslock); 2709 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2710 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 2711 return; /* Someone beat us to it. */ 2712 } 2713 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2714 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 2715 rcu_gp_kthread_wake(rsp); 2716 } 2717 2718 /* 2719 * This function checks for grace-period requests that fail to motivate 2720 * RCU to come out of its idle mode. 2721 */ 2722 static void 2723 rcu_check_gp_start_stall(struct rcu_state *rsp, struct rcu_node *rnp, 2724 struct rcu_data *rdp) 2725 { 2726 unsigned long flags; 2727 unsigned long j; 2728 struct rcu_node *rnp_root = rcu_get_root(rsp); 2729 static atomic_t warned = ATOMIC_INIT(0); 2730 2731 if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress(rsp) || 2732 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) 2733 return; 2734 j = jiffies; /* Expensive access, and in common case don't get here. */ 2735 if (time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || 2736 time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || 2737 atomic_read(&warned)) 2738 return; 2739 2740 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2741 j = jiffies; 2742 if (rcu_gp_in_progress(rsp) || 2743 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || 2744 time_before(j, READ_ONCE(rsp->gp_req_activity) + HZ) || 2745 time_before(j, READ_ONCE(rsp->gp_activity) + HZ) || 2746 atomic_read(&warned)) { 2747 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2748 return; 2749 } 2750 /* Hold onto the leaf lock to make others see warned==1. */ 2751 2752 if (rnp_root != rnp) 2753 raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ 2754 j = jiffies; 2755 if (rcu_gp_in_progress(rsp) || 2756 ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || 2757 time_before(j, rsp->gp_req_activity + HZ) || 2758 time_before(j, rsp->gp_activity + HZ) || 2759 atomic_xchg(&warned, 1)) { 2760 raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ 2761 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2762 return; 2763 } 2764 pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x %s->state:%#lx\n", 2765 __func__, (long)READ_ONCE(rsp->gp_seq), 2766 (long)READ_ONCE(rnp_root->gp_seq_needed), 2767 j - rsp->gp_req_activity, j - rsp->gp_activity, 2768 rsp->gp_flags, rsp->name, 2769 rsp->gp_kthread ? rsp->gp_kthread->state : 0x1ffffL); 2770 WARN_ON(1); 2771 if (rnp_root != rnp) 2772 raw_spin_unlock_rcu_node(rnp_root); 2773 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2774 } 2775 2776 /* 2777 * This does the RCU core processing work for the specified rcu_state 2778 * and rcu_data structures. This may be called only from the CPU to 2779 * whom the rdp belongs. 2780 */ 2781 static void 2782 __rcu_process_callbacks(struct rcu_state *rsp) 2783 { 2784 unsigned long flags; 2785 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2786 struct rcu_node *rnp = rdp->mynode; 2787 2788 WARN_ON_ONCE(!rdp->beenonline); 2789 2790 /* Update RCU state based on any recent quiescent states. */ 2791 rcu_check_quiescent_state(rsp, rdp); 2792 2793 /* No grace period and unregistered callbacks? */ 2794 if (!rcu_gp_in_progress(rsp) && 2795 rcu_segcblist_is_enabled(&rdp->cblist)) { 2796 local_irq_save(flags); 2797 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) 2798 rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); 2799 local_irq_restore(flags); 2800 } 2801 2802 rcu_check_gp_start_stall(rsp, rnp, rdp); 2803 2804 /* If there are callbacks ready, invoke them. */ 2805 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 2806 invoke_rcu_callbacks(rsp, rdp); 2807 2808 /* Do any needed deferred wakeups of rcuo kthreads. */ 2809 do_nocb_deferred_wakeup(rdp); 2810 } 2811 2812 /* 2813 * Do RCU core processing for the current CPU. 2814 */ 2815 static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) 2816 { 2817 struct rcu_state *rsp; 2818 2819 if (cpu_is_offline(smp_processor_id())) 2820 return; 2821 trace_rcu_utilization(TPS("Start RCU core")); 2822 for_each_rcu_flavor(rsp) 2823 __rcu_process_callbacks(rsp); 2824 trace_rcu_utilization(TPS("End RCU core")); 2825 } 2826 2827 /* 2828 * Schedule RCU callback invocation. If the specified type of RCU 2829 * does not support RCU priority boosting, just do a direct call, 2830 * otherwise wake up the per-CPU kernel kthread. Note that because we 2831 * are running on the current CPU with softirqs disabled, the 2832 * rcu_cpu_kthread_task cannot disappear out from under us. 2833 */ 2834 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 2835 { 2836 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) 2837 return; 2838 if (likely(!rsp->boost)) { 2839 rcu_do_batch(rsp, rdp); 2840 return; 2841 } 2842 invoke_rcu_callbacks_kthread(); 2843 } 2844 2845 static void invoke_rcu_core(void) 2846 { 2847 if (cpu_online(smp_processor_id())) 2848 raise_softirq(RCU_SOFTIRQ); 2849 } 2850 2851 /* 2852 * Handle any core-RCU processing required by a call_rcu() invocation. 2853 */ 2854 static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 2855 struct rcu_head *head, unsigned long flags) 2856 { 2857 /* 2858 * If called from an extended quiescent state, invoke the RCU 2859 * core in order to force a re-evaluation of RCU's idleness. 2860 */ 2861 if (!rcu_is_watching()) 2862 invoke_rcu_core(); 2863 2864 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2865 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) 2866 return; 2867 2868 /* 2869 * Force the grace period if too many callbacks or too long waiting. 2870 * Enforce hysteresis, and don't invoke force_quiescent_state() 2871 * if some other CPU has recently done so. Also, don't bother 2872 * invoking force_quiescent_state() if the newly enqueued callback 2873 * is the only one waiting for a grace period to complete. 2874 */ 2875 if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > 2876 rdp->qlen_last_fqs_check + qhimark)) { 2877 2878 /* Are we ignoring a completed grace period? */ 2879 note_gp_changes(rsp, rdp); 2880 2881 /* Start a new grace period if one not already started. */ 2882 if (!rcu_gp_in_progress(rsp)) { 2883 rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp); 2884 } else { 2885 /* Give the grace period a kick. */ 2886 rdp->blimit = LONG_MAX; 2887 if (rsp->n_force_qs == rdp->n_force_qs_snap && 2888 rcu_segcblist_first_pend_cb(&rdp->cblist) != head) 2889 force_quiescent_state(rsp); 2890 rdp->n_force_qs_snap = rsp->n_force_qs; 2891 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); 2892 } 2893 } 2894 } 2895 2896 /* 2897 * RCU callback function to leak a callback. 2898 */ 2899 static void rcu_leak_callback(struct rcu_head *rhp) 2900 { 2901 } 2902 2903 /* 2904 * Helper function for call_rcu() and friends. The cpu argument will 2905 * normally be -1, indicating "currently running CPU". It may specify 2906 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 2907 * is expected to specify a CPU. 2908 */ 2909 static void 2910 __call_rcu(struct rcu_head *head, rcu_callback_t func, 2911 struct rcu_state *rsp, int cpu, bool lazy) 2912 { 2913 unsigned long flags; 2914 struct rcu_data *rdp; 2915 2916 /* Misaligned rcu_head! */ 2917 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); 2918 2919 if (debug_rcu_head_queue(head)) { 2920 /* 2921 * Probable double call_rcu(), so leak the callback. 2922 * Use rcu:rcu_callback trace event to find the previous 2923 * time callback was passed to __call_rcu(). 2924 */ 2925 WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", 2926 head, head->func); 2927 WRITE_ONCE(head->func, rcu_leak_callback); 2928 return; 2929 } 2930 head->func = func; 2931 head->next = NULL; 2932 local_irq_save(flags); 2933 rdp = this_cpu_ptr(rsp->rda); 2934 2935 /* Add the callback to our list. */ 2936 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { 2937 int offline; 2938 2939 if (cpu != -1) 2940 rdp = per_cpu_ptr(rsp->rda, cpu); 2941 if (likely(rdp->mynode)) { 2942 /* Post-boot, so this should be for a no-CBs CPU. */ 2943 offline = !__call_rcu_nocb(rdp, head, lazy, flags); 2944 WARN_ON_ONCE(offline); 2945 /* Offline CPU, _call_rcu() illegal, leak callback. */ 2946 local_irq_restore(flags); 2947 return; 2948 } 2949 /* 2950 * Very early boot, before rcu_init(). Initialize if needed 2951 * and then drop through to queue the callback. 2952 */ 2953 BUG_ON(cpu != -1); 2954 WARN_ON_ONCE(!rcu_is_watching()); 2955 if (rcu_segcblist_empty(&rdp->cblist)) 2956 rcu_segcblist_init(&rdp->cblist); 2957 } 2958 rcu_segcblist_enqueue(&rdp->cblist, head, lazy); 2959 if (!lazy) 2960 rcu_idle_count_callbacks_posted(); 2961 2962 if (__is_kfree_rcu_offset((unsigned long)func)) 2963 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 2964 rcu_segcblist_n_lazy_cbs(&rdp->cblist), 2965 rcu_segcblist_n_cbs(&rdp->cblist)); 2966 else 2967 trace_rcu_callback(rsp->name, head, 2968 rcu_segcblist_n_lazy_cbs(&rdp->cblist), 2969 rcu_segcblist_n_cbs(&rdp->cblist)); 2970 2971 /* Go handle any RCU core processing required. */ 2972 __call_rcu_core(rsp, rdp, head, flags); 2973 local_irq_restore(flags); 2974 } 2975 2976 /** 2977 * call_rcu_sched() - Queue an RCU for invocation after sched grace period. 2978 * @head: structure to be used for queueing the RCU updates. 2979 * @func: actual callback function to be invoked after the grace period 2980 * 2981 * The callback function will be invoked some time after a full grace 2982 * period elapses, in other words after all currently executing RCU 2983 * read-side critical sections have completed. call_rcu_sched() assumes 2984 * that the read-side critical sections end on enabling of preemption 2985 * or on voluntary preemption. 2986 * RCU read-side critical sections are delimited by: 2987 * 2988 * - rcu_read_lock_sched() and rcu_read_unlock_sched(), OR 2989 * - anything that disables preemption. 2990 * 2991 * These may be nested. 2992 * 2993 * See the description of call_rcu() for more detailed information on 2994 * memory ordering guarantees. 2995 */ 2996 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) 2997 { 2998 __call_rcu(head, func, &rcu_sched_state, -1, 0); 2999 } 3000 EXPORT_SYMBOL_GPL(call_rcu_sched); 3001 3002 /** 3003 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. 3004 * @head: structure to be used for queueing the RCU updates. 3005 * @func: actual callback function to be invoked after the grace period 3006 * 3007 * The callback function will be invoked some time after a full grace 3008 * period elapses, in other words after all currently executing RCU 3009 * read-side critical sections have completed. call_rcu_bh() assumes 3010 * that the read-side critical sections end on completion of a softirq 3011 * handler. This means that read-side critical sections in process 3012 * context must not be interrupted by softirqs. This interface is to be 3013 * used when most of the read-side critical sections are in softirq context. 3014 * RCU read-side critical sections are delimited by: 3015 * 3016 * - rcu_read_lock() and rcu_read_unlock(), if in interrupt context, OR 3017 * - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context. 3018 * 3019 * These may be nested. 3020 * 3021 * See the description of call_rcu() for more detailed information on 3022 * memory ordering guarantees. 3023 */ 3024 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) 3025 { 3026 __call_rcu(head, func, &rcu_bh_state, -1, 0); 3027 } 3028 EXPORT_SYMBOL_GPL(call_rcu_bh); 3029 3030 /* 3031 * Queue an RCU callback for lazy invocation after a grace period. 3032 * This will likely be later named something like "call_rcu_lazy()", 3033 * but this change will require some way of tagging the lazy RCU 3034 * callbacks in the list of pending callbacks. Until then, this 3035 * function may only be called from __kfree_rcu(). 3036 */ 3037 void kfree_call_rcu(struct rcu_head *head, 3038 rcu_callback_t func) 3039 { 3040 __call_rcu(head, func, rcu_state_p, -1, 1); 3041 } 3042 EXPORT_SYMBOL_GPL(kfree_call_rcu); 3043 3044 /* 3045 * Because a context switch is a grace period for RCU-sched and RCU-bh, 3046 * any blocking grace-period wait automatically implies a grace period 3047 * if there is only one CPU online at any point time during execution 3048 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to 3049 * occasionally incorrectly indicate that there are multiple CPUs online 3050 * when there was in fact only one the whole time, as this just adds 3051 * some overhead: RCU still operates correctly. 3052 */ 3053 static inline int rcu_blocking_is_gp(void) 3054 { 3055 int ret; 3056 3057 might_sleep(); /* Check for RCU read-side critical section. */ 3058 preempt_disable(); 3059 ret = num_online_cpus() <= 1; 3060 preempt_enable(); 3061 return ret; 3062 } 3063 3064 /** 3065 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 3066 * 3067 * Control will return to the caller some time after a full rcu-sched 3068 * grace period has elapsed, in other words after all currently executing 3069 * rcu-sched read-side critical sections have completed. These read-side 3070 * critical sections are delimited by rcu_read_lock_sched() and 3071 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), 3072 * local_irq_disable(), and so on may be used in place of 3073 * rcu_read_lock_sched(). 3074 * 3075 * This means that all preempt_disable code sequences, including NMI and 3076 * non-threaded hardware-interrupt handlers, in progress on entry will 3077 * have completed before this primitive returns. However, this does not 3078 * guarantee that softirq handlers will have completed, since in some 3079 * kernels, these handlers can run in process context, and can block. 3080 * 3081 * Note that this guarantee implies further memory-ordering guarantees. 3082 * On systems with more than one CPU, when synchronize_sched() returns, 3083 * each CPU is guaranteed to have executed a full memory barrier since the 3084 * end of its last RCU-sched read-side critical section whose beginning 3085 * preceded the call to synchronize_sched(). In addition, each CPU having 3086 * an RCU read-side critical section that extends beyond the return from 3087 * synchronize_sched() is guaranteed to have executed a full memory barrier 3088 * after the beginning of synchronize_sched() and before the beginning of 3089 * that RCU read-side critical section. Note that these guarantees include 3090 * CPUs that are offline, idle, or executing in user mode, as well as CPUs 3091 * that are executing in the kernel. 3092 * 3093 * Furthermore, if CPU A invoked synchronize_sched(), which returned 3094 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 3095 * to have executed a full memory barrier during the execution of 3096 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but 3097 * again only if the system has more than one CPU). 3098 */ 3099 void synchronize_sched(void) 3100 { 3101 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || 3102 lock_is_held(&rcu_lock_map) || 3103 lock_is_held(&rcu_sched_lock_map), 3104 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 3105 if (rcu_blocking_is_gp()) 3106 return; 3107 if (rcu_gp_is_expedited()) 3108 synchronize_sched_expedited(); 3109 else 3110 wait_rcu_gp(call_rcu_sched); 3111 } 3112 EXPORT_SYMBOL_GPL(synchronize_sched); 3113 3114 /** 3115 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. 3116 * 3117 * Control will return to the caller some time after a full rcu_bh grace 3118 * period has elapsed, in other words after all currently executing rcu_bh 3119 * read-side critical sections have completed. RCU read-side critical 3120 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 3121 * and may be nested. 3122 * 3123 * See the description of synchronize_sched() for more detailed information 3124 * on memory ordering guarantees. 3125 */ 3126 void synchronize_rcu_bh(void) 3127 { 3128 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || 3129 lock_is_held(&rcu_lock_map) || 3130 lock_is_held(&rcu_sched_lock_map), 3131 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 3132 if (rcu_blocking_is_gp()) 3133 return; 3134 if (rcu_gp_is_expedited()) 3135 synchronize_rcu_bh_expedited(); 3136 else 3137 wait_rcu_gp(call_rcu_bh); 3138 } 3139 EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 3140 3141 /** 3142 * get_state_synchronize_rcu - Snapshot current RCU state 3143 * 3144 * Returns a cookie that is used by a later call to cond_synchronize_rcu() 3145 * to determine whether or not a full grace period has elapsed in the 3146 * meantime. 3147 */ 3148 unsigned long get_state_synchronize_rcu(void) 3149 { 3150 /* 3151 * Any prior manipulation of RCU-protected data must happen 3152 * before the load from ->gp_seq. 3153 */ 3154 smp_mb(); /* ^^^ */ 3155 return rcu_seq_snap(&rcu_state_p->gp_seq); 3156 } 3157 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 3158 3159 /** 3160 * cond_synchronize_rcu - Conditionally wait for an RCU grace period 3161 * 3162 * @oldstate: return value from earlier call to get_state_synchronize_rcu() 3163 * 3164 * If a full RCU grace period has elapsed since the earlier call to 3165 * get_state_synchronize_rcu(), just return. Otherwise, invoke 3166 * synchronize_rcu() to wait for a full grace period. 3167 * 3168 * Yes, this function does not take counter wrap into account. But 3169 * counter wrap is harmless. If the counter wraps, we have waited for 3170 * more than 2 billion grace periods (and way more on a 64-bit system!), 3171 * so waiting for one additional grace period should be just fine. 3172 */ 3173 void cond_synchronize_rcu(unsigned long oldstate) 3174 { 3175 if (!rcu_seq_done(&rcu_state_p->gp_seq, oldstate)) 3176 synchronize_rcu(); 3177 else 3178 smp_mb(); /* Ensure GP ends before subsequent accesses. */ 3179 } 3180 EXPORT_SYMBOL_GPL(cond_synchronize_rcu); 3181 3182 /** 3183 * get_state_synchronize_sched - Snapshot current RCU-sched state 3184 * 3185 * Returns a cookie that is used by a later call to cond_synchronize_sched() 3186 * to determine whether or not a full grace period has elapsed in the 3187 * meantime. 3188 */ 3189 unsigned long get_state_synchronize_sched(void) 3190 { 3191 /* 3192 * Any prior manipulation of RCU-protected data must happen 3193 * before the load from ->gp_seq. 3194 */ 3195 smp_mb(); /* ^^^ */ 3196 return rcu_seq_snap(&rcu_sched_state.gp_seq); 3197 } 3198 EXPORT_SYMBOL_GPL(get_state_synchronize_sched); 3199 3200 /** 3201 * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period 3202 * 3203 * @oldstate: return value from earlier call to get_state_synchronize_sched() 3204 * 3205 * If a full RCU-sched grace period has elapsed since the earlier call to 3206 * get_state_synchronize_sched(), just return. Otherwise, invoke 3207 * synchronize_sched() to wait for a full grace period. 3208 * 3209 * Yes, this function does not take counter wrap into account. But 3210 * counter wrap is harmless. If the counter wraps, we have waited for 3211 * more than 2 billion grace periods (and way more on a 64-bit system!), 3212 * so waiting for one additional grace period should be just fine. 3213 */ 3214 void cond_synchronize_sched(unsigned long oldstate) 3215 { 3216 if (!rcu_seq_done(&rcu_sched_state.gp_seq, oldstate)) 3217 synchronize_sched(); 3218 else 3219 smp_mb(); /* Ensure GP ends before subsequent accesses. */ 3220 } 3221 EXPORT_SYMBOL_GPL(cond_synchronize_sched); 3222 3223 /* 3224 * Check to see if there is any immediate RCU-related work to be done 3225 * by the current CPU, for the specified type of RCU, returning 1 if so. 3226 * The checks are in order of increasing expense: checks that can be 3227 * carried out against CPU-local state are performed first. However, 3228 * we must check for CPU stalls first, else we might not get a chance. 3229 */ 3230 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 3231 { 3232 struct rcu_node *rnp = rdp->mynode; 3233 3234 /* Check for CPU stalls, if enabled. */ 3235 check_cpu_stall(rsp, rdp); 3236 3237 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ 3238 if (rcu_nohz_full_cpu(rsp)) 3239 return 0; 3240 3241 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3242 if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) 3243 return 1; 3244 3245 /* Does this CPU have callbacks ready to invoke? */ 3246 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 3247 return 1; 3248 3249 /* Has RCU gone idle with this CPU needing another grace period? */ 3250 if (!rcu_gp_in_progress(rsp) && 3251 rcu_segcblist_is_enabled(&rdp->cblist) && 3252 !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) 3253 return 1; 3254 3255 /* Have RCU grace period completed or started? */ 3256 if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq || 3257 unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ 3258 return 1; 3259 3260 /* Does this CPU need a deferred NOCB wakeup? */ 3261 if (rcu_nocb_need_deferred_wakeup(rdp)) 3262 return 1; 3263 3264 /* nothing to do */ 3265 return 0; 3266 } 3267 3268 /* 3269 * Check to see if there is any immediate RCU-related work to be done 3270 * by the current CPU, returning 1 if so. This function is part of the 3271 * RCU implementation; it is -not- an exported member of the RCU API. 3272 */ 3273 static int rcu_pending(void) 3274 { 3275 struct rcu_state *rsp; 3276 3277 for_each_rcu_flavor(rsp) 3278 if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) 3279 return 1; 3280 return 0; 3281 } 3282 3283 /* 3284 * Return true if the specified CPU has any callback. If all_lazy is 3285 * non-NULL, store an indication of whether all callbacks are lazy. 3286 * (If there are no callbacks, all of them are deemed to be lazy.) 3287 */ 3288 static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) 3289 { 3290 bool al = true; 3291 bool hc = false; 3292 struct rcu_data *rdp; 3293 struct rcu_state *rsp; 3294 3295 for_each_rcu_flavor(rsp) { 3296 rdp = this_cpu_ptr(rsp->rda); 3297 if (rcu_segcblist_empty(&rdp->cblist)) 3298 continue; 3299 hc = true; 3300 if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { 3301 al = false; 3302 break; 3303 } 3304 } 3305 if (all_lazy) 3306 *all_lazy = al; 3307 return hc; 3308 } 3309 3310 /* 3311 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 3312 * the compiler is expected to optimize this away. 3313 */ 3314 static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, 3315 int cpu, unsigned long done) 3316 { 3317 trace_rcu_barrier(rsp->name, s, cpu, 3318 atomic_read(&rsp->barrier_cpu_count), done); 3319 } 3320 3321 /* 3322 * RCU callback function for _rcu_barrier(). If we are last, wake 3323 * up the task executing _rcu_barrier(). 3324 */ 3325 static void rcu_barrier_callback(struct rcu_head *rhp) 3326 { 3327 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); 3328 struct rcu_state *rsp = rdp->rsp; 3329 3330 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { 3331 _rcu_barrier_trace(rsp, TPS("LastCB"), -1, 3332 rsp->barrier_sequence); 3333 complete(&rsp->barrier_completion); 3334 } else { 3335 _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); 3336 } 3337 } 3338 3339 /* 3340 * Called with preemption disabled, and from cross-cpu IRQ context. 3341 */ 3342 static void rcu_barrier_func(void *type) 3343 { 3344 struct rcu_state *rsp = type; 3345 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3346 3347 _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); 3348 rdp->barrier_head.func = rcu_barrier_callback; 3349 debug_rcu_head_queue(&rdp->barrier_head); 3350 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { 3351 atomic_inc(&rsp->barrier_cpu_count); 3352 } else { 3353 debug_rcu_head_unqueue(&rdp->barrier_head); 3354 _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, 3355 rsp->barrier_sequence); 3356 } 3357 } 3358 3359 /* 3360 * Orchestrate the specified type of RCU barrier, waiting for all 3361 * RCU callbacks of the specified type to complete. 3362 */ 3363 static void _rcu_barrier(struct rcu_state *rsp) 3364 { 3365 int cpu; 3366 struct rcu_data *rdp; 3367 unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); 3368 3369 _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); 3370 3371 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 3372 mutex_lock(&rsp->barrier_mutex); 3373 3374 /* Did someone else do our work for us? */ 3375 if (rcu_seq_done(&rsp->barrier_sequence, s)) { 3376 _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, 3377 rsp->barrier_sequence); 3378 smp_mb(); /* caller's subsequent code after above check. */ 3379 mutex_unlock(&rsp->barrier_mutex); 3380 return; 3381 } 3382 3383 /* Mark the start of the barrier operation. */ 3384 rcu_seq_start(&rsp->barrier_sequence); 3385 _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); 3386 3387 /* 3388 * Initialize the count to one rather than to zero in order to 3389 * avoid a too-soon return to zero in case of a short grace period 3390 * (or preemption of this task). Exclude CPU-hotplug operations 3391 * to ensure that no offline CPU has callbacks queued. 3392 */ 3393 init_completion(&rsp->barrier_completion); 3394 atomic_set(&rsp->barrier_cpu_count, 1); 3395 get_online_cpus(); 3396 3397 /* 3398 * Force each CPU with callbacks to register a new callback. 3399 * When that callback is invoked, we will know that all of the 3400 * corresponding CPU's preceding callbacks have been invoked. 3401 */ 3402 for_each_possible_cpu(cpu) { 3403 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) 3404 continue; 3405 rdp = per_cpu_ptr(rsp->rda, cpu); 3406 if (rcu_is_nocb_cpu(cpu)) { 3407 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { 3408 _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, 3409 rsp->barrier_sequence); 3410 } else { 3411 _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, 3412 rsp->barrier_sequence); 3413 smp_mb__before_atomic(); 3414 atomic_inc(&rsp->barrier_cpu_count); 3415 __call_rcu(&rdp->barrier_head, 3416 rcu_barrier_callback, rsp, cpu, 0); 3417 } 3418 } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { 3419 _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, 3420 rsp->barrier_sequence); 3421 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3422 } else { 3423 _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, 3424 rsp->barrier_sequence); 3425 } 3426 } 3427 put_online_cpus(); 3428 3429 /* 3430 * Now that we have an rcu_barrier_callback() callback on each 3431 * CPU, and thus each counted, remove the initial count. 3432 */ 3433 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) 3434 complete(&rsp->barrier_completion); 3435 3436 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 3437 wait_for_completion(&rsp->barrier_completion); 3438 3439 /* Mark the end of the barrier operation. */ 3440 _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); 3441 rcu_seq_end(&rsp->barrier_sequence); 3442 3443 /* Other rcu_barrier() invocations can now safely proceed. */ 3444 mutex_unlock(&rsp->barrier_mutex); 3445 } 3446 3447 /** 3448 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. 3449 */ 3450 void rcu_barrier_bh(void) 3451 { 3452 _rcu_barrier(&rcu_bh_state); 3453 } 3454 EXPORT_SYMBOL_GPL(rcu_barrier_bh); 3455 3456 /** 3457 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. 3458 */ 3459 void rcu_barrier_sched(void) 3460 { 3461 _rcu_barrier(&rcu_sched_state); 3462 } 3463 EXPORT_SYMBOL_GPL(rcu_barrier_sched); 3464 3465 /* 3466 * Propagate ->qsinitmask bits up the rcu_node tree to account for the 3467 * first CPU in a given leaf rcu_node structure coming online. The caller 3468 * must hold the corresponding leaf rcu_node ->lock with interrrupts 3469 * disabled. 3470 */ 3471 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) 3472 { 3473 long mask; 3474 long oldmask; 3475 struct rcu_node *rnp = rnp_leaf; 3476 3477 raw_lockdep_assert_held_rcu_node(rnp_leaf); 3478 WARN_ON_ONCE(rnp->wait_blkd_tasks); 3479 for (;;) { 3480 mask = rnp->grpmask; 3481 rnp = rnp->parent; 3482 if (rnp == NULL) 3483 return; 3484 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ 3485 oldmask = rnp->qsmaskinit; 3486 rnp->qsmaskinit |= mask; 3487 raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ 3488 if (oldmask) 3489 return; 3490 } 3491 } 3492 3493 /* 3494 * Do boot-time initialization of a CPU's per-CPU RCU data. 3495 */ 3496 static void __init 3497 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 3498 { 3499 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3500 3501 /* Set up local state, ensuring consistent view of global state. */ 3502 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); 3503 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3504 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); 3505 WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); 3506 rdp->rcu_ofl_gp_seq = rsp->gp_seq; 3507 rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; 3508 rdp->rcu_onl_gp_seq = rsp->gp_seq; 3509 rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; 3510 rdp->cpu = cpu; 3511 rdp->rsp = rsp; 3512 rcu_boot_init_nocb_percpu_data(rdp); 3513 } 3514 3515 /* 3516 * Initialize a CPU's per-CPU RCU data. Note that only one online or 3517 * offline event can be happening at a given time. Note also that we can 3518 * accept some slop in the rsp->gp_seq access due to the fact that this 3519 * CPU cannot possibly have any RCU callbacks in flight yet. 3520 */ 3521 static void 3522 rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 3523 { 3524 unsigned long flags; 3525 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3526 struct rcu_node *rnp = rcu_get_root(rsp); 3527 3528 /* Set up local state, ensuring consistent view of global state. */ 3529 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3530 rdp->qlen_last_fqs_check = 0; 3531 rdp->n_force_qs_snap = rsp->n_force_qs; 3532 rdp->blimit = blimit; 3533 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ 3534 !init_nocb_callback_list(rdp)) 3535 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ 3536 rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */ 3537 rcu_dynticks_eqs_online(); 3538 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 3539 3540 /* 3541 * Add CPU to leaf rcu_node pending-online bitmask. Any needed 3542 * propagation up the rcu_node tree will happen at the beginning 3543 * of the next grace period. 3544 */ 3545 rnp = rdp->mynode; 3546 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 3547 rdp->beenonline = true; /* We have now been online. */ 3548 rdp->gp_seq = rnp->gp_seq; 3549 rdp->gp_seq_needed = rnp->gp_seq; 3550 rdp->cpu_no_qs.b.norm = true; 3551 rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); 3552 rdp->core_needs_qs = false; 3553 rdp->rcu_iw_pending = false; 3554 rdp->rcu_iw_gp_seq = rnp->gp_seq - 1; 3555 trace_rcu_grace_period(rsp->name, rdp->gp_seq, TPS("cpuonl")); 3556 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3557 } 3558 3559 /* 3560 * Invoked early in the CPU-online process, when pretty much all 3561 * services are available. The incoming CPU is not present. 3562 */ 3563 int rcutree_prepare_cpu(unsigned int cpu) 3564 { 3565 struct rcu_state *rsp; 3566 3567 for_each_rcu_flavor(rsp) 3568 rcu_init_percpu_data(cpu, rsp); 3569 3570 rcu_prepare_kthreads(cpu); 3571 rcu_spawn_all_nocb_kthreads(cpu); 3572 3573 return 0; 3574 } 3575 3576 /* 3577 * Update RCU priority boot kthread affinity for CPU-hotplug changes. 3578 */ 3579 static void rcutree_affinity_setting(unsigned int cpu, int outgoing) 3580 { 3581 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3582 3583 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); 3584 } 3585 3586 /* 3587 * Near the end of the CPU-online process. Pretty much all services 3588 * enabled, and the CPU is now very much alive. 3589 */ 3590 int rcutree_online_cpu(unsigned int cpu) 3591 { 3592 unsigned long flags; 3593 struct rcu_data *rdp; 3594 struct rcu_node *rnp; 3595 struct rcu_state *rsp; 3596 3597 for_each_rcu_flavor(rsp) { 3598 rdp = per_cpu_ptr(rsp->rda, cpu); 3599 rnp = rdp->mynode; 3600 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3601 rnp->ffmask |= rdp->grpmask; 3602 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3603 } 3604 if (IS_ENABLED(CONFIG_TREE_SRCU)) 3605 srcu_online_cpu(cpu); 3606 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) 3607 return 0; /* Too early in boot for scheduler work. */ 3608 sync_sched_exp_online_cleanup(cpu); 3609 rcutree_affinity_setting(cpu, -1); 3610 return 0; 3611 } 3612 3613 /* 3614 * Near the beginning of the process. The CPU is still very much alive 3615 * with pretty much all services enabled. 3616 */ 3617 int rcutree_offline_cpu(unsigned int cpu) 3618 { 3619 unsigned long flags; 3620 struct rcu_data *rdp; 3621 struct rcu_node *rnp; 3622 struct rcu_state *rsp; 3623 3624 for_each_rcu_flavor(rsp) { 3625 rdp = per_cpu_ptr(rsp->rda, cpu); 3626 rnp = rdp->mynode; 3627 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3628 rnp->ffmask &= ~rdp->grpmask; 3629 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3630 } 3631 3632 rcutree_affinity_setting(cpu, cpu); 3633 if (IS_ENABLED(CONFIG_TREE_SRCU)) 3634 srcu_offline_cpu(cpu); 3635 return 0; 3636 } 3637 3638 /* 3639 * Near the end of the offline process. We do only tracing here. 3640 */ 3641 int rcutree_dying_cpu(unsigned int cpu) 3642 { 3643 struct rcu_state *rsp; 3644 3645 for_each_rcu_flavor(rsp) 3646 rcu_cleanup_dying_cpu(rsp); 3647 return 0; 3648 } 3649 3650 /* 3651 * The outgoing CPU is gone and we are running elsewhere. 3652 */ 3653 int rcutree_dead_cpu(unsigned int cpu) 3654 { 3655 struct rcu_state *rsp; 3656 3657 for_each_rcu_flavor(rsp) { 3658 rcu_cleanup_dead_cpu(cpu, rsp); 3659 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); 3660 } 3661 return 0; 3662 } 3663 3664 static DEFINE_PER_CPU(int, rcu_cpu_started); 3665 3666 /* 3667 * Mark the specified CPU as being online so that subsequent grace periods 3668 * (both expedited and normal) will wait on it. Note that this means that 3669 * incoming CPUs are not allowed to use RCU read-side critical sections 3670 * until this function is called. Failing to observe this restriction 3671 * will result in lockdep splats. 3672 * 3673 * Note that this function is special in that it is invoked directly 3674 * from the incoming CPU rather than from the cpuhp_step mechanism. 3675 * This is because this function must be invoked at a precise location. 3676 */ 3677 void rcu_cpu_starting(unsigned int cpu) 3678 { 3679 unsigned long flags; 3680 unsigned long mask; 3681 int nbits; 3682 unsigned long oldmask; 3683 struct rcu_data *rdp; 3684 struct rcu_node *rnp; 3685 struct rcu_state *rsp; 3686 3687 if (per_cpu(rcu_cpu_started, cpu)) 3688 return; 3689 3690 per_cpu(rcu_cpu_started, cpu) = 1; 3691 3692 for_each_rcu_flavor(rsp) { 3693 rdp = per_cpu_ptr(rsp->rda, cpu); 3694 rnp = rdp->mynode; 3695 mask = rdp->grpmask; 3696 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3697 rnp->qsmaskinitnext |= mask; 3698 oldmask = rnp->expmaskinitnext; 3699 rnp->expmaskinitnext |= mask; 3700 oldmask ^= rnp->expmaskinitnext; 3701 nbits = bitmap_weight(&oldmask, BITS_PER_LONG); 3702 /* Allow lockless access for expedited grace periods. */ 3703 smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ 3704 rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ 3705 rdp->rcu_onl_gp_seq = READ_ONCE(rsp->gp_seq); 3706 rdp->rcu_onl_gp_flags = READ_ONCE(rsp->gp_flags); 3707 if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ 3708 /* Report QS -after- changing ->qsmaskinitnext! */ 3709 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); 3710 } else { 3711 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3712 } 3713 } 3714 smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ 3715 } 3716 3717 #ifdef CONFIG_HOTPLUG_CPU 3718 /* 3719 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 3720 * function. We now remove it from the rcu_node tree's ->qsmaskinitnext 3721 * bit masks. 3722 */ 3723 static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) 3724 { 3725 unsigned long flags; 3726 unsigned long mask; 3727 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3728 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 3729 3730 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 3731 mask = rdp->grpmask; 3732 spin_lock(&rsp->ofl_lock); 3733 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ 3734 rdp->rcu_ofl_gp_seq = READ_ONCE(rsp->gp_seq); 3735 rdp->rcu_ofl_gp_flags = READ_ONCE(rsp->gp_flags); 3736 if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ 3737 /* Report quiescent state -before- changing ->qsmaskinitnext! */ 3738 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gp_seq, flags); 3739 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3740 } 3741 rnp->qsmaskinitnext &= ~mask; 3742 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3743 spin_unlock(&rsp->ofl_lock); 3744 } 3745 3746 /* 3747 * The outgoing function has no further need of RCU, so remove it from 3748 * the list of CPUs that RCU must track. 3749 * 3750 * Note that this function is special in that it is invoked directly 3751 * from the outgoing CPU rather than from the cpuhp_step mechanism. 3752 * This is because this function must be invoked at a precise location. 3753 */ 3754 void rcu_report_dead(unsigned int cpu) 3755 { 3756 struct rcu_state *rsp; 3757 3758 /* QS for any half-done expedited RCU-sched GP. */ 3759 preempt_disable(); 3760 rcu_report_exp_rdp(&rcu_sched_state, 3761 this_cpu_ptr(rcu_sched_state.rda), true); 3762 preempt_enable(); 3763 for_each_rcu_flavor(rsp) 3764 rcu_cleanup_dying_idle_cpu(cpu, rsp); 3765 3766 per_cpu(rcu_cpu_started, cpu) = 0; 3767 } 3768 3769 /* Migrate the dead CPU's callbacks to the current CPU. */ 3770 static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) 3771 { 3772 unsigned long flags; 3773 struct rcu_data *my_rdp; 3774 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3775 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 3776 bool needwake; 3777 3778 if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) 3779 return; /* No callbacks to migrate. */ 3780 3781 local_irq_save(flags); 3782 my_rdp = this_cpu_ptr(rsp->rda); 3783 if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { 3784 local_irq_restore(flags); 3785 return; 3786 } 3787 raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ 3788 /* Leverage recent GPs and set GP for new callbacks. */ 3789 needwake = rcu_advance_cbs(rsp, rnp_root, rdp) || 3790 rcu_advance_cbs(rsp, rnp_root, my_rdp); 3791 rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); 3792 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != 3793 !rcu_segcblist_n_cbs(&my_rdp->cblist)); 3794 raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); 3795 if (needwake) 3796 rcu_gp_kthread_wake(rsp); 3797 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || 3798 !rcu_segcblist_empty(&rdp->cblist), 3799 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", 3800 cpu, rcu_segcblist_n_cbs(&rdp->cblist), 3801 rcu_segcblist_first_cb(&rdp->cblist)); 3802 } 3803 3804 /* 3805 * The outgoing CPU has just passed through the dying-idle state, 3806 * and we are being invoked from the CPU that was IPIed to continue the 3807 * offline operation. We need to migrate the outgoing CPU's callbacks. 3808 */ 3809 void rcutree_migrate_callbacks(int cpu) 3810 { 3811 struct rcu_state *rsp; 3812 3813 for_each_rcu_flavor(rsp) 3814 rcu_migrate_callbacks(cpu, rsp); 3815 } 3816 #endif 3817 3818 /* 3819 * On non-huge systems, use expedited RCU grace periods to make suspend 3820 * and hibernation run faster. 3821 */ 3822 static int rcu_pm_notify(struct notifier_block *self, 3823 unsigned long action, void *hcpu) 3824 { 3825 switch (action) { 3826 case PM_HIBERNATION_PREPARE: 3827 case PM_SUSPEND_PREPARE: 3828 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3829 rcu_expedite_gp(); 3830 break; 3831 case PM_POST_HIBERNATION: 3832 case PM_POST_SUSPEND: 3833 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3834 rcu_unexpedite_gp(); 3835 break; 3836 default: 3837 break; 3838 } 3839 return NOTIFY_OK; 3840 } 3841 3842 /* 3843 * Spawn the kthreads that handle each RCU flavor's grace periods. 3844 */ 3845 static int __init rcu_spawn_gp_kthread(void) 3846 { 3847 unsigned long flags; 3848 int kthread_prio_in = kthread_prio; 3849 struct rcu_node *rnp; 3850 struct rcu_state *rsp; 3851 struct sched_param sp; 3852 struct task_struct *t; 3853 3854 /* Force priority into range. */ 3855 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) 3856 kthread_prio = 1; 3857 else if (kthread_prio < 0) 3858 kthread_prio = 0; 3859 else if (kthread_prio > 99) 3860 kthread_prio = 99; 3861 if (kthread_prio != kthread_prio_in) 3862 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", 3863 kthread_prio, kthread_prio_in); 3864 3865 rcu_scheduler_fully_active = 1; 3866 for_each_rcu_flavor(rsp) { 3867 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); 3868 BUG_ON(IS_ERR(t)); 3869 rnp = rcu_get_root(rsp); 3870 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3871 rsp->gp_kthread = t; 3872 if (kthread_prio) { 3873 sp.sched_priority = kthread_prio; 3874 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 3875 } 3876 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3877 wake_up_process(t); 3878 } 3879 rcu_spawn_nocb_kthreads(); 3880 rcu_spawn_boost_kthreads(); 3881 return 0; 3882 } 3883 early_initcall(rcu_spawn_gp_kthread); 3884 3885 /* 3886 * This function is invoked towards the end of the scheduler's 3887 * initialization process. Before this is called, the idle task might 3888 * contain synchronous grace-period primitives (during which time, this idle 3889 * task is booting the system, and such primitives are no-ops). After this 3890 * function is called, any synchronous grace-period primitives are run as 3891 * expedited, with the requesting task driving the grace period forward. 3892 * A later core_initcall() rcu_set_runtime_mode() will switch to full 3893 * runtime RCU functionality. 3894 */ 3895 void rcu_scheduler_starting(void) 3896 { 3897 WARN_ON(num_online_cpus() != 1); 3898 WARN_ON(nr_context_switches() > 0); 3899 rcu_test_sync_prims(); 3900 rcu_scheduler_active = RCU_SCHEDULER_INIT; 3901 rcu_test_sync_prims(); 3902 } 3903 3904 /* 3905 * Helper function for rcu_init() that initializes one rcu_state structure. 3906 */ 3907 static void __init rcu_init_one(struct rcu_state *rsp) 3908 { 3909 static const char * const buf[] = RCU_NODE_NAME_INIT; 3910 static const char * const fqs[] = RCU_FQS_NAME_INIT; 3911 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 3912 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 3913 3914 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 3915 int cpustride = 1; 3916 int i; 3917 int j; 3918 struct rcu_node *rnp; 3919 3920 BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 3921 3922 /* Silence gcc 4.8 false positive about array index out of range. */ 3923 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) 3924 panic("rcu_init_one: rcu_num_lvls out of range"); 3925 3926 /* Initialize the level-tracking arrays. */ 3927 3928 for (i = 1; i < rcu_num_lvls; i++) 3929 rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; 3930 rcu_init_levelspread(levelspread, num_rcu_lvl); 3931 3932 /* Initialize the elements themselves, starting from the leaves. */ 3933 3934 for (i = rcu_num_lvls - 1; i >= 0; i--) { 3935 cpustride *= levelspread[i]; 3936 rnp = rsp->level[i]; 3937 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { 3938 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); 3939 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), 3940 &rcu_node_class[i], buf[i]); 3941 raw_spin_lock_init(&rnp->fqslock); 3942 lockdep_set_class_and_name(&rnp->fqslock, 3943 &rcu_fqs_class[i], fqs[i]); 3944 rnp->gp_seq = rsp->gp_seq; 3945 rnp->gp_seq_needed = rsp->gp_seq; 3946 rnp->completedqs = rsp->gp_seq; 3947 rnp->qsmask = 0; 3948 rnp->qsmaskinit = 0; 3949 rnp->grplo = j * cpustride; 3950 rnp->grphi = (j + 1) * cpustride - 1; 3951 if (rnp->grphi >= nr_cpu_ids) 3952 rnp->grphi = nr_cpu_ids - 1; 3953 if (i == 0) { 3954 rnp->grpnum = 0; 3955 rnp->grpmask = 0; 3956 rnp->parent = NULL; 3957 } else { 3958 rnp->grpnum = j % levelspread[i - 1]; 3959 rnp->grpmask = 1UL << rnp->grpnum; 3960 rnp->parent = rsp->level[i - 1] + 3961 j / levelspread[i - 1]; 3962 } 3963 rnp->level = i; 3964 INIT_LIST_HEAD(&rnp->blkd_tasks); 3965 rcu_init_one_nocb(rnp); 3966 init_waitqueue_head(&rnp->exp_wq[0]); 3967 init_waitqueue_head(&rnp->exp_wq[1]); 3968 init_waitqueue_head(&rnp->exp_wq[2]); 3969 init_waitqueue_head(&rnp->exp_wq[3]); 3970 spin_lock_init(&rnp->exp_lock); 3971 } 3972 } 3973 3974 init_swait_queue_head(&rsp->gp_wq); 3975 init_swait_queue_head(&rsp->expedited_wq); 3976 rnp = rcu_first_leaf_node(rsp); 3977 for_each_possible_cpu(i) { 3978 while (i > rnp->grphi) 3979 rnp++; 3980 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 3981 rcu_boot_init_percpu_data(i, rsp); 3982 } 3983 list_add(&rsp->flavors, &rcu_struct_flavors); 3984 } 3985 3986 /* 3987 * Compute the rcu_node tree geometry from kernel parameters. This cannot 3988 * replace the definitions in tree.h because those are needed to size 3989 * the ->node array in the rcu_state structure. 3990 */ 3991 static void __init rcu_init_geometry(void) 3992 { 3993 ulong d; 3994 int i; 3995 int rcu_capacity[RCU_NUM_LVLS]; 3996 3997 /* 3998 * Initialize any unspecified boot parameters. 3999 * The default values of jiffies_till_first_fqs and 4000 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS 4001 * value, which is a function of HZ, then adding one for each 4002 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. 4003 */ 4004 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; 4005 if (jiffies_till_first_fqs == ULONG_MAX) 4006 jiffies_till_first_fqs = d; 4007 if (jiffies_till_next_fqs == ULONG_MAX) 4008 jiffies_till_next_fqs = d; 4009 4010 /* If the compile-time values are accurate, just leave. */ 4011 if (rcu_fanout_leaf == RCU_FANOUT_LEAF && 4012 nr_cpu_ids == NR_CPUS) 4013 return; 4014 pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", 4015 rcu_fanout_leaf, nr_cpu_ids); 4016 4017 /* 4018 * The boot-time rcu_fanout_leaf parameter must be at least two 4019 * and cannot exceed the number of bits in the rcu_node masks. 4020 * Complain and fall back to the compile-time values if this 4021 * limit is exceeded. 4022 */ 4023 if (rcu_fanout_leaf < 2 || 4024 rcu_fanout_leaf > sizeof(unsigned long) * 8) { 4025 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4026 WARN_ON(1); 4027 return; 4028 } 4029 4030 /* 4031 * Compute number of nodes that can be handled an rcu_node tree 4032 * with the given number of levels. 4033 */ 4034 rcu_capacity[0] = rcu_fanout_leaf; 4035 for (i = 1; i < RCU_NUM_LVLS; i++) 4036 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; 4037 4038 /* 4039 * The tree must be able to accommodate the configured number of CPUs. 4040 * If this limit is exceeded, fall back to the compile-time values. 4041 */ 4042 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { 4043 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4044 WARN_ON(1); 4045 return; 4046 } 4047 4048 /* Calculate the number of levels in the tree. */ 4049 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { 4050 } 4051 rcu_num_lvls = i + 1; 4052 4053 /* Calculate the number of rcu_nodes at each level of the tree. */ 4054 for (i = 0; i < rcu_num_lvls; i++) { 4055 int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; 4056 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); 4057 } 4058 4059 /* Calculate the total number of rcu_node structures. */ 4060 rcu_num_nodes = 0; 4061 for (i = 0; i < rcu_num_lvls; i++) 4062 rcu_num_nodes += num_rcu_lvl[i]; 4063 } 4064 4065 /* 4066 * Dump out the structure of the rcu_node combining tree associated 4067 * with the rcu_state structure referenced by rsp. 4068 */ 4069 static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) 4070 { 4071 int level = 0; 4072 struct rcu_node *rnp; 4073 4074 pr_info("rcu_node tree layout dump\n"); 4075 pr_info(" "); 4076 rcu_for_each_node_breadth_first(rsp, rnp) { 4077 if (rnp->level != level) { 4078 pr_cont("\n"); 4079 pr_info(" "); 4080 level = rnp->level; 4081 } 4082 pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum); 4083 } 4084 pr_cont("\n"); 4085 } 4086 4087 struct workqueue_struct *rcu_gp_wq; 4088 struct workqueue_struct *rcu_par_gp_wq; 4089 4090 void __init rcu_init(void) 4091 { 4092 int cpu; 4093 4094 rcu_early_boot_tests(); 4095 4096 rcu_bootup_announce(); 4097 rcu_init_geometry(); 4098 rcu_init_one(&rcu_bh_state); 4099 rcu_init_one(&rcu_sched_state); 4100 if (dump_tree) 4101 rcu_dump_rcu_node_tree(&rcu_sched_state); 4102 __rcu_init_preempt(); 4103 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 4104 4105 /* 4106 * We don't need protection against CPU-hotplug here because 4107 * this is called early in boot, before either interrupts 4108 * or the scheduler are operational. 4109 */ 4110 pm_notifier(rcu_pm_notify, 0); 4111 for_each_online_cpu(cpu) { 4112 rcutree_prepare_cpu(cpu); 4113 rcu_cpu_starting(cpu); 4114 rcutree_online_cpu(cpu); 4115 } 4116 4117 /* Create workqueue for expedited GPs and for Tree SRCU. */ 4118 rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); 4119 WARN_ON(!rcu_gp_wq); 4120 rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); 4121 WARN_ON(!rcu_par_gp_wq); 4122 } 4123 4124 #include "tree_exp.h" 4125 #include "tree_plugin.h" 4126