1 /* 2 * Read-Copy Update mechanism for mutual exclusion 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, you can access it online at 16 * http://www.gnu.org/licenses/gpl-2.0.html. 17 * 18 * Copyright IBM Corporation, 2008 19 * 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 21 * Manfred Spraul <manfred@colorfullife.com> 22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version 23 * 24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 26 * 27 * For detailed explanation of Read-Copy Update mechanism see - 28 * Documentation/RCU 29 */ 30 #include <linux/types.h> 31 #include <linux/kernel.h> 32 #include <linux/init.h> 33 #include <linux/spinlock.h> 34 #include <linux/smp.h> 35 #include <linux/rcupdate.h> 36 #include <linux/interrupt.h> 37 #include <linux/sched.h> 38 #include <linux/nmi.h> 39 #include <linux/atomic.h> 40 #include <linux/bitops.h> 41 #include <linux/export.h> 42 #include <linux/completion.h> 43 #include <linux/moduleparam.h> 44 #include <linux/module.h> 45 #include <linux/percpu.h> 46 #include <linux/notifier.h> 47 #include <linux/cpu.h> 48 #include <linux/mutex.h> 49 #include <linux/time.h> 50 #include <linux/kernel_stat.h> 51 #include <linux/wait.h> 52 #include <linux/kthread.h> 53 #include <linux/prefetch.h> 54 #include <linux/delay.h> 55 #include <linux/stop_machine.h> 56 #include <linux/random.h> 57 #include <linux/trace_events.h> 58 #include <linux/suspend.h> 59 60 #include "tree.h" 61 #include "rcu.h" 62 63 MODULE_ALIAS("rcutree"); 64 #ifdef MODULE_PARAM_PREFIX 65 #undef MODULE_PARAM_PREFIX 66 #endif 67 #define MODULE_PARAM_PREFIX "rcutree." 68 69 /* Data structures. */ 70 71 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 72 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 73 static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; 74 75 /* 76 * In order to export the rcu_state name to the tracing tools, it 77 * needs to be added in the __tracepoint_string section. 78 * This requires defining a separate variable tp_<sname>_varname 79 * that points to the string being used, and this will allow 80 * the tracing userspace tools to be able to decipher the string 81 * address to the matching string. 82 */ 83 #ifdef CONFIG_TRACING 84 # define DEFINE_RCU_TPS(sname) \ 85 static char sname##_varname[] = #sname; \ 86 static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; 87 # define RCU_STATE_NAME(sname) sname##_varname 88 #else 89 # define DEFINE_RCU_TPS(sname) 90 # define RCU_STATE_NAME(sname) __stringify(sname) 91 #endif 92 93 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 94 DEFINE_RCU_TPS(sname) \ 95 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ 96 struct rcu_state sname##_state = { \ 97 .level = { &sname##_state.node[0] }, \ 98 .rda = &sname##_data, \ 99 .call = cr, \ 100 .gp_state = RCU_GP_IDLE, \ 101 .gpnum = 0UL - 300UL, \ 102 .completed = 0UL - 300UL, \ 103 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 104 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 105 .orphan_donetail = &sname##_state.orphan_donelist, \ 106 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 107 .name = RCU_STATE_NAME(sname), \ 108 .abbr = sabbr, \ 109 } 110 111 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 112 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 113 114 static struct rcu_state *const rcu_state_p; 115 static struct rcu_data __percpu *const rcu_data_p; 116 LIST_HEAD(rcu_struct_flavors); 117 118 /* Dump rcu_node combining tree at boot to verify correct setup. */ 119 static bool dump_tree; 120 module_param(dump_tree, bool, 0444); 121 /* Control rcu_node-tree auto-balancing at boot time. */ 122 static bool rcu_fanout_exact; 123 module_param(rcu_fanout_exact, bool, 0444); 124 /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */ 125 static int rcu_fanout_leaf = RCU_FANOUT_LEAF; 126 module_param(rcu_fanout_leaf, int, 0444); 127 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 128 /* Number of rcu_nodes at specified level. */ 129 static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 130 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 131 132 /* 133 * The rcu_scheduler_active variable transitions from zero to one just 134 * before the first task is spawned. So when this variable is zero, RCU 135 * can assume that there is but one task, allowing RCU to (for example) 136 * optimize synchronize_sched() to a simple barrier(). When this variable 137 * is one, RCU must actually do all the hard work required to detect real 138 * grace periods. This variable is also used to suppress boot-time false 139 * positives from lockdep-RCU error checking. 140 */ 141 int rcu_scheduler_active __read_mostly; 142 EXPORT_SYMBOL_GPL(rcu_scheduler_active); 143 144 /* 145 * The rcu_scheduler_fully_active variable transitions from zero to one 146 * during the early_initcall() processing, which is after the scheduler 147 * is capable of creating new tasks. So RCU processing (for example, 148 * creating tasks for RCU priority boosting) must be delayed until after 149 * rcu_scheduler_fully_active transitions from zero to one. We also 150 * currently delay invocation of any RCU callbacks until after this point. 151 * 152 * It might later prove better for people registering RCU callbacks during 153 * early boot to take responsibility for these callbacks, but one step at 154 * a time. 155 */ 156 static int rcu_scheduler_fully_active __read_mostly; 157 158 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); 159 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); 160 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 161 static void invoke_rcu_core(void); 162 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 163 static void rcu_report_exp_rdp(struct rcu_state *rsp, 164 struct rcu_data *rdp, bool wake); 165 166 /* rcuc/rcub kthread realtime priority */ 167 #ifdef CONFIG_RCU_KTHREAD_PRIO 168 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; 169 #else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ 170 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; 171 #endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ 172 module_param(kthread_prio, int, 0644); 173 174 /* Delay in jiffies for grace-period initialization delays, debug only. */ 175 176 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT 177 static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; 178 module_param(gp_preinit_delay, int, 0644); 179 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ 180 static const int gp_preinit_delay; 181 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ 182 183 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT 184 static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; 185 module_param(gp_init_delay, int, 0644); 186 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ 187 static const int gp_init_delay; 188 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ 189 190 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP 191 static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; 192 module_param(gp_cleanup_delay, int, 0644); 193 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ 194 static const int gp_cleanup_delay; 195 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ 196 197 /* 198 * Number of grace periods between delays, normalized by the duration of 199 * the delay. The longer the the delay, the more the grace periods between 200 * each delay. The reason for this normalization is that it means that, 201 * for non-zero delays, the overall slowdown of grace periods is constant 202 * regardless of the duration of the delay. This arrangement balances 203 * the need for long delays to increase some race probabilities with the 204 * need for fast grace periods to increase other race probabilities. 205 */ 206 #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ 207 208 /* 209 * Track the rcutorture test sequence number and the update version 210 * number within a given test. The rcutorture_testseq is incremented 211 * on every rcutorture module load and unload, so has an odd value 212 * when a test is running. The rcutorture_vernum is set to zero 213 * when rcutorture starts and is incremented on each rcutorture update. 214 * These variables enable correlating rcutorture output with the 215 * RCU tracing information. 216 */ 217 unsigned long rcutorture_testseq; 218 unsigned long rcutorture_vernum; 219 220 /* 221 * Compute the mask of online CPUs for the specified rcu_node structure. 222 * This will not be stable unless the rcu_node structure's ->lock is 223 * held, but the bit corresponding to the current CPU will be stable 224 * in most contexts. 225 */ 226 unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) 227 { 228 return READ_ONCE(rnp->qsmaskinitnext); 229 } 230 231 /* 232 * Return true if an RCU grace period is in progress. The READ_ONCE()s 233 * permit this function to be invoked without holding the root rcu_node 234 * structure's ->lock, but of course results can be subject to change. 235 */ 236 static int rcu_gp_in_progress(struct rcu_state *rsp) 237 { 238 return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); 239 } 240 241 /* 242 * Note a quiescent state. Because we do not need to know 243 * how many quiescent states passed, just if there was at least 244 * one since the start of the grace period, this just sets a flag. 245 * The caller must have disabled preemption. 246 */ 247 void rcu_sched_qs(void) 248 { 249 unsigned long flags; 250 251 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { 252 trace_rcu_grace_period(TPS("rcu_sched"), 253 __this_cpu_read(rcu_sched_data.gpnum), 254 TPS("cpuqs")); 255 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); 256 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) 257 return; 258 local_irq_save(flags); 259 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { 260 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); 261 rcu_report_exp_rdp(&rcu_sched_state, 262 this_cpu_ptr(&rcu_sched_data), 263 true); 264 } 265 local_irq_restore(flags); 266 } 267 } 268 269 void rcu_bh_qs(void) 270 { 271 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { 272 trace_rcu_grace_period(TPS("rcu_bh"), 273 __this_cpu_read(rcu_bh_data.gpnum), 274 TPS("cpuqs")); 275 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); 276 } 277 } 278 279 static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 280 281 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 282 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 283 .dynticks = ATOMIC_INIT(1), 284 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 285 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 286 .dynticks_idle = ATOMIC_INIT(1), 287 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 288 }; 289 290 DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); 291 EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); 292 293 /* 294 * Let the RCU core know that this CPU has gone through the scheduler, 295 * which is a quiescent state. This is called when the need for a 296 * quiescent state is urgent, so we burn an atomic operation and full 297 * memory barriers to let the RCU core know about it, regardless of what 298 * this CPU might (or might not) do in the near future. 299 * 300 * We inform the RCU core by emulating a zero-duration dyntick-idle 301 * period, which we in turn do by incrementing the ->dynticks counter 302 * by two. 303 */ 304 static void rcu_momentary_dyntick_idle(void) 305 { 306 unsigned long flags; 307 struct rcu_data *rdp; 308 struct rcu_dynticks *rdtp; 309 int resched_mask; 310 struct rcu_state *rsp; 311 312 local_irq_save(flags); 313 314 /* 315 * Yes, we can lose flag-setting operations. This is OK, because 316 * the flag will be set again after some delay. 317 */ 318 resched_mask = raw_cpu_read(rcu_sched_qs_mask); 319 raw_cpu_write(rcu_sched_qs_mask, 0); 320 321 /* Find the flavor that needs a quiescent state. */ 322 for_each_rcu_flavor(rsp) { 323 rdp = raw_cpu_ptr(rsp->rda); 324 if (!(resched_mask & rsp->flavor_mask)) 325 continue; 326 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ 327 if (READ_ONCE(rdp->mynode->completed) != 328 READ_ONCE(rdp->cond_resched_completed)) 329 continue; 330 331 /* 332 * Pretend to be momentarily idle for the quiescent state. 333 * This allows the grace-period kthread to record the 334 * quiescent state, with no need for this CPU to do anything 335 * further. 336 */ 337 rdtp = this_cpu_ptr(&rcu_dynticks); 338 smp_mb__before_atomic(); /* Earlier stuff before QS. */ 339 atomic_add(2, &rdtp->dynticks); /* QS. */ 340 smp_mb__after_atomic(); /* Later stuff after QS. */ 341 break; 342 } 343 local_irq_restore(flags); 344 } 345 346 /* 347 * Note a context switch. This is a quiescent state for RCU-sched, 348 * and requires special handling for preemptible RCU. 349 * The caller must have disabled preemption. 350 */ 351 void rcu_note_context_switch(void) 352 { 353 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 354 trace_rcu_utilization(TPS("Start context switch")); 355 rcu_sched_qs(); 356 rcu_preempt_note_context_switch(); 357 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 358 rcu_momentary_dyntick_idle(); 359 trace_rcu_utilization(TPS("End context switch")); 360 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 361 } 362 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 363 364 /* 365 * Register a quiescent state for all RCU flavors. If there is an 366 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 367 * dyntick-idle quiescent state visible to other CPUs (but only for those 368 * RCU flavors in desperate need of a quiescent state, which will normally 369 * be none of them). Either way, do a lightweight quiescent state for 370 * all RCU flavors. 371 * 372 * The barrier() calls are redundant in the common case when this is 373 * called externally, but just in case this is called from within this 374 * file. 375 * 376 */ 377 void rcu_all_qs(void) 378 { 379 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 380 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 381 rcu_momentary_dyntick_idle(); 382 this_cpu_inc(rcu_qs_ctr); 383 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 384 } 385 EXPORT_SYMBOL_GPL(rcu_all_qs); 386 387 static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 388 static long qhimark = 10000; /* If this many pending, ignore blimit. */ 389 static long qlowmark = 100; /* Once only this many pending, use blimit. */ 390 391 module_param(blimit, long, 0444); 392 module_param(qhimark, long, 0444); 393 module_param(qlowmark, long, 0444); 394 395 static ulong jiffies_till_first_fqs = ULONG_MAX; 396 static ulong jiffies_till_next_fqs = ULONG_MAX; 397 398 module_param(jiffies_till_first_fqs, ulong, 0644); 399 module_param(jiffies_till_next_fqs, ulong, 0644); 400 401 /* 402 * How long the grace period must be before we start recruiting 403 * quiescent-state help from rcu_note_context_switch(). 404 */ 405 static ulong jiffies_till_sched_qs = HZ / 20; 406 module_param(jiffies_till_sched_qs, ulong, 0644); 407 408 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 409 struct rcu_data *rdp); 410 static void force_qs_rnp(struct rcu_state *rsp, 411 int (*f)(struct rcu_data *rsp, bool *isidle, 412 unsigned long *maxj), 413 bool *isidle, unsigned long *maxj); 414 static void force_quiescent_state(struct rcu_state *rsp); 415 static int rcu_pending(void); 416 417 /* 418 * Return the number of RCU batches started thus far for debug & stats. 419 */ 420 unsigned long rcu_batches_started(void) 421 { 422 return rcu_state_p->gpnum; 423 } 424 EXPORT_SYMBOL_GPL(rcu_batches_started); 425 426 /* 427 * Return the number of RCU-sched batches started thus far for debug & stats. 428 */ 429 unsigned long rcu_batches_started_sched(void) 430 { 431 return rcu_sched_state.gpnum; 432 } 433 EXPORT_SYMBOL_GPL(rcu_batches_started_sched); 434 435 /* 436 * Return the number of RCU BH batches started thus far for debug & stats. 437 */ 438 unsigned long rcu_batches_started_bh(void) 439 { 440 return rcu_bh_state.gpnum; 441 } 442 EXPORT_SYMBOL_GPL(rcu_batches_started_bh); 443 444 /* 445 * Return the number of RCU batches completed thus far for debug & stats. 446 */ 447 unsigned long rcu_batches_completed(void) 448 { 449 return rcu_state_p->completed; 450 } 451 EXPORT_SYMBOL_GPL(rcu_batches_completed); 452 453 /* 454 * Return the number of RCU-sched batches completed thus far for debug & stats. 455 */ 456 unsigned long rcu_batches_completed_sched(void) 457 { 458 return rcu_sched_state.completed; 459 } 460 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); 461 462 /* 463 * Return the number of RCU BH batches completed thus far for debug & stats. 464 */ 465 unsigned long rcu_batches_completed_bh(void) 466 { 467 return rcu_bh_state.completed; 468 } 469 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 470 471 /* 472 * Force a quiescent state. 473 */ 474 void rcu_force_quiescent_state(void) 475 { 476 force_quiescent_state(rcu_state_p); 477 } 478 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 479 480 /* 481 * Force a quiescent state for RCU BH. 482 */ 483 void rcu_bh_force_quiescent_state(void) 484 { 485 force_quiescent_state(&rcu_bh_state); 486 } 487 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 488 489 /* 490 * Force a quiescent state for RCU-sched. 491 */ 492 void rcu_sched_force_quiescent_state(void) 493 { 494 force_quiescent_state(&rcu_sched_state); 495 } 496 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 497 498 /* 499 * Show the state of the grace-period kthreads. 500 */ 501 void show_rcu_gp_kthreads(void) 502 { 503 struct rcu_state *rsp; 504 505 for_each_rcu_flavor(rsp) { 506 pr_info("%s: wait state: %d ->state: %#lx\n", 507 rsp->name, rsp->gp_state, rsp->gp_kthread->state); 508 /* sched_show_task(rsp->gp_kthread); */ 509 } 510 } 511 EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); 512 513 /* 514 * Record the number of times rcutorture tests have been initiated and 515 * terminated. This information allows the debugfs tracing stats to be 516 * correlated to the rcutorture messages, even when the rcutorture module 517 * is being repeatedly loaded and unloaded. In other words, we cannot 518 * store this state in rcutorture itself. 519 */ 520 void rcutorture_record_test_transition(void) 521 { 522 rcutorture_testseq++; 523 rcutorture_vernum = 0; 524 } 525 EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); 526 527 /* 528 * Send along grace-period-related data for rcutorture diagnostics. 529 */ 530 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, 531 unsigned long *gpnum, unsigned long *completed) 532 { 533 struct rcu_state *rsp = NULL; 534 535 switch (test_type) { 536 case RCU_FLAVOR: 537 rsp = rcu_state_p; 538 break; 539 case RCU_BH_FLAVOR: 540 rsp = &rcu_bh_state; 541 break; 542 case RCU_SCHED_FLAVOR: 543 rsp = &rcu_sched_state; 544 break; 545 default: 546 break; 547 } 548 if (rsp != NULL) { 549 *flags = READ_ONCE(rsp->gp_flags); 550 *gpnum = READ_ONCE(rsp->gpnum); 551 *completed = READ_ONCE(rsp->completed); 552 return; 553 } 554 *flags = 0; 555 *gpnum = 0; 556 *completed = 0; 557 } 558 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); 559 560 /* 561 * Record the number of writer passes through the current rcutorture test. 562 * This is also used to correlate debugfs tracing stats with the rcutorture 563 * messages. 564 */ 565 void rcutorture_record_progress(unsigned long vernum) 566 { 567 rcutorture_vernum++; 568 } 569 EXPORT_SYMBOL_GPL(rcutorture_record_progress); 570 571 /* 572 * Does the CPU have callbacks ready to be invoked? 573 */ 574 static int 575 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 576 { 577 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && 578 rdp->nxttail[RCU_DONE_TAIL] != NULL; 579 } 580 581 /* 582 * Return the root node of the specified rcu_state structure. 583 */ 584 static struct rcu_node *rcu_get_root(struct rcu_state *rsp) 585 { 586 return &rsp->node[0]; 587 } 588 589 /* 590 * Is there any need for future grace periods? 591 * Interrupts must be disabled. If the caller does not hold the root 592 * rnp_node structure's ->lock, the results are advisory only. 593 */ 594 static int rcu_future_needs_gp(struct rcu_state *rsp) 595 { 596 struct rcu_node *rnp = rcu_get_root(rsp); 597 int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; 598 int *fp = &rnp->need_future_gp[idx]; 599 600 return READ_ONCE(*fp); 601 } 602 603 /* 604 * Does the current CPU require a not-yet-started grace period? 605 * The caller must have disabled interrupts to prevent races with 606 * normal callback registry. 607 */ 608 static int 609 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 610 { 611 int i; 612 613 if (rcu_gp_in_progress(rsp)) 614 return 0; /* No, a grace period is already in progress. */ 615 if (rcu_future_needs_gp(rsp)) 616 return 1; /* Yes, a no-CBs CPU needs one. */ 617 if (!rdp->nxttail[RCU_NEXT_TAIL]) 618 return 0; /* No, this is a no-CBs (or offline) CPU. */ 619 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 620 return 1; /* Yes, this CPU has newly registered callbacks. */ 621 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) 622 if (rdp->nxttail[i - 1] != rdp->nxttail[i] && 623 ULONG_CMP_LT(READ_ONCE(rsp->completed), 624 rdp->nxtcompleted[i])) 625 return 1; /* Yes, CBs for future grace period. */ 626 return 0; /* No grace period needed. */ 627 } 628 629 /* 630 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state 631 * 632 * If the new value of the ->dynticks_nesting counter now is zero, 633 * we really have entered idle, and must do the appropriate accounting. 634 * The caller must have disabled interrupts. 635 */ 636 static void rcu_eqs_enter_common(long long oldval, bool user) 637 { 638 struct rcu_state *rsp; 639 struct rcu_data *rdp; 640 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 641 642 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 643 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 644 !user && !is_idle_task(current)) { 645 struct task_struct *idle __maybe_unused = 646 idle_task(smp_processor_id()); 647 648 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); 649 ftrace_dump(DUMP_ORIG); 650 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 651 current->pid, current->comm, 652 idle->pid, idle->comm); /* must be idle task! */ 653 } 654 for_each_rcu_flavor(rsp) { 655 rdp = this_cpu_ptr(rsp->rda); 656 do_nocb_deferred_wakeup(rdp); 657 } 658 rcu_prepare_for_idle(); 659 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 660 smp_mb__before_atomic(); /* See above. */ 661 atomic_inc(&rdtp->dynticks); 662 smp_mb__after_atomic(); /* Force ordering with next sojourn. */ 663 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 664 atomic_read(&rdtp->dynticks) & 0x1); 665 rcu_dynticks_task_enter(); 666 667 /* 668 * It is illegal to enter an extended quiescent state while 669 * in an RCU read-side critical section. 670 */ 671 RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), 672 "Illegal idle entry in RCU read-side critical section."); 673 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), 674 "Illegal idle entry in RCU-bh read-side critical section."); 675 RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), 676 "Illegal idle entry in RCU-sched read-side critical section."); 677 } 678 679 /* 680 * Enter an RCU extended quiescent state, which can be either the 681 * idle loop or adaptive-tickless usermode execution. 682 */ 683 static void rcu_eqs_enter(bool user) 684 { 685 long long oldval; 686 struct rcu_dynticks *rdtp; 687 688 rdtp = this_cpu_ptr(&rcu_dynticks); 689 oldval = rdtp->dynticks_nesting; 690 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 691 (oldval & DYNTICK_TASK_NEST_MASK) == 0); 692 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { 693 rdtp->dynticks_nesting = 0; 694 rcu_eqs_enter_common(oldval, user); 695 } else { 696 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 697 } 698 } 699 700 /** 701 * rcu_idle_enter - inform RCU that current CPU is entering idle 702 * 703 * Enter idle mode, in other words, -leave- the mode in which RCU 704 * read-side critical sections can occur. (Though RCU read-side 705 * critical sections can occur in irq handlers in idle, a possibility 706 * handled by irq_enter() and irq_exit().) 707 * 708 * We crowbar the ->dynticks_nesting field to zero to allow for 709 * the possibility of usermode upcalls having messed up our count 710 * of interrupt nesting level during the prior busy period. 711 */ 712 void rcu_idle_enter(void) 713 { 714 unsigned long flags; 715 716 local_irq_save(flags); 717 rcu_eqs_enter(false); 718 rcu_sysidle_enter(0); 719 local_irq_restore(flags); 720 } 721 EXPORT_SYMBOL_GPL(rcu_idle_enter); 722 723 #ifdef CONFIG_NO_HZ_FULL 724 /** 725 * rcu_user_enter - inform RCU that we are resuming userspace. 726 * 727 * Enter RCU idle mode right before resuming userspace. No use of RCU 728 * is permitted between this call and rcu_user_exit(). This way the 729 * CPU doesn't need to maintain the tick for RCU maintenance purposes 730 * when the CPU runs in userspace. 731 */ 732 void rcu_user_enter(void) 733 { 734 rcu_eqs_enter(1); 735 } 736 #endif /* CONFIG_NO_HZ_FULL */ 737 738 /** 739 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 740 * 741 * Exit from an interrupt handler, which might possibly result in entering 742 * idle mode, in other words, leaving the mode in which read-side critical 743 * sections can occur. 744 * 745 * This code assumes that the idle loop never does anything that might 746 * result in unbalanced calls to irq_enter() and irq_exit(). If your 747 * architecture violates this assumption, RCU will give you what you 748 * deserve, good and hard. But very infrequently and irreproducibly. 749 * 750 * Use things like work queues to work around this limitation. 751 * 752 * You have been warned. 753 */ 754 void rcu_irq_exit(void) 755 { 756 unsigned long flags; 757 long long oldval; 758 struct rcu_dynticks *rdtp; 759 760 local_irq_save(flags); 761 rdtp = this_cpu_ptr(&rcu_dynticks); 762 oldval = rdtp->dynticks_nesting; 763 rdtp->dynticks_nesting--; 764 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 765 rdtp->dynticks_nesting < 0); 766 if (rdtp->dynticks_nesting) 767 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); 768 else 769 rcu_eqs_enter_common(oldval, true); 770 rcu_sysidle_enter(1); 771 local_irq_restore(flags); 772 } 773 774 /* 775 * rcu_eqs_exit_common - current CPU moving away from extended quiescent state 776 * 777 * If the new value of the ->dynticks_nesting counter was previously zero, 778 * we really have exited idle, and must do the appropriate accounting. 779 * The caller must have disabled interrupts. 780 */ 781 static void rcu_eqs_exit_common(long long oldval, int user) 782 { 783 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 784 785 rcu_dynticks_task_exit(); 786 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ 787 atomic_inc(&rdtp->dynticks); 788 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 789 smp_mb__after_atomic(); /* See above. */ 790 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 791 !(atomic_read(&rdtp->dynticks) & 0x1)); 792 rcu_cleanup_after_idle(); 793 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 794 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 795 !user && !is_idle_task(current)) { 796 struct task_struct *idle __maybe_unused = 797 idle_task(smp_processor_id()); 798 799 trace_rcu_dyntick(TPS("Error on exit: not idle task"), 800 oldval, rdtp->dynticks_nesting); 801 ftrace_dump(DUMP_ORIG); 802 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 803 current->pid, current->comm, 804 idle->pid, idle->comm); /* must be idle task! */ 805 } 806 } 807 808 /* 809 * Exit an RCU extended quiescent state, which can be either the 810 * idle loop or adaptive-tickless usermode execution. 811 */ 812 static void rcu_eqs_exit(bool user) 813 { 814 struct rcu_dynticks *rdtp; 815 long long oldval; 816 817 rdtp = this_cpu_ptr(&rcu_dynticks); 818 oldval = rdtp->dynticks_nesting; 819 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); 820 if (oldval & DYNTICK_TASK_NEST_MASK) { 821 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 822 } else { 823 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 824 rcu_eqs_exit_common(oldval, user); 825 } 826 } 827 828 /** 829 * rcu_idle_exit - inform RCU that current CPU is leaving idle 830 * 831 * Exit idle mode, in other words, -enter- the mode in which RCU 832 * read-side critical sections can occur. 833 * 834 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to 835 * allow for the possibility of usermode upcalls messing up our count 836 * of interrupt nesting level during the busy period that is just 837 * now starting. 838 */ 839 void rcu_idle_exit(void) 840 { 841 unsigned long flags; 842 843 local_irq_save(flags); 844 rcu_eqs_exit(false); 845 rcu_sysidle_exit(0); 846 local_irq_restore(flags); 847 } 848 EXPORT_SYMBOL_GPL(rcu_idle_exit); 849 850 #ifdef CONFIG_NO_HZ_FULL 851 /** 852 * rcu_user_exit - inform RCU that we are exiting userspace. 853 * 854 * Exit RCU idle mode while entering the kernel because it can 855 * run a RCU read side critical section anytime. 856 */ 857 void rcu_user_exit(void) 858 { 859 rcu_eqs_exit(1); 860 } 861 #endif /* CONFIG_NO_HZ_FULL */ 862 863 /** 864 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 865 * 866 * Enter an interrupt handler, which might possibly result in exiting 867 * idle mode, in other words, entering the mode in which read-side critical 868 * sections can occur. 869 * 870 * Note that the Linux kernel is fully capable of entering an interrupt 871 * handler that it never exits, for example when doing upcalls to 872 * user mode! This code assumes that the idle loop never does upcalls to 873 * user mode. If your architecture does do upcalls from the idle loop (or 874 * does anything else that results in unbalanced calls to the irq_enter() 875 * and irq_exit() functions), RCU will give you what you deserve, good 876 * and hard. But very infrequently and irreproducibly. 877 * 878 * Use things like work queues to work around this limitation. 879 * 880 * You have been warned. 881 */ 882 void rcu_irq_enter(void) 883 { 884 unsigned long flags; 885 struct rcu_dynticks *rdtp; 886 long long oldval; 887 888 local_irq_save(flags); 889 rdtp = this_cpu_ptr(&rcu_dynticks); 890 oldval = rdtp->dynticks_nesting; 891 rdtp->dynticks_nesting++; 892 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 893 rdtp->dynticks_nesting == 0); 894 if (oldval) 895 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 896 else 897 rcu_eqs_exit_common(oldval, true); 898 rcu_sysidle_exit(1); 899 local_irq_restore(flags); 900 } 901 902 /** 903 * rcu_nmi_enter - inform RCU of entry to NMI context 904 * 905 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and 906 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know 907 * that the CPU is active. This implementation permits nested NMIs, as 908 * long as the nesting level does not overflow an int. (You will probably 909 * run out of stack space first.) 910 */ 911 void rcu_nmi_enter(void) 912 { 913 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 914 int incby = 2; 915 916 /* Complain about underflow. */ 917 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); 918 919 /* 920 * If idle from RCU viewpoint, atomically increment ->dynticks 921 * to mark non-idle and increment ->dynticks_nmi_nesting by one. 922 * Otherwise, increment ->dynticks_nmi_nesting by two. This means 923 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed 924 * to be in the outermost NMI handler that interrupted an RCU-idle 925 * period (observation due to Andy Lutomirski). 926 */ 927 if (!(atomic_read(&rdtp->dynticks) & 0x1)) { 928 smp_mb__before_atomic(); /* Force delay from prior write. */ 929 atomic_inc(&rdtp->dynticks); 930 /* atomic_inc() before later RCU read-side crit sects */ 931 smp_mb__after_atomic(); /* See above. */ 932 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 933 incby = 1; 934 } 935 rdtp->dynticks_nmi_nesting += incby; 936 barrier(); 937 } 938 939 /** 940 * rcu_nmi_exit - inform RCU of exit from NMI context 941 * 942 * If we are returning from the outermost NMI handler that interrupted an 943 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting 944 * to let the RCU grace-period handling know that the CPU is back to 945 * being RCU-idle. 946 */ 947 void rcu_nmi_exit(void) 948 { 949 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 950 951 /* 952 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. 953 * (We are exiting an NMI handler, so RCU better be paying attention 954 * to us!) 955 */ 956 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); 957 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 958 959 /* 960 * If the nesting level is not 1, the CPU wasn't RCU-idle, so 961 * leave it in non-RCU-idle state. 962 */ 963 if (rdtp->dynticks_nmi_nesting != 1) { 964 rdtp->dynticks_nmi_nesting -= 2; 965 return; 966 } 967 968 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ 969 rdtp->dynticks_nmi_nesting = 0; 970 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 971 smp_mb__before_atomic(); /* See above. */ 972 atomic_inc(&rdtp->dynticks); 973 smp_mb__after_atomic(); /* Force delay to next write. */ 974 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 975 } 976 977 /** 978 * __rcu_is_watching - are RCU read-side critical sections safe? 979 * 980 * Return true if RCU is watching the running CPU, which means that 981 * this CPU can safely enter RCU read-side critical sections. Unlike 982 * rcu_is_watching(), the caller of __rcu_is_watching() must have at 983 * least disabled preemption. 984 */ 985 bool notrace __rcu_is_watching(void) 986 { 987 return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; 988 } 989 990 /** 991 * rcu_is_watching - see if RCU thinks that the current CPU is idle 992 * 993 * If the current CPU is in its idle loop and is neither in an interrupt 994 * or NMI handler, return true. 995 */ 996 bool notrace rcu_is_watching(void) 997 { 998 bool ret; 999 1000 preempt_disable_notrace(); 1001 ret = __rcu_is_watching(); 1002 preempt_enable_notrace(); 1003 return ret; 1004 } 1005 EXPORT_SYMBOL_GPL(rcu_is_watching); 1006 1007 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 1008 1009 /* 1010 * Is the current CPU online? Disable preemption to avoid false positives 1011 * that could otherwise happen due to the current CPU number being sampled, 1012 * this task being preempted, its old CPU being taken offline, resuming 1013 * on some other CPU, then determining that its old CPU is now offline. 1014 * It is OK to use RCU on an offline processor during initial boot, hence 1015 * the check for rcu_scheduler_fully_active. Note also that it is OK 1016 * for a CPU coming online to use RCU for one jiffy prior to marking itself 1017 * online in the cpu_online_mask. Similarly, it is OK for a CPU going 1018 * offline to continue to use RCU for one jiffy after marking itself 1019 * offline in the cpu_online_mask. This leniency is necessary given the 1020 * non-atomic nature of the online and offline processing, for example, 1021 * the fact that a CPU enters the scheduler after completing the CPU_DYING 1022 * notifiers. 1023 * 1024 * This is also why RCU internally marks CPUs online during the 1025 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. 1026 * 1027 * Disable checking if in an NMI handler because we cannot safely report 1028 * errors from NMI handlers anyway. 1029 */ 1030 bool rcu_lockdep_current_cpu_online(void) 1031 { 1032 struct rcu_data *rdp; 1033 struct rcu_node *rnp; 1034 bool ret; 1035 1036 if (in_nmi()) 1037 return true; 1038 preempt_disable(); 1039 rdp = this_cpu_ptr(&rcu_sched_data); 1040 rnp = rdp->mynode; 1041 ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || 1042 !rcu_scheduler_fully_active; 1043 preempt_enable(); 1044 return ret; 1045 } 1046 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 1047 1048 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ 1049 1050 /** 1051 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 1052 * 1053 * If the current CPU is idle or running at a first-level (not nested) 1054 * interrupt from idle, return true. The caller must have at least 1055 * disabled preemption. 1056 */ 1057 static int rcu_is_cpu_rrupt_from_idle(void) 1058 { 1059 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; 1060 } 1061 1062 /* 1063 * Snapshot the specified CPU's dynticks counter so that we can later 1064 * credit them with an implicit quiescent state. Return 1 if this CPU 1065 * is in dynticks idle mode, which is an extended quiescent state. 1066 */ 1067 static int dyntick_save_progress_counter(struct rcu_data *rdp, 1068 bool *isidle, unsigned long *maxj) 1069 { 1070 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 1071 rcu_sysidle_check_cpu(rdp, isidle, maxj); 1072 if ((rdp->dynticks_snap & 0x1) == 0) { 1073 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 1074 return 1; 1075 } else { 1076 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, 1077 rdp->mynode->gpnum)) 1078 WRITE_ONCE(rdp->gpwrap, true); 1079 return 0; 1080 } 1081 } 1082 1083 /* 1084 * Return true if the specified CPU has passed through a quiescent 1085 * state by virtue of being in or having passed through an dynticks 1086 * idle state since the last call to dyntick_save_progress_counter() 1087 * for this same CPU, or by virtue of having been offline. 1088 */ 1089 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, 1090 bool *isidle, unsigned long *maxj) 1091 { 1092 unsigned int curr; 1093 int *rcrmp; 1094 unsigned int snap; 1095 1096 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); 1097 snap = (unsigned int)rdp->dynticks_snap; 1098 1099 /* 1100 * If the CPU passed through or entered a dynticks idle phase with 1101 * no active irq/NMI handlers, then we can safely pretend that the CPU 1102 * already acknowledged the request to pass through a quiescent 1103 * state. Either way, that CPU cannot possibly be in an RCU 1104 * read-side critical section that started before the beginning 1105 * of the current RCU grace period. 1106 */ 1107 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { 1108 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 1109 rdp->dynticks_fqs++; 1110 return 1; 1111 } 1112 1113 /* 1114 * Check for the CPU being offline, but only if the grace period 1115 * is old enough. We don't need to worry about the CPU changing 1116 * state: If we see it offline even once, it has been through a 1117 * quiescent state. 1118 * 1119 * The reason for insisting that the grace period be at least 1120 * one jiffy old is that CPUs that are not quite online and that 1121 * have just gone offline can still execute RCU read-side critical 1122 * sections. 1123 */ 1124 if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) 1125 return 0; /* Grace period is not old enough. */ 1126 barrier(); 1127 if (cpu_is_offline(rdp->cpu)) { 1128 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); 1129 rdp->offline_fqs++; 1130 return 1; 1131 } 1132 1133 /* 1134 * A CPU running for an extended time within the kernel can 1135 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, 1136 * even context-switching back and forth between a pair of 1137 * in-kernel CPU-bound tasks cannot advance grace periods. 1138 * So if the grace period is old enough, make the CPU pay attention. 1139 * Note that the unsynchronized assignments to the per-CPU 1140 * rcu_sched_qs_mask variable are safe. Yes, setting of 1141 * bits can be lost, but they will be set again on the next 1142 * force-quiescent-state pass. So lost bit sets do not result 1143 * in incorrect behavior, merely in a grace period lasting 1144 * a few jiffies longer than it might otherwise. Because 1145 * there are at most four threads involved, and because the 1146 * updates are only once every few jiffies, the probability of 1147 * lossage (and thus of slight grace-period extension) is 1148 * quite low. 1149 * 1150 * Note that if the jiffies_till_sched_qs boot/sysfs parameter 1151 * is set too high, we override with half of the RCU CPU stall 1152 * warning delay. 1153 */ 1154 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); 1155 if (ULONG_CMP_GE(jiffies, 1156 rdp->rsp->gp_start + jiffies_till_sched_qs) || 1157 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 1158 if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { 1159 WRITE_ONCE(rdp->cond_resched_completed, 1160 READ_ONCE(rdp->mynode->completed)); 1161 smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 1162 WRITE_ONCE(*rcrmp, 1163 READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); 1164 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ 1165 rdp->rsp->jiffies_resched += 5; /* Enable beating. */ 1166 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 1167 /* Time to beat on that CPU again! */ 1168 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ 1169 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ 1170 } 1171 } 1172 1173 return 0; 1174 } 1175 1176 static void record_gp_stall_check_time(struct rcu_state *rsp) 1177 { 1178 unsigned long j = jiffies; 1179 unsigned long j1; 1180 1181 rsp->gp_start = j; 1182 smp_wmb(); /* Record start time before stall time. */ 1183 j1 = rcu_jiffies_till_stall_check(); 1184 WRITE_ONCE(rsp->jiffies_stall, j + j1); 1185 rsp->jiffies_resched = j + j1 / 2; 1186 rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); 1187 } 1188 1189 /* 1190 * Complain about starvation of grace-period kthread. 1191 */ 1192 static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) 1193 { 1194 unsigned long gpa; 1195 unsigned long j; 1196 1197 j = jiffies; 1198 gpa = READ_ONCE(rsp->gp_activity); 1199 if (j - gpa > 2 * HZ) 1200 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", 1201 rsp->name, j - gpa, 1202 rsp->gpnum, rsp->completed, 1203 rsp->gp_flags, rsp->gp_state, 1204 rsp->gp_kthread ? rsp->gp_kthread->state : 0); 1205 } 1206 1207 /* 1208 * Dump stacks of all tasks running on stalled CPUs. 1209 */ 1210 static void rcu_dump_cpu_stacks(struct rcu_state *rsp) 1211 { 1212 int cpu; 1213 unsigned long flags; 1214 struct rcu_node *rnp; 1215 1216 rcu_for_each_leaf_node(rsp, rnp) { 1217 raw_spin_lock_irqsave(&rnp->lock, flags); 1218 if (rnp->qsmask != 0) { 1219 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1220 if (rnp->qsmask & (1UL << cpu)) 1221 dump_cpu_task(rnp->grplo + cpu); 1222 } 1223 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1224 } 1225 } 1226 1227 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) 1228 { 1229 int cpu; 1230 long delta; 1231 unsigned long flags; 1232 unsigned long gpa; 1233 unsigned long j; 1234 int ndetected = 0; 1235 struct rcu_node *rnp = rcu_get_root(rsp); 1236 long totqlen = 0; 1237 1238 /* Only let one CPU complain about others per time interval. */ 1239 1240 raw_spin_lock_irqsave(&rnp->lock, flags); 1241 delta = jiffies - READ_ONCE(rsp->jiffies_stall); 1242 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 1243 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1244 return; 1245 } 1246 WRITE_ONCE(rsp->jiffies_stall, 1247 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1248 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1249 1250 /* 1251 * OK, time to rat on our buddy... 1252 * See Documentation/RCU/stallwarn.txt for info on how to debug 1253 * RCU CPU stall warnings. 1254 */ 1255 pr_err("INFO: %s detected stalls on CPUs/tasks:", 1256 rsp->name); 1257 print_cpu_stall_info_begin(); 1258 rcu_for_each_leaf_node(rsp, rnp) { 1259 raw_spin_lock_irqsave(&rnp->lock, flags); 1260 ndetected += rcu_print_task_stall(rnp); 1261 if (rnp->qsmask != 0) { 1262 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1263 if (rnp->qsmask & (1UL << cpu)) { 1264 print_cpu_stall_info(rsp, 1265 rnp->grplo + cpu); 1266 ndetected++; 1267 } 1268 } 1269 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1270 } 1271 1272 print_cpu_stall_info_end(); 1273 for_each_possible_cpu(cpu) 1274 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1275 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1276 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1277 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1278 if (ndetected) { 1279 rcu_dump_cpu_stacks(rsp); 1280 } else { 1281 if (READ_ONCE(rsp->gpnum) != gpnum || 1282 READ_ONCE(rsp->completed) == gpnum) { 1283 pr_err("INFO: Stall ended before state dump start\n"); 1284 } else { 1285 j = jiffies; 1286 gpa = READ_ONCE(rsp->gp_activity); 1287 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", 1288 rsp->name, j - gpa, j, gpa, 1289 jiffies_till_next_fqs, 1290 rcu_get_root(rsp)->qsmask); 1291 /* In this case, the current CPU might be at fault. */ 1292 sched_show_task(current); 1293 } 1294 } 1295 1296 /* Complain about tasks blocking the grace period. */ 1297 rcu_print_detail_task_stall(rsp); 1298 1299 rcu_check_gp_kthread_starvation(rsp); 1300 1301 force_quiescent_state(rsp); /* Kick them all. */ 1302 } 1303 1304 static void print_cpu_stall(struct rcu_state *rsp) 1305 { 1306 int cpu; 1307 unsigned long flags; 1308 struct rcu_node *rnp = rcu_get_root(rsp); 1309 long totqlen = 0; 1310 1311 /* 1312 * OK, time to rat on ourselves... 1313 * See Documentation/RCU/stallwarn.txt for info on how to debug 1314 * RCU CPU stall warnings. 1315 */ 1316 pr_err("INFO: %s self-detected stall on CPU", rsp->name); 1317 print_cpu_stall_info_begin(); 1318 print_cpu_stall_info(rsp, smp_processor_id()); 1319 print_cpu_stall_info_end(); 1320 for_each_possible_cpu(cpu) 1321 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1322 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1323 jiffies - rsp->gp_start, 1324 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1325 1326 rcu_check_gp_kthread_starvation(rsp); 1327 1328 rcu_dump_cpu_stacks(rsp); 1329 1330 raw_spin_lock_irqsave(&rnp->lock, flags); 1331 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) 1332 WRITE_ONCE(rsp->jiffies_stall, 1333 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1334 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1335 1336 /* 1337 * Attempt to revive the RCU machinery by forcing a context switch. 1338 * 1339 * A context switch would normally allow the RCU state machine to make 1340 * progress and it could be we're stuck in kernel space without context 1341 * switches for an entirely unreasonable amount of time. 1342 */ 1343 resched_cpu(smp_processor_id()); 1344 } 1345 1346 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 1347 { 1348 unsigned long completed; 1349 unsigned long gpnum; 1350 unsigned long gps; 1351 unsigned long j; 1352 unsigned long js; 1353 struct rcu_node *rnp; 1354 1355 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) 1356 return; 1357 j = jiffies; 1358 1359 /* 1360 * Lots of memory barriers to reject false positives. 1361 * 1362 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, 1363 * then rsp->gp_start, and finally rsp->completed. These values 1364 * are updated in the opposite order with memory barriers (or 1365 * equivalent) during grace-period initialization and cleanup. 1366 * Now, a false positive can occur if we get an new value of 1367 * rsp->gp_start and a old value of rsp->jiffies_stall. But given 1368 * the memory barriers, the only way that this can happen is if one 1369 * grace period ends and another starts between these two fetches. 1370 * Detect this by comparing rsp->completed with the previous fetch 1371 * from rsp->gpnum. 1372 * 1373 * Given this check, comparisons of jiffies, rsp->jiffies_stall, 1374 * and rsp->gp_start suffice to forestall false positives. 1375 */ 1376 gpnum = READ_ONCE(rsp->gpnum); 1377 smp_rmb(); /* Pick up ->gpnum first... */ 1378 js = READ_ONCE(rsp->jiffies_stall); 1379 smp_rmb(); /* ...then ->jiffies_stall before the rest... */ 1380 gps = READ_ONCE(rsp->gp_start); 1381 smp_rmb(); /* ...and finally ->gp_start before ->completed. */ 1382 completed = READ_ONCE(rsp->completed); 1383 if (ULONG_CMP_GE(completed, gpnum) || 1384 ULONG_CMP_LT(j, js) || 1385 ULONG_CMP_GE(gps, js)) 1386 return; /* No stall or GP completed since entering function. */ 1387 rnp = rdp->mynode; 1388 if (rcu_gp_in_progress(rsp) && 1389 (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { 1390 1391 /* We haven't checked in, so go dump stack. */ 1392 print_cpu_stall(rsp); 1393 1394 } else if (rcu_gp_in_progress(rsp) && 1395 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1396 1397 /* They had a few time units to dump stack, so complain. */ 1398 print_other_cpu_stall(rsp, gpnum); 1399 } 1400 } 1401 1402 /** 1403 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 1404 * 1405 * Set the stall-warning timeout way off into the future, thus preventing 1406 * any RCU CPU stall-warning messages from appearing in the current set of 1407 * RCU grace periods. 1408 * 1409 * The caller must disable hard irqs. 1410 */ 1411 void rcu_cpu_stall_reset(void) 1412 { 1413 struct rcu_state *rsp; 1414 1415 for_each_rcu_flavor(rsp) 1416 WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); 1417 } 1418 1419 /* 1420 * Initialize the specified rcu_data structure's default callback list 1421 * to empty. The default callback list is the one that is not used by 1422 * no-callbacks CPUs. 1423 */ 1424 static void init_default_callback_list(struct rcu_data *rdp) 1425 { 1426 int i; 1427 1428 rdp->nxtlist = NULL; 1429 for (i = 0; i < RCU_NEXT_SIZE; i++) 1430 rdp->nxttail[i] = &rdp->nxtlist; 1431 } 1432 1433 /* 1434 * Initialize the specified rcu_data structure's callback list to empty. 1435 */ 1436 static void init_callback_list(struct rcu_data *rdp) 1437 { 1438 if (init_nocb_callback_list(rdp)) 1439 return; 1440 init_default_callback_list(rdp); 1441 } 1442 1443 /* 1444 * Determine the value that ->completed will have at the end of the 1445 * next subsequent grace period. This is used to tag callbacks so that 1446 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1447 * been dyntick-idle for an extended period with callbacks under the 1448 * influence of RCU_FAST_NO_HZ. 1449 * 1450 * The caller must hold rnp->lock with interrupts disabled. 1451 */ 1452 static unsigned long rcu_cbs_completed(struct rcu_state *rsp, 1453 struct rcu_node *rnp) 1454 { 1455 /* 1456 * If RCU is idle, we just wait for the next grace period. 1457 * But we can only be sure that RCU is idle if we are looking 1458 * at the root rcu_node structure -- otherwise, a new grace 1459 * period might have started, but just not yet gotten around 1460 * to initializing the current non-root rcu_node structure. 1461 */ 1462 if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) 1463 return rnp->completed + 1; 1464 1465 /* 1466 * Otherwise, wait for a possible partial grace period and 1467 * then the subsequent full grace period. 1468 */ 1469 return rnp->completed + 2; 1470 } 1471 1472 /* 1473 * Trace-event helper function for rcu_start_future_gp() and 1474 * rcu_nocb_wait_gp(). 1475 */ 1476 static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1477 unsigned long c, const char *s) 1478 { 1479 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, 1480 rnp->completed, c, rnp->level, 1481 rnp->grplo, rnp->grphi, s); 1482 } 1483 1484 /* 1485 * Start some future grace period, as needed to handle newly arrived 1486 * callbacks. The required future grace periods are recorded in each 1487 * rcu_node structure's ->need_future_gp field. Returns true if there 1488 * is reason to awaken the grace-period kthread. 1489 * 1490 * The caller must hold the specified rcu_node structure's ->lock. 1491 */ 1492 static bool __maybe_unused 1493 rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1494 unsigned long *c_out) 1495 { 1496 unsigned long c; 1497 int i; 1498 bool ret = false; 1499 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1500 1501 /* 1502 * Pick up grace-period number for new callbacks. If this 1503 * grace period is already marked as needed, return to the caller. 1504 */ 1505 c = rcu_cbs_completed(rdp->rsp, rnp); 1506 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); 1507 if (rnp->need_future_gp[c & 0x1]) { 1508 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); 1509 goto out; 1510 } 1511 1512 /* 1513 * If either this rcu_node structure or the root rcu_node structure 1514 * believe that a grace period is in progress, then we must wait 1515 * for the one following, which is in "c". Because our request 1516 * will be noticed at the end of the current grace period, we don't 1517 * need to explicitly start one. We only do the lockless check 1518 * of rnp_root's fields if the current rcu_node structure thinks 1519 * there is no grace period in flight, and because we hold rnp->lock, 1520 * the only possible change is when rnp_root's two fields are 1521 * equal, in which case rnp_root->gpnum might be concurrently 1522 * incremented. But that is OK, as it will just result in our 1523 * doing some extra useless work. 1524 */ 1525 if (rnp->gpnum != rnp->completed || 1526 READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { 1527 rnp->need_future_gp[c & 0x1]++; 1528 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1529 goto out; 1530 } 1531 1532 /* 1533 * There might be no grace period in progress. If we don't already 1534 * hold it, acquire the root rcu_node structure's lock in order to 1535 * start one (if needed). 1536 */ 1537 if (rnp != rnp_root) { 1538 raw_spin_lock(&rnp_root->lock); 1539 smp_mb__after_unlock_lock(); 1540 } 1541 1542 /* 1543 * Get a new grace-period number. If there really is no grace 1544 * period in progress, it will be smaller than the one we obtained 1545 * earlier. Adjust callbacks as needed. Note that even no-CBs 1546 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. 1547 */ 1548 c = rcu_cbs_completed(rdp->rsp, rnp_root); 1549 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) 1550 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) 1551 rdp->nxtcompleted[i] = c; 1552 1553 /* 1554 * If the needed for the required grace period is already 1555 * recorded, trace and leave. 1556 */ 1557 if (rnp_root->need_future_gp[c & 0x1]) { 1558 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); 1559 goto unlock_out; 1560 } 1561 1562 /* Record the need for the future grace period. */ 1563 rnp_root->need_future_gp[c & 0x1]++; 1564 1565 /* If a grace period is not already in progress, start one. */ 1566 if (rnp_root->gpnum != rnp_root->completed) { 1567 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); 1568 } else { 1569 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); 1570 ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1571 } 1572 unlock_out: 1573 if (rnp != rnp_root) 1574 raw_spin_unlock(&rnp_root->lock); 1575 out: 1576 if (c_out != NULL) 1577 *c_out = c; 1578 return ret; 1579 } 1580 1581 /* 1582 * Clean up any old requests for the just-ended grace period. Also return 1583 * whether any additional grace periods have been requested. Also invoke 1584 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads 1585 * waiting for this grace period to complete. 1586 */ 1587 static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1588 { 1589 int c = rnp->completed; 1590 int needmore; 1591 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1592 1593 rcu_nocb_gp_cleanup(rsp, rnp); 1594 rnp->need_future_gp[c & 0x1] = 0; 1595 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1596 trace_rcu_future_gp(rnp, rdp, c, 1597 needmore ? TPS("CleanupMore") : TPS("Cleanup")); 1598 return needmore; 1599 } 1600 1601 /* 1602 * Awaken the grace-period kthread for the specified flavor of RCU. 1603 * Don't do a self-awaken, and don't bother awakening when there is 1604 * nothing for the grace-period kthread to do (as in several CPUs 1605 * raced to awaken, and we lost), and finally don't try to awaken 1606 * a kthread that has not yet been created. 1607 */ 1608 static void rcu_gp_kthread_wake(struct rcu_state *rsp) 1609 { 1610 if (current == rsp->gp_kthread || 1611 !READ_ONCE(rsp->gp_flags) || 1612 !rsp->gp_kthread) 1613 return; 1614 wake_up(&rsp->gp_wq); 1615 } 1616 1617 /* 1618 * If there is room, assign a ->completed number to any callbacks on 1619 * this CPU that have not already been assigned. Also accelerate any 1620 * callbacks that were previously assigned a ->completed number that has 1621 * since proven to be too conservative, which can happen if callbacks get 1622 * assigned a ->completed number while RCU is idle, but with reference to 1623 * a non-root rcu_node structure. This function is idempotent, so it does 1624 * not hurt to call it repeatedly. Returns an flag saying that we should 1625 * awaken the RCU grace-period kthread. 1626 * 1627 * The caller must hold rnp->lock with interrupts disabled. 1628 */ 1629 static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1630 struct rcu_data *rdp) 1631 { 1632 unsigned long c; 1633 int i; 1634 bool ret; 1635 1636 /* If the CPU has no callbacks, nothing to do. */ 1637 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1638 return false; 1639 1640 /* 1641 * Starting from the sublist containing the callbacks most 1642 * recently assigned a ->completed number and working down, find the 1643 * first sublist that is not assignable to an upcoming grace period. 1644 * Such a sublist has something in it (first two tests) and has 1645 * a ->completed number assigned that will complete sooner than 1646 * the ->completed number for newly arrived callbacks (last test). 1647 * 1648 * The key point is that any later sublist can be assigned the 1649 * same ->completed number as the newly arrived callbacks, which 1650 * means that the callbacks in any of these later sublist can be 1651 * grouped into a single sublist, whether or not they have already 1652 * been assigned a ->completed number. 1653 */ 1654 c = rcu_cbs_completed(rsp, rnp); 1655 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) 1656 if (rdp->nxttail[i] != rdp->nxttail[i - 1] && 1657 !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) 1658 break; 1659 1660 /* 1661 * If there are no sublist for unassigned callbacks, leave. 1662 * At the same time, advance "i" one sublist, so that "i" will 1663 * index into the sublist where all the remaining callbacks should 1664 * be grouped into. 1665 */ 1666 if (++i >= RCU_NEXT_TAIL) 1667 return false; 1668 1669 /* 1670 * Assign all subsequent callbacks' ->completed number to the next 1671 * full grace period and group them all in the sublist initially 1672 * indexed by "i". 1673 */ 1674 for (; i <= RCU_NEXT_TAIL; i++) { 1675 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1676 rdp->nxtcompleted[i] = c; 1677 } 1678 /* Record any needed additional grace periods. */ 1679 ret = rcu_start_future_gp(rnp, rdp, NULL); 1680 1681 /* Trace depending on how much we were able to accelerate. */ 1682 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1683 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1684 else 1685 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1686 return ret; 1687 } 1688 1689 /* 1690 * Move any callbacks whose grace period has completed to the 1691 * RCU_DONE_TAIL sublist, then compact the remaining sublists and 1692 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL 1693 * sublist. This function is idempotent, so it does not hurt to 1694 * invoke it repeatedly. As long as it is not invoked -too- often... 1695 * Returns true if the RCU grace-period kthread needs to be awakened. 1696 * 1697 * The caller must hold rnp->lock with interrupts disabled. 1698 */ 1699 static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1700 struct rcu_data *rdp) 1701 { 1702 int i, j; 1703 1704 /* If the CPU has no callbacks, nothing to do. */ 1705 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1706 return false; 1707 1708 /* 1709 * Find all callbacks whose ->completed numbers indicate that they 1710 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1711 */ 1712 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { 1713 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) 1714 break; 1715 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; 1716 } 1717 /* Clean up any sublist tail pointers that were misordered above. */ 1718 for (j = RCU_WAIT_TAIL; j < i; j++) 1719 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; 1720 1721 /* Copy down callbacks to fill in empty sublists. */ 1722 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { 1723 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) 1724 break; 1725 rdp->nxttail[j] = rdp->nxttail[i]; 1726 rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; 1727 } 1728 1729 /* Classify any remaining callbacks. */ 1730 return rcu_accelerate_cbs(rsp, rnp, rdp); 1731 } 1732 1733 /* 1734 * Update CPU-local rcu_data state to record the beginnings and ends of 1735 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1736 * structure corresponding to the current CPU, and must have irqs disabled. 1737 * Returns true if the grace-period kthread needs to be awakened. 1738 */ 1739 static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, 1740 struct rcu_data *rdp) 1741 { 1742 bool ret; 1743 1744 /* Handle the ends of any preceding grace periods first. */ 1745 if (rdp->completed == rnp->completed && 1746 !unlikely(READ_ONCE(rdp->gpwrap))) { 1747 1748 /* No grace period end, so just accelerate recent callbacks. */ 1749 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1750 1751 } else { 1752 1753 /* Advance callbacks. */ 1754 ret = rcu_advance_cbs(rsp, rnp, rdp); 1755 1756 /* Remember that we saw this grace-period completion. */ 1757 rdp->completed = rnp->completed; 1758 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1759 } 1760 1761 if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { 1762 /* 1763 * If the current grace period is waiting for this CPU, 1764 * set up to detect a quiescent state, otherwise don't 1765 * go looking for one. 1766 */ 1767 rdp->gpnum = rnp->gpnum; 1768 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1769 rdp->cpu_no_qs.b.norm = true; 1770 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1771 rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); 1772 zero_cpu_stall_ticks(rdp); 1773 WRITE_ONCE(rdp->gpwrap, false); 1774 } 1775 return ret; 1776 } 1777 1778 static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 1779 { 1780 unsigned long flags; 1781 bool needwake; 1782 struct rcu_node *rnp; 1783 1784 local_irq_save(flags); 1785 rnp = rdp->mynode; 1786 if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && 1787 rdp->completed == READ_ONCE(rnp->completed) && 1788 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ 1789 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1790 local_irq_restore(flags); 1791 return; 1792 } 1793 smp_mb__after_unlock_lock(); 1794 needwake = __note_gp_changes(rsp, rnp, rdp); 1795 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1796 if (needwake) 1797 rcu_gp_kthread_wake(rsp); 1798 } 1799 1800 static void rcu_gp_slow(struct rcu_state *rsp, int delay) 1801 { 1802 if (delay > 0 && 1803 !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) 1804 schedule_timeout_uninterruptible(delay); 1805 } 1806 1807 /* 1808 * Initialize a new grace period. Return 0 if no grace period required. 1809 */ 1810 static int rcu_gp_init(struct rcu_state *rsp) 1811 { 1812 unsigned long oldmask; 1813 struct rcu_data *rdp; 1814 struct rcu_node *rnp = rcu_get_root(rsp); 1815 1816 WRITE_ONCE(rsp->gp_activity, jiffies); 1817 raw_spin_lock_irq(&rnp->lock); 1818 smp_mb__after_unlock_lock(); 1819 if (!READ_ONCE(rsp->gp_flags)) { 1820 /* Spurious wakeup, tell caller to go back to sleep. */ 1821 raw_spin_unlock_irq(&rnp->lock); 1822 return 0; 1823 } 1824 WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ 1825 1826 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { 1827 /* 1828 * Grace period already in progress, don't start another. 1829 * Not supposed to be able to happen. 1830 */ 1831 raw_spin_unlock_irq(&rnp->lock); 1832 return 0; 1833 } 1834 1835 /* Advance to a new grace period and initialize state. */ 1836 record_gp_stall_check_time(rsp); 1837 /* Record GP times before starting GP, hence smp_store_release(). */ 1838 smp_store_release(&rsp->gpnum, rsp->gpnum + 1); 1839 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1840 raw_spin_unlock_irq(&rnp->lock); 1841 1842 /* 1843 * Apply per-leaf buffered online and offline operations to the 1844 * rcu_node tree. Note that this new grace period need not wait 1845 * for subsequent online CPUs, and that quiescent-state forcing 1846 * will handle subsequent offline CPUs. 1847 */ 1848 rcu_for_each_leaf_node(rsp, rnp) { 1849 rcu_gp_slow(rsp, gp_preinit_delay); 1850 raw_spin_lock_irq(&rnp->lock); 1851 smp_mb__after_unlock_lock(); 1852 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1853 !rnp->wait_blkd_tasks) { 1854 /* Nothing to do on this leaf rcu_node structure. */ 1855 raw_spin_unlock_irq(&rnp->lock); 1856 continue; 1857 } 1858 1859 /* Record old state, apply changes to ->qsmaskinit field. */ 1860 oldmask = rnp->qsmaskinit; 1861 rnp->qsmaskinit = rnp->qsmaskinitnext; 1862 1863 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ 1864 if (!oldmask != !rnp->qsmaskinit) { 1865 if (!oldmask) /* First online CPU for this rcu_node. */ 1866 rcu_init_new_rnp(rnp); 1867 else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ 1868 rnp->wait_blkd_tasks = true; 1869 else /* Last offline CPU and can propagate. */ 1870 rcu_cleanup_dead_rnp(rnp); 1871 } 1872 1873 /* 1874 * If all waited-on tasks from prior grace period are 1875 * done, and if all this rcu_node structure's CPUs are 1876 * still offline, propagate up the rcu_node tree and 1877 * clear ->wait_blkd_tasks. Otherwise, if one of this 1878 * rcu_node structure's CPUs has since come back online, 1879 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() 1880 * checks for this, so just call it unconditionally). 1881 */ 1882 if (rnp->wait_blkd_tasks && 1883 (!rcu_preempt_has_tasks(rnp) || 1884 rnp->qsmaskinit)) { 1885 rnp->wait_blkd_tasks = false; 1886 rcu_cleanup_dead_rnp(rnp); 1887 } 1888 1889 raw_spin_unlock_irq(&rnp->lock); 1890 } 1891 1892 /* 1893 * Set the quiescent-state-needed bits in all the rcu_node 1894 * structures for all currently online CPUs in breadth-first order, 1895 * starting from the root rcu_node structure, relying on the layout 1896 * of the tree within the rsp->node[] array. Note that other CPUs 1897 * will access only the leaves of the hierarchy, thus seeing that no 1898 * grace period is in progress, at least until the corresponding 1899 * leaf node has been initialized. In addition, we have excluded 1900 * CPU-hotplug operations. 1901 * 1902 * The grace period cannot complete until the initialization 1903 * process finishes, because this kthread handles both. 1904 */ 1905 rcu_for_each_node_breadth_first(rsp, rnp) { 1906 rcu_gp_slow(rsp, gp_init_delay); 1907 raw_spin_lock_irq(&rnp->lock); 1908 smp_mb__after_unlock_lock(); 1909 rdp = this_cpu_ptr(rsp->rda); 1910 rcu_preempt_check_blocked_tasks(rnp); 1911 rnp->qsmask = rnp->qsmaskinit; 1912 WRITE_ONCE(rnp->gpnum, rsp->gpnum); 1913 if (WARN_ON_ONCE(rnp->completed != rsp->completed)) 1914 WRITE_ONCE(rnp->completed, rsp->completed); 1915 if (rnp == rdp->mynode) 1916 (void)__note_gp_changes(rsp, rnp, rdp); 1917 rcu_preempt_boost_start_gp(rnp); 1918 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1919 rnp->level, rnp->grplo, 1920 rnp->grphi, rnp->qsmask); 1921 raw_spin_unlock_irq(&rnp->lock); 1922 cond_resched_rcu_qs(); 1923 WRITE_ONCE(rsp->gp_activity, jiffies); 1924 } 1925 1926 return 1; 1927 } 1928 1929 /* 1930 * Helper function for wait_event_interruptible_timeout() wakeup 1931 * at force-quiescent-state time. 1932 */ 1933 static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) 1934 { 1935 struct rcu_node *rnp = rcu_get_root(rsp); 1936 1937 /* Someone like call_rcu() requested a force-quiescent-state scan. */ 1938 *gfp = READ_ONCE(rsp->gp_flags); 1939 if (*gfp & RCU_GP_FLAG_FQS) 1940 return true; 1941 1942 /* The current grace period has completed. */ 1943 if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) 1944 return true; 1945 1946 return false; 1947 } 1948 1949 /* 1950 * Do one round of quiescent-state forcing. 1951 */ 1952 static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) 1953 { 1954 bool isidle = false; 1955 unsigned long maxj; 1956 struct rcu_node *rnp = rcu_get_root(rsp); 1957 1958 WRITE_ONCE(rsp->gp_activity, jiffies); 1959 rsp->n_force_qs++; 1960 if (first_time) { 1961 /* Collect dyntick-idle snapshots. */ 1962 if (is_sysidle_rcu_state(rsp)) { 1963 isidle = true; 1964 maxj = jiffies - ULONG_MAX / 4; 1965 } 1966 force_qs_rnp(rsp, dyntick_save_progress_counter, 1967 &isidle, &maxj); 1968 rcu_sysidle_report_gp(rsp, isidle, maxj); 1969 } else { 1970 /* Handle dyntick-idle and offline CPUs. */ 1971 isidle = true; 1972 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1973 } 1974 /* Clear flag to prevent immediate re-entry. */ 1975 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1976 raw_spin_lock_irq(&rnp->lock); 1977 smp_mb__after_unlock_lock(); 1978 WRITE_ONCE(rsp->gp_flags, 1979 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); 1980 raw_spin_unlock_irq(&rnp->lock); 1981 } 1982 } 1983 1984 /* 1985 * Clean up after the old grace period. 1986 */ 1987 static void rcu_gp_cleanup(struct rcu_state *rsp) 1988 { 1989 unsigned long gp_duration; 1990 bool needgp = false; 1991 int nocb = 0; 1992 struct rcu_data *rdp; 1993 struct rcu_node *rnp = rcu_get_root(rsp); 1994 1995 WRITE_ONCE(rsp->gp_activity, jiffies); 1996 raw_spin_lock_irq(&rnp->lock); 1997 smp_mb__after_unlock_lock(); 1998 gp_duration = jiffies - rsp->gp_start; 1999 if (gp_duration > rsp->gp_max) 2000 rsp->gp_max = gp_duration; 2001 2002 /* 2003 * We know the grace period is complete, but to everyone else 2004 * it appears to still be ongoing. But it is also the case 2005 * that to everyone else it looks like there is nothing that 2006 * they can do to advance the grace period. It is therefore 2007 * safe for us to drop the lock in order to mark the grace 2008 * period as completed in all of the rcu_node structures. 2009 */ 2010 raw_spin_unlock_irq(&rnp->lock); 2011 2012 /* 2013 * Propagate new ->completed value to rcu_node structures so 2014 * that other CPUs don't have to wait until the start of the next 2015 * grace period to process their callbacks. This also avoids 2016 * some nasty RCU grace-period initialization races by forcing 2017 * the end of the current grace period to be completely recorded in 2018 * all of the rcu_node structures before the beginning of the next 2019 * grace period is recorded in any of the rcu_node structures. 2020 */ 2021 rcu_for_each_node_breadth_first(rsp, rnp) { 2022 raw_spin_lock_irq(&rnp->lock); 2023 smp_mb__after_unlock_lock(); 2024 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 2025 WARN_ON_ONCE(rnp->qsmask); 2026 WRITE_ONCE(rnp->completed, rsp->gpnum); 2027 rdp = this_cpu_ptr(rsp->rda); 2028 if (rnp == rdp->mynode) 2029 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; 2030 /* smp_mb() provided by prior unlock-lock pair. */ 2031 nocb += rcu_future_gp_cleanup(rsp, rnp); 2032 raw_spin_unlock_irq(&rnp->lock); 2033 cond_resched_rcu_qs(); 2034 WRITE_ONCE(rsp->gp_activity, jiffies); 2035 rcu_gp_slow(rsp, gp_cleanup_delay); 2036 } 2037 rnp = rcu_get_root(rsp); 2038 raw_spin_lock_irq(&rnp->lock); 2039 smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ 2040 rcu_nocb_gp_set(rnp, nocb); 2041 2042 /* Declare grace period done. */ 2043 WRITE_ONCE(rsp->completed, rsp->gpnum); 2044 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 2045 rsp->gp_state = RCU_GP_IDLE; 2046 rdp = this_cpu_ptr(rsp->rda); 2047 /* Advance CBs to reduce false positives below. */ 2048 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; 2049 if (needgp || cpu_needs_another_gp(rsp, rdp)) { 2050 WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); 2051 trace_rcu_grace_period(rsp->name, 2052 READ_ONCE(rsp->gpnum), 2053 TPS("newreq")); 2054 } 2055 raw_spin_unlock_irq(&rnp->lock); 2056 } 2057 2058 /* 2059 * Body of kthread that handles grace periods. 2060 */ 2061 static int __noreturn rcu_gp_kthread(void *arg) 2062 { 2063 bool first_gp_fqs; 2064 int gf; 2065 unsigned long j; 2066 int ret; 2067 struct rcu_state *rsp = arg; 2068 struct rcu_node *rnp = rcu_get_root(rsp); 2069 2070 rcu_bind_gp_kthread(); 2071 for (;;) { 2072 2073 /* Handle grace-period start. */ 2074 for (;;) { 2075 trace_rcu_grace_period(rsp->name, 2076 READ_ONCE(rsp->gpnum), 2077 TPS("reqwait")); 2078 rsp->gp_state = RCU_GP_WAIT_GPS; 2079 wait_event_interruptible(rsp->gp_wq, 2080 READ_ONCE(rsp->gp_flags) & 2081 RCU_GP_FLAG_INIT); 2082 rsp->gp_state = RCU_GP_DONE_GPS; 2083 /* Locking provides needed memory barrier. */ 2084 if (rcu_gp_init(rsp)) 2085 break; 2086 cond_resched_rcu_qs(); 2087 WRITE_ONCE(rsp->gp_activity, jiffies); 2088 WARN_ON(signal_pending(current)); 2089 trace_rcu_grace_period(rsp->name, 2090 READ_ONCE(rsp->gpnum), 2091 TPS("reqwaitsig")); 2092 } 2093 2094 /* Handle quiescent-state forcing. */ 2095 first_gp_fqs = true; 2096 j = jiffies_till_first_fqs; 2097 if (j > HZ) { 2098 j = HZ; 2099 jiffies_till_first_fqs = HZ; 2100 } 2101 ret = 0; 2102 for (;;) { 2103 if (!ret) 2104 rsp->jiffies_force_qs = jiffies + j; 2105 trace_rcu_grace_period(rsp->name, 2106 READ_ONCE(rsp->gpnum), 2107 TPS("fqswait")); 2108 rsp->gp_state = RCU_GP_WAIT_FQS; 2109 ret = wait_event_interruptible_timeout(rsp->gp_wq, 2110 rcu_gp_fqs_check_wake(rsp, &gf), j); 2111 rsp->gp_state = RCU_GP_DOING_FQS; 2112 /* Locking provides needed memory barriers. */ 2113 /* If grace period done, leave loop. */ 2114 if (!READ_ONCE(rnp->qsmask) && 2115 !rcu_preempt_blocked_readers_cgp(rnp)) 2116 break; 2117 /* If time for quiescent-state forcing, do it. */ 2118 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || 2119 (gf & RCU_GP_FLAG_FQS)) { 2120 trace_rcu_grace_period(rsp->name, 2121 READ_ONCE(rsp->gpnum), 2122 TPS("fqsstart")); 2123 rcu_gp_fqs(rsp, first_gp_fqs); 2124 first_gp_fqs = false; 2125 trace_rcu_grace_period(rsp->name, 2126 READ_ONCE(rsp->gpnum), 2127 TPS("fqsend")); 2128 cond_resched_rcu_qs(); 2129 WRITE_ONCE(rsp->gp_activity, jiffies); 2130 } else { 2131 /* Deal with stray signal. */ 2132 cond_resched_rcu_qs(); 2133 WRITE_ONCE(rsp->gp_activity, jiffies); 2134 WARN_ON(signal_pending(current)); 2135 trace_rcu_grace_period(rsp->name, 2136 READ_ONCE(rsp->gpnum), 2137 TPS("fqswaitsig")); 2138 } 2139 j = jiffies_till_next_fqs; 2140 if (j > HZ) { 2141 j = HZ; 2142 jiffies_till_next_fqs = HZ; 2143 } else if (j < 1) { 2144 j = 1; 2145 jiffies_till_next_fqs = 1; 2146 } 2147 } 2148 2149 /* Handle grace-period end. */ 2150 rsp->gp_state = RCU_GP_CLEANUP; 2151 rcu_gp_cleanup(rsp); 2152 rsp->gp_state = RCU_GP_CLEANED; 2153 } 2154 } 2155 2156 /* 2157 * Start a new RCU grace period if warranted, re-initializing the hierarchy 2158 * in preparation for detecting the next grace period. The caller must hold 2159 * the root node's ->lock and hard irqs must be disabled. 2160 * 2161 * Note that it is legal for a dying CPU (which is marked as offline) to 2162 * invoke this function. This can happen when the dying CPU reports its 2163 * quiescent state. 2164 * 2165 * Returns true if the grace-period kthread must be awakened. 2166 */ 2167 static bool 2168 rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 2169 struct rcu_data *rdp) 2170 { 2171 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { 2172 /* 2173 * Either we have not yet spawned the grace-period 2174 * task, this CPU does not need another grace period, 2175 * or a grace period is already in progress. 2176 * Either way, don't start a new grace period. 2177 */ 2178 return false; 2179 } 2180 WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); 2181 trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), 2182 TPS("newreq")); 2183 2184 /* 2185 * We can't do wakeups while holding the rnp->lock, as that 2186 * could cause possible deadlocks with the rq->lock. Defer 2187 * the wakeup to our caller. 2188 */ 2189 return true; 2190 } 2191 2192 /* 2193 * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's 2194 * callbacks. Note that rcu_start_gp_advanced() cannot do this because it 2195 * is invoked indirectly from rcu_advance_cbs(), which would result in 2196 * endless recursion -- or would do so if it wasn't for the self-deadlock 2197 * that is encountered beforehand. 2198 * 2199 * Returns true if the grace-period kthread needs to be awakened. 2200 */ 2201 static bool rcu_start_gp(struct rcu_state *rsp) 2202 { 2203 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 2204 struct rcu_node *rnp = rcu_get_root(rsp); 2205 bool ret = false; 2206 2207 /* 2208 * If there is no grace period in progress right now, any 2209 * callbacks we have up to this point will be satisfied by the 2210 * next grace period. Also, advancing the callbacks reduces the 2211 * probability of false positives from cpu_needs_another_gp() 2212 * resulting in pointless grace periods. So, advance callbacks 2213 * then start the grace period! 2214 */ 2215 ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; 2216 ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; 2217 return ret; 2218 } 2219 2220 /* 2221 * Report a full set of quiescent states to the specified rcu_state 2222 * data structure. This involves cleaning up after the prior grace 2223 * period and letting rcu_start_gp() start up the next grace period 2224 * if one is needed. Note that the caller must hold rnp->lock, which 2225 * is released before return. 2226 */ 2227 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 2228 __releases(rcu_get_root(rsp)->lock) 2229 { 2230 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2231 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2232 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2233 rcu_gp_kthread_wake(rsp); 2234 } 2235 2236 /* 2237 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 2238 * Allows quiescent states for a group of CPUs to be reported at one go 2239 * to the specified rcu_node structure, though all the CPUs in the group 2240 * must be represented by the same rcu_node structure (which need not be a 2241 * leaf rcu_node structure, though it often will be). The gps parameter 2242 * is the grace-period snapshot, which means that the quiescent states 2243 * are valid only if rnp->gpnum is equal to gps. That structure's lock 2244 * must be held upon entry, and it is released before return. 2245 */ 2246 static void 2247 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 2248 struct rcu_node *rnp, unsigned long gps, unsigned long flags) 2249 __releases(rnp->lock) 2250 { 2251 unsigned long oldmask = 0; 2252 struct rcu_node *rnp_c; 2253 2254 /* Walk up the rcu_node hierarchy. */ 2255 for (;;) { 2256 if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { 2257 2258 /* 2259 * Our bit has already been cleared, or the 2260 * relevant grace period is already over, so done. 2261 */ 2262 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2263 return; 2264 } 2265 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ 2266 rnp->qsmask &= ~mask; 2267 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, 2268 mask, rnp->qsmask, rnp->level, 2269 rnp->grplo, rnp->grphi, 2270 !!rnp->gp_tasks); 2271 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2272 2273 /* Other bits still set at this level, so done. */ 2274 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2275 return; 2276 } 2277 mask = rnp->grpmask; 2278 if (rnp->parent == NULL) { 2279 2280 /* No more levels. Exit loop holding root lock. */ 2281 2282 break; 2283 } 2284 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2285 rnp_c = rnp; 2286 rnp = rnp->parent; 2287 raw_spin_lock_irqsave(&rnp->lock, flags); 2288 smp_mb__after_unlock_lock(); 2289 oldmask = rnp_c->qsmask; 2290 } 2291 2292 /* 2293 * Get here if we are the last CPU to pass through a quiescent 2294 * state for this grace period. Invoke rcu_report_qs_rsp() 2295 * to clean up and start the next grace period if one is needed. 2296 */ 2297 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ 2298 } 2299 2300 /* 2301 * Record a quiescent state for all tasks that were previously queued 2302 * on the specified rcu_node structure and that were blocking the current 2303 * RCU grace period. The caller must hold the specified rnp->lock with 2304 * irqs disabled, and this lock is released upon return, but irqs remain 2305 * disabled. 2306 */ 2307 static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, 2308 struct rcu_node *rnp, unsigned long flags) 2309 __releases(rnp->lock) 2310 { 2311 unsigned long gps; 2312 unsigned long mask; 2313 struct rcu_node *rnp_p; 2314 2315 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || 2316 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2317 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2318 return; /* Still need more quiescent states! */ 2319 } 2320 2321 rnp_p = rnp->parent; 2322 if (rnp_p == NULL) { 2323 /* 2324 * Only one rcu_node structure in the tree, so don't 2325 * try to report up to its nonexistent parent! 2326 */ 2327 rcu_report_qs_rsp(rsp, flags); 2328 return; 2329 } 2330 2331 /* Report up the rest of the hierarchy, tracking current ->gpnum. */ 2332 gps = rnp->gpnum; 2333 mask = rnp->grpmask; 2334 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2335 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ 2336 smp_mb__after_unlock_lock(); 2337 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); 2338 } 2339 2340 /* 2341 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2342 * structure. This must be either called from the specified CPU, or 2343 * called when the specified CPU is known to be offline (and when it is 2344 * also known that no other CPU is concurrently trying to help the offline 2345 * CPU). The lastcomp argument is used to make sure we are still in the 2346 * grace period of interest. We don't want to end the current grace period 2347 * based on quiescent states detected in an earlier grace period! 2348 */ 2349 static void 2350 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) 2351 { 2352 unsigned long flags; 2353 unsigned long mask; 2354 bool needwake; 2355 struct rcu_node *rnp; 2356 2357 rnp = rdp->mynode; 2358 raw_spin_lock_irqsave(&rnp->lock, flags); 2359 smp_mb__after_unlock_lock(); 2360 if ((rdp->cpu_no_qs.b.norm && 2361 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || 2362 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || 2363 rdp->gpwrap) { 2364 2365 /* 2366 * The grace period in which this quiescent state was 2367 * recorded has ended, so don't report it upwards. 2368 * We will instead need a new quiescent state that lies 2369 * within the current grace period. 2370 */ 2371 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2372 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2373 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2374 return; 2375 } 2376 mask = rdp->grpmask; 2377 if ((rnp->qsmask & mask) == 0) { 2378 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2379 } else { 2380 rdp->core_needs_qs = 0; 2381 2382 /* 2383 * This GP can't end until cpu checks in, so all of our 2384 * callbacks can be processed during the next GP. 2385 */ 2386 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 2387 2388 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); 2389 /* ^^^ Released rnp->lock */ 2390 if (needwake) 2391 rcu_gp_kthread_wake(rsp); 2392 } 2393 } 2394 2395 /* 2396 * Check to see if there is a new grace period of which this CPU 2397 * is not yet aware, and if so, set up local rcu_data state for it. 2398 * Otherwise, see if this CPU has just passed through its first 2399 * quiescent state for this grace period, and record that fact if so. 2400 */ 2401 static void 2402 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) 2403 { 2404 /* Check for grace-period ends and beginnings. */ 2405 note_gp_changes(rsp, rdp); 2406 2407 /* 2408 * Does this CPU still need to do its part for current grace period? 2409 * If no, return and let the other CPUs do their part as well. 2410 */ 2411 if (!rdp->core_needs_qs) 2412 return; 2413 2414 /* 2415 * Was there a quiescent state since the beginning of the grace 2416 * period? If no, then exit and wait for the next call. 2417 */ 2418 if (rdp->cpu_no_qs.b.norm && 2419 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) 2420 return; 2421 2422 /* 2423 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 2424 * judge of that). 2425 */ 2426 rcu_report_qs_rdp(rdp->cpu, rsp, rdp); 2427 } 2428 2429 /* 2430 * Send the specified CPU's RCU callbacks to the orphanage. The 2431 * specified CPU must be offline, and the caller must hold the 2432 * ->orphan_lock. 2433 */ 2434 static void 2435 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 2436 struct rcu_node *rnp, struct rcu_data *rdp) 2437 { 2438 /* No-CBs CPUs do not have orphanable callbacks. */ 2439 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) 2440 return; 2441 2442 /* 2443 * Orphan the callbacks. First adjust the counts. This is safe 2444 * because _rcu_barrier() excludes CPU-hotplug operations, so it 2445 * cannot be running now. Thus no memory barrier is required. 2446 */ 2447 if (rdp->nxtlist != NULL) { 2448 rsp->qlen_lazy += rdp->qlen_lazy; 2449 rsp->qlen += rdp->qlen; 2450 rdp->n_cbs_orphaned += rdp->qlen; 2451 rdp->qlen_lazy = 0; 2452 WRITE_ONCE(rdp->qlen, 0); 2453 } 2454 2455 /* 2456 * Next, move those callbacks still needing a grace period to 2457 * the orphanage, where some other CPU will pick them up. 2458 * Some of the callbacks might have gone partway through a grace 2459 * period, but that is too bad. They get to start over because we 2460 * cannot assume that grace periods are synchronized across CPUs. 2461 * We don't bother updating the ->nxttail[] array yet, instead 2462 * we just reset the whole thing later on. 2463 */ 2464 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { 2465 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; 2466 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; 2467 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 2468 } 2469 2470 /* 2471 * Then move the ready-to-invoke callbacks to the orphanage, 2472 * where some other CPU will pick them up. These will not be 2473 * required to pass though another grace period: They are done. 2474 */ 2475 if (rdp->nxtlist != NULL) { 2476 *rsp->orphan_donetail = rdp->nxtlist; 2477 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; 2478 } 2479 2480 /* 2481 * Finally, initialize the rcu_data structure's list to empty and 2482 * disallow further callbacks on this CPU. 2483 */ 2484 init_callback_list(rdp); 2485 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2486 } 2487 2488 /* 2489 * Adopt the RCU callbacks from the specified rcu_state structure's 2490 * orphanage. The caller must hold the ->orphan_lock. 2491 */ 2492 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2493 { 2494 int i; 2495 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2496 2497 /* No-CBs CPUs are handled specially. */ 2498 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2499 rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) 2500 return; 2501 2502 /* Do the accounting first. */ 2503 rdp->qlen_lazy += rsp->qlen_lazy; 2504 rdp->qlen += rsp->qlen; 2505 rdp->n_cbs_adopted += rsp->qlen; 2506 if (rsp->qlen_lazy != rsp->qlen) 2507 rcu_idle_count_callbacks_posted(); 2508 rsp->qlen_lazy = 0; 2509 rsp->qlen = 0; 2510 2511 /* 2512 * We do not need a memory barrier here because the only way we 2513 * can get here if there is an rcu_barrier() in flight is if 2514 * we are the task doing the rcu_barrier(). 2515 */ 2516 2517 /* First adopt the ready-to-invoke callbacks. */ 2518 if (rsp->orphan_donelist != NULL) { 2519 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; 2520 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; 2521 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) 2522 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2523 rdp->nxttail[i] = rsp->orphan_donetail; 2524 rsp->orphan_donelist = NULL; 2525 rsp->orphan_donetail = &rsp->orphan_donelist; 2526 } 2527 2528 /* And then adopt the callbacks that still need a grace period. */ 2529 if (rsp->orphan_nxtlist != NULL) { 2530 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; 2531 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; 2532 rsp->orphan_nxtlist = NULL; 2533 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2534 } 2535 } 2536 2537 /* 2538 * Trace the fact that this CPU is going offline. 2539 */ 2540 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2541 { 2542 RCU_TRACE(unsigned long mask); 2543 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); 2544 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); 2545 2546 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2547 return; 2548 2549 RCU_TRACE(mask = rdp->grpmask); 2550 trace_rcu_grace_period(rsp->name, 2551 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 2552 TPS("cpuofl")); 2553 } 2554 2555 /* 2556 * All CPUs for the specified rcu_node structure have gone offline, 2557 * and all tasks that were preempted within an RCU read-side critical 2558 * section while running on one of those CPUs have since exited their RCU 2559 * read-side critical section. Some other CPU is reporting this fact with 2560 * the specified rcu_node structure's ->lock held and interrupts disabled. 2561 * This function therefore goes up the tree of rcu_node structures, 2562 * clearing the corresponding bits in the ->qsmaskinit fields. Note that 2563 * the leaf rcu_node structure's ->qsmaskinit field has already been 2564 * updated 2565 * 2566 * This function does check that the specified rcu_node structure has 2567 * all CPUs offline and no blocked tasks, so it is OK to invoke it 2568 * prematurely. That said, invoking it after the fact will cost you 2569 * a needless lock acquisition. So once it has done its work, don't 2570 * invoke it again. 2571 */ 2572 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) 2573 { 2574 long mask; 2575 struct rcu_node *rnp = rnp_leaf; 2576 2577 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2578 rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) 2579 return; 2580 for (;;) { 2581 mask = rnp->grpmask; 2582 rnp = rnp->parent; 2583 if (!rnp) 2584 break; 2585 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2586 smp_mb__after_unlock_lock(); /* GP memory ordering. */ 2587 rnp->qsmaskinit &= ~mask; 2588 rnp->qsmask &= ~mask; 2589 if (rnp->qsmaskinit) { 2590 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2591 return; 2592 } 2593 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2594 } 2595 } 2596 2597 /* 2598 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 2599 * function. We now remove it from the rcu_node tree's ->qsmaskinit 2600 * bit masks. 2601 */ 2602 static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) 2603 { 2604 unsigned long flags; 2605 unsigned long mask; 2606 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2607 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2608 2609 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2610 return; 2611 2612 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 2613 mask = rdp->grpmask; 2614 raw_spin_lock_irqsave(&rnp->lock, flags); 2615 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ 2616 rnp->qsmaskinitnext &= ~mask; 2617 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2618 } 2619 2620 /* 2621 * The CPU has been completely removed, and some other CPU is reporting 2622 * this fact from process context. Do the remainder of the cleanup, 2623 * including orphaning the outgoing CPU's RCU callbacks, and also 2624 * adopting them. There can only be one CPU hotplug operation at a time, 2625 * so no other CPU can be attempting to update rcu_cpu_kthread_task. 2626 */ 2627 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2628 { 2629 unsigned long flags; 2630 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2631 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2632 2633 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2634 return; 2635 2636 /* Adjust any no-longer-needed kthreads. */ 2637 rcu_boost_kthread_setaffinity(rnp, -1); 2638 2639 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2640 raw_spin_lock_irqsave(&rsp->orphan_lock, flags); 2641 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2642 rcu_adopt_orphan_cbs(rsp, flags); 2643 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2644 2645 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2646 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2647 cpu, rdp->qlen, rdp->nxtlist); 2648 } 2649 2650 /* 2651 * Invoke any RCU callbacks that have made it to the end of their grace 2652 * period. Thottle as specified by rdp->blimit. 2653 */ 2654 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) 2655 { 2656 unsigned long flags; 2657 struct rcu_head *next, *list, **tail; 2658 long bl, count, count_lazy; 2659 int i; 2660 2661 /* If no callbacks are ready, just return. */ 2662 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 2663 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 2664 trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), 2665 need_resched(), is_idle_task(current), 2666 rcu_is_callbacks_kthread()); 2667 return; 2668 } 2669 2670 /* 2671 * Extract the list of ready callbacks, disabling to prevent 2672 * races with call_rcu() from interrupt handlers. 2673 */ 2674 local_irq_save(flags); 2675 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2676 bl = rdp->blimit; 2677 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); 2678 list = rdp->nxtlist; 2679 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 2680 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 2681 tail = rdp->nxttail[RCU_DONE_TAIL]; 2682 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) 2683 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2684 rdp->nxttail[i] = &rdp->nxtlist; 2685 local_irq_restore(flags); 2686 2687 /* Invoke callbacks. */ 2688 count = count_lazy = 0; 2689 while (list) { 2690 next = list->next; 2691 prefetch(next); 2692 debug_rcu_head_unqueue(list); 2693 if (__rcu_reclaim(rsp->name, list)) 2694 count_lazy++; 2695 list = next; 2696 /* Stop only if limit reached and CPU has something to do. */ 2697 if (++count >= bl && 2698 (need_resched() || 2699 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2700 break; 2701 } 2702 2703 local_irq_save(flags); 2704 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), 2705 is_idle_task(current), 2706 rcu_is_callbacks_kthread()); 2707 2708 /* Update count, and requeue any remaining callbacks. */ 2709 if (list != NULL) { 2710 *tail = rdp->nxtlist; 2711 rdp->nxtlist = list; 2712 for (i = 0; i < RCU_NEXT_SIZE; i++) 2713 if (&rdp->nxtlist == rdp->nxttail[i]) 2714 rdp->nxttail[i] = tail; 2715 else 2716 break; 2717 } 2718 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2719 rdp->qlen_lazy -= count_lazy; 2720 WRITE_ONCE(rdp->qlen, rdp->qlen - count); 2721 rdp->n_cbs_invoked += count; 2722 2723 /* Reinstate batch limit if we have worked down the excess. */ 2724 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 2725 rdp->blimit = blimit; 2726 2727 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2728 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { 2729 rdp->qlen_last_fqs_check = 0; 2730 rdp->n_force_qs_snap = rsp->n_force_qs; 2731 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 2732 rdp->qlen_last_fqs_check = rdp->qlen; 2733 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); 2734 2735 local_irq_restore(flags); 2736 2737 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2738 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2739 invoke_rcu_core(); 2740 } 2741 2742 /* 2743 * Check to see if this CPU is in a non-context-switch quiescent state 2744 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 2745 * Also schedule RCU core processing. 2746 * 2747 * This function must be called from hardirq context. It is normally 2748 * invoked from the scheduling-clock interrupt. If rcu_pending returns 2749 * false, there is no point in invoking rcu_check_callbacks(). 2750 */ 2751 void rcu_check_callbacks(int user) 2752 { 2753 trace_rcu_utilization(TPS("Start scheduler-tick")); 2754 increment_cpu_stall_ticks(); 2755 if (user || rcu_is_cpu_rrupt_from_idle()) { 2756 2757 /* 2758 * Get here if this CPU took its interrupt from user 2759 * mode or from the idle loop, and if this is not a 2760 * nested interrupt. In this case, the CPU is in 2761 * a quiescent state, so note it. 2762 * 2763 * No memory barrier is required here because both 2764 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local 2765 * variables that other CPUs neither access nor modify, 2766 * at least not while the corresponding CPU is online. 2767 */ 2768 2769 rcu_sched_qs(); 2770 rcu_bh_qs(); 2771 2772 } else if (!in_softirq()) { 2773 2774 /* 2775 * Get here if this CPU did not take its interrupt from 2776 * softirq, in other words, if it is not interrupting 2777 * a rcu_bh read-side critical section. This is an _bh 2778 * critical section, so note it. 2779 */ 2780 2781 rcu_bh_qs(); 2782 } 2783 rcu_preempt_check_callbacks(); 2784 if (rcu_pending()) 2785 invoke_rcu_core(); 2786 if (user) 2787 rcu_note_voluntary_context_switch(current); 2788 trace_rcu_utilization(TPS("End scheduler-tick")); 2789 } 2790 2791 /* 2792 * Scan the leaf rcu_node structures, processing dyntick state for any that 2793 * have not yet encountered a quiescent state, using the function specified. 2794 * Also initiate boosting for any threads blocked on the root rcu_node. 2795 * 2796 * The caller must have suppressed start of new grace periods. 2797 */ 2798 static void force_qs_rnp(struct rcu_state *rsp, 2799 int (*f)(struct rcu_data *rsp, bool *isidle, 2800 unsigned long *maxj), 2801 bool *isidle, unsigned long *maxj) 2802 { 2803 unsigned long bit; 2804 int cpu; 2805 unsigned long flags; 2806 unsigned long mask; 2807 struct rcu_node *rnp; 2808 2809 rcu_for_each_leaf_node(rsp, rnp) { 2810 cond_resched_rcu_qs(); 2811 mask = 0; 2812 raw_spin_lock_irqsave(&rnp->lock, flags); 2813 smp_mb__after_unlock_lock(); 2814 if (rnp->qsmask == 0) { 2815 if (rcu_state_p == &rcu_sched_state || 2816 rsp != rcu_state_p || 2817 rcu_preempt_blocked_readers_cgp(rnp)) { 2818 /* 2819 * No point in scanning bits because they 2820 * are all zero. But we might need to 2821 * priority-boost blocked readers. 2822 */ 2823 rcu_initiate_boost(rnp, flags); 2824 /* rcu_initiate_boost() releases rnp->lock */ 2825 continue; 2826 } 2827 if (rnp->parent && 2828 (rnp->parent->qsmask & rnp->grpmask)) { 2829 /* 2830 * Race between grace-period 2831 * initialization and task exiting RCU 2832 * read-side critical section: Report. 2833 */ 2834 rcu_report_unblock_qs_rnp(rsp, rnp, flags); 2835 /* rcu_report_unblock_qs_rnp() rlses ->lock */ 2836 continue; 2837 } 2838 } 2839 cpu = rnp->grplo; 2840 bit = 1; 2841 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2842 if ((rnp->qsmask & bit) != 0) { 2843 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2844 mask |= bit; 2845 } 2846 } 2847 if (mask != 0) { 2848 /* Idle/offline CPUs, report (releases rnp->lock. */ 2849 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); 2850 } else { 2851 /* Nothing to do here, so just drop the lock. */ 2852 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2853 } 2854 } 2855 } 2856 2857 /* 2858 * Force quiescent states on reluctant CPUs, and also detect which 2859 * CPUs are in dyntick-idle mode. 2860 */ 2861 static void force_quiescent_state(struct rcu_state *rsp) 2862 { 2863 unsigned long flags; 2864 bool ret; 2865 struct rcu_node *rnp; 2866 struct rcu_node *rnp_old = NULL; 2867 2868 /* Funnel through hierarchy to reduce memory contention. */ 2869 rnp = __this_cpu_read(rsp->rda->mynode); 2870 for (; rnp != NULL; rnp = rnp->parent) { 2871 ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || 2872 !raw_spin_trylock(&rnp->fqslock); 2873 if (rnp_old != NULL) 2874 raw_spin_unlock(&rnp_old->fqslock); 2875 if (ret) { 2876 rsp->n_force_qs_lh++; 2877 return; 2878 } 2879 rnp_old = rnp; 2880 } 2881 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 2882 2883 /* Reached the root of the rcu_node tree, acquire lock. */ 2884 raw_spin_lock_irqsave(&rnp_old->lock, flags); 2885 smp_mb__after_unlock_lock(); 2886 raw_spin_unlock(&rnp_old->fqslock); 2887 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2888 rsp->n_force_qs_lh++; 2889 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2890 return; /* Someone beat us to it. */ 2891 } 2892 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2893 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2894 rcu_gp_kthread_wake(rsp); 2895 } 2896 2897 /* 2898 * This does the RCU core processing work for the specified rcu_state 2899 * and rcu_data structures. This may be called only from the CPU to 2900 * whom the rdp belongs. 2901 */ 2902 static void 2903 __rcu_process_callbacks(struct rcu_state *rsp) 2904 { 2905 unsigned long flags; 2906 bool needwake; 2907 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2908 2909 WARN_ON_ONCE(rdp->beenonline == 0); 2910 2911 /* Update RCU state based on any recent quiescent states. */ 2912 rcu_check_quiescent_state(rsp, rdp); 2913 2914 /* Does this CPU require a not-yet-started grace period? */ 2915 local_irq_save(flags); 2916 if (cpu_needs_another_gp(rsp, rdp)) { 2917 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2918 needwake = rcu_start_gp(rsp); 2919 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2920 if (needwake) 2921 rcu_gp_kthread_wake(rsp); 2922 } else { 2923 local_irq_restore(flags); 2924 } 2925 2926 /* If there are callbacks ready, invoke them. */ 2927 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2928 invoke_rcu_callbacks(rsp, rdp); 2929 2930 /* Do any needed deferred wakeups of rcuo kthreads. */ 2931 do_nocb_deferred_wakeup(rdp); 2932 } 2933 2934 /* 2935 * Do RCU core processing for the current CPU. 2936 */ 2937 static void rcu_process_callbacks(struct softirq_action *unused) 2938 { 2939 struct rcu_state *rsp; 2940 2941 if (cpu_is_offline(smp_processor_id())) 2942 return; 2943 trace_rcu_utilization(TPS("Start RCU core")); 2944 for_each_rcu_flavor(rsp) 2945 __rcu_process_callbacks(rsp); 2946 trace_rcu_utilization(TPS("End RCU core")); 2947 } 2948 2949 /* 2950 * Schedule RCU callback invocation. If the specified type of RCU 2951 * does not support RCU priority boosting, just do a direct call, 2952 * otherwise wake up the per-CPU kernel kthread. Note that because we 2953 * are running on the current CPU with softirqs disabled, the 2954 * rcu_cpu_kthread_task cannot disappear out from under us. 2955 */ 2956 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 2957 { 2958 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) 2959 return; 2960 if (likely(!rsp->boost)) { 2961 rcu_do_batch(rsp, rdp); 2962 return; 2963 } 2964 invoke_rcu_callbacks_kthread(); 2965 } 2966 2967 static void invoke_rcu_core(void) 2968 { 2969 if (cpu_online(smp_processor_id())) 2970 raise_softirq(RCU_SOFTIRQ); 2971 } 2972 2973 /* 2974 * Handle any core-RCU processing required by a call_rcu() invocation. 2975 */ 2976 static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 2977 struct rcu_head *head, unsigned long flags) 2978 { 2979 bool needwake; 2980 2981 /* 2982 * If called from an extended quiescent state, invoke the RCU 2983 * core in order to force a re-evaluation of RCU's idleness. 2984 */ 2985 if (!rcu_is_watching()) 2986 invoke_rcu_core(); 2987 2988 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2989 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) 2990 return; 2991 2992 /* 2993 * Force the grace period if too many callbacks or too long waiting. 2994 * Enforce hysteresis, and don't invoke force_quiescent_state() 2995 * if some other CPU has recently done so. Also, don't bother 2996 * invoking force_quiescent_state() if the newly enqueued callback 2997 * is the only one waiting for a grace period to complete. 2998 */ 2999 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 3000 3001 /* Are we ignoring a completed grace period? */ 3002 note_gp_changes(rsp, rdp); 3003 3004 /* Start a new grace period if one not already started. */ 3005 if (!rcu_gp_in_progress(rsp)) { 3006 struct rcu_node *rnp_root = rcu_get_root(rsp); 3007 3008 raw_spin_lock(&rnp_root->lock); 3009 smp_mb__after_unlock_lock(); 3010 needwake = rcu_start_gp(rsp); 3011 raw_spin_unlock(&rnp_root->lock); 3012 if (needwake) 3013 rcu_gp_kthread_wake(rsp); 3014 } else { 3015 /* Give the grace period a kick. */ 3016 rdp->blimit = LONG_MAX; 3017 if (rsp->n_force_qs == rdp->n_force_qs_snap && 3018 *rdp->nxttail[RCU_DONE_TAIL] != head) 3019 force_quiescent_state(rsp); 3020 rdp->n_force_qs_snap = rsp->n_force_qs; 3021 rdp->qlen_last_fqs_check = rdp->qlen; 3022 } 3023 } 3024 } 3025 3026 /* 3027 * RCU callback function to leak a callback. 3028 */ 3029 static void rcu_leak_callback(struct rcu_head *rhp) 3030 { 3031 } 3032 3033 /* 3034 * Helper function for call_rcu() and friends. The cpu argument will 3035 * normally be -1, indicating "currently running CPU". It may specify 3036 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 3037 * is expected to specify a CPU. 3038 */ 3039 static void 3040 __call_rcu(struct rcu_head *head, rcu_callback_t func, 3041 struct rcu_state *rsp, int cpu, bool lazy) 3042 { 3043 unsigned long flags; 3044 struct rcu_data *rdp; 3045 3046 WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ 3047 if (debug_rcu_head_queue(head)) { 3048 /* Probable double call_rcu(), so leak the callback. */ 3049 WRITE_ONCE(head->func, rcu_leak_callback); 3050 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); 3051 return; 3052 } 3053 head->func = func; 3054 head->next = NULL; 3055 3056 /* 3057 * Opportunistically note grace-period endings and beginnings. 3058 * Note that we might see a beginning right after we see an 3059 * end, but never vice versa, since this CPU has to pass through 3060 * a quiescent state betweentimes. 3061 */ 3062 local_irq_save(flags); 3063 rdp = this_cpu_ptr(rsp->rda); 3064 3065 /* Add the callback to our list. */ 3066 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { 3067 int offline; 3068 3069 if (cpu != -1) 3070 rdp = per_cpu_ptr(rsp->rda, cpu); 3071 if (likely(rdp->mynode)) { 3072 /* Post-boot, so this should be for a no-CBs CPU. */ 3073 offline = !__call_rcu_nocb(rdp, head, lazy, flags); 3074 WARN_ON_ONCE(offline); 3075 /* Offline CPU, _call_rcu() illegal, leak callback. */ 3076 local_irq_restore(flags); 3077 return; 3078 } 3079 /* 3080 * Very early boot, before rcu_init(). Initialize if needed 3081 * and then drop through to queue the callback. 3082 */ 3083 BUG_ON(cpu != -1); 3084 WARN_ON_ONCE(!rcu_is_watching()); 3085 if (!likely(rdp->nxtlist)) 3086 init_default_callback_list(rdp); 3087 } 3088 WRITE_ONCE(rdp->qlen, rdp->qlen + 1); 3089 if (lazy) 3090 rdp->qlen_lazy++; 3091 else 3092 rcu_idle_count_callbacks_posted(); 3093 smp_mb(); /* Count before adding callback for rcu_barrier(). */ 3094 *rdp->nxttail[RCU_NEXT_TAIL] = head; 3095 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 3096 3097 if (__is_kfree_rcu_offset((unsigned long)func)) 3098 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 3099 rdp->qlen_lazy, rdp->qlen); 3100 else 3101 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 3102 3103 /* Go handle any RCU core processing required. */ 3104 __call_rcu_core(rsp, rdp, head, flags); 3105 local_irq_restore(flags); 3106 } 3107 3108 /* 3109 * Queue an RCU-sched callback for invocation after a grace period. 3110 */ 3111 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) 3112 { 3113 __call_rcu(head, func, &rcu_sched_state, -1, 0); 3114 } 3115 EXPORT_SYMBOL_GPL(call_rcu_sched); 3116 3117 /* 3118 * Queue an RCU callback for invocation after a quicker grace period. 3119 */ 3120 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) 3121 { 3122 __call_rcu(head, func, &rcu_bh_state, -1, 0); 3123 } 3124 EXPORT_SYMBOL_GPL(call_rcu_bh); 3125 3126 /* 3127 * Queue an RCU callback for lazy invocation after a grace period. 3128 * This will likely be later named something like "call_rcu_lazy()", 3129 * but this change will require some way of tagging the lazy RCU 3130 * callbacks in the list of pending callbacks. Until then, this 3131 * function may only be called from __kfree_rcu(). 3132 */ 3133 void kfree_call_rcu(struct rcu_head *head, 3134 rcu_callback_t func) 3135 { 3136 __call_rcu(head, func, rcu_state_p, -1, 1); 3137 } 3138 EXPORT_SYMBOL_GPL(kfree_call_rcu); 3139 3140 /* 3141 * Because a context switch is a grace period for RCU-sched and RCU-bh, 3142 * any blocking grace-period wait automatically implies a grace period 3143 * if there is only one CPU online at any point time during execution 3144 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to 3145 * occasionally incorrectly indicate that there are multiple CPUs online 3146 * when there was in fact only one the whole time, as this just adds 3147 * some overhead: RCU still operates correctly. 3148 */ 3149 static inline int rcu_blocking_is_gp(void) 3150 { 3151 int ret; 3152 3153 might_sleep(); /* Check for RCU read-side critical section. */ 3154 preempt_disable(); 3155 ret = num_online_cpus() <= 1; 3156 preempt_enable(); 3157 return ret; 3158 } 3159 3160 /** 3161 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 3162 * 3163 * Control will return to the caller some time after a full rcu-sched 3164 * grace period has elapsed, in other words after all currently executing 3165 * rcu-sched read-side critical sections have completed. These read-side 3166 * critical sections are delimited by rcu_read_lock_sched() and 3167 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), 3168 * local_irq_disable(), and so on may be used in place of 3169 * rcu_read_lock_sched(). 3170 * 3171 * This means that all preempt_disable code sequences, including NMI and 3172 * non-threaded hardware-interrupt handlers, in progress on entry will 3173 * have completed before this primitive returns. However, this does not 3174 * guarantee that softirq handlers will have completed, since in some 3175 * kernels, these handlers can run in process context, and can block. 3176 * 3177 * Note that this guarantee implies further memory-ordering guarantees. 3178 * On systems with more than one CPU, when synchronize_sched() returns, 3179 * each CPU is guaranteed to have executed a full memory barrier since the 3180 * end of its last RCU-sched read-side critical section whose beginning 3181 * preceded the call to synchronize_sched(). In addition, each CPU having 3182 * an RCU read-side critical section that extends beyond the return from 3183 * synchronize_sched() is guaranteed to have executed a full memory barrier 3184 * after the beginning of synchronize_sched() and before the beginning of 3185 * that RCU read-side critical section. Note that these guarantees include 3186 * CPUs that are offline, idle, or executing in user mode, as well as CPUs 3187 * that are executing in the kernel. 3188 * 3189 * Furthermore, if CPU A invoked synchronize_sched(), which returned 3190 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 3191 * to have executed a full memory barrier during the execution of 3192 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but 3193 * again only if the system has more than one CPU). 3194 * 3195 * This primitive provides the guarantees made by the (now removed) 3196 * synchronize_kernel() API. In contrast, synchronize_rcu() only 3197 * guarantees that rcu_read_lock() sections will have completed. 3198 * In "classic RCU", these two guarantees happen to be one and 3199 * the same, but can differ in realtime RCU implementations. 3200 */ 3201 void synchronize_sched(void) 3202 { 3203 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || 3204 lock_is_held(&rcu_lock_map) || 3205 lock_is_held(&rcu_sched_lock_map), 3206 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 3207 if (rcu_blocking_is_gp()) 3208 return; 3209 if (rcu_gp_is_expedited()) 3210 synchronize_sched_expedited(); 3211 else 3212 wait_rcu_gp(call_rcu_sched); 3213 } 3214 EXPORT_SYMBOL_GPL(synchronize_sched); 3215 3216 /** 3217 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. 3218 * 3219 * Control will return to the caller some time after a full rcu_bh grace 3220 * period has elapsed, in other words after all currently executing rcu_bh 3221 * read-side critical sections have completed. RCU read-side critical 3222 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 3223 * and may be nested. 3224 * 3225 * See the description of synchronize_sched() for more detailed information 3226 * on memory ordering guarantees. 3227 */ 3228 void synchronize_rcu_bh(void) 3229 { 3230 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || 3231 lock_is_held(&rcu_lock_map) || 3232 lock_is_held(&rcu_sched_lock_map), 3233 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 3234 if (rcu_blocking_is_gp()) 3235 return; 3236 if (rcu_gp_is_expedited()) 3237 synchronize_rcu_bh_expedited(); 3238 else 3239 wait_rcu_gp(call_rcu_bh); 3240 } 3241 EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 3242 3243 /** 3244 * get_state_synchronize_rcu - Snapshot current RCU state 3245 * 3246 * Returns a cookie that is used by a later call to cond_synchronize_rcu() 3247 * to determine whether or not a full grace period has elapsed in the 3248 * meantime. 3249 */ 3250 unsigned long get_state_synchronize_rcu(void) 3251 { 3252 /* 3253 * Any prior manipulation of RCU-protected data must happen 3254 * before the load from ->gpnum. 3255 */ 3256 smp_mb(); /* ^^^ */ 3257 3258 /* 3259 * Make sure this load happens before the purportedly 3260 * time-consuming work between get_state_synchronize_rcu() 3261 * and cond_synchronize_rcu(). 3262 */ 3263 return smp_load_acquire(&rcu_state_p->gpnum); 3264 } 3265 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 3266 3267 /** 3268 * cond_synchronize_rcu - Conditionally wait for an RCU grace period 3269 * 3270 * @oldstate: return value from earlier call to get_state_synchronize_rcu() 3271 * 3272 * If a full RCU grace period has elapsed since the earlier call to 3273 * get_state_synchronize_rcu(), just return. Otherwise, invoke 3274 * synchronize_rcu() to wait for a full grace period. 3275 * 3276 * Yes, this function does not take counter wrap into account. But 3277 * counter wrap is harmless. If the counter wraps, we have waited for 3278 * more than 2 billion grace periods (and way more on a 64-bit system!), 3279 * so waiting for one additional grace period should be just fine. 3280 */ 3281 void cond_synchronize_rcu(unsigned long oldstate) 3282 { 3283 unsigned long newstate; 3284 3285 /* 3286 * Ensure that this load happens before any RCU-destructive 3287 * actions the caller might carry out after we return. 3288 */ 3289 newstate = smp_load_acquire(&rcu_state_p->completed); 3290 if (ULONG_CMP_GE(oldstate, newstate)) 3291 synchronize_rcu(); 3292 } 3293 EXPORT_SYMBOL_GPL(cond_synchronize_rcu); 3294 3295 /** 3296 * get_state_synchronize_sched - Snapshot current RCU-sched state 3297 * 3298 * Returns a cookie that is used by a later call to cond_synchronize_sched() 3299 * to determine whether or not a full grace period has elapsed in the 3300 * meantime. 3301 */ 3302 unsigned long get_state_synchronize_sched(void) 3303 { 3304 /* 3305 * Any prior manipulation of RCU-protected data must happen 3306 * before the load from ->gpnum. 3307 */ 3308 smp_mb(); /* ^^^ */ 3309 3310 /* 3311 * Make sure this load happens before the purportedly 3312 * time-consuming work between get_state_synchronize_sched() 3313 * and cond_synchronize_sched(). 3314 */ 3315 return smp_load_acquire(&rcu_sched_state.gpnum); 3316 } 3317 EXPORT_SYMBOL_GPL(get_state_synchronize_sched); 3318 3319 /** 3320 * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period 3321 * 3322 * @oldstate: return value from earlier call to get_state_synchronize_sched() 3323 * 3324 * If a full RCU-sched grace period has elapsed since the earlier call to 3325 * get_state_synchronize_sched(), just return. Otherwise, invoke 3326 * synchronize_sched() to wait for a full grace period. 3327 * 3328 * Yes, this function does not take counter wrap into account. But 3329 * counter wrap is harmless. If the counter wraps, we have waited for 3330 * more than 2 billion grace periods (and way more on a 64-bit system!), 3331 * so waiting for one additional grace period should be just fine. 3332 */ 3333 void cond_synchronize_sched(unsigned long oldstate) 3334 { 3335 unsigned long newstate; 3336 3337 /* 3338 * Ensure that this load happens before any RCU-destructive 3339 * actions the caller might carry out after we return. 3340 */ 3341 newstate = smp_load_acquire(&rcu_sched_state.completed); 3342 if (ULONG_CMP_GE(oldstate, newstate)) 3343 synchronize_sched(); 3344 } 3345 EXPORT_SYMBOL_GPL(cond_synchronize_sched); 3346 3347 /* Adjust sequence number for start of update-side operation. */ 3348 static void rcu_seq_start(unsigned long *sp) 3349 { 3350 WRITE_ONCE(*sp, *sp + 1); 3351 smp_mb(); /* Ensure update-side operation after counter increment. */ 3352 WARN_ON_ONCE(!(*sp & 0x1)); 3353 } 3354 3355 /* Adjust sequence number for end of update-side operation. */ 3356 static void rcu_seq_end(unsigned long *sp) 3357 { 3358 smp_mb(); /* Ensure update-side operation before counter increment. */ 3359 WRITE_ONCE(*sp, *sp + 1); 3360 WARN_ON_ONCE(*sp & 0x1); 3361 } 3362 3363 /* Take a snapshot of the update side's sequence number. */ 3364 static unsigned long rcu_seq_snap(unsigned long *sp) 3365 { 3366 unsigned long s; 3367 3368 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 3369 s = (READ_ONCE(*sp) + 3) & ~0x1; 3370 smp_mb(); /* Above access must not bleed into critical section. */ 3371 return s; 3372 } 3373 3374 /* 3375 * Given a snapshot from rcu_seq_snap(), determine whether or not a 3376 * full update-side operation has occurred. 3377 */ 3378 static bool rcu_seq_done(unsigned long *sp, unsigned long s) 3379 { 3380 return ULONG_CMP_GE(READ_ONCE(*sp), s); 3381 } 3382 3383 /* Wrapper functions for expedited grace periods. */ 3384 static void rcu_exp_gp_seq_start(struct rcu_state *rsp) 3385 { 3386 rcu_seq_start(&rsp->expedited_sequence); 3387 } 3388 static void rcu_exp_gp_seq_end(struct rcu_state *rsp) 3389 { 3390 rcu_seq_end(&rsp->expedited_sequence); 3391 smp_mb(); /* Ensure that consecutive grace periods serialize. */ 3392 } 3393 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) 3394 { 3395 return rcu_seq_snap(&rsp->expedited_sequence); 3396 } 3397 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) 3398 { 3399 return rcu_seq_done(&rsp->expedited_sequence, s); 3400 } 3401 3402 /* 3403 * Reset the ->expmaskinit values in the rcu_node tree to reflect any 3404 * recent CPU-online activity. Note that these masks are not cleared 3405 * when CPUs go offline, so they reflect the union of all CPUs that have 3406 * ever been online. This means that this function normally takes its 3407 * no-work-to-do fastpath. 3408 */ 3409 static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) 3410 { 3411 bool done; 3412 unsigned long flags; 3413 unsigned long mask; 3414 unsigned long oldmask; 3415 int ncpus = READ_ONCE(rsp->ncpus); 3416 struct rcu_node *rnp; 3417 struct rcu_node *rnp_up; 3418 3419 /* If no new CPUs onlined since last time, nothing to do. */ 3420 if (likely(ncpus == rsp->ncpus_snap)) 3421 return; 3422 rsp->ncpus_snap = ncpus; 3423 3424 /* 3425 * Each pass through the following loop propagates newly onlined 3426 * CPUs for the current rcu_node structure up the rcu_node tree. 3427 */ 3428 rcu_for_each_leaf_node(rsp, rnp) { 3429 raw_spin_lock_irqsave(&rnp->lock, flags); 3430 smp_mb__after_unlock_lock(); 3431 if (rnp->expmaskinit == rnp->expmaskinitnext) { 3432 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3433 continue; /* No new CPUs, nothing to do. */ 3434 } 3435 3436 /* Update this node's mask, track old value for propagation. */ 3437 oldmask = rnp->expmaskinit; 3438 rnp->expmaskinit = rnp->expmaskinitnext; 3439 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3440 3441 /* If was already nonzero, nothing to propagate. */ 3442 if (oldmask) 3443 continue; 3444 3445 /* Propagate the new CPU up the tree. */ 3446 mask = rnp->grpmask; 3447 rnp_up = rnp->parent; 3448 done = false; 3449 while (rnp_up) { 3450 raw_spin_lock_irqsave(&rnp_up->lock, flags); 3451 smp_mb__after_unlock_lock(); 3452 if (rnp_up->expmaskinit) 3453 done = true; 3454 rnp_up->expmaskinit |= mask; 3455 raw_spin_unlock_irqrestore(&rnp_up->lock, flags); 3456 if (done) 3457 break; 3458 mask = rnp_up->grpmask; 3459 rnp_up = rnp_up->parent; 3460 } 3461 } 3462 } 3463 3464 /* 3465 * Reset the ->expmask values in the rcu_node tree in preparation for 3466 * a new expedited grace period. 3467 */ 3468 static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) 3469 { 3470 unsigned long flags; 3471 struct rcu_node *rnp; 3472 3473 sync_exp_reset_tree_hotplug(rsp); 3474 rcu_for_each_node_breadth_first(rsp, rnp) { 3475 raw_spin_lock_irqsave(&rnp->lock, flags); 3476 smp_mb__after_unlock_lock(); 3477 WARN_ON_ONCE(rnp->expmask); 3478 rnp->expmask = rnp->expmaskinit; 3479 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3480 } 3481 } 3482 3483 /* 3484 * Return non-zero if there is no RCU expedited grace period in progress 3485 * for the specified rcu_node structure, in other words, if all CPUs and 3486 * tasks covered by the specified rcu_node structure have done their bit 3487 * for the current expedited grace period. Works only for preemptible 3488 * RCU -- other RCU implementation use other means. 3489 * 3490 * Caller must hold the root rcu_node's exp_funnel_mutex. 3491 */ 3492 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) 3493 { 3494 return rnp->exp_tasks == NULL && 3495 READ_ONCE(rnp->expmask) == 0; 3496 } 3497 3498 /* 3499 * Report the exit from RCU read-side critical section for the last task 3500 * that queued itself during or before the current expedited preemptible-RCU 3501 * grace period. This event is reported either to the rcu_node structure on 3502 * which the task was queued or to one of that rcu_node structure's ancestors, 3503 * recursively up the tree. (Calm down, calm down, we do the recursion 3504 * iteratively!) 3505 * 3506 * Caller must hold the root rcu_node's exp_funnel_mutex and the 3507 * specified rcu_node structure's ->lock. 3508 */ 3509 static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 3510 bool wake, unsigned long flags) 3511 __releases(rnp->lock) 3512 { 3513 unsigned long mask; 3514 3515 for (;;) { 3516 if (!sync_rcu_preempt_exp_done(rnp)) { 3517 if (!rnp->expmask) 3518 rcu_initiate_boost(rnp, flags); 3519 else 3520 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3521 break; 3522 } 3523 if (rnp->parent == NULL) { 3524 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3525 if (wake) { 3526 smp_mb(); /* EGP done before wake_up(). */ 3527 wake_up(&rsp->expedited_wq); 3528 } 3529 break; 3530 } 3531 mask = rnp->grpmask; 3532 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 3533 rnp = rnp->parent; 3534 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 3535 smp_mb__after_unlock_lock(); 3536 WARN_ON_ONCE(!(rnp->expmask & mask)); 3537 rnp->expmask &= ~mask; 3538 } 3539 } 3540 3541 /* 3542 * Report expedited quiescent state for specified node. This is a 3543 * lock-acquisition wrapper function for __rcu_report_exp_rnp(). 3544 * 3545 * Caller must hold the root rcu_node's exp_funnel_mutex. 3546 */ 3547 static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, 3548 struct rcu_node *rnp, bool wake) 3549 { 3550 unsigned long flags; 3551 3552 raw_spin_lock_irqsave(&rnp->lock, flags); 3553 smp_mb__after_unlock_lock(); 3554 __rcu_report_exp_rnp(rsp, rnp, wake, flags); 3555 } 3556 3557 /* 3558 * Report expedited quiescent state for multiple CPUs, all covered by the 3559 * specified leaf rcu_node structure. Caller must hold the root 3560 * rcu_node's exp_funnel_mutex. 3561 */ 3562 static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, 3563 unsigned long mask, bool wake) 3564 { 3565 unsigned long flags; 3566 3567 raw_spin_lock_irqsave(&rnp->lock, flags); 3568 smp_mb__after_unlock_lock(); 3569 if (!(rnp->expmask & mask)) { 3570 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3571 return; 3572 } 3573 rnp->expmask &= ~mask; 3574 __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ 3575 } 3576 3577 /* 3578 * Report expedited quiescent state for specified rcu_data (CPU). 3579 * Caller must hold the root rcu_node's exp_funnel_mutex. 3580 */ 3581 static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, 3582 bool wake) 3583 { 3584 rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); 3585 } 3586 3587 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ 3588 static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, 3589 struct rcu_data *rdp, 3590 atomic_long_t *stat, unsigned long s) 3591 { 3592 if (rcu_exp_gp_seq_done(rsp, s)) { 3593 if (rnp) 3594 mutex_unlock(&rnp->exp_funnel_mutex); 3595 else if (rdp) 3596 mutex_unlock(&rdp->exp_funnel_mutex); 3597 /* Ensure test happens before caller kfree(). */ 3598 smp_mb__before_atomic(); /* ^^^ */ 3599 atomic_long_inc(stat); 3600 return true; 3601 } 3602 return false; 3603 } 3604 3605 /* 3606 * Funnel-lock acquisition for expedited grace periods. Returns a 3607 * pointer to the root rcu_node structure, or NULL if some other 3608 * task did the expedited grace period for us. 3609 */ 3610 static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) 3611 { 3612 struct rcu_data *rdp; 3613 struct rcu_node *rnp0; 3614 struct rcu_node *rnp1 = NULL; 3615 3616 /* 3617 * First try directly acquiring the root lock in order to reduce 3618 * latency in the common case where expedited grace periods are 3619 * rare. We check mutex_is_locked() to avoid pathological levels of 3620 * memory contention on ->exp_funnel_mutex in the heavy-load case. 3621 */ 3622 rnp0 = rcu_get_root(rsp); 3623 if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { 3624 if (mutex_trylock(&rnp0->exp_funnel_mutex)) { 3625 if (sync_exp_work_done(rsp, rnp0, NULL, 3626 &rsp->expedited_workdone0, s)) 3627 return NULL; 3628 return rnp0; 3629 } 3630 } 3631 3632 /* 3633 * Each pass through the following loop works its way 3634 * up the rcu_node tree, returning if others have done the 3635 * work or otherwise falls through holding the root rnp's 3636 * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure 3637 * can be inexact, as it is just promoting locality and is not 3638 * strictly needed for correctness. 3639 */ 3640 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 3641 if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) 3642 return NULL; 3643 mutex_lock(&rdp->exp_funnel_mutex); 3644 rnp0 = rdp->mynode; 3645 for (; rnp0 != NULL; rnp0 = rnp0->parent) { 3646 if (sync_exp_work_done(rsp, rnp1, rdp, 3647 &rsp->expedited_workdone2, s)) 3648 return NULL; 3649 mutex_lock(&rnp0->exp_funnel_mutex); 3650 if (rnp1) 3651 mutex_unlock(&rnp1->exp_funnel_mutex); 3652 else 3653 mutex_unlock(&rdp->exp_funnel_mutex); 3654 rnp1 = rnp0; 3655 } 3656 if (sync_exp_work_done(rsp, rnp1, rdp, 3657 &rsp->expedited_workdone3, s)) 3658 return NULL; 3659 return rnp1; 3660 } 3661 3662 /* Invoked on each online non-idle CPU for expedited quiescent state. */ 3663 static void sync_sched_exp_handler(void *data) 3664 { 3665 struct rcu_data *rdp; 3666 struct rcu_node *rnp; 3667 struct rcu_state *rsp = data; 3668 3669 rdp = this_cpu_ptr(rsp->rda); 3670 rnp = rdp->mynode; 3671 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || 3672 __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) 3673 return; 3674 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); 3675 resched_cpu(smp_processor_id()); 3676 } 3677 3678 /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ 3679 static void sync_sched_exp_online_cleanup(int cpu) 3680 { 3681 struct rcu_data *rdp; 3682 int ret; 3683 struct rcu_node *rnp; 3684 struct rcu_state *rsp = &rcu_sched_state; 3685 3686 rdp = per_cpu_ptr(rsp->rda, cpu); 3687 rnp = rdp->mynode; 3688 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) 3689 return; 3690 ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); 3691 WARN_ON_ONCE(ret); 3692 } 3693 3694 /* 3695 * Select the nodes that the upcoming expedited grace period needs 3696 * to wait for. 3697 */ 3698 static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, 3699 smp_call_func_t func) 3700 { 3701 int cpu; 3702 unsigned long flags; 3703 unsigned long mask; 3704 unsigned long mask_ofl_test; 3705 unsigned long mask_ofl_ipi; 3706 int ret; 3707 struct rcu_node *rnp; 3708 3709 sync_exp_reset_tree(rsp); 3710 rcu_for_each_leaf_node(rsp, rnp) { 3711 raw_spin_lock_irqsave(&rnp->lock, flags); 3712 smp_mb__after_unlock_lock(); 3713 3714 /* Each pass checks a CPU for identity, offline, and idle. */ 3715 mask_ofl_test = 0; 3716 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { 3717 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3718 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 3719 3720 if (raw_smp_processor_id() == cpu || 3721 !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) 3722 mask_ofl_test |= rdp->grpmask; 3723 } 3724 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; 3725 3726 /* 3727 * Need to wait for any blocked tasks as well. Note that 3728 * additional blocking tasks will also block the expedited 3729 * GP until such time as the ->expmask bits are cleared. 3730 */ 3731 if (rcu_preempt_has_tasks(rnp)) 3732 rnp->exp_tasks = rnp->blkd_tasks.next; 3733 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3734 3735 /* IPI the remaining CPUs for expedited quiescent state. */ 3736 mask = 1; 3737 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { 3738 if (!(mask_ofl_ipi & mask)) 3739 continue; 3740 retry_ipi: 3741 ret = smp_call_function_single(cpu, func, rsp, 0); 3742 if (!ret) { 3743 mask_ofl_ipi &= ~mask; 3744 } else { 3745 /* Failed, raced with offline. */ 3746 raw_spin_lock_irqsave(&rnp->lock, flags); 3747 if (cpu_online(cpu) && 3748 (rnp->expmask & mask)) { 3749 raw_spin_unlock_irqrestore(&rnp->lock, 3750 flags); 3751 schedule_timeout_uninterruptible(1); 3752 if (cpu_online(cpu) && 3753 (rnp->expmask & mask)) 3754 goto retry_ipi; 3755 raw_spin_lock_irqsave(&rnp->lock, 3756 flags); 3757 } 3758 if (!(rnp->expmask & mask)) 3759 mask_ofl_ipi &= ~mask; 3760 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3761 } 3762 } 3763 /* Report quiescent states for those that went offline. */ 3764 mask_ofl_test |= mask_ofl_ipi; 3765 if (mask_ofl_test) 3766 rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); 3767 } 3768 } 3769 3770 static void synchronize_sched_expedited_wait(struct rcu_state *rsp) 3771 { 3772 int cpu; 3773 unsigned long jiffies_stall; 3774 unsigned long jiffies_start; 3775 unsigned long mask; 3776 struct rcu_node *rnp; 3777 struct rcu_node *rnp_root = rcu_get_root(rsp); 3778 int ret; 3779 3780 jiffies_stall = rcu_jiffies_till_stall_check(); 3781 jiffies_start = jiffies; 3782 3783 for (;;) { 3784 ret = wait_event_interruptible_timeout( 3785 rsp->expedited_wq, 3786 sync_rcu_preempt_exp_done(rnp_root), 3787 jiffies_stall); 3788 if (ret > 0) 3789 return; 3790 if (ret < 0) { 3791 /* Hit a signal, disable CPU stall warnings. */ 3792 wait_event(rsp->expedited_wq, 3793 sync_rcu_preempt_exp_done(rnp_root)); 3794 return; 3795 } 3796 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", 3797 rsp->name); 3798 rcu_for_each_leaf_node(rsp, rnp) { 3799 (void)rcu_print_task_exp_stall(rnp); 3800 mask = 1; 3801 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { 3802 struct rcu_data *rdp; 3803 3804 if (!(rnp->expmask & mask)) 3805 continue; 3806 rdp = per_cpu_ptr(rsp->rda, cpu); 3807 pr_cont(" %d-%c%c%c", cpu, 3808 "O."[cpu_online(cpu)], 3809 "o."[!!(rdp->grpmask & rnp->expmaskinit)], 3810 "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); 3811 } 3812 mask <<= 1; 3813 } 3814 pr_cont(" } %lu jiffies s: %lu\n", 3815 jiffies - jiffies_start, rsp->expedited_sequence); 3816 rcu_for_each_leaf_node(rsp, rnp) { 3817 mask = 1; 3818 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { 3819 if (!(rnp->expmask & mask)) 3820 continue; 3821 dump_cpu_task(cpu); 3822 } 3823 } 3824 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; 3825 } 3826 } 3827 3828 /** 3829 * synchronize_sched_expedited - Brute-force RCU-sched grace period 3830 * 3831 * Wait for an RCU-sched grace period to elapse, but use a "big hammer" 3832 * approach to force the grace period to end quickly. This consumes 3833 * significant time on all CPUs and is unfriendly to real-time workloads, 3834 * so is thus not recommended for any sort of common-case code. In fact, 3835 * if you are using synchronize_sched_expedited() in a loop, please 3836 * restructure your code to batch your updates, and then use a single 3837 * synchronize_sched() instead. 3838 * 3839 * This implementation can be thought of as an application of sequence 3840 * locking to expedited grace periods, but using the sequence counter to 3841 * determine when someone else has already done the work instead of for 3842 * retrying readers. 3843 */ 3844 void synchronize_sched_expedited(void) 3845 { 3846 unsigned long s; 3847 struct rcu_node *rnp; 3848 struct rcu_state *rsp = &rcu_sched_state; 3849 3850 /* Take a snapshot of the sequence number. */ 3851 s = rcu_exp_gp_seq_snap(rsp); 3852 3853 rnp = exp_funnel_lock(rsp, s); 3854 if (rnp == NULL) 3855 return; /* Someone else did our work for us. */ 3856 3857 rcu_exp_gp_seq_start(rsp); 3858 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); 3859 synchronize_sched_expedited_wait(rsp); 3860 3861 rcu_exp_gp_seq_end(rsp); 3862 mutex_unlock(&rnp->exp_funnel_mutex); 3863 } 3864 EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 3865 3866 /* 3867 * Check to see if there is any immediate RCU-related work to be done 3868 * by the current CPU, for the specified type of RCU, returning 1 if so. 3869 * The checks are in order of increasing expense: checks that can be 3870 * carried out against CPU-local state are performed first. However, 3871 * we must check for CPU stalls first, else we might not get a chance. 3872 */ 3873 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 3874 { 3875 struct rcu_node *rnp = rdp->mynode; 3876 3877 rdp->n_rcu_pending++; 3878 3879 /* Check for CPU stalls, if enabled. */ 3880 check_cpu_stall(rsp, rdp); 3881 3882 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ 3883 if (rcu_nohz_full_cpu(rsp)) 3884 return 0; 3885 3886 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3887 if (rcu_scheduler_fully_active && 3888 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && 3889 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3890 rdp->n_rp_core_needs_qs++; 3891 } else if (rdp->core_needs_qs && 3892 (!rdp->cpu_no_qs.b.norm || 3893 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { 3894 rdp->n_rp_report_qs++; 3895 return 1; 3896 } 3897 3898 /* Does this CPU have callbacks ready to invoke? */ 3899 if (cpu_has_callbacks_ready_to_invoke(rdp)) { 3900 rdp->n_rp_cb_ready++; 3901 return 1; 3902 } 3903 3904 /* Has RCU gone idle with this CPU needing another grace period? */ 3905 if (cpu_needs_another_gp(rsp, rdp)) { 3906 rdp->n_rp_cpu_needs_gp++; 3907 return 1; 3908 } 3909 3910 /* Has another RCU grace period completed? */ 3911 if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ 3912 rdp->n_rp_gp_completed++; 3913 return 1; 3914 } 3915 3916 /* Has a new RCU grace period started? */ 3917 if (READ_ONCE(rnp->gpnum) != rdp->gpnum || 3918 unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ 3919 rdp->n_rp_gp_started++; 3920 return 1; 3921 } 3922 3923 /* Does this CPU need a deferred NOCB wakeup? */ 3924 if (rcu_nocb_need_deferred_wakeup(rdp)) { 3925 rdp->n_rp_nocb_defer_wakeup++; 3926 return 1; 3927 } 3928 3929 /* nothing to do */ 3930 rdp->n_rp_need_nothing++; 3931 return 0; 3932 } 3933 3934 /* 3935 * Check to see if there is any immediate RCU-related work to be done 3936 * by the current CPU, returning 1 if so. This function is part of the 3937 * RCU implementation; it is -not- an exported member of the RCU API. 3938 */ 3939 static int rcu_pending(void) 3940 { 3941 struct rcu_state *rsp; 3942 3943 for_each_rcu_flavor(rsp) 3944 if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) 3945 return 1; 3946 return 0; 3947 } 3948 3949 /* 3950 * Return true if the specified CPU has any callback. If all_lazy is 3951 * non-NULL, store an indication of whether all callbacks are lazy. 3952 * (If there are no callbacks, all of them are deemed to be lazy.) 3953 */ 3954 static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) 3955 { 3956 bool al = true; 3957 bool hc = false; 3958 struct rcu_data *rdp; 3959 struct rcu_state *rsp; 3960 3961 for_each_rcu_flavor(rsp) { 3962 rdp = this_cpu_ptr(rsp->rda); 3963 if (!rdp->nxtlist) 3964 continue; 3965 hc = true; 3966 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { 3967 al = false; 3968 break; 3969 } 3970 } 3971 if (all_lazy) 3972 *all_lazy = al; 3973 return hc; 3974 } 3975 3976 /* 3977 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 3978 * the compiler is expected to optimize this away. 3979 */ 3980 static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, 3981 int cpu, unsigned long done) 3982 { 3983 trace_rcu_barrier(rsp->name, s, cpu, 3984 atomic_read(&rsp->barrier_cpu_count), done); 3985 } 3986 3987 /* 3988 * RCU callback function for _rcu_barrier(). If we are last, wake 3989 * up the task executing _rcu_barrier(). 3990 */ 3991 static void rcu_barrier_callback(struct rcu_head *rhp) 3992 { 3993 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); 3994 struct rcu_state *rsp = rdp->rsp; 3995 3996 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { 3997 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); 3998 complete(&rsp->barrier_completion); 3999 } else { 4000 _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); 4001 } 4002 } 4003 4004 /* 4005 * Called with preemption disabled, and from cross-cpu IRQ context. 4006 */ 4007 static void rcu_barrier_func(void *type) 4008 { 4009 struct rcu_state *rsp = type; 4010 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 4011 4012 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); 4013 atomic_inc(&rsp->barrier_cpu_count); 4014 rsp->call(&rdp->barrier_head, rcu_barrier_callback); 4015 } 4016 4017 /* 4018 * Orchestrate the specified type of RCU barrier, waiting for all 4019 * RCU callbacks of the specified type to complete. 4020 */ 4021 static void _rcu_barrier(struct rcu_state *rsp) 4022 { 4023 int cpu; 4024 struct rcu_data *rdp; 4025 unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); 4026 4027 _rcu_barrier_trace(rsp, "Begin", -1, s); 4028 4029 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 4030 mutex_lock(&rsp->barrier_mutex); 4031 4032 /* Did someone else do our work for us? */ 4033 if (rcu_seq_done(&rsp->barrier_sequence, s)) { 4034 _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); 4035 smp_mb(); /* caller's subsequent code after above check. */ 4036 mutex_unlock(&rsp->barrier_mutex); 4037 return; 4038 } 4039 4040 /* Mark the start of the barrier operation. */ 4041 rcu_seq_start(&rsp->barrier_sequence); 4042 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); 4043 4044 /* 4045 * Initialize the count to one rather than to zero in order to 4046 * avoid a too-soon return to zero in case of a short grace period 4047 * (or preemption of this task). Exclude CPU-hotplug operations 4048 * to ensure that no offline CPU has callbacks queued. 4049 */ 4050 init_completion(&rsp->barrier_completion); 4051 atomic_set(&rsp->barrier_cpu_count, 1); 4052 get_online_cpus(); 4053 4054 /* 4055 * Force each CPU with callbacks to register a new callback. 4056 * When that callback is invoked, we will know that all of the 4057 * corresponding CPU's preceding callbacks have been invoked. 4058 */ 4059 for_each_possible_cpu(cpu) { 4060 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) 4061 continue; 4062 rdp = per_cpu_ptr(rsp->rda, cpu); 4063 if (rcu_is_nocb_cpu(cpu)) { 4064 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { 4065 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, 4066 rsp->barrier_sequence); 4067 } else { 4068 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 4069 rsp->barrier_sequence); 4070 smp_mb__before_atomic(); 4071 atomic_inc(&rsp->barrier_cpu_count); 4072 __call_rcu(&rdp->barrier_head, 4073 rcu_barrier_callback, rsp, cpu, 0); 4074 } 4075 } else if (READ_ONCE(rdp->qlen)) { 4076 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 4077 rsp->barrier_sequence); 4078 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 4079 } else { 4080 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 4081 rsp->barrier_sequence); 4082 } 4083 } 4084 put_online_cpus(); 4085 4086 /* 4087 * Now that we have an rcu_barrier_callback() callback on each 4088 * CPU, and thus each counted, remove the initial count. 4089 */ 4090 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) 4091 complete(&rsp->barrier_completion); 4092 4093 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 4094 wait_for_completion(&rsp->barrier_completion); 4095 4096 /* Mark the end of the barrier operation. */ 4097 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); 4098 rcu_seq_end(&rsp->barrier_sequence); 4099 4100 /* Other rcu_barrier() invocations can now safely proceed. */ 4101 mutex_unlock(&rsp->barrier_mutex); 4102 } 4103 4104 /** 4105 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. 4106 */ 4107 void rcu_barrier_bh(void) 4108 { 4109 _rcu_barrier(&rcu_bh_state); 4110 } 4111 EXPORT_SYMBOL_GPL(rcu_barrier_bh); 4112 4113 /** 4114 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. 4115 */ 4116 void rcu_barrier_sched(void) 4117 { 4118 _rcu_barrier(&rcu_sched_state); 4119 } 4120 EXPORT_SYMBOL_GPL(rcu_barrier_sched); 4121 4122 /* 4123 * Propagate ->qsinitmask bits up the rcu_node tree to account for the 4124 * first CPU in a given leaf rcu_node structure coming online. The caller 4125 * must hold the corresponding leaf rcu_node ->lock with interrrupts 4126 * disabled. 4127 */ 4128 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) 4129 { 4130 long mask; 4131 struct rcu_node *rnp = rnp_leaf; 4132 4133 for (;;) { 4134 mask = rnp->grpmask; 4135 rnp = rnp->parent; 4136 if (rnp == NULL) 4137 return; 4138 raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ 4139 rnp->qsmaskinit |= mask; 4140 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ 4141 } 4142 } 4143 4144 /* 4145 * Do boot-time initialization of a CPU's per-CPU RCU data. 4146 */ 4147 static void __init 4148 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 4149 { 4150 unsigned long flags; 4151 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 4152 struct rcu_node *rnp = rcu_get_root(rsp); 4153 4154 /* Set up local state, ensuring consistent view of global state. */ 4155 raw_spin_lock_irqsave(&rnp->lock, flags); 4156 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 4157 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 4158 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 4159 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 4160 rdp->cpu = cpu; 4161 rdp->rsp = rsp; 4162 mutex_init(&rdp->exp_funnel_mutex); 4163 rcu_boot_init_nocb_percpu_data(rdp); 4164 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4165 } 4166 4167 /* 4168 * Initialize a CPU's per-CPU RCU data. Note that only one online or 4169 * offline event can be happening at a given time. Note also that we 4170 * can accept some slop in the rsp->completed access due to the fact 4171 * that this CPU cannot possibly have any RCU callbacks in flight yet. 4172 */ 4173 static void 4174 rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 4175 { 4176 unsigned long flags; 4177 unsigned long mask; 4178 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 4179 struct rcu_node *rnp = rcu_get_root(rsp); 4180 4181 /* Set up local state, ensuring consistent view of global state. */ 4182 raw_spin_lock_irqsave(&rnp->lock, flags); 4183 rdp->qlen_last_fqs_check = 0; 4184 rdp->n_force_qs_snap = rsp->n_force_qs; 4185 rdp->blimit = blimit; 4186 if (!rdp->nxtlist) 4187 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 4188 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 4189 rcu_sysidle_init_percpu_data(rdp->dynticks); 4190 atomic_set(&rdp->dynticks->dynticks, 4191 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 4192 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 4193 4194 /* 4195 * Add CPU to leaf rcu_node pending-online bitmask. Any needed 4196 * propagation up the rcu_node tree will happen at the beginning 4197 * of the next grace period. 4198 */ 4199 rnp = rdp->mynode; 4200 mask = rdp->grpmask; 4201 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 4202 smp_mb__after_unlock_lock(); 4203 rnp->qsmaskinitnext |= mask; 4204 rnp->expmaskinitnext |= mask; 4205 if (!rdp->beenonline) 4206 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); 4207 rdp->beenonline = true; /* We have now been online. */ 4208 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 4209 rdp->completed = rnp->completed; 4210 rdp->cpu_no_qs.b.norm = true; 4211 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 4212 rdp->core_needs_qs = false; 4213 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 4214 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4215 } 4216 4217 static void rcu_prepare_cpu(int cpu) 4218 { 4219 struct rcu_state *rsp; 4220 4221 for_each_rcu_flavor(rsp) 4222 rcu_init_percpu_data(cpu, rsp); 4223 } 4224 4225 /* 4226 * Handle CPU online/offline notification events. 4227 */ 4228 int rcu_cpu_notify(struct notifier_block *self, 4229 unsigned long action, void *hcpu) 4230 { 4231 long cpu = (long)hcpu; 4232 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 4233 struct rcu_node *rnp = rdp->mynode; 4234 struct rcu_state *rsp; 4235 4236 switch (action) { 4237 case CPU_UP_PREPARE: 4238 case CPU_UP_PREPARE_FROZEN: 4239 rcu_prepare_cpu(cpu); 4240 rcu_prepare_kthreads(cpu); 4241 rcu_spawn_all_nocb_kthreads(cpu); 4242 break; 4243 case CPU_ONLINE: 4244 case CPU_DOWN_FAILED: 4245 sync_sched_exp_online_cleanup(cpu); 4246 rcu_boost_kthread_setaffinity(rnp, -1); 4247 break; 4248 case CPU_DOWN_PREPARE: 4249 rcu_boost_kthread_setaffinity(rnp, cpu); 4250 break; 4251 case CPU_DYING: 4252 case CPU_DYING_FROZEN: 4253 for_each_rcu_flavor(rsp) 4254 rcu_cleanup_dying_cpu(rsp); 4255 break; 4256 case CPU_DYING_IDLE: 4257 /* QS for any half-done expedited RCU-sched GP. */ 4258 preempt_disable(); 4259 rcu_report_exp_rdp(&rcu_sched_state, 4260 this_cpu_ptr(rcu_sched_state.rda), true); 4261 preempt_enable(); 4262 4263 for_each_rcu_flavor(rsp) { 4264 rcu_cleanup_dying_idle_cpu(cpu, rsp); 4265 } 4266 break; 4267 case CPU_DEAD: 4268 case CPU_DEAD_FROZEN: 4269 case CPU_UP_CANCELED: 4270 case CPU_UP_CANCELED_FROZEN: 4271 for_each_rcu_flavor(rsp) { 4272 rcu_cleanup_dead_cpu(cpu, rsp); 4273 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); 4274 } 4275 break; 4276 default: 4277 break; 4278 } 4279 return NOTIFY_OK; 4280 } 4281 4282 static int rcu_pm_notify(struct notifier_block *self, 4283 unsigned long action, void *hcpu) 4284 { 4285 switch (action) { 4286 case PM_HIBERNATION_PREPARE: 4287 case PM_SUSPEND_PREPARE: 4288 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 4289 rcu_expedite_gp(); 4290 break; 4291 case PM_POST_HIBERNATION: 4292 case PM_POST_SUSPEND: 4293 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 4294 rcu_unexpedite_gp(); 4295 break; 4296 default: 4297 break; 4298 } 4299 return NOTIFY_OK; 4300 } 4301 4302 /* 4303 * Spawn the kthreads that handle each RCU flavor's grace periods. 4304 */ 4305 static int __init rcu_spawn_gp_kthread(void) 4306 { 4307 unsigned long flags; 4308 int kthread_prio_in = kthread_prio; 4309 struct rcu_node *rnp; 4310 struct rcu_state *rsp; 4311 struct sched_param sp; 4312 struct task_struct *t; 4313 4314 /* Force priority into range. */ 4315 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) 4316 kthread_prio = 1; 4317 else if (kthread_prio < 0) 4318 kthread_prio = 0; 4319 else if (kthread_prio > 99) 4320 kthread_prio = 99; 4321 if (kthread_prio != kthread_prio_in) 4322 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", 4323 kthread_prio, kthread_prio_in); 4324 4325 rcu_scheduler_fully_active = 1; 4326 for_each_rcu_flavor(rsp) { 4327 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); 4328 BUG_ON(IS_ERR(t)); 4329 rnp = rcu_get_root(rsp); 4330 raw_spin_lock_irqsave(&rnp->lock, flags); 4331 rsp->gp_kthread = t; 4332 if (kthread_prio) { 4333 sp.sched_priority = kthread_prio; 4334 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 4335 } 4336 wake_up_process(t); 4337 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4338 } 4339 rcu_spawn_nocb_kthreads(); 4340 rcu_spawn_boost_kthreads(); 4341 return 0; 4342 } 4343 early_initcall(rcu_spawn_gp_kthread); 4344 4345 /* 4346 * This function is invoked towards the end of the scheduler's initialization 4347 * process. Before this is called, the idle task might contain 4348 * RCU read-side critical sections (during which time, this idle 4349 * task is booting the system). After this function is called, the 4350 * idle tasks are prohibited from containing RCU read-side critical 4351 * sections. This function also enables RCU lockdep checking. 4352 */ 4353 void rcu_scheduler_starting(void) 4354 { 4355 WARN_ON(num_online_cpus() != 1); 4356 WARN_ON(nr_context_switches() > 0); 4357 rcu_scheduler_active = 1; 4358 } 4359 4360 /* 4361 * Compute the per-level fanout, either using the exact fanout specified 4362 * or balancing the tree, depending on the rcu_fanout_exact boot parameter. 4363 */ 4364 static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) 4365 { 4366 int i; 4367 4368 if (rcu_fanout_exact) { 4369 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; 4370 for (i = rcu_num_lvls - 2; i >= 0; i--) 4371 levelspread[i] = RCU_FANOUT; 4372 } else { 4373 int ccur; 4374 int cprv; 4375 4376 cprv = nr_cpu_ids; 4377 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4378 ccur = levelcnt[i]; 4379 levelspread[i] = (cprv + ccur - 1) / ccur; 4380 cprv = ccur; 4381 } 4382 } 4383 } 4384 4385 /* 4386 * Helper function for rcu_init() that initializes one rcu_state structure. 4387 */ 4388 static void __init rcu_init_one(struct rcu_state *rsp, 4389 struct rcu_data __percpu *rda) 4390 { 4391 static const char * const buf[] = RCU_NODE_NAME_INIT; 4392 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4393 static const char * const exp[] = RCU_EXP_NAME_INIT; 4394 static u8 fl_mask = 0x1; 4395 4396 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ 4397 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 4398 int cpustride = 1; 4399 int i; 4400 int j; 4401 struct rcu_node *rnp; 4402 4403 BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 4404 4405 /* Silence gcc 4.8 false positive about array index out of range. */ 4406 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) 4407 panic("rcu_init_one: rcu_num_lvls out of range"); 4408 4409 /* Initialize the level-tracking arrays. */ 4410 4411 for (i = 0; i < rcu_num_lvls; i++) 4412 levelcnt[i] = num_rcu_lvl[i]; 4413 for (i = 1; i < rcu_num_lvls; i++) 4414 rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; 4415 rcu_init_levelspread(levelspread, levelcnt); 4416 rsp->flavor_mask = fl_mask; 4417 fl_mask <<= 1; 4418 4419 /* Initialize the elements themselves, starting from the leaves. */ 4420 4421 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4422 cpustride *= levelspread[i]; 4423 rnp = rsp->level[i]; 4424 for (j = 0; j < levelcnt[i]; j++, rnp++) { 4425 raw_spin_lock_init(&rnp->lock); 4426 lockdep_set_class_and_name(&rnp->lock, 4427 &rcu_node_class[i], buf[i]); 4428 raw_spin_lock_init(&rnp->fqslock); 4429 lockdep_set_class_and_name(&rnp->fqslock, 4430 &rcu_fqs_class[i], fqs[i]); 4431 rnp->gpnum = rsp->gpnum; 4432 rnp->completed = rsp->completed; 4433 rnp->qsmask = 0; 4434 rnp->qsmaskinit = 0; 4435 rnp->grplo = j * cpustride; 4436 rnp->grphi = (j + 1) * cpustride - 1; 4437 if (rnp->grphi >= nr_cpu_ids) 4438 rnp->grphi = nr_cpu_ids - 1; 4439 if (i == 0) { 4440 rnp->grpnum = 0; 4441 rnp->grpmask = 0; 4442 rnp->parent = NULL; 4443 } else { 4444 rnp->grpnum = j % levelspread[i - 1]; 4445 rnp->grpmask = 1UL << rnp->grpnum; 4446 rnp->parent = rsp->level[i - 1] + 4447 j / levelspread[i - 1]; 4448 } 4449 rnp->level = i; 4450 INIT_LIST_HEAD(&rnp->blkd_tasks); 4451 rcu_init_one_nocb(rnp); 4452 mutex_init(&rnp->exp_funnel_mutex); 4453 lockdep_set_class_and_name(&rnp->exp_funnel_mutex, 4454 &rcu_exp_class[i], exp[i]); 4455 } 4456 } 4457 4458 init_waitqueue_head(&rsp->gp_wq); 4459 init_waitqueue_head(&rsp->expedited_wq); 4460 rnp = rsp->level[rcu_num_lvls - 1]; 4461 for_each_possible_cpu(i) { 4462 while (i > rnp->grphi) 4463 rnp++; 4464 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 4465 rcu_boot_init_percpu_data(i, rsp); 4466 } 4467 list_add(&rsp->flavors, &rcu_struct_flavors); 4468 } 4469 4470 /* 4471 * Compute the rcu_node tree geometry from kernel parameters. This cannot 4472 * replace the definitions in tree.h because those are needed to size 4473 * the ->node array in the rcu_state structure. 4474 */ 4475 static void __init rcu_init_geometry(void) 4476 { 4477 ulong d; 4478 int i; 4479 int rcu_capacity[RCU_NUM_LVLS]; 4480 4481 /* 4482 * Initialize any unspecified boot parameters. 4483 * The default values of jiffies_till_first_fqs and 4484 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS 4485 * value, which is a function of HZ, then adding one for each 4486 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. 4487 */ 4488 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; 4489 if (jiffies_till_first_fqs == ULONG_MAX) 4490 jiffies_till_first_fqs = d; 4491 if (jiffies_till_next_fqs == ULONG_MAX) 4492 jiffies_till_next_fqs = d; 4493 4494 /* If the compile-time values are accurate, just leave. */ 4495 if (rcu_fanout_leaf == RCU_FANOUT_LEAF && 4496 nr_cpu_ids == NR_CPUS) 4497 return; 4498 pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", 4499 rcu_fanout_leaf, nr_cpu_ids); 4500 4501 /* 4502 * The boot-time rcu_fanout_leaf parameter must be at least two 4503 * and cannot exceed the number of bits in the rcu_node masks. 4504 * Complain and fall back to the compile-time values if this 4505 * limit is exceeded. 4506 */ 4507 if (rcu_fanout_leaf < 2 || 4508 rcu_fanout_leaf > sizeof(unsigned long) * 8) { 4509 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4510 WARN_ON(1); 4511 return; 4512 } 4513 4514 /* 4515 * Compute number of nodes that can be handled an rcu_node tree 4516 * with the given number of levels. 4517 */ 4518 rcu_capacity[0] = rcu_fanout_leaf; 4519 for (i = 1; i < RCU_NUM_LVLS; i++) 4520 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; 4521 4522 /* 4523 * The tree must be able to accommodate the configured number of CPUs. 4524 * If this limit is exceeded, fall back to the compile-time values. 4525 */ 4526 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { 4527 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4528 WARN_ON(1); 4529 return; 4530 } 4531 4532 /* Calculate the number of levels in the tree. */ 4533 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { 4534 } 4535 rcu_num_lvls = i + 1; 4536 4537 /* Calculate the number of rcu_nodes at each level of the tree. */ 4538 for (i = 0; i < rcu_num_lvls; i++) { 4539 int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; 4540 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); 4541 } 4542 4543 /* Calculate the total number of rcu_node structures. */ 4544 rcu_num_nodes = 0; 4545 for (i = 0; i < rcu_num_lvls; i++) 4546 rcu_num_nodes += num_rcu_lvl[i]; 4547 } 4548 4549 /* 4550 * Dump out the structure of the rcu_node combining tree associated 4551 * with the rcu_state structure referenced by rsp. 4552 */ 4553 static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) 4554 { 4555 int level = 0; 4556 struct rcu_node *rnp; 4557 4558 pr_info("rcu_node tree layout dump\n"); 4559 pr_info(" "); 4560 rcu_for_each_node_breadth_first(rsp, rnp) { 4561 if (rnp->level != level) { 4562 pr_cont("\n"); 4563 pr_info(" "); 4564 level = rnp->level; 4565 } 4566 pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum); 4567 } 4568 pr_cont("\n"); 4569 } 4570 4571 void __init rcu_init(void) 4572 { 4573 int cpu; 4574 4575 rcu_early_boot_tests(); 4576 4577 rcu_bootup_announce(); 4578 rcu_init_geometry(); 4579 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 4580 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 4581 if (dump_tree) 4582 rcu_dump_rcu_node_tree(&rcu_sched_state); 4583 __rcu_init_preempt(); 4584 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 4585 4586 /* 4587 * We don't need protection against CPU-hotplug here because 4588 * this is called early in boot, before either interrupts 4589 * or the scheduler are operational. 4590 */ 4591 cpu_notifier(rcu_cpu_notify, 0); 4592 pm_notifier(rcu_pm_notify, 0); 4593 for_each_online_cpu(cpu) 4594 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 4595 } 4596 4597 #include "tree_plugin.h" 4598