1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 4 * 5 * Copyright IBM Corporation, 2008 6 * 7 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 8 * Manfred Spraul <manfred@colorfullife.com> 9 * Paul E. McKenney <paulmck@linux.ibm.com> 10 * 11 * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> 12 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 13 * 14 * For detailed explanation of Read-Copy Update mechanism see - 15 * Documentation/RCU 16 */ 17 18 #define pr_fmt(fmt) "rcu: " fmt 19 20 #include <linux/types.h> 21 #include <linux/kernel.h> 22 #include <linux/init.h> 23 #include <linux/spinlock.h> 24 #include <linux/smp.h> 25 #include <linux/rcupdate_wait.h> 26 #include <linux/interrupt.h> 27 #include <linux/sched.h> 28 #include <linux/sched/debug.h> 29 #include <linux/nmi.h> 30 #include <linux/atomic.h> 31 #include <linux/bitops.h> 32 #include <linux/export.h> 33 #include <linux/completion.h> 34 #include <linux/moduleparam.h> 35 #include <linux/percpu.h> 36 #include <linux/notifier.h> 37 #include <linux/cpu.h> 38 #include <linux/mutex.h> 39 #include <linux/time.h> 40 #include <linux/kernel_stat.h> 41 #include <linux/wait.h> 42 #include <linux/kthread.h> 43 #include <uapi/linux/sched/types.h> 44 #include <linux/prefetch.h> 45 #include <linux/delay.h> 46 #include <linux/random.h> 47 #include <linux/trace_events.h> 48 #include <linux/suspend.h> 49 #include <linux/ftrace.h> 50 #include <linux/tick.h> 51 #include <linux/sysrq.h> 52 #include <linux/kprobes.h> 53 #include <linux/gfp.h> 54 #include <linux/oom.h> 55 #include <linux/smpboot.h> 56 #include <linux/jiffies.h> 57 #include <linux/slab.h> 58 #include <linux/sched/isolation.h> 59 #include <linux/sched/clock.h> 60 #include <linux/vmalloc.h> 61 #include <linux/mm.h> 62 #include "../time/tick-internal.h" 63 64 #include "tree.h" 65 #include "rcu.h" 66 67 #ifdef MODULE_PARAM_PREFIX 68 #undef MODULE_PARAM_PREFIX 69 #endif 70 #define MODULE_PARAM_PREFIX "rcutree." 71 72 #ifndef data_race 73 #define data_race(expr) \ 74 ({ \ 75 expr; \ 76 }) 77 #endif 78 #ifndef ASSERT_EXCLUSIVE_WRITER 79 #define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) 80 #endif 81 #ifndef ASSERT_EXCLUSIVE_ACCESS 82 #define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) 83 #endif 84 85 /* Data structures. */ 86 87 /* 88 * Steal a bit from the bottom of ->dynticks for idle entry/exit 89 * control. Initially this is for TLB flushing. 90 */ 91 #define RCU_DYNTICK_CTRL_MASK 0x1 92 #define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) 93 94 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { 95 .dynticks_nesting = 1, 96 .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, 97 .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), 98 }; 99 static struct rcu_state rcu_state = { 100 .level = { &rcu_state.node[0] }, 101 .gp_state = RCU_GP_IDLE, 102 .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, 103 .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex), 104 .name = RCU_NAME, 105 .abbr = RCU_ABBR, 106 .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), 107 .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), 108 .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock), 109 }; 110 111 /* Dump rcu_node combining tree at boot to verify correct setup. */ 112 static bool dump_tree; 113 module_param(dump_tree, bool, 0444); 114 /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ 115 static bool use_softirq = true; 116 module_param(use_softirq, bool, 0444); 117 /* Control rcu_node-tree auto-balancing at boot time. */ 118 static bool rcu_fanout_exact; 119 module_param(rcu_fanout_exact, bool, 0444); 120 /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */ 121 static int rcu_fanout_leaf = RCU_FANOUT_LEAF; 122 module_param(rcu_fanout_leaf, int, 0444); 123 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 124 /* Number of rcu_nodes at specified level. */ 125 int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 126 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 127 128 /* 129 * The rcu_scheduler_active variable is initialized to the value 130 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the 131 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE, 132 * RCU can assume that there is but one task, allowing RCU to (for example) 133 * optimize synchronize_rcu() to a simple barrier(). When this variable 134 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required 135 * to detect real grace periods. This variable is also used to suppress 136 * boot-time false positives from lockdep-RCU error checking. Finally, it 137 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU 138 * is fully initialized, including all of its kthreads having been spawned. 139 */ 140 int rcu_scheduler_active __read_mostly; 141 EXPORT_SYMBOL_GPL(rcu_scheduler_active); 142 143 /* 144 * The rcu_scheduler_fully_active variable transitions from zero to one 145 * during the early_initcall() processing, which is after the scheduler 146 * is capable of creating new tasks. So RCU processing (for example, 147 * creating tasks for RCU priority boosting) must be delayed until after 148 * rcu_scheduler_fully_active transitions from zero to one. We also 149 * currently delay invocation of any RCU callbacks until after this point. 150 * 151 * It might later prove better for people registering RCU callbacks during 152 * early boot to take responsibility for these callbacks, but one step at 153 * a time. 154 */ 155 static int rcu_scheduler_fully_active __read_mostly; 156 157 static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, 158 unsigned long gps, unsigned long flags); 159 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); 160 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); 161 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 162 static void invoke_rcu_core(void); 163 static void rcu_report_exp_rdp(struct rcu_data *rdp); 164 static void sync_sched_exp_online_cleanup(int cpu); 165 static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); 166 167 /* rcuc/rcub kthread realtime priority */ 168 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; 169 module_param(kthread_prio, int, 0444); 170 171 /* Delay in jiffies for grace-period initialization delays, debug only. */ 172 173 static int gp_preinit_delay; 174 module_param(gp_preinit_delay, int, 0444); 175 static int gp_init_delay; 176 module_param(gp_init_delay, int, 0444); 177 static int gp_cleanup_delay; 178 module_param(gp_cleanup_delay, int, 0444); 179 180 /* 181 * This rcu parameter is runtime-read-only. It reflects 182 * a minimum allowed number of objects which can be cached 183 * per-CPU. Object size is equal to one page. This value 184 * can be changed at boot time. 185 */ 186 static int rcu_min_cached_objs = 2; 187 module_param(rcu_min_cached_objs, int, 0444); 188 189 /* Retrieve RCU kthreads priority for rcutorture */ 190 int rcu_get_gp_kthreads_prio(void) 191 { 192 return kthread_prio; 193 } 194 EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); 195 196 /* 197 * Number of grace periods between delays, normalized by the duration of 198 * the delay. The longer the delay, the more the grace periods between 199 * each delay. The reason for this normalization is that it means that, 200 * for non-zero delays, the overall slowdown of grace periods is constant 201 * regardless of the duration of the delay. This arrangement balances 202 * the need for long delays to increase some race probabilities with the 203 * need for fast grace periods to increase other race probabilities. 204 */ 205 #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ 206 207 /* 208 * Compute the mask of online CPUs for the specified rcu_node structure. 209 * This will not be stable unless the rcu_node structure's ->lock is 210 * held, but the bit corresponding to the current CPU will be stable 211 * in most contexts. 212 */ 213 static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) 214 { 215 return READ_ONCE(rnp->qsmaskinitnext); 216 } 217 218 /* 219 * Return true if an RCU grace period is in progress. The READ_ONCE()s 220 * permit this function to be invoked without holding the root rcu_node 221 * structure's ->lock, but of course results can be subject to change. 222 */ 223 static int rcu_gp_in_progress(void) 224 { 225 return rcu_seq_state(rcu_seq_current(&rcu_state.gp_seq)); 226 } 227 228 /* 229 * Return the number of callbacks queued on the specified CPU. 230 * Handles both the nocbs and normal cases. 231 */ 232 static long rcu_get_n_cbs_cpu(int cpu) 233 { 234 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 235 236 if (rcu_segcblist_is_enabled(&rdp->cblist)) 237 return rcu_segcblist_n_cbs(&rdp->cblist); 238 return 0; 239 } 240 241 void rcu_softirq_qs(void) 242 { 243 rcu_qs(); 244 rcu_preempt_deferred_qs(current); 245 } 246 247 /* 248 * Record entry into an extended quiescent state. This is only to be 249 * called when not already in an extended quiescent state, that is, 250 * RCU is watching prior to the call to this function and is no longer 251 * watching upon return. 252 */ 253 static noinstr void rcu_dynticks_eqs_enter(void) 254 { 255 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 256 int seq; 257 258 /* 259 * CPUs seeing atomic_add_return() must see prior RCU read-side 260 * critical sections, and we also must force ordering with the 261 * next idle sojourn. 262 */ 263 rcu_dynticks_task_trace_enter(); // Before ->dynticks update! 264 seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); 265 // RCU is no longer watching. Better be in extended quiescent state! 266 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 267 (seq & RCU_DYNTICK_CTRL_CTR)); 268 /* Better not have special action (TLB flush) pending! */ 269 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 270 (seq & RCU_DYNTICK_CTRL_MASK)); 271 } 272 273 /* 274 * Record exit from an extended quiescent state. This is only to be 275 * called from an extended quiescent state, that is, RCU is not watching 276 * prior to the call to this function and is watching upon return. 277 */ 278 static noinstr void rcu_dynticks_eqs_exit(void) 279 { 280 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 281 int seq; 282 283 /* 284 * CPUs seeing atomic_add_return() must see prior idle sojourns, 285 * and we also must force ordering with the next RCU read-side 286 * critical section. 287 */ 288 seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); 289 // RCU is now watching. Better not be in an extended quiescent state! 290 rcu_dynticks_task_trace_exit(); // After ->dynticks update! 291 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 292 !(seq & RCU_DYNTICK_CTRL_CTR)); 293 if (seq & RCU_DYNTICK_CTRL_MASK) { 294 arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks); 295 smp_mb__after_atomic(); /* _exit after clearing mask. */ 296 } 297 } 298 299 /* 300 * Reset the current CPU's ->dynticks counter to indicate that the 301 * newly onlined CPU is no longer in an extended quiescent state. 302 * This will either leave the counter unchanged, or increment it 303 * to the next non-quiescent value. 304 * 305 * The non-atomic test/increment sequence works because the upper bits 306 * of the ->dynticks counter are manipulated only by the corresponding CPU, 307 * or when the corresponding CPU is offline. 308 */ 309 static void rcu_dynticks_eqs_online(void) 310 { 311 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 312 313 if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR) 314 return; 315 atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks); 316 } 317 318 /* 319 * Is the current CPU in an extended quiescent state? 320 * 321 * No ordering, as we are sampling CPU-local information. 322 */ 323 static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) 324 { 325 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 326 327 return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR); 328 } 329 330 /* 331 * Snapshot the ->dynticks counter with full ordering so as to allow 332 * stable comparison of this counter with past and future snapshots. 333 */ 334 static int rcu_dynticks_snap(struct rcu_data *rdp) 335 { 336 int snap = atomic_add_return(0, &rdp->dynticks); 337 338 return snap & ~RCU_DYNTICK_CTRL_MASK; 339 } 340 341 /* 342 * Return true if the snapshot returned from rcu_dynticks_snap() 343 * indicates that RCU is in an extended quiescent state. 344 */ 345 static bool rcu_dynticks_in_eqs(int snap) 346 { 347 return !(snap & RCU_DYNTICK_CTRL_CTR); 348 } 349 350 /* 351 * Return true if the CPU corresponding to the specified rcu_data 352 * structure has spent some time in an extended quiescent state since 353 * rcu_dynticks_snap() returned the specified snapshot. 354 */ 355 static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap) 356 { 357 return snap != rcu_dynticks_snap(rdp); 358 } 359 360 /* 361 * Return true if the referenced integer is zero while the specified 362 * CPU remains within a single extended quiescent state. 363 */ 364 bool rcu_dynticks_zero_in_eqs(int cpu, int *vp) 365 { 366 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 367 int snap; 368 369 // If not quiescent, force back to earlier extended quiescent state. 370 snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK | 371 RCU_DYNTICK_CTRL_CTR); 372 373 smp_rmb(); // Order ->dynticks and *vp reads. 374 if (READ_ONCE(*vp)) 375 return false; // Non-zero, so report failure; 376 smp_rmb(); // Order *vp read and ->dynticks re-read. 377 378 // If still in the same extended quiescent state, we are good! 379 return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK); 380 } 381 382 /* 383 * Set the special (bottom) bit of the specified CPU so that it 384 * will take special action (such as flushing its TLB) on the 385 * next exit from an extended quiescent state. Returns true if 386 * the bit was successfully set, or false if the CPU was not in 387 * an extended quiescent state. 388 */ 389 bool rcu_eqs_special_set(int cpu) 390 { 391 int old; 392 int new; 393 int new_old; 394 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 395 396 new_old = atomic_read(&rdp->dynticks); 397 do { 398 old = new_old; 399 if (old & RCU_DYNTICK_CTRL_CTR) 400 return false; 401 new = old | RCU_DYNTICK_CTRL_MASK; 402 new_old = atomic_cmpxchg(&rdp->dynticks, old, new); 403 } while (new_old != old); 404 return true; 405 } 406 407 /* 408 * Let the RCU core know that this CPU has gone through the scheduler, 409 * which is a quiescent state. This is called when the need for a 410 * quiescent state is urgent, so we burn an atomic operation and full 411 * memory barriers to let the RCU core know about it, regardless of what 412 * this CPU might (or might not) do in the near future. 413 * 414 * We inform the RCU core by emulating a zero-duration dyntick-idle period. 415 * 416 * The caller must have disabled interrupts and must not be idle. 417 */ 418 void rcu_momentary_dyntick_idle(void) 419 { 420 int special; 421 422 raw_cpu_write(rcu_data.rcu_need_heavy_qs, false); 423 special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, 424 &this_cpu_ptr(&rcu_data)->dynticks); 425 /* It is illegal to call this from idle state. */ 426 WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); 427 rcu_preempt_deferred_qs(current); 428 } 429 EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle); 430 431 /** 432 * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle 433 * 434 * If the current CPU is idle and running at a first-level (not nested) 435 * interrupt, or directly, from idle, return true. 436 * 437 * The caller must have at least disabled IRQs. 438 */ 439 static int rcu_is_cpu_rrupt_from_idle(void) 440 { 441 long nesting; 442 443 /* 444 * Usually called from the tick; but also used from smp_function_call() 445 * for expedited grace periods. This latter can result in running from 446 * the idle task, instead of an actual IPI. 447 */ 448 lockdep_assert_irqs_disabled(); 449 450 /* Check for counter underflows */ 451 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, 452 "RCU dynticks_nesting counter underflow!"); 453 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, 454 "RCU dynticks_nmi_nesting counter underflow/zero!"); 455 456 /* Are we at first interrupt nesting level? */ 457 nesting = __this_cpu_read(rcu_data.dynticks_nmi_nesting); 458 if (nesting > 1) 459 return false; 460 461 /* 462 * If we're not in an interrupt, we must be in the idle task! 463 */ 464 WARN_ON_ONCE(!nesting && !is_idle_task(current)); 465 466 /* Does CPU appear to be idle from an RCU standpoint? */ 467 return __this_cpu_read(rcu_data.dynticks_nesting) == 0; 468 } 469 470 #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ 471 #define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ 472 static long blimit = DEFAULT_RCU_BLIMIT; 473 #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ 474 static long qhimark = DEFAULT_RCU_QHIMARK; 475 #define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */ 476 static long qlowmark = DEFAULT_RCU_QLOMARK; 477 #define DEFAULT_RCU_QOVLD_MULT 2 478 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK) 479 static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */ 480 static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */ 481 482 module_param(blimit, long, 0444); 483 module_param(qhimark, long, 0444); 484 module_param(qlowmark, long, 0444); 485 module_param(qovld, long, 0444); 486 487 static ulong jiffies_till_first_fqs = ULONG_MAX; 488 static ulong jiffies_till_next_fqs = ULONG_MAX; 489 static bool rcu_kick_kthreads; 490 static int rcu_divisor = 7; 491 module_param(rcu_divisor, int, 0644); 492 493 /* Force an exit from rcu_do_batch() after 3 milliseconds. */ 494 static long rcu_resched_ns = 3 * NSEC_PER_MSEC; 495 module_param(rcu_resched_ns, long, 0644); 496 497 /* 498 * How long the grace period must be before we start recruiting 499 * quiescent-state help from rcu_note_context_switch(). 500 */ 501 static ulong jiffies_till_sched_qs = ULONG_MAX; 502 module_param(jiffies_till_sched_qs, ulong, 0444); 503 static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */ 504 module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ 505 506 /* 507 * Make sure that we give the grace-period kthread time to detect any 508 * idle CPUs before taking active measures to force quiescent states. 509 * However, don't go below 100 milliseconds, adjusted upwards for really 510 * large systems. 511 */ 512 static void adjust_jiffies_till_sched_qs(void) 513 { 514 unsigned long j; 515 516 /* If jiffies_till_sched_qs was specified, respect the request. */ 517 if (jiffies_till_sched_qs != ULONG_MAX) { 518 WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); 519 return; 520 } 521 /* Otherwise, set to third fqs scan, but bound below on large system. */ 522 j = READ_ONCE(jiffies_till_first_fqs) + 523 2 * READ_ONCE(jiffies_till_next_fqs); 524 if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) 525 j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; 526 pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j); 527 WRITE_ONCE(jiffies_to_sched_qs, j); 528 } 529 530 static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) 531 { 532 ulong j; 533 int ret = kstrtoul(val, 0, &j); 534 535 if (!ret) { 536 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : j); 537 adjust_jiffies_till_sched_qs(); 538 } 539 return ret; 540 } 541 542 static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp) 543 { 544 ulong j; 545 int ret = kstrtoul(val, 0, &j); 546 547 if (!ret) { 548 WRITE_ONCE(*(ulong *)kp->arg, (j > HZ) ? HZ : (j ?: 1)); 549 adjust_jiffies_till_sched_qs(); 550 } 551 return ret; 552 } 553 554 static struct kernel_param_ops first_fqs_jiffies_ops = { 555 .set = param_set_first_fqs_jiffies, 556 .get = param_get_ulong, 557 }; 558 559 static struct kernel_param_ops next_fqs_jiffies_ops = { 560 .set = param_set_next_fqs_jiffies, 561 .get = param_get_ulong, 562 }; 563 564 module_param_cb(jiffies_till_first_fqs, &first_fqs_jiffies_ops, &jiffies_till_first_fqs, 0644); 565 module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next_fqs, 0644); 566 module_param(rcu_kick_kthreads, bool, 0644); 567 568 static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); 569 static int rcu_pending(int user); 570 571 /* 572 * Return the number of RCU GPs completed thus far for debug & stats. 573 */ 574 unsigned long rcu_get_gp_seq(void) 575 { 576 return READ_ONCE(rcu_state.gp_seq); 577 } 578 EXPORT_SYMBOL_GPL(rcu_get_gp_seq); 579 580 /* 581 * Return the number of RCU expedited batches completed thus far for 582 * debug & stats. Odd numbers mean that a batch is in progress, even 583 * numbers mean idle. The value returned will thus be roughly double 584 * the cumulative batches since boot. 585 */ 586 unsigned long rcu_exp_batches_completed(void) 587 { 588 return rcu_state.expedited_sequence; 589 } 590 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); 591 592 /* 593 * Return the root node of the rcu_state structure. 594 */ 595 static struct rcu_node *rcu_get_root(void) 596 { 597 return &rcu_state.node[0]; 598 } 599 600 /* 601 * Send along grace-period-related data for rcutorture diagnostics. 602 */ 603 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, 604 unsigned long *gp_seq) 605 { 606 switch (test_type) { 607 case RCU_FLAVOR: 608 *flags = READ_ONCE(rcu_state.gp_flags); 609 *gp_seq = rcu_seq_current(&rcu_state.gp_seq); 610 break; 611 default: 612 break; 613 } 614 } 615 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); 616 617 /* 618 * Enter an RCU extended quiescent state, which can be either the 619 * idle loop or adaptive-tickless usermode execution. 620 * 621 * We crowbar the ->dynticks_nmi_nesting field to zero to allow for 622 * the possibility of usermode upcalls having messed up our count 623 * of interrupt nesting level during the prior busy period. 624 */ 625 static noinstr void rcu_eqs_enter(bool user) 626 { 627 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 628 629 WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE); 630 WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); 631 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 632 rdp->dynticks_nesting == 0); 633 if (rdp->dynticks_nesting != 1) { 634 // RCU will still be watching, so just do accounting and leave. 635 rdp->dynticks_nesting--; 636 return; 637 } 638 639 lockdep_assert_irqs_disabled(); 640 instrumentation_begin(); 641 trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); 642 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); 643 rdp = this_cpu_ptr(&rcu_data); 644 do_nocb_deferred_wakeup(rdp); 645 rcu_prepare_for_idle(); 646 rcu_preempt_deferred_qs(current); 647 648 // instrumentation for the noinstr rcu_dynticks_eqs_enter() 649 instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); 650 651 instrumentation_end(); 652 WRITE_ONCE(rdp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ 653 // RCU is watching here ... 654 rcu_dynticks_eqs_enter(); 655 // ... but is no longer watching here. 656 rcu_dynticks_task_enter(); 657 } 658 659 /** 660 * rcu_idle_enter - inform RCU that current CPU is entering idle 661 * 662 * Enter idle mode, in other words, -leave- the mode in which RCU 663 * read-side critical sections can occur. (Though RCU read-side 664 * critical sections can occur in irq handlers in idle, a possibility 665 * handled by irq_enter() and irq_exit().) 666 * 667 * If you add or remove a call to rcu_idle_enter(), be sure to test with 668 * CONFIG_RCU_EQS_DEBUG=y. 669 */ 670 void rcu_idle_enter(void) 671 { 672 lockdep_assert_irqs_disabled(); 673 rcu_eqs_enter(false); 674 } 675 676 #ifdef CONFIG_NO_HZ_FULL 677 /** 678 * rcu_user_enter - inform RCU that we are resuming userspace. 679 * 680 * Enter RCU idle mode right before resuming userspace. No use of RCU 681 * is permitted between this call and rcu_user_exit(). This way the 682 * CPU doesn't need to maintain the tick for RCU maintenance purposes 683 * when the CPU runs in userspace. 684 * 685 * If you add or remove a call to rcu_user_enter(), be sure to test with 686 * CONFIG_RCU_EQS_DEBUG=y. 687 */ 688 noinstr void rcu_user_enter(void) 689 { 690 lockdep_assert_irqs_disabled(); 691 rcu_eqs_enter(true); 692 } 693 #endif /* CONFIG_NO_HZ_FULL */ 694 695 /** 696 * rcu_nmi_exit - inform RCU of exit from NMI context 697 * 698 * If we are returning from the outermost NMI handler that interrupted an 699 * RCU-idle period, update rdp->dynticks and rdp->dynticks_nmi_nesting 700 * to let the RCU grace-period handling know that the CPU is back to 701 * being RCU-idle. 702 * 703 * If you add or remove a call to rcu_nmi_exit(), be sure to test 704 * with CONFIG_RCU_EQS_DEBUG=y. 705 */ 706 noinstr void rcu_nmi_exit(void) 707 { 708 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 709 710 instrumentation_begin(); 711 /* 712 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. 713 * (We are exiting an NMI handler, so RCU better be paying attention 714 * to us!) 715 */ 716 WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0); 717 WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); 718 719 /* 720 * If the nesting level is not 1, the CPU wasn't RCU-idle, so 721 * leave it in non-RCU-idle state. 722 */ 723 if (rdp->dynticks_nmi_nesting != 1) { 724 trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, 725 atomic_read(&rdp->dynticks)); 726 WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ 727 rdp->dynticks_nmi_nesting - 2); 728 instrumentation_end(); 729 return; 730 } 731 732 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ 733 trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); 734 WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ 735 736 if (!in_nmi()) 737 rcu_prepare_for_idle(); 738 739 // instrumentation for the noinstr rcu_dynticks_eqs_enter() 740 instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); 741 instrumentation_end(); 742 743 // RCU is watching here ... 744 rcu_dynticks_eqs_enter(); 745 // ... but is no longer watching here. 746 747 if (!in_nmi()) 748 rcu_dynticks_task_enter(); 749 } 750 751 /** 752 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 753 * 754 * Exit from an interrupt handler, which might possibly result in entering 755 * idle mode, in other words, leaving the mode in which read-side critical 756 * sections can occur. The caller must have disabled interrupts. 757 * 758 * This code assumes that the idle loop never does anything that might 759 * result in unbalanced calls to irq_enter() and irq_exit(). If your 760 * architecture's idle loop violates this assumption, RCU will give you what 761 * you deserve, good and hard. But very infrequently and irreproducibly. 762 * 763 * Use things like work queues to work around this limitation. 764 * 765 * You have been warned. 766 * 767 * If you add or remove a call to rcu_irq_exit(), be sure to test with 768 * CONFIG_RCU_EQS_DEBUG=y. 769 */ 770 void noinstr rcu_irq_exit(void) 771 { 772 lockdep_assert_irqs_disabled(); 773 rcu_nmi_exit(); 774 } 775 776 /** 777 * rcu_irq_exit_preempt - Inform RCU that current CPU is exiting irq 778 * towards in kernel preemption 779 * 780 * Same as rcu_irq_exit() but has a sanity check that scheduling is safe 781 * from RCU point of view. Invoked from return from interrupt before kernel 782 * preemption. 783 */ 784 void rcu_irq_exit_preempt(void) 785 { 786 lockdep_assert_irqs_disabled(); 787 rcu_nmi_exit(); 788 789 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, 790 "RCU dynticks_nesting counter underflow/zero!"); 791 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 792 DYNTICK_IRQ_NONIDLE, 793 "Bad RCU dynticks_nmi_nesting counter\n"); 794 RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), 795 "RCU in extended quiescent state!"); 796 } 797 798 #ifdef CONFIG_PROVE_RCU 799 /** 800 * rcu_irq_exit_check_preempt - Validate that scheduling is possible 801 */ 802 void rcu_irq_exit_check_preempt(void) 803 { 804 lockdep_assert_irqs_disabled(); 805 806 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) <= 0, 807 "RCU dynticks_nesting counter underflow/zero!"); 808 RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 809 DYNTICK_IRQ_NONIDLE, 810 "Bad RCU dynticks_nmi_nesting counter\n"); 811 RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), 812 "RCU in extended quiescent state!"); 813 } 814 #endif /* #ifdef CONFIG_PROVE_RCU */ 815 816 /* 817 * Wrapper for rcu_irq_exit() where interrupts are enabled. 818 * 819 * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test 820 * with CONFIG_RCU_EQS_DEBUG=y. 821 */ 822 void rcu_irq_exit_irqson(void) 823 { 824 unsigned long flags; 825 826 local_irq_save(flags); 827 rcu_irq_exit(); 828 local_irq_restore(flags); 829 } 830 831 /* 832 * Exit an RCU extended quiescent state, which can be either the 833 * idle loop or adaptive-tickless usermode execution. 834 * 835 * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to 836 * allow for the possibility of usermode upcalls messing up our count of 837 * interrupt nesting level during the busy period that is just now starting. 838 */ 839 static void noinstr rcu_eqs_exit(bool user) 840 { 841 struct rcu_data *rdp; 842 long oldval; 843 844 lockdep_assert_irqs_disabled(); 845 rdp = this_cpu_ptr(&rcu_data); 846 oldval = rdp->dynticks_nesting; 847 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); 848 if (oldval) { 849 // RCU was already watching, so just do accounting and leave. 850 rdp->dynticks_nesting++; 851 return; 852 } 853 rcu_dynticks_task_exit(); 854 // RCU is not watching here ... 855 rcu_dynticks_eqs_exit(); 856 // ... but is watching here. 857 instrumentation_begin(); 858 859 // instrumentation for the noinstr rcu_dynticks_eqs_exit() 860 instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); 861 862 rcu_cleanup_after_idle(); 863 trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); 864 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); 865 WRITE_ONCE(rdp->dynticks_nesting, 1); 866 WARN_ON_ONCE(rdp->dynticks_nmi_nesting); 867 WRITE_ONCE(rdp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); 868 instrumentation_end(); 869 } 870 871 /** 872 * rcu_idle_exit - inform RCU that current CPU is leaving idle 873 * 874 * Exit idle mode, in other words, -enter- the mode in which RCU 875 * read-side critical sections can occur. 876 * 877 * If you add or remove a call to rcu_idle_exit(), be sure to test with 878 * CONFIG_RCU_EQS_DEBUG=y. 879 */ 880 void rcu_idle_exit(void) 881 { 882 unsigned long flags; 883 884 local_irq_save(flags); 885 rcu_eqs_exit(false); 886 local_irq_restore(flags); 887 } 888 889 #ifdef CONFIG_NO_HZ_FULL 890 /** 891 * rcu_user_exit - inform RCU that we are exiting userspace. 892 * 893 * Exit RCU idle mode while entering the kernel because it can 894 * run a RCU read side critical section anytime. 895 * 896 * If you add or remove a call to rcu_user_exit(), be sure to test with 897 * CONFIG_RCU_EQS_DEBUG=y. 898 */ 899 void noinstr rcu_user_exit(void) 900 { 901 rcu_eqs_exit(1); 902 } 903 904 /** 905 * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. 906 * 907 * The scheduler tick is not normally enabled when CPUs enter the kernel 908 * from nohz_full userspace execution. After all, nohz_full userspace 909 * execution is an RCU quiescent state and the time executing in the kernel 910 * is quite short. Except of course when it isn't. And it is not hard to 911 * cause a large system to spend tens of seconds or even minutes looping 912 * in the kernel, which can cause a number of problems, include RCU CPU 913 * stall warnings. 914 * 915 * Therefore, if a nohz_full CPU fails to report a quiescent state 916 * in a timely manner, the RCU grace-period kthread sets that CPU's 917 * ->rcu_urgent_qs flag with the expectation that the next interrupt or 918 * exception will invoke this function, which will turn on the scheduler 919 * tick, which will enable RCU to detect that CPU's quiescent states, 920 * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. 921 * The tick will be disabled once a quiescent state is reported for 922 * this CPU. 923 * 924 * Of course, in carefully tuned systems, there might never be an 925 * interrupt or exception. In that case, the RCU grace-period kthread 926 * will eventually cause one to happen. However, in less carefully 927 * controlled environments, this function allows RCU to get what it 928 * needs without creating otherwise useless interruptions. 929 */ 930 void __rcu_irq_enter_check_tick(void) 931 { 932 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 933 934 // Enabling the tick is unsafe in NMI handlers. 935 if (WARN_ON_ONCE(in_nmi())) 936 return; 937 938 RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), 939 "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); 940 941 if (!tick_nohz_full_cpu(rdp->cpu) || 942 !READ_ONCE(rdp->rcu_urgent_qs) || 943 READ_ONCE(rdp->rcu_forced_tick)) { 944 // RCU doesn't need nohz_full help from this CPU, or it is 945 // already getting that help. 946 return; 947 } 948 949 // We get here only when not in an extended quiescent state and 950 // from interrupts (as opposed to NMIs). Therefore, (1) RCU is 951 // already watching and (2) The fact that we are in an interrupt 952 // handler and that the rcu_node lock is an irq-disabled lock 953 // prevents self-deadlock. So we can safely recheck under the lock. 954 // Note that the nohz_full state currently cannot change. 955 raw_spin_lock_rcu_node(rdp->mynode); 956 if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { 957 // A nohz_full CPU is in the kernel and RCU needs a 958 // quiescent state. Turn on the tick! 959 WRITE_ONCE(rdp->rcu_forced_tick, true); 960 tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); 961 } 962 raw_spin_unlock_rcu_node(rdp->mynode); 963 } 964 #endif /* CONFIG_NO_HZ_FULL */ 965 966 /** 967 * rcu_nmi_enter - inform RCU of entry to NMI context 968 * 969 * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and 970 * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know 971 * that the CPU is active. This implementation permits nested NMIs, as 972 * long as the nesting level does not overflow an int. (You will probably 973 * run out of stack space first.) 974 * 975 * If you add or remove a call to rcu_nmi_enter(), be sure to test 976 * with CONFIG_RCU_EQS_DEBUG=y. 977 */ 978 noinstr void rcu_nmi_enter(void) 979 { 980 long incby = 2; 981 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 982 983 /* Complain about underflow. */ 984 WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); 985 986 /* 987 * If idle from RCU viewpoint, atomically increment ->dynticks 988 * to mark non-idle and increment ->dynticks_nmi_nesting by one. 989 * Otherwise, increment ->dynticks_nmi_nesting by two. This means 990 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed 991 * to be in the outermost NMI handler that interrupted an RCU-idle 992 * period (observation due to Andy Lutomirski). 993 */ 994 if (rcu_dynticks_curr_cpu_in_eqs()) { 995 996 if (!in_nmi()) 997 rcu_dynticks_task_exit(); 998 999 // RCU is not watching here ... 1000 rcu_dynticks_eqs_exit(); 1001 // ... but is watching here. 1002 1003 if (!in_nmi()) { 1004 instrumentation_begin(); 1005 rcu_cleanup_after_idle(); 1006 instrumentation_end(); 1007 } 1008 1009 instrumentation_begin(); 1010 // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() 1011 instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); 1012 // instrumentation for the noinstr rcu_dynticks_eqs_exit() 1013 instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); 1014 1015 incby = 1; 1016 } else if (!in_nmi()) { 1017 instrumentation_begin(); 1018 rcu_irq_enter_check_tick(); 1019 instrumentation_end(); 1020 } else { 1021 instrumentation_begin(); 1022 } 1023 1024 trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), 1025 rdp->dynticks_nmi_nesting, 1026 rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); 1027 instrumentation_end(); 1028 WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ 1029 rdp->dynticks_nmi_nesting + incby); 1030 barrier(); 1031 } 1032 1033 /** 1034 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 1035 * 1036 * Enter an interrupt handler, which might possibly result in exiting 1037 * idle mode, in other words, entering the mode in which read-side critical 1038 * sections can occur. The caller must have disabled interrupts. 1039 * 1040 * Note that the Linux kernel is fully capable of entering an interrupt 1041 * handler that it never exits, for example when doing upcalls to user mode! 1042 * This code assumes that the idle loop never does upcalls to user mode. 1043 * If your architecture's idle loop does do upcalls to user mode (or does 1044 * anything else that results in unbalanced calls to the irq_enter() and 1045 * irq_exit() functions), RCU will give you what you deserve, good and hard. 1046 * But very infrequently and irreproducibly. 1047 * 1048 * Use things like work queues to work around this limitation. 1049 * 1050 * You have been warned. 1051 * 1052 * If you add or remove a call to rcu_irq_enter(), be sure to test with 1053 * CONFIG_RCU_EQS_DEBUG=y. 1054 */ 1055 noinstr void rcu_irq_enter(void) 1056 { 1057 lockdep_assert_irqs_disabled(); 1058 rcu_nmi_enter(); 1059 } 1060 1061 /* 1062 * Wrapper for rcu_irq_enter() where interrupts are enabled. 1063 * 1064 * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test 1065 * with CONFIG_RCU_EQS_DEBUG=y. 1066 */ 1067 void rcu_irq_enter_irqson(void) 1068 { 1069 unsigned long flags; 1070 1071 local_irq_save(flags); 1072 rcu_irq_enter(); 1073 local_irq_restore(flags); 1074 } 1075 1076 /* 1077 * If any sort of urgency was applied to the current CPU (for example, 1078 * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order 1079 * to get to a quiescent state, disable it. 1080 */ 1081 static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) 1082 { 1083 raw_lockdep_assert_held_rcu_node(rdp->mynode); 1084 WRITE_ONCE(rdp->rcu_urgent_qs, false); 1085 WRITE_ONCE(rdp->rcu_need_heavy_qs, false); 1086 if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { 1087 tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU); 1088 WRITE_ONCE(rdp->rcu_forced_tick, false); 1089 } 1090 } 1091 1092 noinstr bool __rcu_is_watching(void) 1093 { 1094 return !rcu_dynticks_curr_cpu_in_eqs(); 1095 } 1096 1097 /** 1098 * rcu_is_watching - see if RCU thinks that the current CPU is not idle 1099 * 1100 * Return true if RCU is watching the running CPU, which means that this 1101 * CPU can safely enter RCU read-side critical sections. In other words, 1102 * if the current CPU is not in its idle loop or is in an interrupt or 1103 * NMI handler, return true. 1104 */ 1105 bool rcu_is_watching(void) 1106 { 1107 bool ret; 1108 1109 preempt_disable_notrace(); 1110 ret = !rcu_dynticks_curr_cpu_in_eqs(); 1111 preempt_enable_notrace(); 1112 return ret; 1113 } 1114 EXPORT_SYMBOL_GPL(rcu_is_watching); 1115 1116 /* 1117 * If a holdout task is actually running, request an urgent quiescent 1118 * state from its CPU. This is unsynchronized, so migrations can cause 1119 * the request to go to the wrong CPU. Which is OK, all that will happen 1120 * is that the CPU's next context switch will be a bit slower and next 1121 * time around this task will generate another request. 1122 */ 1123 void rcu_request_urgent_qs_task(struct task_struct *t) 1124 { 1125 int cpu; 1126 1127 barrier(); 1128 cpu = task_cpu(t); 1129 if (!task_curr(t)) 1130 return; /* This task is not running on that CPU. */ 1131 smp_store_release(per_cpu_ptr(&rcu_data.rcu_urgent_qs, cpu), true); 1132 } 1133 1134 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 1135 1136 /* 1137 * Is the current CPU online as far as RCU is concerned? 1138 * 1139 * Disable preemption to avoid false positives that could otherwise 1140 * happen due to the current CPU number being sampled, this task being 1141 * preempted, its old CPU being taken offline, resuming on some other CPU, 1142 * then determining that its old CPU is now offline. 1143 * 1144 * Disable checking if in an NMI handler because we cannot safely 1145 * report errors from NMI handlers anyway. In addition, it is OK to use 1146 * RCU on an offline processor during initial boot, hence the check for 1147 * rcu_scheduler_fully_active. 1148 */ 1149 bool rcu_lockdep_current_cpu_online(void) 1150 { 1151 struct rcu_data *rdp; 1152 struct rcu_node *rnp; 1153 bool ret = false; 1154 1155 if (in_nmi() || !rcu_scheduler_fully_active) 1156 return true; 1157 preempt_disable_notrace(); 1158 rdp = this_cpu_ptr(&rcu_data); 1159 rnp = rdp->mynode; 1160 if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) 1161 ret = true; 1162 preempt_enable_notrace(); 1163 return ret; 1164 } 1165 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 1166 1167 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ 1168 1169 /* 1170 * We are reporting a quiescent state on behalf of some other CPU, so 1171 * it is our responsibility to check for and handle potential overflow 1172 * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. 1173 * After all, the CPU might be in deep idle state, and thus executing no 1174 * code whatsoever. 1175 */ 1176 static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) 1177 { 1178 raw_lockdep_assert_held_rcu_node(rnp); 1179 if (ULONG_CMP_LT(rcu_seq_current(&rdp->gp_seq) + ULONG_MAX / 4, 1180 rnp->gp_seq)) 1181 WRITE_ONCE(rdp->gpwrap, true); 1182 if (ULONG_CMP_LT(rdp->rcu_iw_gp_seq + ULONG_MAX / 4, rnp->gp_seq)) 1183 rdp->rcu_iw_gp_seq = rnp->gp_seq + ULONG_MAX / 4; 1184 } 1185 1186 /* 1187 * Snapshot the specified CPU's dynticks counter so that we can later 1188 * credit them with an implicit quiescent state. Return 1 if this CPU 1189 * is in dynticks idle mode, which is an extended quiescent state. 1190 */ 1191 static int dyntick_save_progress_counter(struct rcu_data *rdp) 1192 { 1193 rdp->dynticks_snap = rcu_dynticks_snap(rdp); 1194 if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { 1195 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); 1196 rcu_gpnum_ovf(rdp->mynode, rdp); 1197 return 1; 1198 } 1199 return 0; 1200 } 1201 1202 /* 1203 * Return true if the specified CPU has passed through a quiescent 1204 * state by virtue of being in or having passed through an dynticks 1205 * idle state since the last call to dyntick_save_progress_counter() 1206 * for this same CPU, or by virtue of having been offline. 1207 */ 1208 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 1209 { 1210 unsigned long jtsq; 1211 bool *rnhqp; 1212 bool *ruqp; 1213 struct rcu_node *rnp = rdp->mynode; 1214 1215 /* 1216 * If the CPU passed through or entered a dynticks idle phase with 1217 * no active irq/NMI handlers, then we can safely pretend that the CPU 1218 * already acknowledged the request to pass through a quiescent 1219 * state. Either way, that CPU cannot possibly be in an RCU 1220 * read-side critical section that started before the beginning 1221 * of the current RCU grace period. 1222 */ 1223 if (rcu_dynticks_in_eqs_since(rdp, rdp->dynticks_snap)) { 1224 trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti")); 1225 rcu_gpnum_ovf(rnp, rdp); 1226 return 1; 1227 } 1228 1229 /* If waiting too long on an offline CPU, complain. */ 1230 if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) && 1231 time_after(jiffies, rcu_state.gp_start + HZ)) { 1232 bool onl; 1233 struct rcu_node *rnp1; 1234 1235 WARN_ON(1); /* Offline CPUs are supposed to report QS! */ 1236 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", 1237 __func__, rnp->grplo, rnp->grphi, rnp->level, 1238 (long)rnp->gp_seq, (long)rnp->completedqs); 1239 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) 1240 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n", 1241 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); 1242 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); 1243 pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", 1244 __func__, rdp->cpu, ".o"[onl], 1245 (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, 1246 (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); 1247 return 1; /* Break things loose after complaining. */ 1248 } 1249 1250 /* 1251 * A CPU running for an extended time within the kernel can 1252 * delay RCU grace periods: (1) At age jiffies_to_sched_qs, 1253 * set .rcu_urgent_qs, (2) At age 2*jiffies_to_sched_qs, set 1254 * both .rcu_need_heavy_qs and .rcu_urgent_qs. Note that the 1255 * unsynchronized assignments to the per-CPU rcu_need_heavy_qs 1256 * variable are safe because the assignments are repeated if this 1257 * CPU failed to pass through a quiescent state. This code 1258 * also checks .jiffies_resched in case jiffies_to_sched_qs 1259 * is set way high. 1260 */ 1261 jtsq = READ_ONCE(jiffies_to_sched_qs); 1262 ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); 1263 rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu); 1264 if (!READ_ONCE(*rnhqp) && 1265 (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || 1266 time_after(jiffies, rcu_state.jiffies_resched) || 1267 rcu_state.cbovld)) { 1268 WRITE_ONCE(*rnhqp, true); 1269 /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ 1270 smp_store_release(ruqp, true); 1271 } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) { 1272 WRITE_ONCE(*ruqp, true); 1273 } 1274 1275 /* 1276 * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq! 1277 * The above code handles this, but only for straight cond_resched(). 1278 * And some in-kernel loops check need_resched() before calling 1279 * cond_resched(), which defeats the above code for CPUs that are 1280 * running in-kernel with scheduling-clock interrupts disabled. 1281 * So hit them over the head with the resched_cpu() hammer! 1282 */ 1283 if (tick_nohz_full_cpu(rdp->cpu) && 1284 (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || 1285 rcu_state.cbovld)) { 1286 WRITE_ONCE(*ruqp, true); 1287 resched_cpu(rdp->cpu); 1288 WRITE_ONCE(rdp->last_fqs_resched, jiffies); 1289 } 1290 1291 /* 1292 * If more than halfway to RCU CPU stall-warning time, invoke 1293 * resched_cpu() more frequently to try to loosen things up a bit. 1294 * Also check to see if the CPU is getting hammered with interrupts, 1295 * but only once per grace period, just to keep the IPIs down to 1296 * a dull roar. 1297 */ 1298 if (time_after(jiffies, rcu_state.jiffies_resched)) { 1299 if (time_after(jiffies, 1300 READ_ONCE(rdp->last_fqs_resched) + jtsq)) { 1301 resched_cpu(rdp->cpu); 1302 WRITE_ONCE(rdp->last_fqs_resched, jiffies); 1303 } 1304 if (IS_ENABLED(CONFIG_IRQ_WORK) && 1305 !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && 1306 (rnp->ffmask & rdp->grpmask)) { 1307 init_irq_work(&rdp->rcu_iw, rcu_iw_handler); 1308 atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); 1309 rdp->rcu_iw_pending = true; 1310 rdp->rcu_iw_gp_seq = rnp->gp_seq; 1311 irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); 1312 } 1313 } 1314 1315 return 0; 1316 } 1317 1318 /* Trace-event wrapper function for trace_rcu_future_grace_period. */ 1319 static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1320 unsigned long gp_seq_req, const char *s) 1321 { 1322 trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), 1323 gp_seq_req, rnp->level, 1324 rnp->grplo, rnp->grphi, s); 1325 } 1326 1327 /* 1328 * rcu_start_this_gp - Request the start of a particular grace period 1329 * @rnp_start: The leaf node of the CPU from which to start. 1330 * @rdp: The rcu_data corresponding to the CPU from which to start. 1331 * @gp_seq_req: The gp_seq of the grace period to start. 1332 * 1333 * Start the specified grace period, as needed to handle newly arrived 1334 * callbacks. The required future grace periods are recorded in each 1335 * rcu_node structure's ->gp_seq_needed field. Returns true if there 1336 * is reason to awaken the grace-period kthread. 1337 * 1338 * The caller must hold the specified rcu_node structure's ->lock, which 1339 * is why the caller is responsible for waking the grace-period kthread. 1340 * 1341 * Returns true if the GP thread needs to be awakened else false. 1342 */ 1343 static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, 1344 unsigned long gp_seq_req) 1345 { 1346 bool ret = false; 1347 struct rcu_node *rnp; 1348 1349 /* 1350 * Use funnel locking to either acquire the root rcu_node 1351 * structure's lock or bail out if the need for this grace period 1352 * has already been recorded -- or if that grace period has in 1353 * fact already started. If there is already a grace period in 1354 * progress in a non-leaf node, no recording is needed because the 1355 * end of the grace period will scan the leaf rcu_node structures. 1356 * Note that rnp_start->lock must not be released. 1357 */ 1358 raw_lockdep_assert_held_rcu_node(rnp_start); 1359 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, TPS("Startleaf")); 1360 for (rnp = rnp_start; 1; rnp = rnp->parent) { 1361 if (rnp != rnp_start) 1362 raw_spin_lock_rcu_node(rnp); 1363 if (ULONG_CMP_GE(rnp->gp_seq_needed, gp_seq_req) || 1364 rcu_seq_started(&rnp->gp_seq, gp_seq_req) || 1365 (rnp != rnp_start && 1366 rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))) { 1367 trace_rcu_this_gp(rnp, rdp, gp_seq_req, 1368 TPS("Prestarted")); 1369 goto unlock_out; 1370 } 1371 WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req); 1372 if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) { 1373 /* 1374 * We just marked the leaf or internal node, and a 1375 * grace period is in progress, which means that 1376 * rcu_gp_cleanup() will see the marking. Bail to 1377 * reduce contention. 1378 */ 1379 trace_rcu_this_gp(rnp_start, rdp, gp_seq_req, 1380 TPS("Startedleaf")); 1381 goto unlock_out; 1382 } 1383 if (rnp != rnp_start && rnp->parent != NULL) 1384 raw_spin_unlock_rcu_node(rnp); 1385 if (!rnp->parent) 1386 break; /* At root, and perhaps also leaf. */ 1387 } 1388 1389 /* If GP already in progress, just leave, otherwise start one. */ 1390 if (rcu_gp_in_progress()) { 1391 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedleafroot")); 1392 goto unlock_out; 1393 } 1394 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot")); 1395 WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT); 1396 WRITE_ONCE(rcu_state.gp_req_activity, jiffies); 1397 if (!READ_ONCE(rcu_state.gp_kthread)) { 1398 trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); 1399 goto unlock_out; 1400 } 1401 trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); 1402 ret = true; /* Caller must wake GP kthread. */ 1403 unlock_out: 1404 /* Push furthest requested GP to leaf node and rcu_data structure. */ 1405 if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) { 1406 WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed); 1407 WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); 1408 } 1409 if (rnp != rnp_start) 1410 raw_spin_unlock_rcu_node(rnp); 1411 return ret; 1412 } 1413 1414 /* 1415 * Clean up any old requests for the just-ended grace period. Also return 1416 * whether any additional grace periods have been requested. 1417 */ 1418 static bool rcu_future_gp_cleanup(struct rcu_node *rnp) 1419 { 1420 bool needmore; 1421 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1422 1423 needmore = ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed); 1424 if (!needmore) 1425 rnp->gp_seq_needed = rnp->gp_seq; /* Avoid counter wrap. */ 1426 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq, 1427 needmore ? TPS("CleanupMore") : TPS("Cleanup")); 1428 return needmore; 1429 } 1430 1431 /* 1432 * Awaken the grace-period kthread. Don't do a self-awaken (unless in an 1433 * interrupt or softirq handler, in which case we just might immediately 1434 * sleep upon return, resulting in a grace-period hang), and don't bother 1435 * awakening when there is nothing for the grace-period kthread to do 1436 * (as in several CPUs raced to awaken, we lost), and finally don't try 1437 * to awaken a kthread that has not yet been created. If all those checks 1438 * are passed, track some debug information and awaken. 1439 * 1440 * So why do the self-wakeup when in an interrupt or softirq handler 1441 * in the grace-period kthread's context? Because the kthread might have 1442 * been interrupted just as it was going to sleep, and just after the final 1443 * pre-sleep check of the awaken condition. In this case, a wakeup really 1444 * is required, and is therefore supplied. 1445 */ 1446 static void rcu_gp_kthread_wake(void) 1447 { 1448 struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); 1449 1450 if ((current == t && !in_irq() && !in_serving_softirq()) || 1451 !READ_ONCE(rcu_state.gp_flags) || !t) 1452 return; 1453 WRITE_ONCE(rcu_state.gp_wake_time, jiffies); 1454 WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); 1455 swake_up_one(&rcu_state.gp_wq); 1456 } 1457 1458 /* 1459 * If there is room, assign a ->gp_seq number to any callbacks on this 1460 * CPU that have not already been assigned. Also accelerate any callbacks 1461 * that were previously assigned a ->gp_seq number that has since proven 1462 * to be too conservative, which can happen if callbacks get assigned a 1463 * ->gp_seq number while RCU is idle, but with reference to a non-root 1464 * rcu_node structure. This function is idempotent, so it does not hurt 1465 * to call it repeatedly. Returns an flag saying that we should awaken 1466 * the RCU grace-period kthread. 1467 * 1468 * The caller must hold rnp->lock with interrupts disabled. 1469 */ 1470 static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) 1471 { 1472 unsigned long gp_seq_req; 1473 bool ret = false; 1474 1475 rcu_lockdep_assert_cblist_protected(rdp); 1476 raw_lockdep_assert_held_rcu_node(rnp); 1477 1478 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1479 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1480 return false; 1481 1482 /* 1483 * Callbacks are often registered with incomplete grace-period 1484 * information. Something about the fact that getting exact 1485 * information requires acquiring a global lock... RCU therefore 1486 * makes a conservative estimate of the grace period number at which 1487 * a given callback will become ready to invoke. The following 1488 * code checks this estimate and improves it when possible, thus 1489 * accelerating callback invocation to an earlier grace-period 1490 * number. 1491 */ 1492 gp_seq_req = rcu_seq_snap(&rcu_state.gp_seq); 1493 if (rcu_segcblist_accelerate(&rdp->cblist, gp_seq_req)) 1494 ret = rcu_start_this_gp(rnp, rdp, gp_seq_req); 1495 1496 /* Trace depending on how much we were able to accelerate. */ 1497 if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) 1498 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB")); 1499 else 1500 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB")); 1501 return ret; 1502 } 1503 1504 /* 1505 * Similar to rcu_accelerate_cbs(), but does not require that the leaf 1506 * rcu_node structure's ->lock be held. It consults the cached value 1507 * of ->gp_seq_needed in the rcu_data structure, and if that indicates 1508 * that a new grace-period request be made, invokes rcu_accelerate_cbs() 1509 * while holding the leaf rcu_node structure's ->lock. 1510 */ 1511 static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, 1512 struct rcu_data *rdp) 1513 { 1514 unsigned long c; 1515 bool needwake; 1516 1517 rcu_lockdep_assert_cblist_protected(rdp); 1518 c = rcu_seq_snap(&rcu_state.gp_seq); 1519 if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) { 1520 /* Old request still live, so mark recent callbacks. */ 1521 (void)rcu_segcblist_accelerate(&rdp->cblist, c); 1522 return; 1523 } 1524 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1525 needwake = rcu_accelerate_cbs(rnp, rdp); 1526 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 1527 if (needwake) 1528 rcu_gp_kthread_wake(); 1529 } 1530 1531 /* 1532 * Move any callbacks whose grace period has completed to the 1533 * RCU_DONE_TAIL sublist, then compact the remaining sublists and 1534 * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL 1535 * sublist. This function is idempotent, so it does not hurt to 1536 * invoke it repeatedly. As long as it is not invoked -too- often... 1537 * Returns true if the RCU grace-period kthread needs to be awakened. 1538 * 1539 * The caller must hold rnp->lock with interrupts disabled. 1540 */ 1541 static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) 1542 { 1543 rcu_lockdep_assert_cblist_protected(rdp); 1544 raw_lockdep_assert_held_rcu_node(rnp); 1545 1546 /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ 1547 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) 1548 return false; 1549 1550 /* 1551 * Find all callbacks whose ->gp_seq numbers indicate that they 1552 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1553 */ 1554 rcu_segcblist_advance(&rdp->cblist, rnp->gp_seq); 1555 1556 /* Classify any remaining callbacks. */ 1557 return rcu_accelerate_cbs(rnp, rdp); 1558 } 1559 1560 /* 1561 * Move and classify callbacks, but only if doing so won't require 1562 * that the RCU grace-period kthread be awakened. 1563 */ 1564 static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, 1565 struct rcu_data *rdp) 1566 { 1567 rcu_lockdep_assert_cblist_protected(rdp); 1568 if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || 1569 !raw_spin_trylock_rcu_node(rnp)) 1570 return; 1571 WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); 1572 raw_spin_unlock_rcu_node(rnp); 1573 } 1574 1575 /* 1576 * Update CPU-local rcu_data state to record the beginnings and ends of 1577 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1578 * structure corresponding to the current CPU, and must have irqs disabled. 1579 * Returns true if the grace-period kthread needs to be awakened. 1580 */ 1581 static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) 1582 { 1583 bool ret = false; 1584 bool need_qs; 1585 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && 1586 rcu_segcblist_is_offloaded(&rdp->cblist); 1587 1588 raw_lockdep_assert_held_rcu_node(rnp); 1589 1590 if (rdp->gp_seq == rnp->gp_seq) 1591 return false; /* Nothing to do. */ 1592 1593 /* Handle the ends of any preceding grace periods first. */ 1594 if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) || 1595 unlikely(READ_ONCE(rdp->gpwrap))) { 1596 if (!offloaded) 1597 ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */ 1598 rdp->core_needs_qs = false; 1599 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend")); 1600 } else { 1601 if (!offloaded) 1602 ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */ 1603 if (rdp->core_needs_qs) 1604 rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); 1605 } 1606 1607 /* Now handle the beginnings of any new-to-this-CPU grace periods. */ 1608 if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) || 1609 unlikely(READ_ONCE(rdp->gpwrap))) { 1610 /* 1611 * If the current grace period is waiting for this CPU, 1612 * set up to detect a quiescent state, otherwise don't 1613 * go looking for one. 1614 */ 1615 trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart")); 1616 need_qs = !!(rnp->qsmask & rdp->grpmask); 1617 rdp->cpu_no_qs.b.norm = need_qs; 1618 rdp->core_needs_qs = need_qs; 1619 zero_cpu_stall_ticks(rdp); 1620 } 1621 rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ 1622 if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) 1623 WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed); 1624 WRITE_ONCE(rdp->gpwrap, false); 1625 rcu_gpnum_ovf(rnp, rdp); 1626 return ret; 1627 } 1628 1629 static void note_gp_changes(struct rcu_data *rdp) 1630 { 1631 unsigned long flags; 1632 bool needwake; 1633 struct rcu_node *rnp; 1634 1635 local_irq_save(flags); 1636 rnp = rdp->mynode; 1637 if ((rdp->gp_seq == rcu_seq_current(&rnp->gp_seq) && 1638 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ 1639 !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ 1640 local_irq_restore(flags); 1641 return; 1642 } 1643 needwake = __note_gp_changes(rnp, rdp); 1644 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1645 if (needwake) 1646 rcu_gp_kthread_wake(); 1647 } 1648 1649 static void rcu_gp_slow(int delay) 1650 { 1651 if (delay > 0 && 1652 !(rcu_seq_ctr(rcu_state.gp_seq) % 1653 (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) 1654 schedule_timeout_idle(delay); 1655 } 1656 1657 static unsigned long sleep_duration; 1658 1659 /* Allow rcutorture to stall the grace-period kthread. */ 1660 void rcu_gp_set_torture_wait(int duration) 1661 { 1662 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) 1663 WRITE_ONCE(sleep_duration, duration); 1664 } 1665 EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); 1666 1667 /* Actually implement the aforementioned wait. */ 1668 static void rcu_gp_torture_wait(void) 1669 { 1670 unsigned long duration; 1671 1672 if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) 1673 return; 1674 duration = xchg(&sleep_duration, 0UL); 1675 if (duration > 0) { 1676 pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); 1677 schedule_timeout_idle(duration); 1678 pr_alert("%s: Wait complete\n", __func__); 1679 } 1680 } 1681 1682 /* 1683 * Initialize a new grace period. Return false if no grace period required. 1684 */ 1685 static bool rcu_gp_init(void) 1686 { 1687 unsigned long flags; 1688 unsigned long oldmask; 1689 unsigned long mask; 1690 struct rcu_data *rdp; 1691 struct rcu_node *rnp = rcu_get_root(); 1692 1693 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1694 raw_spin_lock_irq_rcu_node(rnp); 1695 if (!READ_ONCE(rcu_state.gp_flags)) { 1696 /* Spurious wakeup, tell caller to go back to sleep. */ 1697 raw_spin_unlock_irq_rcu_node(rnp); 1698 return false; 1699 } 1700 WRITE_ONCE(rcu_state.gp_flags, 0); /* Clear all flags: New GP. */ 1701 1702 if (WARN_ON_ONCE(rcu_gp_in_progress())) { 1703 /* 1704 * Grace period already in progress, don't start another. 1705 * Not supposed to be able to happen. 1706 */ 1707 raw_spin_unlock_irq_rcu_node(rnp); 1708 return false; 1709 } 1710 1711 /* Advance to a new grace period and initialize state. */ 1712 record_gp_stall_check_time(); 1713 /* Record GP times before starting GP, hence rcu_seq_start(). */ 1714 rcu_seq_start(&rcu_state.gp_seq); 1715 ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); 1716 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); 1717 raw_spin_unlock_irq_rcu_node(rnp); 1718 1719 /* 1720 * Apply per-leaf buffered online and offline operations to the 1721 * rcu_node tree. Note that this new grace period need not wait 1722 * for subsequent online CPUs, and that quiescent-state forcing 1723 * will handle subsequent offline CPUs. 1724 */ 1725 rcu_state.gp_state = RCU_GP_ONOFF; 1726 rcu_for_each_leaf_node(rnp) { 1727 raw_spin_lock(&rcu_state.ofl_lock); 1728 raw_spin_lock_irq_rcu_node(rnp); 1729 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1730 !rnp->wait_blkd_tasks) { 1731 /* Nothing to do on this leaf rcu_node structure. */ 1732 raw_spin_unlock_irq_rcu_node(rnp); 1733 raw_spin_unlock(&rcu_state.ofl_lock); 1734 continue; 1735 } 1736 1737 /* Record old state, apply changes to ->qsmaskinit field. */ 1738 oldmask = rnp->qsmaskinit; 1739 rnp->qsmaskinit = rnp->qsmaskinitnext; 1740 1741 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ 1742 if (!oldmask != !rnp->qsmaskinit) { 1743 if (!oldmask) { /* First online CPU for rcu_node. */ 1744 if (!rnp->wait_blkd_tasks) /* Ever offline? */ 1745 rcu_init_new_rnp(rnp); 1746 } else if (rcu_preempt_has_tasks(rnp)) { 1747 rnp->wait_blkd_tasks = true; /* blocked tasks */ 1748 } else { /* Last offline CPU and can propagate. */ 1749 rcu_cleanup_dead_rnp(rnp); 1750 } 1751 } 1752 1753 /* 1754 * If all waited-on tasks from prior grace period are 1755 * done, and if all this rcu_node structure's CPUs are 1756 * still offline, propagate up the rcu_node tree and 1757 * clear ->wait_blkd_tasks. Otherwise, if one of this 1758 * rcu_node structure's CPUs has since come back online, 1759 * simply clear ->wait_blkd_tasks. 1760 */ 1761 if (rnp->wait_blkd_tasks && 1762 (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) { 1763 rnp->wait_blkd_tasks = false; 1764 if (!rnp->qsmaskinit) 1765 rcu_cleanup_dead_rnp(rnp); 1766 } 1767 1768 raw_spin_unlock_irq_rcu_node(rnp); 1769 raw_spin_unlock(&rcu_state.ofl_lock); 1770 } 1771 rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */ 1772 1773 /* 1774 * Set the quiescent-state-needed bits in all the rcu_node 1775 * structures for all currently online CPUs in breadth-first 1776 * order, starting from the root rcu_node structure, relying on the 1777 * layout of the tree within the rcu_state.node[] array. Note that 1778 * other CPUs will access only the leaves of the hierarchy, thus 1779 * seeing that no grace period is in progress, at least until the 1780 * corresponding leaf node has been initialized. 1781 * 1782 * The grace period cannot complete until the initialization 1783 * process finishes, because this kthread handles both. 1784 */ 1785 rcu_state.gp_state = RCU_GP_INIT; 1786 rcu_for_each_node_breadth_first(rnp) { 1787 rcu_gp_slow(gp_init_delay); 1788 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1789 rdp = this_cpu_ptr(&rcu_data); 1790 rcu_preempt_check_blocked_tasks(rnp); 1791 rnp->qsmask = rnp->qsmaskinit; 1792 WRITE_ONCE(rnp->gp_seq, rcu_state.gp_seq); 1793 if (rnp == rdp->mynode) 1794 (void)__note_gp_changes(rnp, rdp); 1795 rcu_preempt_boost_start_gp(rnp); 1796 trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, 1797 rnp->level, rnp->grplo, 1798 rnp->grphi, rnp->qsmask); 1799 /* Quiescent states for tasks on any now-offline CPUs. */ 1800 mask = rnp->qsmask & ~rnp->qsmaskinitnext; 1801 rnp->rcu_gp_init_mask = mask; 1802 if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) 1803 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); 1804 else 1805 raw_spin_unlock_irq_rcu_node(rnp); 1806 cond_resched_tasks_rcu_qs(); 1807 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1808 } 1809 1810 return true; 1811 } 1812 1813 /* 1814 * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state 1815 * time. 1816 */ 1817 static bool rcu_gp_fqs_check_wake(int *gfp) 1818 { 1819 struct rcu_node *rnp = rcu_get_root(); 1820 1821 // If under overload conditions, force an immediate FQS scan. 1822 if (*gfp & RCU_GP_FLAG_OVLD) 1823 return true; 1824 1825 // Someone like call_rcu() requested a force-quiescent-state scan. 1826 *gfp = READ_ONCE(rcu_state.gp_flags); 1827 if (*gfp & RCU_GP_FLAG_FQS) 1828 return true; 1829 1830 // The current grace period has completed. 1831 if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) 1832 return true; 1833 1834 return false; 1835 } 1836 1837 /* 1838 * Do one round of quiescent-state forcing. 1839 */ 1840 static void rcu_gp_fqs(bool first_time) 1841 { 1842 struct rcu_node *rnp = rcu_get_root(); 1843 1844 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1845 rcu_state.n_force_qs++; 1846 if (first_time) { 1847 /* Collect dyntick-idle snapshots. */ 1848 force_qs_rnp(dyntick_save_progress_counter); 1849 } else { 1850 /* Handle dyntick-idle and offline CPUs. */ 1851 force_qs_rnp(rcu_implicit_dynticks_qs); 1852 } 1853 /* Clear flag to prevent immediate re-entry. */ 1854 if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { 1855 raw_spin_lock_irq_rcu_node(rnp); 1856 WRITE_ONCE(rcu_state.gp_flags, 1857 READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS); 1858 raw_spin_unlock_irq_rcu_node(rnp); 1859 } 1860 } 1861 1862 /* 1863 * Loop doing repeated quiescent-state forcing until the grace period ends. 1864 */ 1865 static void rcu_gp_fqs_loop(void) 1866 { 1867 bool first_gp_fqs; 1868 int gf = 0; 1869 unsigned long j; 1870 int ret; 1871 struct rcu_node *rnp = rcu_get_root(); 1872 1873 first_gp_fqs = true; 1874 j = READ_ONCE(jiffies_till_first_fqs); 1875 if (rcu_state.cbovld) 1876 gf = RCU_GP_FLAG_OVLD; 1877 ret = 0; 1878 for (;;) { 1879 if (!ret) { 1880 rcu_state.jiffies_force_qs = jiffies + j; 1881 WRITE_ONCE(rcu_state.jiffies_kick_kthreads, 1882 jiffies + (j ? 3 * j : 2)); 1883 } 1884 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, 1885 TPS("fqswait")); 1886 rcu_state.gp_state = RCU_GP_WAIT_FQS; 1887 ret = swait_event_idle_timeout_exclusive( 1888 rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); 1889 rcu_gp_torture_wait(); 1890 rcu_state.gp_state = RCU_GP_DOING_FQS; 1891 /* Locking provides needed memory barriers. */ 1892 /* If grace period done, leave loop. */ 1893 if (!READ_ONCE(rnp->qsmask) && 1894 !rcu_preempt_blocked_readers_cgp(rnp)) 1895 break; 1896 /* If time for quiescent-state forcing, do it. */ 1897 if (!time_after(rcu_state.jiffies_force_qs, jiffies) || 1898 (gf & RCU_GP_FLAG_FQS)) { 1899 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, 1900 TPS("fqsstart")); 1901 rcu_gp_fqs(first_gp_fqs); 1902 gf = 0; 1903 if (first_gp_fqs) { 1904 first_gp_fqs = false; 1905 gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; 1906 } 1907 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, 1908 TPS("fqsend")); 1909 cond_resched_tasks_rcu_qs(); 1910 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1911 ret = 0; /* Force full wait till next FQS. */ 1912 j = READ_ONCE(jiffies_till_next_fqs); 1913 } else { 1914 /* Deal with stray signal. */ 1915 cond_resched_tasks_rcu_qs(); 1916 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1917 WARN_ON(signal_pending(current)); 1918 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, 1919 TPS("fqswaitsig")); 1920 ret = 1; /* Keep old FQS timing. */ 1921 j = jiffies; 1922 if (time_after(jiffies, rcu_state.jiffies_force_qs)) 1923 j = 1; 1924 else 1925 j = rcu_state.jiffies_force_qs - j; 1926 gf = 0; 1927 } 1928 } 1929 } 1930 1931 /* 1932 * Clean up after the old grace period. 1933 */ 1934 static void rcu_gp_cleanup(void) 1935 { 1936 int cpu; 1937 bool needgp = false; 1938 unsigned long gp_duration; 1939 unsigned long new_gp_seq; 1940 bool offloaded; 1941 struct rcu_data *rdp; 1942 struct rcu_node *rnp = rcu_get_root(); 1943 struct swait_queue_head *sq; 1944 1945 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1946 raw_spin_lock_irq_rcu_node(rnp); 1947 rcu_state.gp_end = jiffies; 1948 gp_duration = rcu_state.gp_end - rcu_state.gp_start; 1949 if (gp_duration > rcu_state.gp_max) 1950 rcu_state.gp_max = gp_duration; 1951 1952 /* 1953 * We know the grace period is complete, but to everyone else 1954 * it appears to still be ongoing. But it is also the case 1955 * that to everyone else it looks like there is nothing that 1956 * they can do to advance the grace period. It is therefore 1957 * safe for us to drop the lock in order to mark the grace 1958 * period as completed in all of the rcu_node structures. 1959 */ 1960 raw_spin_unlock_irq_rcu_node(rnp); 1961 1962 /* 1963 * Propagate new ->gp_seq value to rcu_node structures so that 1964 * other CPUs don't have to wait until the start of the next grace 1965 * period to process their callbacks. This also avoids some nasty 1966 * RCU grace-period initialization races by forcing the end of 1967 * the current grace period to be completely recorded in all of 1968 * the rcu_node structures before the beginning of the next grace 1969 * period is recorded in any of the rcu_node structures. 1970 */ 1971 new_gp_seq = rcu_state.gp_seq; 1972 rcu_seq_end(&new_gp_seq); 1973 rcu_for_each_node_breadth_first(rnp) { 1974 raw_spin_lock_irq_rcu_node(rnp); 1975 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) 1976 dump_blkd_tasks(rnp, 10); 1977 WARN_ON_ONCE(rnp->qsmask); 1978 WRITE_ONCE(rnp->gp_seq, new_gp_seq); 1979 rdp = this_cpu_ptr(&rcu_data); 1980 if (rnp == rdp->mynode) 1981 needgp = __note_gp_changes(rnp, rdp) || needgp; 1982 /* smp_mb() provided by prior unlock-lock pair. */ 1983 needgp = rcu_future_gp_cleanup(rnp) || needgp; 1984 // Reset overload indication for CPUs no longer overloaded 1985 if (rcu_is_leaf_node(rnp)) 1986 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) { 1987 rdp = per_cpu_ptr(&rcu_data, cpu); 1988 check_cb_ovld_locked(rdp, rnp); 1989 } 1990 sq = rcu_nocb_gp_get(rnp); 1991 raw_spin_unlock_irq_rcu_node(rnp); 1992 rcu_nocb_gp_cleanup(sq); 1993 cond_resched_tasks_rcu_qs(); 1994 WRITE_ONCE(rcu_state.gp_activity, jiffies); 1995 rcu_gp_slow(gp_cleanup_delay); 1996 } 1997 rnp = rcu_get_root(); 1998 raw_spin_lock_irq_rcu_node(rnp); /* GP before ->gp_seq update. */ 1999 2000 /* Declare grace period done, trace first to use old GP number. */ 2001 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); 2002 rcu_seq_end(&rcu_state.gp_seq); 2003 ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); 2004 rcu_state.gp_state = RCU_GP_IDLE; 2005 /* Check for GP requests since above loop. */ 2006 rdp = this_cpu_ptr(&rcu_data); 2007 if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { 2008 trace_rcu_this_gp(rnp, rdp, rnp->gp_seq_needed, 2009 TPS("CleanupMore")); 2010 needgp = true; 2011 } 2012 /* Advance CBs to reduce false positives below. */ 2013 offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && 2014 rcu_segcblist_is_offloaded(&rdp->cblist); 2015 if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { 2016 WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); 2017 WRITE_ONCE(rcu_state.gp_req_activity, jiffies); 2018 trace_rcu_grace_period(rcu_state.name, 2019 rcu_state.gp_seq, 2020 TPS("newreq")); 2021 } else { 2022 WRITE_ONCE(rcu_state.gp_flags, 2023 rcu_state.gp_flags & RCU_GP_FLAG_INIT); 2024 } 2025 raw_spin_unlock_irq_rcu_node(rnp); 2026 } 2027 2028 /* 2029 * Body of kthread that handles grace periods. 2030 */ 2031 static int __noreturn rcu_gp_kthread(void *unused) 2032 { 2033 rcu_bind_gp_kthread(); 2034 for (;;) { 2035 2036 /* Handle grace-period start. */ 2037 for (;;) { 2038 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, 2039 TPS("reqwait")); 2040 rcu_state.gp_state = RCU_GP_WAIT_GPS; 2041 swait_event_idle_exclusive(rcu_state.gp_wq, 2042 READ_ONCE(rcu_state.gp_flags) & 2043 RCU_GP_FLAG_INIT); 2044 rcu_gp_torture_wait(); 2045 rcu_state.gp_state = RCU_GP_DONE_GPS; 2046 /* Locking provides needed memory barrier. */ 2047 if (rcu_gp_init()) 2048 break; 2049 cond_resched_tasks_rcu_qs(); 2050 WRITE_ONCE(rcu_state.gp_activity, jiffies); 2051 WARN_ON(signal_pending(current)); 2052 trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, 2053 TPS("reqwaitsig")); 2054 } 2055 2056 /* Handle quiescent-state forcing. */ 2057 rcu_gp_fqs_loop(); 2058 2059 /* Handle grace-period end. */ 2060 rcu_state.gp_state = RCU_GP_CLEANUP; 2061 rcu_gp_cleanup(); 2062 rcu_state.gp_state = RCU_GP_CLEANED; 2063 } 2064 } 2065 2066 /* 2067 * Report a full set of quiescent states to the rcu_state data structure. 2068 * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if 2069 * another grace period is required. Whether we wake the grace-period 2070 * kthread or it awakens itself for the next round of quiescent-state 2071 * forcing, that kthread will clean up after the just-completed grace 2072 * period. Note that the caller must hold rnp->lock, which is released 2073 * before return. 2074 */ 2075 static void rcu_report_qs_rsp(unsigned long flags) 2076 __releases(rcu_get_root()->lock) 2077 { 2078 raw_lockdep_assert_held_rcu_node(rcu_get_root()); 2079 WARN_ON_ONCE(!rcu_gp_in_progress()); 2080 WRITE_ONCE(rcu_state.gp_flags, 2081 READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); 2082 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags); 2083 rcu_gp_kthread_wake(); 2084 } 2085 2086 /* 2087 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 2088 * Allows quiescent states for a group of CPUs to be reported at one go 2089 * to the specified rcu_node structure, though all the CPUs in the group 2090 * must be represented by the same rcu_node structure (which need not be a 2091 * leaf rcu_node structure, though it often will be). The gps parameter 2092 * is the grace-period snapshot, which means that the quiescent states 2093 * are valid only if rnp->gp_seq is equal to gps. That structure's lock 2094 * must be held upon entry, and it is released before return. 2095 * 2096 * As a special case, if mask is zero, the bit-already-cleared check is 2097 * disabled. This allows propagating quiescent state due to resumed tasks 2098 * during grace-period initialization. 2099 */ 2100 static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, 2101 unsigned long gps, unsigned long flags) 2102 __releases(rnp->lock) 2103 { 2104 unsigned long oldmask = 0; 2105 struct rcu_node *rnp_c; 2106 2107 raw_lockdep_assert_held_rcu_node(rnp); 2108 2109 /* Walk up the rcu_node hierarchy. */ 2110 for (;;) { 2111 if ((!(rnp->qsmask & mask) && mask) || rnp->gp_seq != gps) { 2112 2113 /* 2114 * Our bit has already been cleared, or the 2115 * relevant grace period is already over, so done. 2116 */ 2117 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2118 return; 2119 } 2120 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ 2121 WARN_ON_ONCE(!rcu_is_leaf_node(rnp) && 2122 rcu_preempt_blocked_readers_cgp(rnp)); 2123 WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask); 2124 trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq, 2125 mask, rnp->qsmask, rnp->level, 2126 rnp->grplo, rnp->grphi, 2127 !!rnp->gp_tasks); 2128 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2129 2130 /* Other bits still set at this level, so done. */ 2131 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2132 return; 2133 } 2134 rnp->completedqs = rnp->gp_seq; 2135 mask = rnp->grpmask; 2136 if (rnp->parent == NULL) { 2137 2138 /* No more levels. Exit loop holding root lock. */ 2139 2140 break; 2141 } 2142 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2143 rnp_c = rnp; 2144 rnp = rnp->parent; 2145 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2146 oldmask = READ_ONCE(rnp_c->qsmask); 2147 } 2148 2149 /* 2150 * Get here if we are the last CPU to pass through a quiescent 2151 * state for this grace period. Invoke rcu_report_qs_rsp() 2152 * to clean up and start the next grace period if one is needed. 2153 */ 2154 rcu_report_qs_rsp(flags); /* releases rnp->lock. */ 2155 } 2156 2157 /* 2158 * Record a quiescent state for all tasks that were previously queued 2159 * on the specified rcu_node structure and that were blocking the current 2160 * RCU grace period. The caller must hold the corresponding rnp->lock with 2161 * irqs disabled, and this lock is released upon return, but irqs remain 2162 * disabled. 2163 */ 2164 static void __maybe_unused 2165 rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 2166 __releases(rnp->lock) 2167 { 2168 unsigned long gps; 2169 unsigned long mask; 2170 struct rcu_node *rnp_p; 2171 2172 raw_lockdep_assert_held_rcu_node(rnp); 2173 if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || 2174 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || 2175 rnp->qsmask != 0) { 2176 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2177 return; /* Still need more quiescent states! */ 2178 } 2179 2180 rnp->completedqs = rnp->gp_seq; 2181 rnp_p = rnp->parent; 2182 if (rnp_p == NULL) { 2183 /* 2184 * Only one rcu_node structure in the tree, so don't 2185 * try to report up to its nonexistent parent! 2186 */ 2187 rcu_report_qs_rsp(flags); 2188 return; 2189 } 2190 2191 /* Report up the rest of the hierarchy, tracking current ->gp_seq. */ 2192 gps = rnp->gp_seq; 2193 mask = rnp->grpmask; 2194 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 2195 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ 2196 rcu_report_qs_rnp(mask, rnp_p, gps, flags); 2197 } 2198 2199 /* 2200 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2201 * structure. This must be called from the specified CPU. 2202 */ 2203 static void 2204 rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) 2205 { 2206 unsigned long flags; 2207 unsigned long mask; 2208 bool needwake = false; 2209 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && 2210 rcu_segcblist_is_offloaded(&rdp->cblist); 2211 struct rcu_node *rnp; 2212 2213 rnp = rdp->mynode; 2214 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2215 if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || 2216 rdp->gpwrap) { 2217 2218 /* 2219 * The grace period in which this quiescent state was 2220 * recorded has ended, so don't report it upwards. 2221 * We will instead need a new quiescent state that lies 2222 * within the current grace period. 2223 */ 2224 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2225 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2226 return; 2227 } 2228 mask = rdp->grpmask; 2229 if (rdp->cpu == smp_processor_id()) 2230 rdp->core_needs_qs = false; 2231 if ((rnp->qsmask & mask) == 0) { 2232 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2233 } else { 2234 /* 2235 * This GP can't end until cpu checks in, so all of our 2236 * callbacks can be processed during the next GP. 2237 */ 2238 if (!offloaded) 2239 needwake = rcu_accelerate_cbs(rnp, rdp); 2240 2241 rcu_disable_urgency_upon_qs(rdp); 2242 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); 2243 /* ^^^ Released rnp->lock */ 2244 if (needwake) 2245 rcu_gp_kthread_wake(); 2246 } 2247 } 2248 2249 /* 2250 * Check to see if there is a new grace period of which this CPU 2251 * is not yet aware, and if so, set up local rcu_data state for it. 2252 * Otherwise, see if this CPU has just passed through its first 2253 * quiescent state for this grace period, and record that fact if so. 2254 */ 2255 static void 2256 rcu_check_quiescent_state(struct rcu_data *rdp) 2257 { 2258 /* Check for grace-period ends and beginnings. */ 2259 note_gp_changes(rdp); 2260 2261 /* 2262 * Does this CPU still need to do its part for current grace period? 2263 * If no, return and let the other CPUs do their part as well. 2264 */ 2265 if (!rdp->core_needs_qs) 2266 return; 2267 2268 /* 2269 * Was there a quiescent state since the beginning of the grace 2270 * period? If no, then exit and wait for the next call. 2271 */ 2272 if (rdp->cpu_no_qs.b.norm) 2273 return; 2274 2275 /* 2276 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 2277 * judge of that). 2278 */ 2279 rcu_report_qs_rdp(rdp->cpu, rdp); 2280 } 2281 2282 /* 2283 * Near the end of the offline process. Trace the fact that this CPU 2284 * is going offline. 2285 */ 2286 int rcutree_dying_cpu(unsigned int cpu) 2287 { 2288 bool blkd; 2289 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 2290 struct rcu_node *rnp = rdp->mynode; 2291 2292 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2293 return 0; 2294 2295 blkd = !!(rnp->qsmask & rdp->grpmask); 2296 trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), 2297 blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); 2298 return 0; 2299 } 2300 2301 /* 2302 * All CPUs for the specified rcu_node structure have gone offline, 2303 * and all tasks that were preempted within an RCU read-side critical 2304 * section while running on one of those CPUs have since exited their RCU 2305 * read-side critical section. Some other CPU is reporting this fact with 2306 * the specified rcu_node structure's ->lock held and interrupts disabled. 2307 * This function therefore goes up the tree of rcu_node structures, 2308 * clearing the corresponding bits in the ->qsmaskinit fields. Note that 2309 * the leaf rcu_node structure's ->qsmaskinit field has already been 2310 * updated. 2311 * 2312 * This function does check that the specified rcu_node structure has 2313 * all CPUs offline and no blocked tasks, so it is OK to invoke it 2314 * prematurely. That said, invoking it after the fact will cost you 2315 * a needless lock acquisition. So once it has done its work, don't 2316 * invoke it again. 2317 */ 2318 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) 2319 { 2320 long mask; 2321 struct rcu_node *rnp = rnp_leaf; 2322 2323 raw_lockdep_assert_held_rcu_node(rnp_leaf); 2324 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2325 WARN_ON_ONCE(rnp_leaf->qsmaskinit) || 2326 WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf))) 2327 return; 2328 for (;;) { 2329 mask = rnp->grpmask; 2330 rnp = rnp->parent; 2331 if (!rnp) 2332 break; 2333 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 2334 rnp->qsmaskinit &= ~mask; 2335 /* Between grace periods, so better already be zero! */ 2336 WARN_ON_ONCE(rnp->qsmask); 2337 if (rnp->qsmaskinit) { 2338 raw_spin_unlock_rcu_node(rnp); 2339 /* irqs remain disabled. */ 2340 return; 2341 } 2342 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 2343 } 2344 } 2345 2346 /* 2347 * The CPU has been completely removed, and some other CPU is reporting 2348 * this fact from process context. Do the remainder of the cleanup. 2349 * There can only be one CPU hotplug operation at a time, so no need for 2350 * explicit locking. 2351 */ 2352 int rcutree_dead_cpu(unsigned int cpu) 2353 { 2354 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 2355 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2356 2357 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2358 return 0; 2359 2360 /* Adjust any no-longer-needed kthreads. */ 2361 rcu_boost_kthread_setaffinity(rnp, -1); 2362 /* Do any needed no-CB deferred wakeups from this CPU. */ 2363 do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu)); 2364 2365 // Stop-machine done, so allow nohz_full to disable tick. 2366 tick_dep_clear(TICK_DEP_BIT_RCU); 2367 return 0; 2368 } 2369 2370 /* 2371 * Invoke any RCU callbacks that have made it to the end of their grace 2372 * period. Thottle as specified by rdp->blimit. 2373 */ 2374 static void rcu_do_batch(struct rcu_data *rdp) 2375 { 2376 unsigned long flags; 2377 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && 2378 rcu_segcblist_is_offloaded(&rdp->cblist); 2379 struct rcu_head *rhp; 2380 struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); 2381 long bl, count; 2382 long pending, tlimit = 0; 2383 2384 /* If no callbacks are ready, just return. */ 2385 if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { 2386 trace_rcu_batch_start(rcu_state.name, 2387 rcu_segcblist_n_cbs(&rdp->cblist), 0); 2388 trace_rcu_batch_end(rcu_state.name, 0, 2389 !rcu_segcblist_empty(&rdp->cblist), 2390 need_resched(), is_idle_task(current), 2391 rcu_is_callbacks_kthread()); 2392 return; 2393 } 2394 2395 /* 2396 * Extract the list of ready callbacks, disabling to prevent 2397 * races with call_rcu() from interrupt handlers. Leave the 2398 * callback counts, as rcu_barrier() needs to be conservative. 2399 */ 2400 local_irq_save(flags); 2401 rcu_nocb_lock(rdp); 2402 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2403 pending = rcu_segcblist_n_cbs(&rdp->cblist); 2404 bl = max(rdp->blimit, pending >> rcu_divisor); 2405 if (unlikely(bl > 100)) 2406 tlimit = local_clock() + rcu_resched_ns; 2407 trace_rcu_batch_start(rcu_state.name, 2408 rcu_segcblist_n_cbs(&rdp->cblist), bl); 2409 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); 2410 if (offloaded) 2411 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); 2412 rcu_nocb_unlock_irqrestore(rdp, flags); 2413 2414 /* Invoke callbacks. */ 2415 tick_dep_set_task(current, TICK_DEP_BIT_RCU); 2416 rhp = rcu_cblist_dequeue(&rcl); 2417 for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { 2418 rcu_callback_t f; 2419 2420 debug_rcu_head_unqueue(rhp); 2421 2422 rcu_lock_acquire(&rcu_callback_map); 2423 trace_rcu_invoke_callback(rcu_state.name, rhp); 2424 2425 f = rhp->func; 2426 WRITE_ONCE(rhp->func, (rcu_callback_t)0L); 2427 f(rhp); 2428 2429 rcu_lock_release(&rcu_callback_map); 2430 2431 /* 2432 * Stop only if limit reached and CPU has something to do. 2433 * Note: The rcl structure counts down from zero. 2434 */ 2435 if (-rcl.len >= bl && !offloaded && 2436 (need_resched() || 2437 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2438 break; 2439 if (unlikely(tlimit)) { 2440 /* only call local_clock() every 32 callbacks */ 2441 if (likely((-rcl.len & 31) || local_clock() < tlimit)) 2442 continue; 2443 /* Exceeded the time limit, so leave. */ 2444 break; 2445 } 2446 if (offloaded) { 2447 WARN_ON_ONCE(in_serving_softirq()); 2448 local_bh_enable(); 2449 lockdep_assert_irqs_enabled(); 2450 cond_resched_tasks_rcu_qs(); 2451 lockdep_assert_irqs_enabled(); 2452 local_bh_disable(); 2453 } 2454 } 2455 2456 local_irq_save(flags); 2457 rcu_nocb_lock(rdp); 2458 count = -rcl.len; 2459 rdp->n_cbs_invoked += count; 2460 trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), 2461 is_idle_task(current), rcu_is_callbacks_kthread()); 2462 2463 /* Update counts and requeue any remaining callbacks. */ 2464 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); 2465 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2466 rcu_segcblist_insert_count(&rdp->cblist, &rcl); 2467 2468 /* Reinstate batch limit if we have worked down the excess. */ 2469 count = rcu_segcblist_n_cbs(&rdp->cblist); 2470 if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) 2471 rdp->blimit = blimit; 2472 2473 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2474 if (count == 0 && rdp->qlen_last_fqs_check != 0) { 2475 rdp->qlen_last_fqs_check = 0; 2476 rdp->n_force_qs_snap = rcu_state.n_force_qs; 2477 } else if (count < rdp->qlen_last_fqs_check - qhimark) 2478 rdp->qlen_last_fqs_check = count; 2479 2480 /* 2481 * The following usually indicates a double call_rcu(). To track 2482 * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. 2483 */ 2484 WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); 2485 WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && 2486 count != 0 && rcu_segcblist_empty(&rdp->cblist)); 2487 2488 rcu_nocb_unlock_irqrestore(rdp, flags); 2489 2490 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2491 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) 2492 invoke_rcu_core(); 2493 tick_dep_clear_task(current, TICK_DEP_BIT_RCU); 2494 } 2495 2496 /* 2497 * This function is invoked from each scheduling-clock interrupt, 2498 * and checks to see if this CPU is in a non-context-switch quiescent 2499 * state, for example, user mode or idle loop. It also schedules RCU 2500 * core processing. If the current grace period has gone on too long, 2501 * it will ask the scheduler to manufacture a context switch for the sole 2502 * purpose of providing a providing the needed quiescent state. 2503 */ 2504 void rcu_sched_clock_irq(int user) 2505 { 2506 trace_rcu_utilization(TPS("Start scheduler-tick")); 2507 raw_cpu_inc(rcu_data.ticks_this_gp); 2508 /* The load-acquire pairs with the store-release setting to true. */ 2509 if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { 2510 /* Idle and userspace execution already are quiescent states. */ 2511 if (!rcu_is_cpu_rrupt_from_idle() && !user) { 2512 set_tsk_need_resched(current); 2513 set_preempt_need_resched(); 2514 } 2515 __this_cpu_write(rcu_data.rcu_urgent_qs, false); 2516 } 2517 rcu_flavor_sched_clock_irq(user); 2518 if (rcu_pending(user)) 2519 invoke_rcu_core(); 2520 2521 trace_rcu_utilization(TPS("End scheduler-tick")); 2522 } 2523 2524 /* 2525 * Scan the leaf rcu_node structures. For each structure on which all 2526 * CPUs have reported a quiescent state and on which there are tasks 2527 * blocking the current grace period, initiate RCU priority boosting. 2528 * Otherwise, invoke the specified function to check dyntick state for 2529 * each CPU that has not yet reported a quiescent state. 2530 */ 2531 static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) 2532 { 2533 int cpu; 2534 unsigned long flags; 2535 unsigned long mask; 2536 struct rcu_data *rdp; 2537 struct rcu_node *rnp; 2538 2539 rcu_state.cbovld = rcu_state.cbovldnext; 2540 rcu_state.cbovldnext = false; 2541 rcu_for_each_leaf_node(rnp) { 2542 cond_resched_tasks_rcu_qs(); 2543 mask = 0; 2544 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2545 rcu_state.cbovldnext |= !!rnp->cbovldmask; 2546 if (rnp->qsmask == 0) { 2547 if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || 2548 rcu_preempt_blocked_readers_cgp(rnp)) { 2549 /* 2550 * No point in scanning bits because they 2551 * are all zero. But we might need to 2552 * priority-boost blocked readers. 2553 */ 2554 rcu_initiate_boost(rnp, flags); 2555 /* rcu_initiate_boost() releases rnp->lock */ 2556 continue; 2557 } 2558 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2559 continue; 2560 } 2561 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { 2562 rdp = per_cpu_ptr(&rcu_data, cpu); 2563 if (f(rdp)) { 2564 mask |= rdp->grpmask; 2565 rcu_disable_urgency_upon_qs(rdp); 2566 } 2567 } 2568 if (mask != 0) { 2569 /* Idle/offline CPUs, report (releases rnp->lock). */ 2570 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); 2571 } else { 2572 /* Nothing to do here, so just drop the lock. */ 2573 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2574 } 2575 } 2576 } 2577 2578 /* 2579 * Force quiescent states on reluctant CPUs, and also detect which 2580 * CPUs are in dyntick-idle mode. 2581 */ 2582 void rcu_force_quiescent_state(void) 2583 { 2584 unsigned long flags; 2585 bool ret; 2586 struct rcu_node *rnp; 2587 struct rcu_node *rnp_old = NULL; 2588 2589 /* Funnel through hierarchy to reduce memory contention. */ 2590 rnp = __this_cpu_read(rcu_data.mynode); 2591 for (; rnp != NULL; rnp = rnp->parent) { 2592 ret = (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) || 2593 !raw_spin_trylock(&rnp->fqslock); 2594 if (rnp_old != NULL) 2595 raw_spin_unlock(&rnp_old->fqslock); 2596 if (ret) 2597 return; 2598 rnp_old = rnp; 2599 } 2600 /* rnp_old == rcu_get_root(), rnp == NULL. */ 2601 2602 /* Reached the root of the rcu_node tree, acquire lock. */ 2603 raw_spin_lock_irqsave_rcu_node(rnp_old, flags); 2604 raw_spin_unlock(&rnp_old->fqslock); 2605 if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { 2606 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 2607 return; /* Someone beat us to it. */ 2608 } 2609 WRITE_ONCE(rcu_state.gp_flags, 2610 READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); 2611 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 2612 rcu_gp_kthread_wake(); 2613 } 2614 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 2615 2616 /* Perform RCU core processing work for the current CPU. */ 2617 static __latent_entropy void rcu_core(void) 2618 { 2619 unsigned long flags; 2620 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); 2621 struct rcu_node *rnp = rdp->mynode; 2622 const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && 2623 rcu_segcblist_is_offloaded(&rdp->cblist); 2624 2625 if (cpu_is_offline(smp_processor_id())) 2626 return; 2627 trace_rcu_utilization(TPS("Start RCU core")); 2628 WARN_ON_ONCE(!rdp->beenonline); 2629 2630 /* Report any deferred quiescent states if preemption enabled. */ 2631 if (!(preempt_count() & PREEMPT_MASK)) { 2632 rcu_preempt_deferred_qs(current); 2633 } else if (rcu_preempt_need_deferred_qs(current)) { 2634 set_tsk_need_resched(current); 2635 set_preempt_need_resched(); 2636 } 2637 2638 /* Update RCU state based on any recent quiescent states. */ 2639 rcu_check_quiescent_state(rdp); 2640 2641 /* No grace period and unregistered callbacks? */ 2642 if (!rcu_gp_in_progress() && 2643 rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { 2644 local_irq_save(flags); 2645 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) 2646 rcu_accelerate_cbs_unlocked(rnp, rdp); 2647 local_irq_restore(flags); 2648 } 2649 2650 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); 2651 2652 /* If there are callbacks ready, invoke them. */ 2653 if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && 2654 likely(READ_ONCE(rcu_scheduler_fully_active))) 2655 rcu_do_batch(rdp); 2656 2657 /* Do any needed deferred wakeups of rcuo kthreads. */ 2658 do_nocb_deferred_wakeup(rdp); 2659 trace_rcu_utilization(TPS("End RCU core")); 2660 } 2661 2662 static void rcu_core_si(struct softirq_action *h) 2663 { 2664 rcu_core(); 2665 } 2666 2667 static void rcu_wake_cond(struct task_struct *t, int status) 2668 { 2669 /* 2670 * If the thread is yielding, only wake it when this 2671 * is invoked from idle 2672 */ 2673 if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) 2674 wake_up_process(t); 2675 } 2676 2677 static void invoke_rcu_core_kthread(void) 2678 { 2679 struct task_struct *t; 2680 unsigned long flags; 2681 2682 local_irq_save(flags); 2683 __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); 2684 t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); 2685 if (t != NULL && t != current) 2686 rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); 2687 local_irq_restore(flags); 2688 } 2689 2690 /* 2691 * Wake up this CPU's rcuc kthread to do RCU core processing. 2692 */ 2693 static void invoke_rcu_core(void) 2694 { 2695 if (!cpu_online(smp_processor_id())) 2696 return; 2697 if (use_softirq) 2698 raise_softirq(RCU_SOFTIRQ); 2699 else 2700 invoke_rcu_core_kthread(); 2701 } 2702 2703 static void rcu_cpu_kthread_park(unsigned int cpu) 2704 { 2705 per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; 2706 } 2707 2708 static int rcu_cpu_kthread_should_run(unsigned int cpu) 2709 { 2710 return __this_cpu_read(rcu_data.rcu_cpu_has_work); 2711 } 2712 2713 /* 2714 * Per-CPU kernel thread that invokes RCU callbacks. This replaces 2715 * the RCU softirq used in configurations of RCU that do not support RCU 2716 * priority boosting. 2717 */ 2718 static void rcu_cpu_kthread(unsigned int cpu) 2719 { 2720 unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); 2721 char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); 2722 int spincnt; 2723 2724 trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); 2725 for (spincnt = 0; spincnt < 10; spincnt++) { 2726 local_bh_disable(); 2727 *statusp = RCU_KTHREAD_RUNNING; 2728 local_irq_disable(); 2729 work = *workp; 2730 *workp = 0; 2731 local_irq_enable(); 2732 if (work) 2733 rcu_core(); 2734 local_bh_enable(); 2735 if (*workp == 0) { 2736 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); 2737 *statusp = RCU_KTHREAD_WAITING; 2738 return; 2739 } 2740 } 2741 *statusp = RCU_KTHREAD_YIELDING; 2742 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); 2743 schedule_timeout_idle(2); 2744 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); 2745 *statusp = RCU_KTHREAD_WAITING; 2746 } 2747 2748 static struct smp_hotplug_thread rcu_cpu_thread_spec = { 2749 .store = &rcu_data.rcu_cpu_kthread_task, 2750 .thread_should_run = rcu_cpu_kthread_should_run, 2751 .thread_fn = rcu_cpu_kthread, 2752 .thread_comm = "rcuc/%u", 2753 .setup = rcu_cpu_kthread_setup, 2754 .park = rcu_cpu_kthread_park, 2755 }; 2756 2757 /* 2758 * Spawn per-CPU RCU core processing kthreads. 2759 */ 2760 static int __init rcu_spawn_core_kthreads(void) 2761 { 2762 int cpu; 2763 2764 for_each_possible_cpu(cpu) 2765 per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; 2766 if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) 2767 return 0; 2768 WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), 2769 "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); 2770 return 0; 2771 } 2772 early_initcall(rcu_spawn_core_kthreads); 2773 2774 /* 2775 * Handle any core-RCU processing required by a call_rcu() invocation. 2776 */ 2777 static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, 2778 unsigned long flags) 2779 { 2780 /* 2781 * If called from an extended quiescent state, invoke the RCU 2782 * core in order to force a re-evaluation of RCU's idleness. 2783 */ 2784 if (!rcu_is_watching()) 2785 invoke_rcu_core(); 2786 2787 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2788 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) 2789 return; 2790 2791 /* 2792 * Force the grace period if too many callbacks or too long waiting. 2793 * Enforce hysteresis, and don't invoke rcu_force_quiescent_state() 2794 * if some other CPU has recently done so. Also, don't bother 2795 * invoking rcu_force_quiescent_state() if the newly enqueued callback 2796 * is the only one waiting for a grace period to complete. 2797 */ 2798 if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > 2799 rdp->qlen_last_fqs_check + qhimark)) { 2800 2801 /* Are we ignoring a completed grace period? */ 2802 note_gp_changes(rdp); 2803 2804 /* Start a new grace period if one not already started. */ 2805 if (!rcu_gp_in_progress()) { 2806 rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); 2807 } else { 2808 /* Give the grace period a kick. */ 2809 rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; 2810 if (rcu_state.n_force_qs == rdp->n_force_qs_snap && 2811 rcu_segcblist_first_pend_cb(&rdp->cblist) != head) 2812 rcu_force_quiescent_state(); 2813 rdp->n_force_qs_snap = rcu_state.n_force_qs; 2814 rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); 2815 } 2816 } 2817 } 2818 2819 /* 2820 * RCU callback function to leak a callback. 2821 */ 2822 static void rcu_leak_callback(struct rcu_head *rhp) 2823 { 2824 } 2825 2826 /* 2827 * Check and if necessary update the leaf rcu_node structure's 2828 * ->cbovldmask bit corresponding to the current CPU based on that CPU's 2829 * number of queued RCU callbacks. The caller must hold the leaf rcu_node 2830 * structure's ->lock. 2831 */ 2832 static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp) 2833 { 2834 raw_lockdep_assert_held_rcu_node(rnp); 2835 if (qovld_calc <= 0) 2836 return; // Early boot and wildcard value set. 2837 if (rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) 2838 WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask | rdp->grpmask); 2839 else 2840 WRITE_ONCE(rnp->cbovldmask, rnp->cbovldmask & ~rdp->grpmask); 2841 } 2842 2843 /* 2844 * Check and if necessary update the leaf rcu_node structure's 2845 * ->cbovldmask bit corresponding to the current CPU based on that CPU's 2846 * number of queued RCU callbacks. No locks need be held, but the 2847 * caller must have disabled interrupts. 2848 * 2849 * Note that this function ignores the possibility that there are a lot 2850 * of callbacks all of which have already seen the end of their respective 2851 * grace periods. This omission is due to the need for no-CBs CPUs to 2852 * be holding ->nocb_lock to do this check, which is too heavy for a 2853 * common-case operation. 2854 */ 2855 static void check_cb_ovld(struct rcu_data *rdp) 2856 { 2857 struct rcu_node *const rnp = rdp->mynode; 2858 2859 if (qovld_calc <= 0 || 2860 ((rcu_segcblist_n_cbs(&rdp->cblist) >= qovld_calc) == 2861 !!(READ_ONCE(rnp->cbovldmask) & rdp->grpmask))) 2862 return; // Early boot wildcard value or already set correctly. 2863 raw_spin_lock_rcu_node(rnp); 2864 check_cb_ovld_locked(rdp, rnp); 2865 raw_spin_unlock_rcu_node(rnp); 2866 } 2867 2868 /* Helper function for call_rcu() and friends. */ 2869 static void 2870 __call_rcu(struct rcu_head *head, rcu_callback_t func) 2871 { 2872 unsigned long flags; 2873 struct rcu_data *rdp; 2874 bool was_alldone; 2875 2876 /* Misaligned rcu_head! */ 2877 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); 2878 2879 if (debug_rcu_head_queue(head)) { 2880 /* 2881 * Probable double call_rcu(), so leak the callback. 2882 * Use rcu:rcu_callback trace event to find the previous 2883 * time callback was passed to __call_rcu(). 2884 */ 2885 WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", 2886 head, head->func); 2887 WRITE_ONCE(head->func, rcu_leak_callback); 2888 return; 2889 } 2890 head->func = func; 2891 head->next = NULL; 2892 local_irq_save(flags); 2893 rdp = this_cpu_ptr(&rcu_data); 2894 2895 /* Add the callback to our list. */ 2896 if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist))) { 2897 // This can trigger due to call_rcu() from offline CPU: 2898 WARN_ON_ONCE(rcu_scheduler_active != RCU_SCHEDULER_INACTIVE); 2899 WARN_ON_ONCE(!rcu_is_watching()); 2900 // Very early boot, before rcu_init(). Initialize if needed 2901 // and then drop through to queue the callback. 2902 if (rcu_segcblist_empty(&rdp->cblist)) 2903 rcu_segcblist_init(&rdp->cblist); 2904 } 2905 2906 check_cb_ovld(rdp); 2907 if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) 2908 return; // Enqueued onto ->nocb_bypass, so just leave. 2909 // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. 2910 rcu_segcblist_enqueue(&rdp->cblist, head); 2911 if (__is_kvfree_rcu_offset((unsigned long)func)) 2912 trace_rcu_kvfree_callback(rcu_state.name, head, 2913 (unsigned long)func, 2914 rcu_segcblist_n_cbs(&rdp->cblist)); 2915 else 2916 trace_rcu_callback(rcu_state.name, head, 2917 rcu_segcblist_n_cbs(&rdp->cblist)); 2918 2919 /* Go handle any RCU core processing required. */ 2920 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && 2921 unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { 2922 __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ 2923 } else { 2924 __call_rcu_core(rdp, head, flags); 2925 local_irq_restore(flags); 2926 } 2927 } 2928 2929 /** 2930 * call_rcu() - Queue an RCU callback for invocation after a grace period. 2931 * @head: structure to be used for queueing the RCU updates. 2932 * @func: actual callback function to be invoked after the grace period 2933 * 2934 * The callback function will be invoked some time after a full grace 2935 * period elapses, in other words after all pre-existing RCU read-side 2936 * critical sections have completed. However, the callback function 2937 * might well execute concurrently with RCU read-side critical sections 2938 * that started after call_rcu() was invoked. RCU read-side critical 2939 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), and 2940 * may be nested. In addition, regions of code across which interrupts, 2941 * preemption, or softirqs have been disabled also serve as RCU read-side 2942 * critical sections. This includes hardware interrupt handlers, softirq 2943 * handlers, and NMI handlers. 2944 * 2945 * Note that all CPUs must agree that the grace period extended beyond 2946 * all pre-existing RCU read-side critical section. On systems with more 2947 * than one CPU, this means that when "func()" is invoked, each CPU is 2948 * guaranteed to have executed a full memory barrier since the end of its 2949 * last RCU read-side critical section whose beginning preceded the call 2950 * to call_rcu(). It also means that each CPU executing an RCU read-side 2951 * critical section that continues beyond the start of "func()" must have 2952 * executed a memory barrier after the call_rcu() but before the beginning 2953 * of that RCU read-side critical section. Note that these guarantees 2954 * include CPUs that are offline, idle, or executing in user mode, as 2955 * well as CPUs that are executing in the kernel. 2956 * 2957 * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the 2958 * resulting RCU callback function "func()", then both CPU A and CPU B are 2959 * guaranteed to execute a full memory barrier during the time interval 2960 * between the call to call_rcu() and the invocation of "func()" -- even 2961 * if CPU A and CPU B are the same CPU (but again only if the system has 2962 * more than one CPU). 2963 */ 2964 void call_rcu(struct rcu_head *head, rcu_callback_t func) 2965 { 2966 __call_rcu(head, func); 2967 } 2968 EXPORT_SYMBOL_GPL(call_rcu); 2969 2970 2971 /* Maximum number of jiffies to wait before draining a batch. */ 2972 #define KFREE_DRAIN_JIFFIES (HZ / 50) 2973 #define KFREE_N_BATCHES 2 2974 #define FREE_N_CHANNELS 2 2975 2976 /** 2977 * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers 2978 * @nr_records: Number of active pointers in the array 2979 * @next: Next bulk object in the block chain 2980 * @records: Array of the kvfree_rcu() pointers 2981 */ 2982 struct kvfree_rcu_bulk_data { 2983 unsigned long nr_records; 2984 struct kvfree_rcu_bulk_data *next; 2985 void *records[]; 2986 }; 2987 2988 /* 2989 * This macro defines how many entries the "records" array 2990 * will contain. It is based on the fact that the size of 2991 * kvfree_rcu_bulk_data structure becomes exactly one page. 2992 */ 2993 #define KVFREE_BULK_MAX_ENTR \ 2994 ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *)) 2995 2996 /** 2997 * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests 2998 * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period 2999 * @head_free: List of kfree_rcu() objects waiting for a grace period 3000 * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period 3001 * @krcp: Pointer to @kfree_rcu_cpu structure 3002 */ 3003 3004 struct kfree_rcu_cpu_work { 3005 struct rcu_work rcu_work; 3006 struct rcu_head *head_free; 3007 struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS]; 3008 struct kfree_rcu_cpu *krcp; 3009 }; 3010 3011 /** 3012 * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period 3013 * @head: List of kfree_rcu() objects not yet waiting for a grace period 3014 * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period 3015 * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period 3016 * @lock: Synchronize access to this structure 3017 * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES 3018 * @monitor_todo: Tracks whether a @monitor_work delayed work is pending 3019 * @initialized: The @rcu_work fields have been initialized 3020 * @count: Number of objects for which GP not started 3021 * 3022 * This is a per-CPU structure. The reason that it is not included in 3023 * the rcu_data structure is to permit this code to be extracted from 3024 * the RCU files. Such extraction could allow further optimization of 3025 * the interactions with the slab allocators. 3026 */ 3027 struct kfree_rcu_cpu { 3028 struct rcu_head *head; 3029 struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS]; 3030 struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; 3031 raw_spinlock_t lock; 3032 struct delayed_work monitor_work; 3033 bool monitor_todo; 3034 bool initialized; 3035 int count; 3036 3037 /* 3038 * A simple cache list that contains objects for 3039 * reuse purpose. In order to save some per-cpu 3040 * space the list is singular. Even though it is 3041 * lockless an access has to be protected by the 3042 * per-cpu lock. 3043 */ 3044 struct llist_head bkvcache; 3045 int nr_bkv_objs; 3046 }; 3047 3048 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = { 3049 .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock), 3050 }; 3051 3052 static __always_inline void 3053 debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) 3054 { 3055 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 3056 int i; 3057 3058 for (i = 0; i < bhead->nr_records; i++) 3059 debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i])); 3060 #endif 3061 } 3062 3063 static inline struct kfree_rcu_cpu * 3064 krc_this_cpu_lock(unsigned long *flags) 3065 { 3066 struct kfree_rcu_cpu *krcp; 3067 3068 local_irq_save(*flags); // For safely calling this_cpu_ptr(). 3069 krcp = this_cpu_ptr(&krc); 3070 raw_spin_lock(&krcp->lock); 3071 3072 return krcp; 3073 } 3074 3075 static inline void 3076 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) 3077 { 3078 raw_spin_unlock(&krcp->lock); 3079 local_irq_restore(flags); 3080 } 3081 3082 static inline struct kvfree_rcu_bulk_data * 3083 get_cached_bnode(struct kfree_rcu_cpu *krcp) 3084 { 3085 if (!krcp->nr_bkv_objs) 3086 return NULL; 3087 3088 krcp->nr_bkv_objs--; 3089 return (struct kvfree_rcu_bulk_data *) 3090 llist_del_first(&krcp->bkvcache); 3091 } 3092 3093 static inline bool 3094 put_cached_bnode(struct kfree_rcu_cpu *krcp, 3095 struct kvfree_rcu_bulk_data *bnode) 3096 { 3097 // Check the limit. 3098 if (krcp->nr_bkv_objs >= rcu_min_cached_objs) 3099 return false; 3100 3101 llist_add((struct llist_node *) bnode, &krcp->bkvcache); 3102 krcp->nr_bkv_objs++; 3103 return true; 3104 3105 } 3106 3107 /* 3108 * This function is invoked in workqueue context after a grace period. 3109 * It frees all the objects queued on ->bhead_free or ->head_free. 3110 */ 3111 static void kfree_rcu_work(struct work_struct *work) 3112 { 3113 unsigned long flags; 3114 struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext; 3115 struct rcu_head *head, *next; 3116 struct kfree_rcu_cpu *krcp; 3117 struct kfree_rcu_cpu_work *krwp; 3118 int i, j; 3119 3120 krwp = container_of(to_rcu_work(work), 3121 struct kfree_rcu_cpu_work, rcu_work); 3122 krcp = krwp->krcp; 3123 3124 raw_spin_lock_irqsave(&krcp->lock, flags); 3125 // Channels 1 and 2. 3126 for (i = 0; i < FREE_N_CHANNELS; i++) { 3127 bkvhead[i] = krwp->bkvhead_free[i]; 3128 krwp->bkvhead_free[i] = NULL; 3129 } 3130 3131 // Channel 3. 3132 head = krwp->head_free; 3133 krwp->head_free = NULL; 3134 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3135 3136 // Handle two first channels. 3137 for (i = 0; i < FREE_N_CHANNELS; i++) { 3138 for (; bkvhead[i]; bkvhead[i] = bnext) { 3139 bnext = bkvhead[i]->next; 3140 debug_rcu_bhead_unqueue(bkvhead[i]); 3141 3142 rcu_lock_acquire(&rcu_callback_map); 3143 if (i == 0) { // kmalloc() / kfree(). 3144 trace_rcu_invoke_kfree_bulk_callback( 3145 rcu_state.name, bkvhead[i]->nr_records, 3146 bkvhead[i]->records); 3147 3148 kfree_bulk(bkvhead[i]->nr_records, 3149 bkvhead[i]->records); 3150 } else { // vmalloc() / vfree(). 3151 for (j = 0; j < bkvhead[i]->nr_records; j++) { 3152 trace_rcu_invoke_kvfree_callback( 3153 rcu_state.name, 3154 bkvhead[i]->records[j], 0); 3155 3156 vfree(bkvhead[i]->records[j]); 3157 } 3158 } 3159 rcu_lock_release(&rcu_callback_map); 3160 3161 krcp = krc_this_cpu_lock(&flags); 3162 if (put_cached_bnode(krcp, bkvhead[i])) 3163 bkvhead[i] = NULL; 3164 krc_this_cpu_unlock(krcp, flags); 3165 3166 if (bkvhead[i]) 3167 free_page((unsigned long) bkvhead[i]); 3168 3169 cond_resched_tasks_rcu_qs(); 3170 } 3171 } 3172 3173 /* 3174 * Emergency case only. It can happen under low memory 3175 * condition when an allocation gets failed, so the "bulk" 3176 * path can not be temporary maintained. 3177 */ 3178 for (; head; head = next) { 3179 unsigned long offset = (unsigned long)head->func; 3180 void *ptr = (void *)head - offset; 3181 3182 next = head->next; 3183 debug_rcu_head_unqueue((struct rcu_head *)ptr); 3184 rcu_lock_acquire(&rcu_callback_map); 3185 trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset); 3186 3187 if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) 3188 kvfree(ptr); 3189 3190 rcu_lock_release(&rcu_callback_map); 3191 cond_resched_tasks_rcu_qs(); 3192 } 3193 } 3194 3195 /* 3196 * Schedule the kfree batch RCU work to run in workqueue context after a GP. 3197 * 3198 * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES 3199 * timeout has been reached. 3200 */ 3201 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) 3202 { 3203 struct kfree_rcu_cpu_work *krwp; 3204 bool repeat = false; 3205 int i, j; 3206 3207 lockdep_assert_held(&krcp->lock); 3208 3209 for (i = 0; i < KFREE_N_BATCHES; i++) { 3210 krwp = &(krcp->krw_arr[i]); 3211 3212 /* 3213 * Try to detach bkvhead or head and attach it over any 3214 * available corresponding free channel. It can be that 3215 * a previous RCU batch is in progress, it means that 3216 * immediately to queue another one is not possible so 3217 * return false to tell caller to retry. 3218 */ 3219 if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) || 3220 (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) || 3221 (krcp->head && !krwp->head_free)) { 3222 // Channel 1 corresponds to SLAB ptrs. 3223 // Channel 2 corresponds to vmalloc ptrs. 3224 for (j = 0; j < FREE_N_CHANNELS; j++) { 3225 if (!krwp->bkvhead_free[j]) { 3226 krwp->bkvhead_free[j] = krcp->bkvhead[j]; 3227 krcp->bkvhead[j] = NULL; 3228 } 3229 } 3230 3231 // Channel 3 corresponds to emergency path. 3232 if (!krwp->head_free) { 3233 krwp->head_free = krcp->head; 3234 krcp->head = NULL; 3235 } 3236 3237 WRITE_ONCE(krcp->count, 0); 3238 3239 /* 3240 * One work is per one batch, so there are three 3241 * "free channels", the batch can handle. It can 3242 * be that the work is in the pending state when 3243 * channels have been detached following by each 3244 * other. 3245 */ 3246 queue_rcu_work(system_wq, &krwp->rcu_work); 3247 } 3248 3249 // Repeat if any "free" corresponding channel is still busy. 3250 if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head) 3251 repeat = true; 3252 } 3253 3254 return !repeat; 3255 } 3256 3257 static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, 3258 unsigned long flags) 3259 { 3260 // Attempt to start a new batch. 3261 krcp->monitor_todo = false; 3262 if (queue_kfree_rcu_work(krcp)) { 3263 // Success! Our job is done here. 3264 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3265 return; 3266 } 3267 3268 // Previous RCU batch still in progress, try again later. 3269 krcp->monitor_todo = true; 3270 schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); 3271 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3272 } 3273 3274 /* 3275 * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. 3276 * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. 3277 */ 3278 static void kfree_rcu_monitor(struct work_struct *work) 3279 { 3280 unsigned long flags; 3281 struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, 3282 monitor_work.work); 3283 3284 raw_spin_lock_irqsave(&krcp->lock, flags); 3285 if (krcp->monitor_todo) 3286 kfree_rcu_drain_unlock(krcp, flags); 3287 else 3288 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3289 } 3290 3291 static inline bool 3292 kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) 3293 { 3294 struct kvfree_rcu_bulk_data *bnode; 3295 int idx; 3296 3297 if (unlikely(!krcp->initialized)) 3298 return false; 3299 3300 lockdep_assert_held(&krcp->lock); 3301 idx = !!is_vmalloc_addr(ptr); 3302 3303 /* Check if a new block is required. */ 3304 if (!krcp->bkvhead[idx] || 3305 krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { 3306 bnode = get_cached_bnode(krcp); 3307 if (!bnode) { 3308 /* 3309 * To keep this path working on raw non-preemptible 3310 * sections, prevent the optional entry into the 3311 * allocator as it uses sleeping locks. In fact, even 3312 * if the caller of kfree_rcu() is preemptible, this 3313 * path still is not, as krcp->lock is a raw spinlock. 3314 * With additional page pre-allocation in the works, 3315 * hitting this return is going to be much less likely. 3316 */ 3317 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 3318 return false; 3319 3320 /* 3321 * NOTE: For one argument of kvfree_rcu() we can 3322 * drop the lock and get the page in sleepable 3323 * context. That would allow to maintain an array 3324 * for the CONFIG_PREEMPT_RT as well if no cached 3325 * pages are available. 3326 */ 3327 bnode = (struct kvfree_rcu_bulk_data *) 3328 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 3329 } 3330 3331 /* Switch to emergency path. */ 3332 if (unlikely(!bnode)) 3333 return false; 3334 3335 /* Initialize the new block. */ 3336 bnode->nr_records = 0; 3337 bnode->next = krcp->bkvhead[idx]; 3338 3339 /* Attach it to the head. */ 3340 krcp->bkvhead[idx] = bnode; 3341 } 3342 3343 /* Finally insert. */ 3344 krcp->bkvhead[idx]->records 3345 [krcp->bkvhead[idx]->nr_records++] = ptr; 3346 3347 return true; 3348 } 3349 3350 /* 3351 * Queue a request for lazy invocation of appropriate free routine after a 3352 * grace period. Please note there are three paths are maintained, two are the 3353 * main ones that use array of pointers interface and third one is emergency 3354 * one, that is used only when the main path can not be maintained temporary, 3355 * due to memory pressure. 3356 * 3357 * Each kvfree_call_rcu() request is added to a batch. The batch will be drained 3358 * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will 3359 * be free'd in workqueue context. This allows us to: batch requests together to 3360 * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. 3361 */ 3362 void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) 3363 { 3364 unsigned long flags; 3365 struct kfree_rcu_cpu *krcp; 3366 bool success; 3367 void *ptr; 3368 3369 if (head) { 3370 ptr = (void *) head - (unsigned long) func; 3371 } else { 3372 /* 3373 * Please note there is a limitation for the head-less 3374 * variant, that is why there is a clear rule for such 3375 * objects: it can be used from might_sleep() context 3376 * only. For other places please embed an rcu_head to 3377 * your data. 3378 */ 3379 might_sleep(); 3380 ptr = (unsigned long *) func; 3381 } 3382 3383 krcp = krc_this_cpu_lock(&flags); 3384 3385 // Queue the object but don't yet schedule the batch. 3386 if (debug_rcu_head_queue(ptr)) { 3387 // Probable double kfree_rcu(), just leak. 3388 WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", 3389 __func__, head); 3390 3391 // Mark as success and leave. 3392 success = true; 3393 goto unlock_return; 3394 } 3395 3396 /* 3397 * Under high memory pressure GFP_NOWAIT can fail, 3398 * in that case the emergency path is maintained. 3399 */ 3400 success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); 3401 if (!success) { 3402 if (head == NULL) 3403 // Inline if kvfree_rcu(one_arg) call. 3404 goto unlock_return; 3405 3406 head->func = func; 3407 head->next = krcp->head; 3408 krcp->head = head; 3409 success = true; 3410 } 3411 3412 WRITE_ONCE(krcp->count, krcp->count + 1); 3413 3414 // Set timer to drain after KFREE_DRAIN_JIFFIES. 3415 if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && 3416 !krcp->monitor_todo) { 3417 krcp->monitor_todo = true; 3418 schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); 3419 } 3420 3421 unlock_return: 3422 krc_this_cpu_unlock(krcp, flags); 3423 3424 /* 3425 * Inline kvfree() after synchronize_rcu(). We can do 3426 * it from might_sleep() context only, so the current 3427 * CPU can pass the QS state. 3428 */ 3429 if (!success) { 3430 debug_rcu_head_unqueue((struct rcu_head *) ptr); 3431 synchronize_rcu(); 3432 kvfree(ptr); 3433 } 3434 } 3435 EXPORT_SYMBOL_GPL(kvfree_call_rcu); 3436 3437 static unsigned long 3438 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 3439 { 3440 int cpu; 3441 unsigned long count = 0; 3442 3443 /* Snapshot count of all CPUs */ 3444 for_each_online_cpu(cpu) { 3445 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 3446 3447 count += READ_ONCE(krcp->count); 3448 } 3449 3450 return count; 3451 } 3452 3453 static unsigned long 3454 kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 3455 { 3456 int cpu, freed = 0; 3457 unsigned long flags; 3458 3459 for_each_online_cpu(cpu) { 3460 int count; 3461 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 3462 3463 count = krcp->count; 3464 raw_spin_lock_irqsave(&krcp->lock, flags); 3465 if (krcp->monitor_todo) 3466 kfree_rcu_drain_unlock(krcp, flags); 3467 else 3468 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3469 3470 sc->nr_to_scan -= count; 3471 freed += count; 3472 3473 if (sc->nr_to_scan <= 0) 3474 break; 3475 } 3476 3477 return freed == 0 ? SHRINK_STOP : freed; 3478 } 3479 3480 static struct shrinker kfree_rcu_shrinker = { 3481 .count_objects = kfree_rcu_shrink_count, 3482 .scan_objects = kfree_rcu_shrink_scan, 3483 .batch = 0, 3484 .seeks = DEFAULT_SEEKS, 3485 }; 3486 3487 void __init kfree_rcu_scheduler_running(void) 3488 { 3489 int cpu; 3490 unsigned long flags; 3491 3492 for_each_online_cpu(cpu) { 3493 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 3494 3495 raw_spin_lock_irqsave(&krcp->lock, flags); 3496 if (!krcp->head || krcp->monitor_todo) { 3497 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3498 continue; 3499 } 3500 krcp->monitor_todo = true; 3501 schedule_delayed_work_on(cpu, &krcp->monitor_work, 3502 KFREE_DRAIN_JIFFIES); 3503 raw_spin_unlock_irqrestore(&krcp->lock, flags); 3504 } 3505 } 3506 3507 /* 3508 * During early boot, any blocking grace-period wait automatically 3509 * implies a grace period. Later on, this is never the case for PREEMPTION. 3510 * 3511 * Howevr, because a context switch is a grace period for !PREEMPTION, any 3512 * blocking grace-period wait automatically implies a grace period if 3513 * there is only one CPU online at any point time during execution of 3514 * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to 3515 * occasionally incorrectly indicate that there are multiple CPUs online 3516 * when there was in fact only one the whole time, as this just adds some 3517 * overhead: RCU still operates correctly. 3518 */ 3519 static int rcu_blocking_is_gp(void) 3520 { 3521 int ret; 3522 3523 if (IS_ENABLED(CONFIG_PREEMPTION)) 3524 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; 3525 might_sleep(); /* Check for RCU read-side critical section. */ 3526 preempt_disable(); 3527 ret = num_online_cpus() <= 1; 3528 preempt_enable(); 3529 return ret; 3530 } 3531 3532 /** 3533 * synchronize_rcu - wait until a grace period has elapsed. 3534 * 3535 * Control will return to the caller some time after a full grace 3536 * period has elapsed, in other words after all currently executing RCU 3537 * read-side critical sections have completed. Note, however, that 3538 * upon return from synchronize_rcu(), the caller might well be executing 3539 * concurrently with new RCU read-side critical sections that began while 3540 * synchronize_rcu() was waiting. RCU read-side critical sections are 3541 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 3542 * In addition, regions of code across which interrupts, preemption, or 3543 * softirqs have been disabled also serve as RCU read-side critical 3544 * sections. This includes hardware interrupt handlers, softirq handlers, 3545 * and NMI handlers. 3546 * 3547 * Note that this guarantee implies further memory-ordering guarantees. 3548 * On systems with more than one CPU, when synchronize_rcu() returns, 3549 * each CPU is guaranteed to have executed a full memory barrier since 3550 * the end of its last RCU read-side critical section whose beginning 3551 * preceded the call to synchronize_rcu(). In addition, each CPU having 3552 * an RCU read-side critical section that extends beyond the return from 3553 * synchronize_rcu() is guaranteed to have executed a full memory barrier 3554 * after the beginning of synchronize_rcu() and before the beginning of 3555 * that RCU read-side critical section. Note that these guarantees include 3556 * CPUs that are offline, idle, or executing in user mode, as well as CPUs 3557 * that are executing in the kernel. 3558 * 3559 * Furthermore, if CPU A invoked synchronize_rcu(), which returned 3560 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 3561 * to have executed a full memory barrier during the execution of 3562 * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but 3563 * again only if the system has more than one CPU). 3564 */ 3565 void synchronize_rcu(void) 3566 { 3567 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || 3568 lock_is_held(&rcu_lock_map) || 3569 lock_is_held(&rcu_sched_lock_map), 3570 "Illegal synchronize_rcu() in RCU read-side critical section"); 3571 if (rcu_blocking_is_gp()) 3572 return; 3573 if (rcu_gp_is_expedited()) 3574 synchronize_rcu_expedited(); 3575 else 3576 wait_rcu_gp(call_rcu); 3577 } 3578 EXPORT_SYMBOL_GPL(synchronize_rcu); 3579 3580 /** 3581 * get_state_synchronize_rcu - Snapshot current RCU state 3582 * 3583 * Returns a cookie that is used by a later call to cond_synchronize_rcu() 3584 * to determine whether or not a full grace period has elapsed in the 3585 * meantime. 3586 */ 3587 unsigned long get_state_synchronize_rcu(void) 3588 { 3589 /* 3590 * Any prior manipulation of RCU-protected data must happen 3591 * before the load from ->gp_seq. 3592 */ 3593 smp_mb(); /* ^^^ */ 3594 return rcu_seq_snap(&rcu_state.gp_seq); 3595 } 3596 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 3597 3598 /** 3599 * cond_synchronize_rcu - Conditionally wait for an RCU grace period 3600 * 3601 * @oldstate: return value from earlier call to get_state_synchronize_rcu() 3602 * 3603 * If a full RCU grace period has elapsed since the earlier call to 3604 * get_state_synchronize_rcu(), just return. Otherwise, invoke 3605 * synchronize_rcu() to wait for a full grace period. 3606 * 3607 * Yes, this function does not take counter wrap into account. But 3608 * counter wrap is harmless. If the counter wraps, we have waited for 3609 * more than 2 billion grace periods (and way more on a 64-bit system!), 3610 * so waiting for one additional grace period should be just fine. 3611 */ 3612 void cond_synchronize_rcu(unsigned long oldstate) 3613 { 3614 if (!rcu_seq_done(&rcu_state.gp_seq, oldstate)) 3615 synchronize_rcu(); 3616 else 3617 smp_mb(); /* Ensure GP ends before subsequent accesses. */ 3618 } 3619 EXPORT_SYMBOL_GPL(cond_synchronize_rcu); 3620 3621 /* 3622 * Check to see if there is any immediate RCU-related work to be done by 3623 * the current CPU, returning 1 if so and zero otherwise. The checks are 3624 * in order of increasing expense: checks that can be carried out against 3625 * CPU-local state are performed first. However, we must check for CPU 3626 * stalls first, else we might not get a chance. 3627 */ 3628 static int rcu_pending(int user) 3629 { 3630 bool gp_in_progress; 3631 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 3632 struct rcu_node *rnp = rdp->mynode; 3633 3634 /* Check for CPU stalls, if enabled. */ 3635 check_cpu_stall(rdp); 3636 3637 /* Does this CPU need a deferred NOCB wakeup? */ 3638 if (rcu_nocb_need_deferred_wakeup(rdp)) 3639 return 1; 3640 3641 /* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */ 3642 if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu()) 3643 return 0; 3644 3645 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3646 gp_in_progress = rcu_gp_in_progress(); 3647 if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress) 3648 return 1; 3649 3650 /* Does this CPU have callbacks ready to invoke? */ 3651 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 3652 return 1; 3653 3654 /* Has RCU gone idle with this CPU needing another grace period? */ 3655 if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && 3656 (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || 3657 !rcu_segcblist_is_offloaded(&rdp->cblist)) && 3658 !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) 3659 return 1; 3660 3661 /* Have RCU grace period completed or started? */ 3662 if (rcu_seq_current(&rnp->gp_seq) != rdp->gp_seq || 3663 unlikely(READ_ONCE(rdp->gpwrap))) /* outside lock */ 3664 return 1; 3665 3666 /* nothing to do */ 3667 return 0; 3668 } 3669 3670 /* 3671 * Helper function for rcu_barrier() tracing. If tracing is disabled, 3672 * the compiler is expected to optimize this away. 3673 */ 3674 static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) 3675 { 3676 trace_rcu_barrier(rcu_state.name, s, cpu, 3677 atomic_read(&rcu_state.barrier_cpu_count), done); 3678 } 3679 3680 /* 3681 * RCU callback function for rcu_barrier(). If we are last, wake 3682 * up the task executing rcu_barrier(). 3683 * 3684 * Note that the value of rcu_state.barrier_sequence must be captured 3685 * before the atomic_dec_and_test(). Otherwise, if this CPU is not last, 3686 * other CPUs might count the value down to zero before this CPU gets 3687 * around to invoking rcu_barrier_trace(), which might result in bogus 3688 * data from the next instance of rcu_barrier(). 3689 */ 3690 static void rcu_barrier_callback(struct rcu_head *rhp) 3691 { 3692 unsigned long __maybe_unused s = rcu_state.barrier_sequence; 3693 3694 if (atomic_dec_and_test(&rcu_state.barrier_cpu_count)) { 3695 rcu_barrier_trace(TPS("LastCB"), -1, s); 3696 complete(&rcu_state.barrier_completion); 3697 } else { 3698 rcu_barrier_trace(TPS("CB"), -1, s); 3699 } 3700 } 3701 3702 /* 3703 * Called with preemption disabled, and from cross-cpu IRQ context. 3704 */ 3705 static void rcu_barrier_func(void *cpu_in) 3706 { 3707 uintptr_t cpu = (uintptr_t)cpu_in; 3708 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 3709 3710 rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence); 3711 rdp->barrier_head.func = rcu_barrier_callback; 3712 debug_rcu_head_queue(&rdp->barrier_head); 3713 rcu_nocb_lock(rdp); 3714 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); 3715 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { 3716 atomic_inc(&rcu_state.barrier_cpu_count); 3717 } else { 3718 debug_rcu_head_unqueue(&rdp->barrier_head); 3719 rcu_barrier_trace(TPS("IRQNQ"), -1, 3720 rcu_state.barrier_sequence); 3721 } 3722 rcu_nocb_unlock(rdp); 3723 } 3724 3725 /** 3726 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 3727 * 3728 * Note that this primitive does not necessarily wait for an RCU grace period 3729 * to complete. For example, if there are no RCU callbacks queued anywhere 3730 * in the system, then rcu_barrier() is within its rights to return 3731 * immediately, without waiting for anything, much less an RCU grace period. 3732 */ 3733 void rcu_barrier(void) 3734 { 3735 uintptr_t cpu; 3736 struct rcu_data *rdp; 3737 unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); 3738 3739 rcu_barrier_trace(TPS("Begin"), -1, s); 3740 3741 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 3742 mutex_lock(&rcu_state.barrier_mutex); 3743 3744 /* Did someone else do our work for us? */ 3745 if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { 3746 rcu_barrier_trace(TPS("EarlyExit"), -1, 3747 rcu_state.barrier_sequence); 3748 smp_mb(); /* caller's subsequent code after above check. */ 3749 mutex_unlock(&rcu_state.barrier_mutex); 3750 return; 3751 } 3752 3753 /* Mark the start of the barrier operation. */ 3754 rcu_seq_start(&rcu_state.barrier_sequence); 3755 rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence); 3756 3757 /* 3758 * Initialize the count to two rather than to zero in order 3759 * to avoid a too-soon return to zero in case of an immediate 3760 * invocation of the just-enqueued callback (or preemption of 3761 * this task). Exclude CPU-hotplug operations to ensure that no 3762 * offline non-offloaded CPU has callbacks queued. 3763 */ 3764 init_completion(&rcu_state.barrier_completion); 3765 atomic_set(&rcu_state.barrier_cpu_count, 2); 3766 get_online_cpus(); 3767 3768 /* 3769 * Force each CPU with callbacks to register a new callback. 3770 * When that callback is invoked, we will know that all of the 3771 * corresponding CPU's preceding callbacks have been invoked. 3772 */ 3773 for_each_possible_cpu(cpu) { 3774 rdp = per_cpu_ptr(&rcu_data, cpu); 3775 if (cpu_is_offline(cpu) && 3776 !rcu_segcblist_is_offloaded(&rdp->cblist)) 3777 continue; 3778 if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) { 3779 rcu_barrier_trace(TPS("OnlineQ"), cpu, 3780 rcu_state.barrier_sequence); 3781 smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1); 3782 } else if (rcu_segcblist_n_cbs(&rdp->cblist) && 3783 cpu_is_offline(cpu)) { 3784 rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, 3785 rcu_state.barrier_sequence); 3786 local_irq_disable(); 3787 rcu_barrier_func((void *)cpu); 3788 local_irq_enable(); 3789 } else if (cpu_is_offline(cpu)) { 3790 rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu, 3791 rcu_state.barrier_sequence); 3792 } else { 3793 rcu_barrier_trace(TPS("OnlineNQ"), cpu, 3794 rcu_state.barrier_sequence); 3795 } 3796 } 3797 put_online_cpus(); 3798 3799 /* 3800 * Now that we have an rcu_barrier_callback() callback on each 3801 * CPU, and thus each counted, remove the initial count. 3802 */ 3803 if (atomic_sub_and_test(2, &rcu_state.barrier_cpu_count)) 3804 complete(&rcu_state.barrier_completion); 3805 3806 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 3807 wait_for_completion(&rcu_state.barrier_completion); 3808 3809 /* Mark the end of the barrier operation. */ 3810 rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence); 3811 rcu_seq_end(&rcu_state.barrier_sequence); 3812 3813 /* Other rcu_barrier() invocations can now safely proceed. */ 3814 mutex_unlock(&rcu_state.barrier_mutex); 3815 } 3816 EXPORT_SYMBOL_GPL(rcu_barrier); 3817 3818 /* 3819 * Propagate ->qsinitmask bits up the rcu_node tree to account for the 3820 * first CPU in a given leaf rcu_node structure coming online. The caller 3821 * must hold the corresponding leaf rcu_node ->lock with interrrupts 3822 * disabled. 3823 */ 3824 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) 3825 { 3826 long mask; 3827 long oldmask; 3828 struct rcu_node *rnp = rnp_leaf; 3829 3830 raw_lockdep_assert_held_rcu_node(rnp_leaf); 3831 WARN_ON_ONCE(rnp->wait_blkd_tasks); 3832 for (;;) { 3833 mask = rnp->grpmask; 3834 rnp = rnp->parent; 3835 if (rnp == NULL) 3836 return; 3837 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ 3838 oldmask = rnp->qsmaskinit; 3839 rnp->qsmaskinit |= mask; 3840 raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ 3841 if (oldmask) 3842 return; 3843 } 3844 } 3845 3846 /* 3847 * Do boot-time initialization of a CPU's per-CPU RCU data. 3848 */ 3849 static void __init 3850 rcu_boot_init_percpu_data(int cpu) 3851 { 3852 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 3853 3854 /* Set up local state, ensuring consistent view of global state. */ 3855 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); 3856 WARN_ON_ONCE(rdp->dynticks_nesting != 1); 3857 WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp))); 3858 rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; 3859 rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; 3860 rdp->rcu_onl_gp_seq = rcu_state.gp_seq; 3861 rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; 3862 rdp->cpu = cpu; 3863 rcu_boot_init_nocb_percpu_data(rdp); 3864 } 3865 3866 /* 3867 * Invoked early in the CPU-online process, when pretty much all services 3868 * are available. The incoming CPU is not present. 3869 * 3870 * Initializes a CPU's per-CPU RCU data. Note that only one online or 3871 * offline event can be happening at a given time. Note also that we can 3872 * accept some slop in the rsp->gp_seq access due to the fact that this 3873 * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet. 3874 * And any offloaded callbacks are being numbered elsewhere. 3875 */ 3876 int rcutree_prepare_cpu(unsigned int cpu) 3877 { 3878 unsigned long flags; 3879 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 3880 struct rcu_node *rnp = rcu_get_root(); 3881 3882 /* Set up local state, ensuring consistent view of global state. */ 3883 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3884 rdp->qlen_last_fqs_check = 0; 3885 rdp->n_force_qs_snap = rcu_state.n_force_qs; 3886 rdp->blimit = blimit; 3887 if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ 3888 !rcu_segcblist_is_offloaded(&rdp->cblist)) 3889 rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ 3890 rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ 3891 rcu_dynticks_eqs_online(); 3892 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 3893 3894 /* 3895 * Add CPU to leaf rcu_node pending-online bitmask. Any needed 3896 * propagation up the rcu_node tree will happen at the beginning 3897 * of the next grace period. 3898 */ 3899 rnp = rdp->mynode; 3900 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 3901 rdp->beenonline = true; /* We have now been online. */ 3902 rdp->gp_seq = READ_ONCE(rnp->gp_seq); 3903 rdp->gp_seq_needed = rdp->gp_seq; 3904 rdp->cpu_no_qs.b.norm = true; 3905 rdp->core_needs_qs = false; 3906 rdp->rcu_iw_pending = false; 3907 rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; 3908 trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); 3909 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3910 rcu_prepare_kthreads(cpu); 3911 rcu_spawn_cpu_nocb_kthread(cpu); 3912 3913 return 0; 3914 } 3915 3916 /* 3917 * Update RCU priority boot kthread affinity for CPU-hotplug changes. 3918 */ 3919 static void rcutree_affinity_setting(unsigned int cpu, int outgoing) 3920 { 3921 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 3922 3923 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); 3924 } 3925 3926 /* 3927 * Near the end of the CPU-online process. Pretty much all services 3928 * enabled, and the CPU is now very much alive. 3929 */ 3930 int rcutree_online_cpu(unsigned int cpu) 3931 { 3932 unsigned long flags; 3933 struct rcu_data *rdp; 3934 struct rcu_node *rnp; 3935 3936 rdp = per_cpu_ptr(&rcu_data, cpu); 3937 rnp = rdp->mynode; 3938 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3939 rnp->ffmask |= rdp->grpmask; 3940 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3941 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) 3942 return 0; /* Too early in boot for scheduler work. */ 3943 sync_sched_exp_online_cleanup(cpu); 3944 rcutree_affinity_setting(cpu, -1); 3945 3946 // Stop-machine done, so allow nohz_full to disable tick. 3947 tick_dep_clear(TICK_DEP_BIT_RCU); 3948 return 0; 3949 } 3950 3951 /* 3952 * Near the beginning of the process. The CPU is still very much alive 3953 * with pretty much all services enabled. 3954 */ 3955 int rcutree_offline_cpu(unsigned int cpu) 3956 { 3957 unsigned long flags; 3958 struct rcu_data *rdp; 3959 struct rcu_node *rnp; 3960 3961 rdp = per_cpu_ptr(&rcu_data, cpu); 3962 rnp = rdp->mynode; 3963 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3964 rnp->ffmask &= ~rdp->grpmask; 3965 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3966 3967 rcutree_affinity_setting(cpu, cpu); 3968 3969 // nohz_full CPUs need the tick for stop-machine to work quickly 3970 tick_dep_set(TICK_DEP_BIT_RCU); 3971 return 0; 3972 } 3973 3974 static DEFINE_PER_CPU(int, rcu_cpu_started); 3975 3976 /* 3977 * Mark the specified CPU as being online so that subsequent grace periods 3978 * (both expedited and normal) will wait on it. Note that this means that 3979 * incoming CPUs are not allowed to use RCU read-side critical sections 3980 * until this function is called. Failing to observe this restriction 3981 * will result in lockdep splats. 3982 * 3983 * Note that this function is special in that it is invoked directly 3984 * from the incoming CPU rather than from the cpuhp_step mechanism. 3985 * This is because this function must be invoked at a precise location. 3986 */ 3987 void rcu_cpu_starting(unsigned int cpu) 3988 { 3989 unsigned long flags; 3990 unsigned long mask; 3991 struct rcu_data *rdp; 3992 struct rcu_node *rnp; 3993 bool newcpu; 3994 3995 if (per_cpu(rcu_cpu_started, cpu)) 3996 return; 3997 3998 per_cpu(rcu_cpu_started, cpu) = 1; 3999 4000 rdp = per_cpu_ptr(&rcu_data, cpu); 4001 rnp = rdp->mynode; 4002 mask = rdp->grpmask; 4003 raw_spin_lock_irqsave_rcu_node(rnp, flags); 4004 WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); 4005 newcpu = !(rnp->expmaskinitnext & mask); 4006 rnp->expmaskinitnext |= mask; 4007 /* Allow lockless access for expedited grace periods. */ 4008 smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */ 4009 ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); 4010 rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ 4011 rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); 4012 rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); 4013 if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ 4014 rcu_disable_urgency_upon_qs(rdp); 4015 /* Report QS -after- changing ->qsmaskinitnext! */ 4016 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); 4017 } else { 4018 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4019 } 4020 smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ 4021 } 4022 4023 #ifdef CONFIG_HOTPLUG_CPU 4024 /* 4025 * The outgoing function has no further need of RCU, so remove it from 4026 * the rcu_node tree's ->qsmaskinitnext bit masks. 4027 * 4028 * Note that this function is special in that it is invoked directly 4029 * from the outgoing CPU rather than from the cpuhp_step mechanism. 4030 * This is because this function must be invoked at a precise location. 4031 */ 4032 void rcu_report_dead(unsigned int cpu) 4033 { 4034 unsigned long flags; 4035 unsigned long mask; 4036 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 4037 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 4038 4039 /* QS for any half-done expedited grace period. */ 4040 preempt_disable(); 4041 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); 4042 preempt_enable(); 4043 rcu_preempt_deferred_qs(current); 4044 4045 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 4046 mask = rdp->grpmask; 4047 raw_spin_lock(&rcu_state.ofl_lock); 4048 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ 4049 rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); 4050 rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); 4051 if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ 4052 /* Report quiescent state -before- changing ->qsmaskinitnext! */ 4053 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); 4054 raw_spin_lock_irqsave_rcu_node(rnp, flags); 4055 } 4056 WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); 4057 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4058 raw_spin_unlock(&rcu_state.ofl_lock); 4059 4060 per_cpu(rcu_cpu_started, cpu) = 0; 4061 } 4062 4063 /* 4064 * The outgoing CPU has just passed through the dying-idle state, and we 4065 * are being invoked from the CPU that was IPIed to continue the offline 4066 * operation. Migrate the outgoing CPU's callbacks to the current CPU. 4067 */ 4068 void rcutree_migrate_callbacks(int cpu) 4069 { 4070 unsigned long flags; 4071 struct rcu_data *my_rdp; 4072 struct rcu_node *my_rnp; 4073 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 4074 bool needwake; 4075 4076 if (rcu_segcblist_is_offloaded(&rdp->cblist) || 4077 rcu_segcblist_empty(&rdp->cblist)) 4078 return; /* No callbacks to migrate. */ 4079 4080 local_irq_save(flags); 4081 my_rdp = this_cpu_ptr(&rcu_data); 4082 my_rnp = my_rdp->mynode; 4083 rcu_nocb_lock(my_rdp); /* irqs already disabled. */ 4084 WARN_ON_ONCE(!rcu_nocb_flush_bypass(my_rdp, NULL, jiffies)); 4085 raw_spin_lock_rcu_node(my_rnp); /* irqs already disabled. */ 4086 /* Leverage recent GPs and set GP for new callbacks. */ 4087 needwake = rcu_advance_cbs(my_rnp, rdp) || 4088 rcu_advance_cbs(my_rnp, my_rdp); 4089 rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); 4090 needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp); 4091 rcu_segcblist_disable(&rdp->cblist); 4092 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != 4093 !rcu_segcblist_n_cbs(&my_rdp->cblist)); 4094 if (rcu_segcblist_is_offloaded(&my_rdp->cblist)) { 4095 raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ 4096 __call_rcu_nocb_wake(my_rdp, true, flags); 4097 } else { 4098 rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ 4099 raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); 4100 } 4101 if (needwake) 4102 rcu_gp_kthread_wake(); 4103 lockdep_assert_irqs_enabled(); 4104 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || 4105 !rcu_segcblist_empty(&rdp->cblist), 4106 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", 4107 cpu, rcu_segcblist_n_cbs(&rdp->cblist), 4108 rcu_segcblist_first_cb(&rdp->cblist)); 4109 } 4110 #endif 4111 4112 /* 4113 * On non-huge systems, use expedited RCU grace periods to make suspend 4114 * and hibernation run faster. 4115 */ 4116 static int rcu_pm_notify(struct notifier_block *self, 4117 unsigned long action, void *hcpu) 4118 { 4119 switch (action) { 4120 case PM_HIBERNATION_PREPARE: 4121 case PM_SUSPEND_PREPARE: 4122 rcu_expedite_gp(); 4123 break; 4124 case PM_POST_HIBERNATION: 4125 case PM_POST_SUSPEND: 4126 rcu_unexpedite_gp(); 4127 break; 4128 default: 4129 break; 4130 } 4131 return NOTIFY_OK; 4132 } 4133 4134 /* 4135 * Spawn the kthreads that handle RCU's grace periods. 4136 */ 4137 static int __init rcu_spawn_gp_kthread(void) 4138 { 4139 unsigned long flags; 4140 int kthread_prio_in = kthread_prio; 4141 struct rcu_node *rnp; 4142 struct sched_param sp; 4143 struct task_struct *t; 4144 4145 /* Force priority into range. */ 4146 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2 4147 && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)) 4148 kthread_prio = 2; 4149 else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) 4150 kthread_prio = 1; 4151 else if (kthread_prio < 0) 4152 kthread_prio = 0; 4153 else if (kthread_prio > 99) 4154 kthread_prio = 99; 4155 4156 if (kthread_prio != kthread_prio_in) 4157 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", 4158 kthread_prio, kthread_prio_in); 4159 4160 rcu_scheduler_fully_active = 1; 4161 t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); 4162 if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) 4163 return 0; 4164 if (kthread_prio) { 4165 sp.sched_priority = kthread_prio; 4166 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 4167 } 4168 rnp = rcu_get_root(); 4169 raw_spin_lock_irqsave_rcu_node(rnp, flags); 4170 WRITE_ONCE(rcu_state.gp_activity, jiffies); 4171 WRITE_ONCE(rcu_state.gp_req_activity, jiffies); 4172 // Reset .gp_activity and .gp_req_activity before setting .gp_kthread. 4173 smp_store_release(&rcu_state.gp_kthread, t); /* ^^^ */ 4174 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4175 wake_up_process(t); 4176 rcu_spawn_nocb_kthreads(); 4177 rcu_spawn_boost_kthreads(); 4178 return 0; 4179 } 4180 early_initcall(rcu_spawn_gp_kthread); 4181 4182 /* 4183 * This function is invoked towards the end of the scheduler's 4184 * initialization process. Before this is called, the idle task might 4185 * contain synchronous grace-period primitives (during which time, this idle 4186 * task is booting the system, and such primitives are no-ops). After this 4187 * function is called, any synchronous grace-period primitives are run as 4188 * expedited, with the requesting task driving the grace period forward. 4189 * A later core_initcall() rcu_set_runtime_mode() will switch to full 4190 * runtime RCU functionality. 4191 */ 4192 void rcu_scheduler_starting(void) 4193 { 4194 WARN_ON(num_online_cpus() != 1); 4195 WARN_ON(nr_context_switches() > 0); 4196 rcu_test_sync_prims(); 4197 rcu_scheduler_active = RCU_SCHEDULER_INIT; 4198 rcu_test_sync_prims(); 4199 } 4200 4201 /* 4202 * Helper function for rcu_init() that initializes the rcu_state structure. 4203 */ 4204 static void __init rcu_init_one(void) 4205 { 4206 static const char * const buf[] = RCU_NODE_NAME_INIT; 4207 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4208 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 4209 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 4210 4211 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 4212 int cpustride = 1; 4213 int i; 4214 int j; 4215 struct rcu_node *rnp; 4216 4217 BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 4218 4219 /* Silence gcc 4.8 false positive about array index out of range. */ 4220 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) 4221 panic("rcu_init_one: rcu_num_lvls out of range"); 4222 4223 /* Initialize the level-tracking arrays. */ 4224 4225 for (i = 1; i < rcu_num_lvls; i++) 4226 rcu_state.level[i] = 4227 rcu_state.level[i - 1] + num_rcu_lvl[i - 1]; 4228 rcu_init_levelspread(levelspread, num_rcu_lvl); 4229 4230 /* Initialize the elements themselves, starting from the leaves. */ 4231 4232 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4233 cpustride *= levelspread[i]; 4234 rnp = rcu_state.level[i]; 4235 for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { 4236 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); 4237 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), 4238 &rcu_node_class[i], buf[i]); 4239 raw_spin_lock_init(&rnp->fqslock); 4240 lockdep_set_class_and_name(&rnp->fqslock, 4241 &rcu_fqs_class[i], fqs[i]); 4242 rnp->gp_seq = rcu_state.gp_seq; 4243 rnp->gp_seq_needed = rcu_state.gp_seq; 4244 rnp->completedqs = rcu_state.gp_seq; 4245 rnp->qsmask = 0; 4246 rnp->qsmaskinit = 0; 4247 rnp->grplo = j * cpustride; 4248 rnp->grphi = (j + 1) * cpustride - 1; 4249 if (rnp->grphi >= nr_cpu_ids) 4250 rnp->grphi = nr_cpu_ids - 1; 4251 if (i == 0) { 4252 rnp->grpnum = 0; 4253 rnp->grpmask = 0; 4254 rnp->parent = NULL; 4255 } else { 4256 rnp->grpnum = j % levelspread[i - 1]; 4257 rnp->grpmask = BIT(rnp->grpnum); 4258 rnp->parent = rcu_state.level[i - 1] + 4259 j / levelspread[i - 1]; 4260 } 4261 rnp->level = i; 4262 INIT_LIST_HEAD(&rnp->blkd_tasks); 4263 rcu_init_one_nocb(rnp); 4264 init_waitqueue_head(&rnp->exp_wq[0]); 4265 init_waitqueue_head(&rnp->exp_wq[1]); 4266 init_waitqueue_head(&rnp->exp_wq[2]); 4267 init_waitqueue_head(&rnp->exp_wq[3]); 4268 spin_lock_init(&rnp->exp_lock); 4269 } 4270 } 4271 4272 init_swait_queue_head(&rcu_state.gp_wq); 4273 init_swait_queue_head(&rcu_state.expedited_wq); 4274 rnp = rcu_first_leaf_node(); 4275 for_each_possible_cpu(i) { 4276 while (i > rnp->grphi) 4277 rnp++; 4278 per_cpu_ptr(&rcu_data, i)->mynode = rnp; 4279 rcu_boot_init_percpu_data(i); 4280 } 4281 } 4282 4283 /* 4284 * Compute the rcu_node tree geometry from kernel parameters. This cannot 4285 * replace the definitions in tree.h because those are needed to size 4286 * the ->node array in the rcu_state structure. 4287 */ 4288 static void __init rcu_init_geometry(void) 4289 { 4290 ulong d; 4291 int i; 4292 int rcu_capacity[RCU_NUM_LVLS]; 4293 4294 /* 4295 * Initialize any unspecified boot parameters. 4296 * The default values of jiffies_till_first_fqs and 4297 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS 4298 * value, which is a function of HZ, then adding one for each 4299 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. 4300 */ 4301 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; 4302 if (jiffies_till_first_fqs == ULONG_MAX) 4303 jiffies_till_first_fqs = d; 4304 if (jiffies_till_next_fqs == ULONG_MAX) 4305 jiffies_till_next_fqs = d; 4306 adjust_jiffies_till_sched_qs(); 4307 4308 /* If the compile-time values are accurate, just leave. */ 4309 if (rcu_fanout_leaf == RCU_FANOUT_LEAF && 4310 nr_cpu_ids == NR_CPUS) 4311 return; 4312 pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n", 4313 rcu_fanout_leaf, nr_cpu_ids); 4314 4315 /* 4316 * The boot-time rcu_fanout_leaf parameter must be at least two 4317 * and cannot exceed the number of bits in the rcu_node masks. 4318 * Complain and fall back to the compile-time values if this 4319 * limit is exceeded. 4320 */ 4321 if (rcu_fanout_leaf < 2 || 4322 rcu_fanout_leaf > sizeof(unsigned long) * 8) { 4323 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4324 WARN_ON(1); 4325 return; 4326 } 4327 4328 /* 4329 * Compute number of nodes that can be handled an rcu_node tree 4330 * with the given number of levels. 4331 */ 4332 rcu_capacity[0] = rcu_fanout_leaf; 4333 for (i = 1; i < RCU_NUM_LVLS; i++) 4334 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; 4335 4336 /* 4337 * The tree must be able to accommodate the configured number of CPUs. 4338 * If this limit is exceeded, fall back to the compile-time values. 4339 */ 4340 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { 4341 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4342 WARN_ON(1); 4343 return; 4344 } 4345 4346 /* Calculate the number of levels in the tree. */ 4347 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { 4348 } 4349 rcu_num_lvls = i + 1; 4350 4351 /* Calculate the number of rcu_nodes at each level of the tree. */ 4352 for (i = 0; i < rcu_num_lvls; i++) { 4353 int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; 4354 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); 4355 } 4356 4357 /* Calculate the total number of rcu_node structures. */ 4358 rcu_num_nodes = 0; 4359 for (i = 0; i < rcu_num_lvls; i++) 4360 rcu_num_nodes += num_rcu_lvl[i]; 4361 } 4362 4363 /* 4364 * Dump out the structure of the rcu_node combining tree associated 4365 * with the rcu_state structure. 4366 */ 4367 static void __init rcu_dump_rcu_node_tree(void) 4368 { 4369 int level = 0; 4370 struct rcu_node *rnp; 4371 4372 pr_info("rcu_node tree layout dump\n"); 4373 pr_info(" "); 4374 rcu_for_each_node_breadth_first(rnp) { 4375 if (rnp->level != level) { 4376 pr_cont("\n"); 4377 pr_info(" "); 4378 level = rnp->level; 4379 } 4380 pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum); 4381 } 4382 pr_cont("\n"); 4383 } 4384 4385 struct workqueue_struct *rcu_gp_wq; 4386 struct workqueue_struct *rcu_par_gp_wq; 4387 4388 static void __init kfree_rcu_batch_init(void) 4389 { 4390 int cpu; 4391 int i; 4392 4393 for_each_possible_cpu(cpu) { 4394 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); 4395 struct kvfree_rcu_bulk_data *bnode; 4396 4397 for (i = 0; i < KFREE_N_BATCHES; i++) { 4398 INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); 4399 krcp->krw_arr[i].krcp = krcp; 4400 } 4401 4402 for (i = 0; i < rcu_min_cached_objs; i++) { 4403 bnode = (struct kvfree_rcu_bulk_data *) 4404 __get_free_page(GFP_NOWAIT | __GFP_NOWARN); 4405 4406 if (bnode) 4407 put_cached_bnode(krcp, bnode); 4408 else 4409 pr_err("Failed to preallocate for %d CPU!\n", cpu); 4410 } 4411 4412 INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); 4413 krcp->initialized = true; 4414 } 4415 if (register_shrinker(&kfree_rcu_shrinker)) 4416 pr_err("Failed to register kfree_rcu() shrinker!\n"); 4417 } 4418 4419 void __init rcu_init(void) 4420 { 4421 int cpu; 4422 4423 rcu_early_boot_tests(); 4424 4425 kfree_rcu_batch_init(); 4426 rcu_bootup_announce(); 4427 rcu_init_geometry(); 4428 rcu_init_one(); 4429 if (dump_tree) 4430 rcu_dump_rcu_node_tree(); 4431 if (use_softirq) 4432 open_softirq(RCU_SOFTIRQ, rcu_core_si); 4433 4434 /* 4435 * We don't need protection against CPU-hotplug here because 4436 * this is called early in boot, before either interrupts 4437 * or the scheduler are operational. 4438 */ 4439 pm_notifier(rcu_pm_notify, 0); 4440 for_each_online_cpu(cpu) { 4441 rcutree_prepare_cpu(cpu); 4442 rcu_cpu_starting(cpu); 4443 rcutree_online_cpu(cpu); 4444 } 4445 4446 /* Create workqueue for expedited GPs and for Tree SRCU. */ 4447 rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); 4448 WARN_ON(!rcu_gp_wq); 4449 rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); 4450 WARN_ON(!rcu_par_gp_wq); 4451 srcu_init(); 4452 4453 /* Fill in default value for rcutree.qovld boot parameter. */ 4454 /* -After- the rcu_node ->lock fields are initialized! */ 4455 if (qovld < 0) 4456 qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; 4457 else 4458 qovld_calc = qovld; 4459 } 4460 4461 #include "tree_stall.h" 4462 #include "tree_exp.h" 4463 #include "tree_plugin.h" 4464