1 /* 2 * Read-Copy Update mechanism for mutual exclusion 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, you can access it online at 16 * http://www.gnu.org/licenses/gpl-2.0.html. 17 * 18 * Copyright IBM Corporation, 2008 19 * 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 21 * Manfred Spraul <manfred@colorfullife.com> 22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version 23 * 24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 26 * 27 * For detailed explanation of Read-Copy Update mechanism see - 28 * Documentation/RCU 29 */ 30 #include <linux/types.h> 31 #include <linux/kernel.h> 32 #include <linux/init.h> 33 #include <linux/spinlock.h> 34 #include <linux/smp.h> 35 #include <linux/rcupdate.h> 36 #include <linux/interrupt.h> 37 #include <linux/sched.h> 38 #include <linux/nmi.h> 39 #include <linux/atomic.h> 40 #include <linux/bitops.h> 41 #include <linux/export.h> 42 #include <linux/completion.h> 43 #include <linux/moduleparam.h> 44 #include <linux/module.h> 45 #include <linux/percpu.h> 46 #include <linux/notifier.h> 47 #include <linux/cpu.h> 48 #include <linux/mutex.h> 49 #include <linux/time.h> 50 #include <linux/kernel_stat.h> 51 #include <linux/wait.h> 52 #include <linux/kthread.h> 53 #include <linux/prefetch.h> 54 #include <linux/delay.h> 55 #include <linux/stop_machine.h> 56 #include <linux/random.h> 57 #include <linux/ftrace_event.h> 58 #include <linux/suspend.h> 59 60 #include "tree.h" 61 #include "rcu.h" 62 63 MODULE_ALIAS("rcutree"); 64 #ifdef MODULE_PARAM_PREFIX 65 #undef MODULE_PARAM_PREFIX 66 #endif 67 #define MODULE_PARAM_PREFIX "rcutree." 68 69 /* Data structures. */ 70 71 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 72 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 73 74 /* 75 * In order to export the rcu_state name to the tracing tools, it 76 * needs to be added in the __tracepoint_string section. 77 * This requires defining a separate variable tp_<sname>_varname 78 * that points to the string being used, and this will allow 79 * the tracing userspace tools to be able to decipher the string 80 * address to the matching string. 81 */ 82 #ifdef CONFIG_TRACING 83 # define DEFINE_RCU_TPS(sname) \ 84 static char sname##_varname[] = #sname; \ 85 static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; 86 # define RCU_STATE_NAME(sname) sname##_varname 87 #else 88 # define DEFINE_RCU_TPS(sname) 89 # define RCU_STATE_NAME(sname) __stringify(sname) 90 #endif 91 92 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 93 DEFINE_RCU_TPS(sname) \ 94 struct rcu_state sname##_state = { \ 95 .level = { &sname##_state.node[0] }, \ 96 .call = cr, \ 97 .fqs_state = RCU_GP_IDLE, \ 98 .gpnum = 0UL - 300UL, \ 99 .completed = 0UL - 300UL, \ 100 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 101 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 102 .orphan_donetail = &sname##_state.orphan_donelist, \ 103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 104 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 105 .name = RCU_STATE_NAME(sname), \ 106 .abbr = sabbr, \ 107 }; \ 108 DEFINE_PER_CPU(struct rcu_data, sname##_data) 109 110 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 111 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 112 113 static struct rcu_state *rcu_state_p; 114 LIST_HEAD(rcu_struct_flavors); 115 116 /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ 117 static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; 118 module_param(rcu_fanout_leaf, int, 0444); 119 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 120 static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ 121 NUM_RCU_LVL_0, 122 NUM_RCU_LVL_1, 123 NUM_RCU_LVL_2, 124 NUM_RCU_LVL_3, 125 NUM_RCU_LVL_4, 126 }; 127 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 128 129 /* 130 * The rcu_scheduler_active variable transitions from zero to one just 131 * before the first task is spawned. So when this variable is zero, RCU 132 * can assume that there is but one task, allowing RCU to (for example) 133 * optimize synchronize_sched() to a simple barrier(). When this variable 134 * is one, RCU must actually do all the hard work required to detect real 135 * grace periods. This variable is also used to suppress boot-time false 136 * positives from lockdep-RCU error checking. 137 */ 138 int rcu_scheduler_active __read_mostly; 139 EXPORT_SYMBOL_GPL(rcu_scheduler_active); 140 141 /* 142 * The rcu_scheduler_fully_active variable transitions from zero to one 143 * during the early_initcall() processing, which is after the scheduler 144 * is capable of creating new tasks. So RCU processing (for example, 145 * creating tasks for RCU priority boosting) must be delayed until after 146 * rcu_scheduler_fully_active transitions from zero to one. We also 147 * currently delay invocation of any RCU callbacks until after this point. 148 * 149 * It might later prove better for people registering RCU callbacks during 150 * early boot to take responsibility for these callbacks, but one step at 151 * a time. 152 */ 153 static int rcu_scheduler_fully_active __read_mostly; 154 155 #ifdef CONFIG_RCU_BOOST 156 157 /* 158 * Control variables for per-CPU and per-rcu_node kthreads. These 159 * handle all flavors of RCU. 160 */ 161 static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); 162 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 163 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); 164 DEFINE_PER_CPU(char, rcu_cpu_has_work); 165 166 #endif /* #ifdef CONFIG_RCU_BOOST */ 167 168 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 169 static void invoke_rcu_core(void); 170 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 171 172 /* 173 * Track the rcutorture test sequence number and the update version 174 * number within a given test. The rcutorture_testseq is incremented 175 * on every rcutorture module load and unload, so has an odd value 176 * when a test is running. The rcutorture_vernum is set to zero 177 * when rcutorture starts and is incremented on each rcutorture update. 178 * These variables enable correlating rcutorture output with the 179 * RCU tracing information. 180 */ 181 unsigned long rcutorture_testseq; 182 unsigned long rcutorture_vernum; 183 184 /* 185 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 186 * permit this function to be invoked without holding the root rcu_node 187 * structure's ->lock, but of course results can be subject to change. 188 */ 189 static int rcu_gp_in_progress(struct rcu_state *rsp) 190 { 191 return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); 192 } 193 194 /* 195 * Note a quiescent state. Because we do not need to know 196 * how many quiescent states passed, just if there was at least 197 * one since the start of the grace period, this just sets a flag. 198 * The caller must have disabled preemption. 199 */ 200 void rcu_sched_qs(void) 201 { 202 if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { 203 trace_rcu_grace_period(TPS("rcu_sched"), 204 __this_cpu_read(rcu_sched_data.gpnum), 205 TPS("cpuqs")); 206 __this_cpu_write(rcu_sched_data.passed_quiesce, 1); 207 } 208 } 209 210 void rcu_bh_qs(void) 211 { 212 if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { 213 trace_rcu_grace_period(TPS("rcu_bh"), 214 __this_cpu_read(rcu_bh_data.gpnum), 215 TPS("cpuqs")); 216 __this_cpu_write(rcu_bh_data.passed_quiesce, 1); 217 } 218 } 219 220 static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 221 222 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 223 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 224 .dynticks = ATOMIC_INIT(1), 225 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 226 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 227 .dynticks_idle = ATOMIC_INIT(1), 228 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 229 }; 230 231 /* 232 * Let the RCU core know that this CPU has gone through the scheduler, 233 * which is a quiescent state. This is called when the need for a 234 * quiescent state is urgent, so we burn an atomic operation and full 235 * memory barriers to let the RCU core know about it, regardless of what 236 * this CPU might (or might not) do in the near future. 237 * 238 * We inform the RCU core by emulating a zero-duration dyntick-idle 239 * period, which we in turn do by incrementing the ->dynticks counter 240 * by two. 241 */ 242 static void rcu_momentary_dyntick_idle(void) 243 { 244 unsigned long flags; 245 struct rcu_data *rdp; 246 struct rcu_dynticks *rdtp; 247 int resched_mask; 248 struct rcu_state *rsp; 249 250 local_irq_save(flags); 251 252 /* 253 * Yes, we can lose flag-setting operations. This is OK, because 254 * the flag will be set again after some delay. 255 */ 256 resched_mask = raw_cpu_read(rcu_sched_qs_mask); 257 raw_cpu_write(rcu_sched_qs_mask, 0); 258 259 /* Find the flavor that needs a quiescent state. */ 260 for_each_rcu_flavor(rsp) { 261 rdp = raw_cpu_ptr(rsp->rda); 262 if (!(resched_mask & rsp->flavor_mask)) 263 continue; 264 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ 265 if (ACCESS_ONCE(rdp->mynode->completed) != 266 ACCESS_ONCE(rdp->cond_resched_completed)) 267 continue; 268 269 /* 270 * Pretend to be momentarily idle for the quiescent state. 271 * This allows the grace-period kthread to record the 272 * quiescent state, with no need for this CPU to do anything 273 * further. 274 */ 275 rdtp = this_cpu_ptr(&rcu_dynticks); 276 smp_mb__before_atomic(); /* Earlier stuff before QS. */ 277 atomic_add(2, &rdtp->dynticks); /* QS. */ 278 smp_mb__after_atomic(); /* Later stuff after QS. */ 279 break; 280 } 281 local_irq_restore(flags); 282 } 283 284 /* 285 * Note a context switch. This is a quiescent state for RCU-sched, 286 * and requires special handling for preemptible RCU. 287 * The caller must have disabled preemption. 288 */ 289 void rcu_note_context_switch(int cpu) 290 { 291 trace_rcu_utilization(TPS("Start context switch")); 292 rcu_sched_qs(); 293 rcu_preempt_note_context_switch(cpu); 294 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 295 rcu_momentary_dyntick_idle(); 296 trace_rcu_utilization(TPS("End context switch")); 297 } 298 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 299 300 static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 301 static long qhimark = 10000; /* If this many pending, ignore blimit. */ 302 static long qlowmark = 100; /* Once only this many pending, use blimit. */ 303 304 module_param(blimit, long, 0444); 305 module_param(qhimark, long, 0444); 306 module_param(qlowmark, long, 0444); 307 308 static ulong jiffies_till_first_fqs = ULONG_MAX; 309 static ulong jiffies_till_next_fqs = ULONG_MAX; 310 311 module_param(jiffies_till_first_fqs, ulong, 0644); 312 module_param(jiffies_till_next_fqs, ulong, 0644); 313 314 /* 315 * How long the grace period must be before we start recruiting 316 * quiescent-state help from rcu_note_context_switch(). 317 */ 318 static ulong jiffies_till_sched_qs = HZ / 20; 319 module_param(jiffies_till_sched_qs, ulong, 0644); 320 321 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 322 struct rcu_data *rdp); 323 static void force_qs_rnp(struct rcu_state *rsp, 324 int (*f)(struct rcu_data *rsp, bool *isidle, 325 unsigned long *maxj), 326 bool *isidle, unsigned long *maxj); 327 static void force_quiescent_state(struct rcu_state *rsp); 328 static int rcu_pending(int cpu); 329 330 /* 331 * Return the number of RCU-sched batches processed thus far for debug & stats. 332 */ 333 long rcu_batches_completed_sched(void) 334 { 335 return rcu_sched_state.completed; 336 } 337 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); 338 339 /* 340 * Return the number of RCU BH batches processed thus far for debug & stats. 341 */ 342 long rcu_batches_completed_bh(void) 343 { 344 return rcu_bh_state.completed; 345 } 346 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 347 348 /* 349 * Force a quiescent state. 350 */ 351 void rcu_force_quiescent_state(void) 352 { 353 force_quiescent_state(rcu_state_p); 354 } 355 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 356 357 /* 358 * Force a quiescent state for RCU BH. 359 */ 360 void rcu_bh_force_quiescent_state(void) 361 { 362 force_quiescent_state(&rcu_bh_state); 363 } 364 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 365 366 /* 367 * Show the state of the grace-period kthreads. 368 */ 369 void show_rcu_gp_kthreads(void) 370 { 371 struct rcu_state *rsp; 372 373 for_each_rcu_flavor(rsp) { 374 pr_info("%s: wait state: %d ->state: %#lx\n", 375 rsp->name, rsp->gp_state, rsp->gp_kthread->state); 376 /* sched_show_task(rsp->gp_kthread); */ 377 } 378 } 379 EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); 380 381 /* 382 * Record the number of times rcutorture tests have been initiated and 383 * terminated. This information allows the debugfs tracing stats to be 384 * correlated to the rcutorture messages, even when the rcutorture module 385 * is being repeatedly loaded and unloaded. In other words, we cannot 386 * store this state in rcutorture itself. 387 */ 388 void rcutorture_record_test_transition(void) 389 { 390 rcutorture_testseq++; 391 rcutorture_vernum = 0; 392 } 393 EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); 394 395 /* 396 * Send along grace-period-related data for rcutorture diagnostics. 397 */ 398 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, 399 unsigned long *gpnum, unsigned long *completed) 400 { 401 struct rcu_state *rsp = NULL; 402 403 switch (test_type) { 404 case RCU_FLAVOR: 405 rsp = rcu_state_p; 406 break; 407 case RCU_BH_FLAVOR: 408 rsp = &rcu_bh_state; 409 break; 410 case RCU_SCHED_FLAVOR: 411 rsp = &rcu_sched_state; 412 break; 413 default: 414 break; 415 } 416 if (rsp != NULL) { 417 *flags = ACCESS_ONCE(rsp->gp_flags); 418 *gpnum = ACCESS_ONCE(rsp->gpnum); 419 *completed = ACCESS_ONCE(rsp->completed); 420 return; 421 } 422 *flags = 0; 423 *gpnum = 0; 424 *completed = 0; 425 } 426 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); 427 428 /* 429 * Record the number of writer passes through the current rcutorture test. 430 * This is also used to correlate debugfs tracing stats with the rcutorture 431 * messages. 432 */ 433 void rcutorture_record_progress(unsigned long vernum) 434 { 435 rcutorture_vernum++; 436 } 437 EXPORT_SYMBOL_GPL(rcutorture_record_progress); 438 439 /* 440 * Force a quiescent state for RCU-sched. 441 */ 442 void rcu_sched_force_quiescent_state(void) 443 { 444 force_quiescent_state(&rcu_sched_state); 445 } 446 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 447 448 /* 449 * Does the CPU have callbacks ready to be invoked? 450 */ 451 static int 452 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 453 { 454 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && 455 rdp->nxttail[RCU_DONE_TAIL] != NULL; 456 } 457 458 /* 459 * Return the root node of the specified rcu_state structure. 460 */ 461 static struct rcu_node *rcu_get_root(struct rcu_state *rsp) 462 { 463 return &rsp->node[0]; 464 } 465 466 /* 467 * Is there any need for future grace periods? 468 * Interrupts must be disabled. If the caller does not hold the root 469 * rnp_node structure's ->lock, the results are advisory only. 470 */ 471 static int rcu_future_needs_gp(struct rcu_state *rsp) 472 { 473 struct rcu_node *rnp = rcu_get_root(rsp); 474 int idx = (ACCESS_ONCE(rnp->completed) + 1) & 0x1; 475 int *fp = &rnp->need_future_gp[idx]; 476 477 return ACCESS_ONCE(*fp); 478 } 479 480 /* 481 * Does the current CPU require a not-yet-started grace period? 482 * The caller must have disabled interrupts to prevent races with 483 * normal callback registry. 484 */ 485 static int 486 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 487 { 488 int i; 489 490 if (rcu_gp_in_progress(rsp)) 491 return 0; /* No, a grace period is already in progress. */ 492 if (rcu_future_needs_gp(rsp)) 493 return 1; /* Yes, a no-CBs CPU needs one. */ 494 if (!rdp->nxttail[RCU_NEXT_TAIL]) 495 return 0; /* No, this is a no-CBs (or offline) CPU. */ 496 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 497 return 1; /* Yes, this CPU has newly registered callbacks. */ 498 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) 499 if (rdp->nxttail[i - 1] != rdp->nxttail[i] && 500 ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), 501 rdp->nxtcompleted[i])) 502 return 1; /* Yes, CBs for future grace period. */ 503 return 0; /* No grace period needed. */ 504 } 505 506 /* 507 * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state 508 * 509 * If the new value of the ->dynticks_nesting counter now is zero, 510 * we really have entered idle, and must do the appropriate accounting. 511 * The caller must have disabled interrupts. 512 */ 513 static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 514 bool user) 515 { 516 struct rcu_state *rsp; 517 struct rcu_data *rdp; 518 519 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 520 if (!user && !is_idle_task(current)) { 521 struct task_struct *idle __maybe_unused = 522 idle_task(smp_processor_id()); 523 524 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); 525 ftrace_dump(DUMP_ORIG); 526 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 527 current->pid, current->comm, 528 idle->pid, idle->comm); /* must be idle task! */ 529 } 530 for_each_rcu_flavor(rsp) { 531 rdp = this_cpu_ptr(rsp->rda); 532 do_nocb_deferred_wakeup(rdp); 533 } 534 rcu_prepare_for_idle(smp_processor_id()); 535 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 536 smp_mb__before_atomic(); /* See above. */ 537 atomic_inc(&rdtp->dynticks); 538 smp_mb__after_atomic(); /* Force ordering with next sojourn. */ 539 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 540 rcu_dynticks_task_enter(); 541 542 /* 543 * It is illegal to enter an extended quiescent state while 544 * in an RCU read-side critical section. 545 */ 546 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), 547 "Illegal idle entry in RCU read-side critical section."); 548 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), 549 "Illegal idle entry in RCU-bh read-side critical section."); 550 rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), 551 "Illegal idle entry in RCU-sched read-side critical section."); 552 } 553 554 /* 555 * Enter an RCU extended quiescent state, which can be either the 556 * idle loop or adaptive-tickless usermode execution. 557 */ 558 static void rcu_eqs_enter(bool user) 559 { 560 long long oldval; 561 struct rcu_dynticks *rdtp; 562 563 rdtp = this_cpu_ptr(&rcu_dynticks); 564 oldval = rdtp->dynticks_nesting; 565 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 566 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { 567 rdtp->dynticks_nesting = 0; 568 rcu_eqs_enter_common(rdtp, oldval, user); 569 } else { 570 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 571 } 572 } 573 574 /** 575 * rcu_idle_enter - inform RCU that current CPU is entering idle 576 * 577 * Enter idle mode, in other words, -leave- the mode in which RCU 578 * read-side critical sections can occur. (Though RCU read-side 579 * critical sections can occur in irq handlers in idle, a possibility 580 * handled by irq_enter() and irq_exit().) 581 * 582 * We crowbar the ->dynticks_nesting field to zero to allow for 583 * the possibility of usermode upcalls having messed up our count 584 * of interrupt nesting level during the prior busy period. 585 */ 586 void rcu_idle_enter(void) 587 { 588 unsigned long flags; 589 590 local_irq_save(flags); 591 rcu_eqs_enter(false); 592 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); 593 local_irq_restore(flags); 594 } 595 EXPORT_SYMBOL_GPL(rcu_idle_enter); 596 597 #ifdef CONFIG_RCU_USER_QS 598 /** 599 * rcu_user_enter - inform RCU that we are resuming userspace. 600 * 601 * Enter RCU idle mode right before resuming userspace. No use of RCU 602 * is permitted between this call and rcu_user_exit(). This way the 603 * CPU doesn't need to maintain the tick for RCU maintenance purposes 604 * when the CPU runs in userspace. 605 */ 606 void rcu_user_enter(void) 607 { 608 rcu_eqs_enter(1); 609 } 610 #endif /* CONFIG_RCU_USER_QS */ 611 612 /** 613 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 614 * 615 * Exit from an interrupt handler, which might possibly result in entering 616 * idle mode, in other words, leaving the mode in which read-side critical 617 * sections can occur. 618 * 619 * This code assumes that the idle loop never does anything that might 620 * result in unbalanced calls to irq_enter() and irq_exit(). If your 621 * architecture violates this assumption, RCU will give you what you 622 * deserve, good and hard. But very infrequently and irreproducibly. 623 * 624 * Use things like work queues to work around this limitation. 625 * 626 * You have been warned. 627 */ 628 void rcu_irq_exit(void) 629 { 630 unsigned long flags; 631 long long oldval; 632 struct rcu_dynticks *rdtp; 633 634 local_irq_save(flags); 635 rdtp = this_cpu_ptr(&rcu_dynticks); 636 oldval = rdtp->dynticks_nesting; 637 rdtp->dynticks_nesting--; 638 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 639 if (rdtp->dynticks_nesting) 640 trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); 641 else 642 rcu_eqs_enter_common(rdtp, oldval, true); 643 rcu_sysidle_enter(rdtp, 1); 644 local_irq_restore(flags); 645 } 646 647 /* 648 * rcu_eqs_exit_common - current CPU moving away from extended quiescent state 649 * 650 * If the new value of the ->dynticks_nesting counter was previously zero, 651 * we really have exited idle, and must do the appropriate accounting. 652 * The caller must have disabled interrupts. 653 */ 654 static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, 655 int user) 656 { 657 rcu_dynticks_task_exit(); 658 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ 659 atomic_inc(&rdtp->dynticks); 660 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 661 smp_mb__after_atomic(); /* See above. */ 662 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 663 rcu_cleanup_after_idle(smp_processor_id()); 664 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 665 if (!user && !is_idle_task(current)) { 666 struct task_struct *idle __maybe_unused = 667 idle_task(smp_processor_id()); 668 669 trace_rcu_dyntick(TPS("Error on exit: not idle task"), 670 oldval, rdtp->dynticks_nesting); 671 ftrace_dump(DUMP_ORIG); 672 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 673 current->pid, current->comm, 674 idle->pid, idle->comm); /* must be idle task! */ 675 } 676 } 677 678 /* 679 * Exit an RCU extended quiescent state, which can be either the 680 * idle loop or adaptive-tickless usermode execution. 681 */ 682 static void rcu_eqs_exit(bool user) 683 { 684 struct rcu_dynticks *rdtp; 685 long long oldval; 686 687 rdtp = this_cpu_ptr(&rcu_dynticks); 688 oldval = rdtp->dynticks_nesting; 689 WARN_ON_ONCE(oldval < 0); 690 if (oldval & DYNTICK_TASK_NEST_MASK) { 691 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 692 } else { 693 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 694 rcu_eqs_exit_common(rdtp, oldval, user); 695 } 696 } 697 698 /** 699 * rcu_idle_exit - inform RCU that current CPU is leaving idle 700 * 701 * Exit idle mode, in other words, -enter- the mode in which RCU 702 * read-side critical sections can occur. 703 * 704 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to 705 * allow for the possibility of usermode upcalls messing up our count 706 * of interrupt nesting level during the busy period that is just 707 * now starting. 708 */ 709 void rcu_idle_exit(void) 710 { 711 unsigned long flags; 712 713 local_irq_save(flags); 714 rcu_eqs_exit(false); 715 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); 716 local_irq_restore(flags); 717 } 718 EXPORT_SYMBOL_GPL(rcu_idle_exit); 719 720 #ifdef CONFIG_RCU_USER_QS 721 /** 722 * rcu_user_exit - inform RCU that we are exiting userspace. 723 * 724 * Exit RCU idle mode while entering the kernel because it can 725 * run a RCU read side critical section anytime. 726 */ 727 void rcu_user_exit(void) 728 { 729 rcu_eqs_exit(1); 730 } 731 #endif /* CONFIG_RCU_USER_QS */ 732 733 /** 734 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 735 * 736 * Enter an interrupt handler, which might possibly result in exiting 737 * idle mode, in other words, entering the mode in which read-side critical 738 * sections can occur. 739 * 740 * Note that the Linux kernel is fully capable of entering an interrupt 741 * handler that it never exits, for example when doing upcalls to 742 * user mode! This code assumes that the idle loop never does upcalls to 743 * user mode. If your architecture does do upcalls from the idle loop (or 744 * does anything else that results in unbalanced calls to the irq_enter() 745 * and irq_exit() functions), RCU will give you what you deserve, good 746 * and hard. But very infrequently and irreproducibly. 747 * 748 * Use things like work queues to work around this limitation. 749 * 750 * You have been warned. 751 */ 752 void rcu_irq_enter(void) 753 { 754 unsigned long flags; 755 struct rcu_dynticks *rdtp; 756 long long oldval; 757 758 local_irq_save(flags); 759 rdtp = this_cpu_ptr(&rcu_dynticks); 760 oldval = rdtp->dynticks_nesting; 761 rdtp->dynticks_nesting++; 762 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 763 if (oldval) 764 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 765 else 766 rcu_eqs_exit_common(rdtp, oldval, true); 767 rcu_sysidle_exit(rdtp, 1); 768 local_irq_restore(flags); 769 } 770 771 /** 772 * rcu_nmi_enter - inform RCU of entry to NMI context 773 * 774 * If the CPU was idle with dynamic ticks active, and there is no 775 * irq handler running, this updates rdtp->dynticks_nmi to let the 776 * RCU grace-period handling know that the CPU is active. 777 */ 778 void rcu_nmi_enter(void) 779 { 780 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 781 782 if (rdtp->dynticks_nmi_nesting == 0 && 783 (atomic_read(&rdtp->dynticks) & 0x1)) 784 return; 785 rdtp->dynticks_nmi_nesting++; 786 smp_mb__before_atomic(); /* Force delay from prior write. */ 787 atomic_inc(&rdtp->dynticks); 788 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 789 smp_mb__after_atomic(); /* See above. */ 790 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 791 } 792 793 /** 794 * rcu_nmi_exit - inform RCU of exit from NMI context 795 * 796 * If the CPU was idle with dynamic ticks active, and there is no 797 * irq handler running, this updates rdtp->dynticks_nmi to let the 798 * RCU grace-period handling know that the CPU is no longer active. 799 */ 800 void rcu_nmi_exit(void) 801 { 802 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 803 804 if (rdtp->dynticks_nmi_nesting == 0 || 805 --rdtp->dynticks_nmi_nesting != 0) 806 return; 807 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 808 smp_mb__before_atomic(); /* See above. */ 809 atomic_inc(&rdtp->dynticks); 810 smp_mb__after_atomic(); /* Force delay to next write. */ 811 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 812 } 813 814 /** 815 * __rcu_is_watching - are RCU read-side critical sections safe? 816 * 817 * Return true if RCU is watching the running CPU, which means that 818 * this CPU can safely enter RCU read-side critical sections. Unlike 819 * rcu_is_watching(), the caller of __rcu_is_watching() must have at 820 * least disabled preemption. 821 */ 822 bool notrace __rcu_is_watching(void) 823 { 824 return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; 825 } 826 827 /** 828 * rcu_is_watching - see if RCU thinks that the current CPU is idle 829 * 830 * If the current CPU is in its idle loop and is neither in an interrupt 831 * or NMI handler, return true. 832 */ 833 bool notrace rcu_is_watching(void) 834 { 835 bool ret; 836 837 preempt_disable(); 838 ret = __rcu_is_watching(); 839 preempt_enable(); 840 return ret; 841 } 842 EXPORT_SYMBOL_GPL(rcu_is_watching); 843 844 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 845 846 /* 847 * Is the current CPU online? Disable preemption to avoid false positives 848 * that could otherwise happen due to the current CPU number being sampled, 849 * this task being preempted, its old CPU being taken offline, resuming 850 * on some other CPU, then determining that its old CPU is now offline. 851 * It is OK to use RCU on an offline processor during initial boot, hence 852 * the check for rcu_scheduler_fully_active. Note also that it is OK 853 * for a CPU coming online to use RCU for one jiffy prior to marking itself 854 * online in the cpu_online_mask. Similarly, it is OK for a CPU going 855 * offline to continue to use RCU for one jiffy after marking itself 856 * offline in the cpu_online_mask. This leniency is necessary given the 857 * non-atomic nature of the online and offline processing, for example, 858 * the fact that a CPU enters the scheduler after completing the CPU_DYING 859 * notifiers. 860 * 861 * This is also why RCU internally marks CPUs online during the 862 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. 863 * 864 * Disable checking if in an NMI handler because we cannot safely report 865 * errors from NMI handlers anyway. 866 */ 867 bool rcu_lockdep_current_cpu_online(void) 868 { 869 struct rcu_data *rdp; 870 struct rcu_node *rnp; 871 bool ret; 872 873 if (in_nmi()) 874 return true; 875 preempt_disable(); 876 rdp = this_cpu_ptr(&rcu_sched_data); 877 rnp = rdp->mynode; 878 ret = (rdp->grpmask & rnp->qsmaskinit) || 879 !rcu_scheduler_fully_active; 880 preempt_enable(); 881 return ret; 882 } 883 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 884 885 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ 886 887 /** 888 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 889 * 890 * If the current CPU is idle or running at a first-level (not nested) 891 * interrupt from idle, return true. The caller must have at least 892 * disabled preemption. 893 */ 894 static int rcu_is_cpu_rrupt_from_idle(void) 895 { 896 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; 897 } 898 899 /* 900 * Snapshot the specified CPU's dynticks counter so that we can later 901 * credit them with an implicit quiescent state. Return 1 if this CPU 902 * is in dynticks idle mode, which is an extended quiescent state. 903 */ 904 static int dyntick_save_progress_counter(struct rcu_data *rdp, 905 bool *isidle, unsigned long *maxj) 906 { 907 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 908 rcu_sysidle_check_cpu(rdp, isidle, maxj); 909 if ((rdp->dynticks_snap & 0x1) == 0) { 910 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 911 return 1; 912 } else { 913 return 0; 914 } 915 } 916 917 /* 918 * This function really isn't for public consumption, but RCU is special in 919 * that context switches can allow the state machine to make progress. 920 */ 921 extern void resched_cpu(int cpu); 922 923 /* 924 * Return true if the specified CPU has passed through a quiescent 925 * state by virtue of being in or having passed through an dynticks 926 * idle state since the last call to dyntick_save_progress_counter() 927 * for this same CPU, or by virtue of having been offline. 928 */ 929 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, 930 bool *isidle, unsigned long *maxj) 931 { 932 unsigned int curr; 933 int *rcrmp; 934 unsigned int snap; 935 936 curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); 937 snap = (unsigned int)rdp->dynticks_snap; 938 939 /* 940 * If the CPU passed through or entered a dynticks idle phase with 941 * no active irq/NMI handlers, then we can safely pretend that the CPU 942 * already acknowledged the request to pass through a quiescent 943 * state. Either way, that CPU cannot possibly be in an RCU 944 * read-side critical section that started before the beginning 945 * of the current RCU grace period. 946 */ 947 if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { 948 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 949 rdp->dynticks_fqs++; 950 return 1; 951 } 952 953 /* 954 * Check for the CPU being offline, but only if the grace period 955 * is old enough. We don't need to worry about the CPU changing 956 * state: If we see it offline even once, it has been through a 957 * quiescent state. 958 * 959 * The reason for insisting that the grace period be at least 960 * one jiffy old is that CPUs that are not quite online and that 961 * have just gone offline can still execute RCU read-side critical 962 * sections. 963 */ 964 if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) 965 return 0; /* Grace period is not old enough. */ 966 barrier(); 967 if (cpu_is_offline(rdp->cpu)) { 968 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); 969 rdp->offline_fqs++; 970 return 1; 971 } 972 973 /* 974 * A CPU running for an extended time within the kernel can 975 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, 976 * even context-switching back and forth between a pair of 977 * in-kernel CPU-bound tasks cannot advance grace periods. 978 * So if the grace period is old enough, make the CPU pay attention. 979 * Note that the unsynchronized assignments to the per-CPU 980 * rcu_sched_qs_mask variable are safe. Yes, setting of 981 * bits can be lost, but they will be set again on the next 982 * force-quiescent-state pass. So lost bit sets do not result 983 * in incorrect behavior, merely in a grace period lasting 984 * a few jiffies longer than it might otherwise. Because 985 * there are at most four threads involved, and because the 986 * updates are only once every few jiffies, the probability of 987 * lossage (and thus of slight grace-period extension) is 988 * quite low. 989 * 990 * Note that if the jiffies_till_sched_qs boot/sysfs parameter 991 * is set too high, we override with half of the RCU CPU stall 992 * warning delay. 993 */ 994 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); 995 if (ULONG_CMP_GE(jiffies, 996 rdp->rsp->gp_start + jiffies_till_sched_qs) || 997 ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 998 if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { 999 ACCESS_ONCE(rdp->cond_resched_completed) = 1000 ACCESS_ONCE(rdp->mynode->completed); 1001 smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 1002 ACCESS_ONCE(*rcrmp) = 1003 ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; 1004 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ 1005 rdp->rsp->jiffies_resched += 5; /* Enable beating. */ 1006 } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { 1007 /* Time to beat on that CPU again! */ 1008 resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ 1009 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ 1010 } 1011 } 1012 1013 return 0; 1014 } 1015 1016 static void record_gp_stall_check_time(struct rcu_state *rsp) 1017 { 1018 unsigned long j = jiffies; 1019 unsigned long j1; 1020 1021 rsp->gp_start = j; 1022 smp_wmb(); /* Record start time before stall time. */ 1023 j1 = rcu_jiffies_till_stall_check(); 1024 ACCESS_ONCE(rsp->jiffies_stall) = j + j1; 1025 rsp->jiffies_resched = j + j1 / 2; 1026 } 1027 1028 /* 1029 * Dump stacks of all tasks running on stalled CPUs. 1030 */ 1031 static void rcu_dump_cpu_stacks(struct rcu_state *rsp) 1032 { 1033 int cpu; 1034 unsigned long flags; 1035 struct rcu_node *rnp; 1036 1037 rcu_for_each_leaf_node(rsp, rnp) { 1038 raw_spin_lock_irqsave(&rnp->lock, flags); 1039 if (rnp->qsmask != 0) { 1040 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1041 if (rnp->qsmask & (1UL << cpu)) 1042 dump_cpu_task(rnp->grplo + cpu); 1043 } 1044 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1045 } 1046 } 1047 1048 static void print_other_cpu_stall(struct rcu_state *rsp) 1049 { 1050 int cpu; 1051 long delta; 1052 unsigned long flags; 1053 int ndetected = 0; 1054 struct rcu_node *rnp = rcu_get_root(rsp); 1055 long totqlen = 0; 1056 1057 /* Only let one CPU complain about others per time interval. */ 1058 1059 raw_spin_lock_irqsave(&rnp->lock, flags); 1060 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); 1061 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 1062 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1063 return; 1064 } 1065 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; 1066 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1067 1068 /* 1069 * OK, time to rat on our buddy... 1070 * See Documentation/RCU/stallwarn.txt for info on how to debug 1071 * RCU CPU stall warnings. 1072 */ 1073 pr_err("INFO: %s detected stalls on CPUs/tasks:", 1074 rsp->name); 1075 print_cpu_stall_info_begin(); 1076 rcu_for_each_leaf_node(rsp, rnp) { 1077 raw_spin_lock_irqsave(&rnp->lock, flags); 1078 ndetected += rcu_print_task_stall(rnp); 1079 if (rnp->qsmask != 0) { 1080 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 1081 if (rnp->qsmask & (1UL << cpu)) { 1082 print_cpu_stall_info(rsp, 1083 rnp->grplo + cpu); 1084 ndetected++; 1085 } 1086 } 1087 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1088 } 1089 1090 /* 1091 * Now rat on any tasks that got kicked up to the root rcu_node 1092 * due to CPU offlining. 1093 */ 1094 rnp = rcu_get_root(rsp); 1095 raw_spin_lock_irqsave(&rnp->lock, flags); 1096 ndetected += rcu_print_task_stall(rnp); 1097 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1098 1099 print_cpu_stall_info_end(); 1100 for_each_possible_cpu(cpu) 1101 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1102 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1103 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1104 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1105 if (ndetected == 0) 1106 pr_err("INFO: Stall ended before state dump start\n"); 1107 else 1108 rcu_dump_cpu_stacks(rsp); 1109 1110 /* Complain about tasks blocking the grace period. */ 1111 1112 rcu_print_detail_task_stall(rsp); 1113 1114 force_quiescent_state(rsp); /* Kick them all. */ 1115 } 1116 1117 static void print_cpu_stall(struct rcu_state *rsp) 1118 { 1119 int cpu; 1120 unsigned long flags; 1121 struct rcu_node *rnp = rcu_get_root(rsp); 1122 long totqlen = 0; 1123 1124 /* 1125 * OK, time to rat on ourselves... 1126 * See Documentation/RCU/stallwarn.txt for info on how to debug 1127 * RCU CPU stall warnings. 1128 */ 1129 pr_err("INFO: %s self-detected stall on CPU", rsp->name); 1130 print_cpu_stall_info_begin(); 1131 print_cpu_stall_info(rsp, smp_processor_id()); 1132 print_cpu_stall_info_end(); 1133 for_each_possible_cpu(cpu) 1134 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1135 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1136 jiffies - rsp->gp_start, 1137 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1138 rcu_dump_cpu_stacks(rsp); 1139 1140 raw_spin_lock_irqsave(&rnp->lock, flags); 1141 if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) 1142 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + 1143 3 * rcu_jiffies_till_stall_check() + 3; 1144 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1145 1146 /* 1147 * Attempt to revive the RCU machinery by forcing a context switch. 1148 * 1149 * A context switch would normally allow the RCU state machine to make 1150 * progress and it could be we're stuck in kernel space without context 1151 * switches for an entirely unreasonable amount of time. 1152 */ 1153 resched_cpu(smp_processor_id()); 1154 } 1155 1156 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 1157 { 1158 unsigned long completed; 1159 unsigned long gpnum; 1160 unsigned long gps; 1161 unsigned long j; 1162 unsigned long js; 1163 struct rcu_node *rnp; 1164 1165 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) 1166 return; 1167 j = jiffies; 1168 1169 /* 1170 * Lots of memory barriers to reject false positives. 1171 * 1172 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, 1173 * then rsp->gp_start, and finally rsp->completed. These values 1174 * are updated in the opposite order with memory barriers (or 1175 * equivalent) during grace-period initialization and cleanup. 1176 * Now, a false positive can occur if we get an new value of 1177 * rsp->gp_start and a old value of rsp->jiffies_stall. But given 1178 * the memory barriers, the only way that this can happen is if one 1179 * grace period ends and another starts between these two fetches. 1180 * Detect this by comparing rsp->completed with the previous fetch 1181 * from rsp->gpnum. 1182 * 1183 * Given this check, comparisons of jiffies, rsp->jiffies_stall, 1184 * and rsp->gp_start suffice to forestall false positives. 1185 */ 1186 gpnum = ACCESS_ONCE(rsp->gpnum); 1187 smp_rmb(); /* Pick up ->gpnum first... */ 1188 js = ACCESS_ONCE(rsp->jiffies_stall); 1189 smp_rmb(); /* ...then ->jiffies_stall before the rest... */ 1190 gps = ACCESS_ONCE(rsp->gp_start); 1191 smp_rmb(); /* ...and finally ->gp_start before ->completed. */ 1192 completed = ACCESS_ONCE(rsp->completed); 1193 if (ULONG_CMP_GE(completed, gpnum) || 1194 ULONG_CMP_LT(j, js) || 1195 ULONG_CMP_GE(gps, js)) 1196 return; /* No stall or GP completed since entering function. */ 1197 rnp = rdp->mynode; 1198 if (rcu_gp_in_progress(rsp) && 1199 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { 1200 1201 /* We haven't checked in, so go dump stack. */ 1202 print_cpu_stall(rsp); 1203 1204 } else if (rcu_gp_in_progress(rsp) && 1205 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1206 1207 /* They had a few time units to dump stack, so complain. */ 1208 print_other_cpu_stall(rsp); 1209 } 1210 } 1211 1212 /** 1213 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 1214 * 1215 * Set the stall-warning timeout way off into the future, thus preventing 1216 * any RCU CPU stall-warning messages from appearing in the current set of 1217 * RCU grace periods. 1218 * 1219 * The caller must disable hard irqs. 1220 */ 1221 void rcu_cpu_stall_reset(void) 1222 { 1223 struct rcu_state *rsp; 1224 1225 for_each_rcu_flavor(rsp) 1226 ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2; 1227 } 1228 1229 /* 1230 * Initialize the specified rcu_data structure's callback list to empty. 1231 */ 1232 static void init_callback_list(struct rcu_data *rdp) 1233 { 1234 int i; 1235 1236 if (init_nocb_callback_list(rdp)) 1237 return; 1238 rdp->nxtlist = NULL; 1239 for (i = 0; i < RCU_NEXT_SIZE; i++) 1240 rdp->nxttail[i] = &rdp->nxtlist; 1241 } 1242 1243 /* 1244 * Determine the value that ->completed will have at the end of the 1245 * next subsequent grace period. This is used to tag callbacks so that 1246 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1247 * been dyntick-idle for an extended period with callbacks under the 1248 * influence of RCU_FAST_NO_HZ. 1249 * 1250 * The caller must hold rnp->lock with interrupts disabled. 1251 */ 1252 static unsigned long rcu_cbs_completed(struct rcu_state *rsp, 1253 struct rcu_node *rnp) 1254 { 1255 /* 1256 * If RCU is idle, we just wait for the next grace period. 1257 * But we can only be sure that RCU is idle if we are looking 1258 * at the root rcu_node structure -- otherwise, a new grace 1259 * period might have started, but just not yet gotten around 1260 * to initializing the current non-root rcu_node structure. 1261 */ 1262 if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) 1263 return rnp->completed + 1; 1264 1265 /* 1266 * Otherwise, wait for a possible partial grace period and 1267 * then the subsequent full grace period. 1268 */ 1269 return rnp->completed + 2; 1270 } 1271 1272 /* 1273 * Trace-event helper function for rcu_start_future_gp() and 1274 * rcu_nocb_wait_gp(). 1275 */ 1276 static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1277 unsigned long c, const char *s) 1278 { 1279 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, 1280 rnp->completed, c, rnp->level, 1281 rnp->grplo, rnp->grphi, s); 1282 } 1283 1284 /* 1285 * Start some future grace period, as needed to handle newly arrived 1286 * callbacks. The required future grace periods are recorded in each 1287 * rcu_node structure's ->need_future_gp field. Returns true if there 1288 * is reason to awaken the grace-period kthread. 1289 * 1290 * The caller must hold the specified rcu_node structure's ->lock. 1291 */ 1292 static bool __maybe_unused 1293 rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1294 unsigned long *c_out) 1295 { 1296 unsigned long c; 1297 int i; 1298 bool ret = false; 1299 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1300 1301 /* 1302 * Pick up grace-period number for new callbacks. If this 1303 * grace period is already marked as needed, return to the caller. 1304 */ 1305 c = rcu_cbs_completed(rdp->rsp, rnp); 1306 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); 1307 if (rnp->need_future_gp[c & 0x1]) { 1308 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); 1309 goto out; 1310 } 1311 1312 /* 1313 * If either this rcu_node structure or the root rcu_node structure 1314 * believe that a grace period is in progress, then we must wait 1315 * for the one following, which is in "c". Because our request 1316 * will be noticed at the end of the current grace period, we don't 1317 * need to explicitly start one. We only do the lockless check 1318 * of rnp_root's fields if the current rcu_node structure thinks 1319 * there is no grace period in flight, and because we hold rnp->lock, 1320 * the only possible change is when rnp_root's two fields are 1321 * equal, in which case rnp_root->gpnum might be concurrently 1322 * incremented. But that is OK, as it will just result in our 1323 * doing some extra useless work. 1324 */ 1325 if (rnp->gpnum != rnp->completed || 1326 ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) { 1327 rnp->need_future_gp[c & 0x1]++; 1328 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1329 goto out; 1330 } 1331 1332 /* 1333 * There might be no grace period in progress. If we don't already 1334 * hold it, acquire the root rcu_node structure's lock in order to 1335 * start one (if needed). 1336 */ 1337 if (rnp != rnp_root) { 1338 raw_spin_lock(&rnp_root->lock); 1339 smp_mb__after_unlock_lock(); 1340 } 1341 1342 /* 1343 * Get a new grace-period number. If there really is no grace 1344 * period in progress, it will be smaller than the one we obtained 1345 * earlier. Adjust callbacks as needed. Note that even no-CBs 1346 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. 1347 */ 1348 c = rcu_cbs_completed(rdp->rsp, rnp_root); 1349 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) 1350 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) 1351 rdp->nxtcompleted[i] = c; 1352 1353 /* 1354 * If the needed for the required grace period is already 1355 * recorded, trace and leave. 1356 */ 1357 if (rnp_root->need_future_gp[c & 0x1]) { 1358 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); 1359 goto unlock_out; 1360 } 1361 1362 /* Record the need for the future grace period. */ 1363 rnp_root->need_future_gp[c & 0x1]++; 1364 1365 /* If a grace period is not already in progress, start one. */ 1366 if (rnp_root->gpnum != rnp_root->completed) { 1367 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); 1368 } else { 1369 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); 1370 ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1371 } 1372 unlock_out: 1373 if (rnp != rnp_root) 1374 raw_spin_unlock(&rnp_root->lock); 1375 out: 1376 if (c_out != NULL) 1377 *c_out = c; 1378 return ret; 1379 } 1380 1381 /* 1382 * Clean up any old requests for the just-ended grace period. Also return 1383 * whether any additional grace periods have been requested. Also invoke 1384 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads 1385 * waiting for this grace period to complete. 1386 */ 1387 static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1388 { 1389 int c = rnp->completed; 1390 int needmore; 1391 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1392 1393 rcu_nocb_gp_cleanup(rsp, rnp); 1394 rnp->need_future_gp[c & 0x1] = 0; 1395 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1396 trace_rcu_future_gp(rnp, rdp, c, 1397 needmore ? TPS("CleanupMore") : TPS("Cleanup")); 1398 return needmore; 1399 } 1400 1401 /* 1402 * Awaken the grace-period kthread for the specified flavor of RCU. 1403 * Don't do a self-awaken, and don't bother awakening when there is 1404 * nothing for the grace-period kthread to do (as in several CPUs 1405 * raced to awaken, and we lost), and finally don't try to awaken 1406 * a kthread that has not yet been created. 1407 */ 1408 static void rcu_gp_kthread_wake(struct rcu_state *rsp) 1409 { 1410 if (current == rsp->gp_kthread || 1411 !ACCESS_ONCE(rsp->gp_flags) || 1412 !rsp->gp_kthread) 1413 return; 1414 wake_up(&rsp->gp_wq); 1415 } 1416 1417 /* 1418 * If there is room, assign a ->completed number to any callbacks on 1419 * this CPU that have not already been assigned. Also accelerate any 1420 * callbacks that were previously assigned a ->completed number that has 1421 * since proven to be too conservative, which can happen if callbacks get 1422 * assigned a ->completed number while RCU is idle, but with reference to 1423 * a non-root rcu_node structure. This function is idempotent, so it does 1424 * not hurt to call it repeatedly. Returns an flag saying that we should 1425 * awaken the RCU grace-period kthread. 1426 * 1427 * The caller must hold rnp->lock with interrupts disabled. 1428 */ 1429 static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1430 struct rcu_data *rdp) 1431 { 1432 unsigned long c; 1433 int i; 1434 bool ret; 1435 1436 /* If the CPU has no callbacks, nothing to do. */ 1437 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1438 return false; 1439 1440 /* 1441 * Starting from the sublist containing the callbacks most 1442 * recently assigned a ->completed number and working down, find the 1443 * first sublist that is not assignable to an upcoming grace period. 1444 * Such a sublist has something in it (first two tests) and has 1445 * a ->completed number assigned that will complete sooner than 1446 * the ->completed number for newly arrived callbacks (last test). 1447 * 1448 * The key point is that any later sublist can be assigned the 1449 * same ->completed number as the newly arrived callbacks, which 1450 * means that the callbacks in any of these later sublist can be 1451 * grouped into a single sublist, whether or not they have already 1452 * been assigned a ->completed number. 1453 */ 1454 c = rcu_cbs_completed(rsp, rnp); 1455 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) 1456 if (rdp->nxttail[i] != rdp->nxttail[i - 1] && 1457 !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) 1458 break; 1459 1460 /* 1461 * If there are no sublist for unassigned callbacks, leave. 1462 * At the same time, advance "i" one sublist, so that "i" will 1463 * index into the sublist where all the remaining callbacks should 1464 * be grouped into. 1465 */ 1466 if (++i >= RCU_NEXT_TAIL) 1467 return false; 1468 1469 /* 1470 * Assign all subsequent callbacks' ->completed number to the next 1471 * full grace period and group them all in the sublist initially 1472 * indexed by "i". 1473 */ 1474 for (; i <= RCU_NEXT_TAIL; i++) { 1475 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1476 rdp->nxtcompleted[i] = c; 1477 } 1478 /* Record any needed additional grace periods. */ 1479 ret = rcu_start_future_gp(rnp, rdp, NULL); 1480 1481 /* Trace depending on how much we were able to accelerate. */ 1482 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1483 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1484 else 1485 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1486 return ret; 1487 } 1488 1489 /* 1490 * Move any callbacks whose grace period has completed to the 1491 * RCU_DONE_TAIL sublist, then compact the remaining sublists and 1492 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL 1493 * sublist. This function is idempotent, so it does not hurt to 1494 * invoke it repeatedly. As long as it is not invoked -too- often... 1495 * Returns true if the RCU grace-period kthread needs to be awakened. 1496 * 1497 * The caller must hold rnp->lock with interrupts disabled. 1498 */ 1499 static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1500 struct rcu_data *rdp) 1501 { 1502 int i, j; 1503 1504 /* If the CPU has no callbacks, nothing to do. */ 1505 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1506 return false; 1507 1508 /* 1509 * Find all callbacks whose ->completed numbers indicate that they 1510 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1511 */ 1512 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { 1513 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) 1514 break; 1515 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; 1516 } 1517 /* Clean up any sublist tail pointers that were misordered above. */ 1518 for (j = RCU_WAIT_TAIL; j < i; j++) 1519 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; 1520 1521 /* Copy down callbacks to fill in empty sublists. */ 1522 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { 1523 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) 1524 break; 1525 rdp->nxttail[j] = rdp->nxttail[i]; 1526 rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; 1527 } 1528 1529 /* Classify any remaining callbacks. */ 1530 return rcu_accelerate_cbs(rsp, rnp, rdp); 1531 } 1532 1533 /* 1534 * Update CPU-local rcu_data state to record the beginnings and ends of 1535 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1536 * structure corresponding to the current CPU, and must have irqs disabled. 1537 * Returns true if the grace-period kthread needs to be awakened. 1538 */ 1539 static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, 1540 struct rcu_data *rdp) 1541 { 1542 bool ret; 1543 1544 /* Handle the ends of any preceding grace periods first. */ 1545 if (rdp->completed == rnp->completed) { 1546 1547 /* No grace period end, so just accelerate recent callbacks. */ 1548 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1549 1550 } else { 1551 1552 /* Advance callbacks. */ 1553 ret = rcu_advance_cbs(rsp, rnp, rdp); 1554 1555 /* Remember that we saw this grace-period completion. */ 1556 rdp->completed = rnp->completed; 1557 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1558 } 1559 1560 if (rdp->gpnum != rnp->gpnum) { 1561 /* 1562 * If the current grace period is waiting for this CPU, 1563 * set up to detect a quiescent state, otherwise don't 1564 * go looking for one. 1565 */ 1566 rdp->gpnum = rnp->gpnum; 1567 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1568 rdp->passed_quiesce = 0; 1569 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1570 zero_cpu_stall_ticks(rdp); 1571 } 1572 return ret; 1573 } 1574 1575 static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 1576 { 1577 unsigned long flags; 1578 bool needwake; 1579 struct rcu_node *rnp; 1580 1581 local_irq_save(flags); 1582 rnp = rdp->mynode; 1583 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && 1584 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ 1585 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1586 local_irq_restore(flags); 1587 return; 1588 } 1589 smp_mb__after_unlock_lock(); 1590 needwake = __note_gp_changes(rsp, rnp, rdp); 1591 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1592 if (needwake) 1593 rcu_gp_kthread_wake(rsp); 1594 } 1595 1596 /* 1597 * Initialize a new grace period. Return 0 if no grace period required. 1598 */ 1599 static int rcu_gp_init(struct rcu_state *rsp) 1600 { 1601 struct rcu_data *rdp; 1602 struct rcu_node *rnp = rcu_get_root(rsp); 1603 1604 rcu_bind_gp_kthread(); 1605 raw_spin_lock_irq(&rnp->lock); 1606 smp_mb__after_unlock_lock(); 1607 if (!ACCESS_ONCE(rsp->gp_flags)) { 1608 /* Spurious wakeup, tell caller to go back to sleep. */ 1609 raw_spin_unlock_irq(&rnp->lock); 1610 return 0; 1611 } 1612 ACCESS_ONCE(rsp->gp_flags) = 0; /* Clear all flags: New grace period. */ 1613 1614 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { 1615 /* 1616 * Grace period already in progress, don't start another. 1617 * Not supposed to be able to happen. 1618 */ 1619 raw_spin_unlock_irq(&rnp->lock); 1620 return 0; 1621 } 1622 1623 /* Advance to a new grace period and initialize state. */ 1624 record_gp_stall_check_time(rsp); 1625 /* Record GP times before starting GP, hence smp_store_release(). */ 1626 smp_store_release(&rsp->gpnum, rsp->gpnum + 1); 1627 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1628 raw_spin_unlock_irq(&rnp->lock); 1629 1630 /* Exclude any concurrent CPU-hotplug operations. */ 1631 mutex_lock(&rsp->onoff_mutex); 1632 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ 1633 1634 /* 1635 * Set the quiescent-state-needed bits in all the rcu_node 1636 * structures for all currently online CPUs in breadth-first order, 1637 * starting from the root rcu_node structure, relying on the layout 1638 * of the tree within the rsp->node[] array. Note that other CPUs 1639 * will access only the leaves of the hierarchy, thus seeing that no 1640 * grace period is in progress, at least until the corresponding 1641 * leaf node has been initialized. In addition, we have excluded 1642 * CPU-hotplug operations. 1643 * 1644 * The grace period cannot complete until the initialization 1645 * process finishes, because this kthread handles both. 1646 */ 1647 rcu_for_each_node_breadth_first(rsp, rnp) { 1648 raw_spin_lock_irq(&rnp->lock); 1649 smp_mb__after_unlock_lock(); 1650 rdp = this_cpu_ptr(rsp->rda); 1651 rcu_preempt_check_blocked_tasks(rnp); 1652 rnp->qsmask = rnp->qsmaskinit; 1653 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; 1654 WARN_ON_ONCE(rnp->completed != rsp->completed); 1655 ACCESS_ONCE(rnp->completed) = rsp->completed; 1656 if (rnp == rdp->mynode) 1657 (void)__note_gp_changes(rsp, rnp, rdp); 1658 rcu_preempt_boost_start_gp(rnp); 1659 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1660 rnp->level, rnp->grplo, 1661 rnp->grphi, rnp->qsmask); 1662 raw_spin_unlock_irq(&rnp->lock); 1663 cond_resched_rcu_qs(); 1664 } 1665 1666 mutex_unlock(&rsp->onoff_mutex); 1667 return 1; 1668 } 1669 1670 /* 1671 * Do one round of quiescent-state forcing. 1672 */ 1673 static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1674 { 1675 int fqs_state = fqs_state_in; 1676 bool isidle = false; 1677 unsigned long maxj; 1678 struct rcu_node *rnp = rcu_get_root(rsp); 1679 1680 rsp->n_force_qs++; 1681 if (fqs_state == RCU_SAVE_DYNTICK) { 1682 /* Collect dyntick-idle snapshots. */ 1683 if (is_sysidle_rcu_state(rsp)) { 1684 isidle = true; 1685 maxj = jiffies - ULONG_MAX / 4; 1686 } 1687 force_qs_rnp(rsp, dyntick_save_progress_counter, 1688 &isidle, &maxj); 1689 rcu_sysidle_report_gp(rsp, isidle, maxj); 1690 fqs_state = RCU_FORCE_QS; 1691 } else { 1692 /* Handle dyntick-idle and offline CPUs. */ 1693 isidle = false; 1694 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1695 } 1696 /* Clear flag to prevent immediate re-entry. */ 1697 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1698 raw_spin_lock_irq(&rnp->lock); 1699 smp_mb__after_unlock_lock(); 1700 ACCESS_ONCE(rsp->gp_flags) = 1701 ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS; 1702 raw_spin_unlock_irq(&rnp->lock); 1703 } 1704 return fqs_state; 1705 } 1706 1707 /* 1708 * Clean up after the old grace period. 1709 */ 1710 static void rcu_gp_cleanup(struct rcu_state *rsp) 1711 { 1712 unsigned long gp_duration; 1713 bool needgp = false; 1714 int nocb = 0; 1715 struct rcu_data *rdp; 1716 struct rcu_node *rnp = rcu_get_root(rsp); 1717 1718 raw_spin_lock_irq(&rnp->lock); 1719 smp_mb__after_unlock_lock(); 1720 gp_duration = jiffies - rsp->gp_start; 1721 if (gp_duration > rsp->gp_max) 1722 rsp->gp_max = gp_duration; 1723 1724 /* 1725 * We know the grace period is complete, but to everyone else 1726 * it appears to still be ongoing. But it is also the case 1727 * that to everyone else it looks like there is nothing that 1728 * they can do to advance the grace period. It is therefore 1729 * safe for us to drop the lock in order to mark the grace 1730 * period as completed in all of the rcu_node structures. 1731 */ 1732 raw_spin_unlock_irq(&rnp->lock); 1733 1734 /* 1735 * Propagate new ->completed value to rcu_node structures so 1736 * that other CPUs don't have to wait until the start of the next 1737 * grace period to process their callbacks. This also avoids 1738 * some nasty RCU grace-period initialization races by forcing 1739 * the end of the current grace period to be completely recorded in 1740 * all of the rcu_node structures before the beginning of the next 1741 * grace period is recorded in any of the rcu_node structures. 1742 */ 1743 rcu_for_each_node_breadth_first(rsp, rnp) { 1744 raw_spin_lock_irq(&rnp->lock); 1745 smp_mb__after_unlock_lock(); 1746 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1747 rdp = this_cpu_ptr(rsp->rda); 1748 if (rnp == rdp->mynode) 1749 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; 1750 /* smp_mb() provided by prior unlock-lock pair. */ 1751 nocb += rcu_future_gp_cleanup(rsp, rnp); 1752 raw_spin_unlock_irq(&rnp->lock); 1753 cond_resched_rcu_qs(); 1754 } 1755 rnp = rcu_get_root(rsp); 1756 raw_spin_lock_irq(&rnp->lock); 1757 smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ 1758 rcu_nocb_gp_set(rnp, nocb); 1759 1760 /* Declare grace period done. */ 1761 ACCESS_ONCE(rsp->completed) = rsp->gpnum; 1762 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 1763 rsp->fqs_state = RCU_GP_IDLE; 1764 rdp = this_cpu_ptr(rsp->rda); 1765 /* Advance CBs to reduce false positives below. */ 1766 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; 1767 if (needgp || cpu_needs_another_gp(rsp, rdp)) { 1768 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; 1769 trace_rcu_grace_period(rsp->name, 1770 ACCESS_ONCE(rsp->gpnum), 1771 TPS("newreq")); 1772 } 1773 raw_spin_unlock_irq(&rnp->lock); 1774 } 1775 1776 /* 1777 * Body of kthread that handles grace periods. 1778 */ 1779 static int __noreturn rcu_gp_kthread(void *arg) 1780 { 1781 int fqs_state; 1782 int gf; 1783 unsigned long j; 1784 int ret; 1785 struct rcu_state *rsp = arg; 1786 struct rcu_node *rnp = rcu_get_root(rsp); 1787 1788 for (;;) { 1789 1790 /* Handle grace-period start. */ 1791 for (;;) { 1792 trace_rcu_grace_period(rsp->name, 1793 ACCESS_ONCE(rsp->gpnum), 1794 TPS("reqwait")); 1795 rsp->gp_state = RCU_GP_WAIT_GPS; 1796 wait_event_interruptible(rsp->gp_wq, 1797 ACCESS_ONCE(rsp->gp_flags) & 1798 RCU_GP_FLAG_INIT); 1799 /* Locking provides needed memory barrier. */ 1800 if (rcu_gp_init(rsp)) 1801 break; 1802 cond_resched_rcu_qs(); 1803 WARN_ON(signal_pending(current)); 1804 trace_rcu_grace_period(rsp->name, 1805 ACCESS_ONCE(rsp->gpnum), 1806 TPS("reqwaitsig")); 1807 } 1808 1809 /* Handle quiescent-state forcing. */ 1810 fqs_state = RCU_SAVE_DYNTICK; 1811 j = jiffies_till_first_fqs; 1812 if (j > HZ) { 1813 j = HZ; 1814 jiffies_till_first_fqs = HZ; 1815 } 1816 ret = 0; 1817 for (;;) { 1818 if (!ret) 1819 rsp->jiffies_force_qs = jiffies + j; 1820 trace_rcu_grace_period(rsp->name, 1821 ACCESS_ONCE(rsp->gpnum), 1822 TPS("fqswait")); 1823 rsp->gp_state = RCU_GP_WAIT_FQS; 1824 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1825 ((gf = ACCESS_ONCE(rsp->gp_flags)) & 1826 RCU_GP_FLAG_FQS) || 1827 (!ACCESS_ONCE(rnp->qsmask) && 1828 !rcu_preempt_blocked_readers_cgp(rnp)), 1829 j); 1830 /* Locking provides needed memory barriers. */ 1831 /* If grace period done, leave loop. */ 1832 if (!ACCESS_ONCE(rnp->qsmask) && 1833 !rcu_preempt_blocked_readers_cgp(rnp)) 1834 break; 1835 /* If time for quiescent-state forcing, do it. */ 1836 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || 1837 (gf & RCU_GP_FLAG_FQS)) { 1838 trace_rcu_grace_period(rsp->name, 1839 ACCESS_ONCE(rsp->gpnum), 1840 TPS("fqsstart")); 1841 fqs_state = rcu_gp_fqs(rsp, fqs_state); 1842 trace_rcu_grace_period(rsp->name, 1843 ACCESS_ONCE(rsp->gpnum), 1844 TPS("fqsend")); 1845 cond_resched_rcu_qs(); 1846 } else { 1847 /* Deal with stray signal. */ 1848 cond_resched_rcu_qs(); 1849 WARN_ON(signal_pending(current)); 1850 trace_rcu_grace_period(rsp->name, 1851 ACCESS_ONCE(rsp->gpnum), 1852 TPS("fqswaitsig")); 1853 } 1854 j = jiffies_till_next_fqs; 1855 if (j > HZ) { 1856 j = HZ; 1857 jiffies_till_next_fqs = HZ; 1858 } else if (j < 1) { 1859 j = 1; 1860 jiffies_till_next_fqs = 1; 1861 } 1862 } 1863 1864 /* Handle grace-period end. */ 1865 rcu_gp_cleanup(rsp); 1866 } 1867 } 1868 1869 /* 1870 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1871 * in preparation for detecting the next grace period. The caller must hold 1872 * the root node's ->lock and hard irqs must be disabled. 1873 * 1874 * Note that it is legal for a dying CPU (which is marked as offline) to 1875 * invoke this function. This can happen when the dying CPU reports its 1876 * quiescent state. 1877 * 1878 * Returns true if the grace-period kthread must be awakened. 1879 */ 1880 static bool 1881 rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 1882 struct rcu_data *rdp) 1883 { 1884 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { 1885 /* 1886 * Either we have not yet spawned the grace-period 1887 * task, this CPU does not need another grace period, 1888 * or a grace period is already in progress. 1889 * Either way, don't start a new grace period. 1890 */ 1891 return false; 1892 } 1893 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; 1894 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), 1895 TPS("newreq")); 1896 1897 /* 1898 * We can't do wakeups while holding the rnp->lock, as that 1899 * could cause possible deadlocks with the rq->lock. Defer 1900 * the wakeup to our caller. 1901 */ 1902 return true; 1903 } 1904 1905 /* 1906 * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's 1907 * callbacks. Note that rcu_start_gp_advanced() cannot do this because it 1908 * is invoked indirectly from rcu_advance_cbs(), which would result in 1909 * endless recursion -- or would do so if it wasn't for the self-deadlock 1910 * that is encountered beforehand. 1911 * 1912 * Returns true if the grace-period kthread needs to be awakened. 1913 */ 1914 static bool rcu_start_gp(struct rcu_state *rsp) 1915 { 1916 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1917 struct rcu_node *rnp = rcu_get_root(rsp); 1918 bool ret = false; 1919 1920 /* 1921 * If there is no grace period in progress right now, any 1922 * callbacks we have up to this point will be satisfied by the 1923 * next grace period. Also, advancing the callbacks reduces the 1924 * probability of false positives from cpu_needs_another_gp() 1925 * resulting in pointless grace periods. So, advance callbacks 1926 * then start the grace period! 1927 */ 1928 ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; 1929 ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; 1930 return ret; 1931 } 1932 1933 /* 1934 * Report a full set of quiescent states to the specified rcu_state 1935 * data structure. This involves cleaning up after the prior grace 1936 * period and letting rcu_start_gp() start up the next grace period 1937 * if one is needed. Note that the caller must hold rnp->lock, which 1938 * is released before return. 1939 */ 1940 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1941 __releases(rcu_get_root(rsp)->lock) 1942 { 1943 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 1944 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 1945 rcu_gp_kthread_wake(rsp); 1946 } 1947 1948 /* 1949 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 1950 * Allows quiescent states for a group of CPUs to be reported at one go 1951 * to the specified rcu_node structure, though all the CPUs in the group 1952 * must be represented by the same rcu_node structure (which need not be 1953 * a leaf rcu_node structure, though it often will be). That structure's 1954 * lock must be held upon entry, and it is released before return. 1955 */ 1956 static void 1957 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 1958 struct rcu_node *rnp, unsigned long flags) 1959 __releases(rnp->lock) 1960 { 1961 struct rcu_node *rnp_c; 1962 1963 /* Walk up the rcu_node hierarchy. */ 1964 for (;;) { 1965 if (!(rnp->qsmask & mask)) { 1966 1967 /* Our bit has already been cleared, so done. */ 1968 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1969 return; 1970 } 1971 rnp->qsmask &= ~mask; 1972 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, 1973 mask, rnp->qsmask, rnp->level, 1974 rnp->grplo, rnp->grphi, 1975 !!rnp->gp_tasks); 1976 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 1977 1978 /* Other bits still set at this level, so done. */ 1979 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1980 return; 1981 } 1982 mask = rnp->grpmask; 1983 if (rnp->parent == NULL) { 1984 1985 /* No more levels. Exit loop holding root lock. */ 1986 1987 break; 1988 } 1989 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1990 rnp_c = rnp; 1991 rnp = rnp->parent; 1992 raw_spin_lock_irqsave(&rnp->lock, flags); 1993 smp_mb__after_unlock_lock(); 1994 WARN_ON_ONCE(rnp_c->qsmask); 1995 } 1996 1997 /* 1998 * Get here if we are the last CPU to pass through a quiescent 1999 * state for this grace period. Invoke rcu_report_qs_rsp() 2000 * to clean up and start the next grace period if one is needed. 2001 */ 2002 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ 2003 } 2004 2005 /* 2006 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2007 * structure. This must be either called from the specified CPU, or 2008 * called when the specified CPU is known to be offline (and when it is 2009 * also known that no other CPU is concurrently trying to help the offline 2010 * CPU). The lastcomp argument is used to make sure we are still in the 2011 * grace period of interest. We don't want to end the current grace period 2012 * based on quiescent states detected in an earlier grace period! 2013 */ 2014 static void 2015 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) 2016 { 2017 unsigned long flags; 2018 unsigned long mask; 2019 bool needwake; 2020 struct rcu_node *rnp; 2021 2022 rnp = rdp->mynode; 2023 raw_spin_lock_irqsave(&rnp->lock, flags); 2024 smp_mb__after_unlock_lock(); 2025 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 2026 rnp->completed == rnp->gpnum) { 2027 2028 /* 2029 * The grace period in which this quiescent state was 2030 * recorded has ended, so don't report it upwards. 2031 * We will instead need a new quiescent state that lies 2032 * within the current grace period. 2033 */ 2034 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2035 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2036 return; 2037 } 2038 mask = rdp->grpmask; 2039 if ((rnp->qsmask & mask) == 0) { 2040 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2041 } else { 2042 rdp->qs_pending = 0; 2043 2044 /* 2045 * This GP can't end until cpu checks in, so all of our 2046 * callbacks can be processed during the next GP. 2047 */ 2048 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 2049 2050 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 2051 if (needwake) 2052 rcu_gp_kthread_wake(rsp); 2053 } 2054 } 2055 2056 /* 2057 * Check to see if there is a new grace period of which this CPU 2058 * is not yet aware, and if so, set up local rcu_data state for it. 2059 * Otherwise, see if this CPU has just passed through its first 2060 * quiescent state for this grace period, and record that fact if so. 2061 */ 2062 static void 2063 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) 2064 { 2065 /* Check for grace-period ends and beginnings. */ 2066 note_gp_changes(rsp, rdp); 2067 2068 /* 2069 * Does this CPU still need to do its part for current grace period? 2070 * If no, return and let the other CPUs do their part as well. 2071 */ 2072 if (!rdp->qs_pending) 2073 return; 2074 2075 /* 2076 * Was there a quiescent state since the beginning of the grace 2077 * period? If no, then exit and wait for the next call. 2078 */ 2079 if (!rdp->passed_quiesce) 2080 return; 2081 2082 /* 2083 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 2084 * judge of that). 2085 */ 2086 rcu_report_qs_rdp(rdp->cpu, rsp, rdp); 2087 } 2088 2089 #ifdef CONFIG_HOTPLUG_CPU 2090 2091 /* 2092 * Send the specified CPU's RCU callbacks to the orphanage. The 2093 * specified CPU must be offline, and the caller must hold the 2094 * ->orphan_lock. 2095 */ 2096 static void 2097 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 2098 struct rcu_node *rnp, struct rcu_data *rdp) 2099 { 2100 /* No-CBs CPUs do not have orphanable callbacks. */ 2101 if (rcu_is_nocb_cpu(rdp->cpu)) 2102 return; 2103 2104 /* 2105 * Orphan the callbacks. First adjust the counts. This is safe 2106 * because _rcu_barrier() excludes CPU-hotplug operations, so it 2107 * cannot be running now. Thus no memory barrier is required. 2108 */ 2109 if (rdp->nxtlist != NULL) { 2110 rsp->qlen_lazy += rdp->qlen_lazy; 2111 rsp->qlen += rdp->qlen; 2112 rdp->n_cbs_orphaned += rdp->qlen; 2113 rdp->qlen_lazy = 0; 2114 ACCESS_ONCE(rdp->qlen) = 0; 2115 } 2116 2117 /* 2118 * Next, move those callbacks still needing a grace period to 2119 * the orphanage, where some other CPU will pick them up. 2120 * Some of the callbacks might have gone partway through a grace 2121 * period, but that is too bad. They get to start over because we 2122 * cannot assume that grace periods are synchronized across CPUs. 2123 * We don't bother updating the ->nxttail[] array yet, instead 2124 * we just reset the whole thing later on. 2125 */ 2126 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { 2127 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; 2128 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; 2129 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 2130 } 2131 2132 /* 2133 * Then move the ready-to-invoke callbacks to the orphanage, 2134 * where some other CPU will pick them up. These will not be 2135 * required to pass though another grace period: They are done. 2136 */ 2137 if (rdp->nxtlist != NULL) { 2138 *rsp->orphan_donetail = rdp->nxtlist; 2139 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; 2140 } 2141 2142 /* Finally, initialize the rcu_data structure's list to empty. */ 2143 init_callback_list(rdp); 2144 } 2145 2146 /* 2147 * Adopt the RCU callbacks from the specified rcu_state structure's 2148 * orphanage. The caller must hold the ->orphan_lock. 2149 */ 2150 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2151 { 2152 int i; 2153 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2154 2155 /* No-CBs CPUs are handled specially. */ 2156 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) 2157 return; 2158 2159 /* Do the accounting first. */ 2160 rdp->qlen_lazy += rsp->qlen_lazy; 2161 rdp->qlen += rsp->qlen; 2162 rdp->n_cbs_adopted += rsp->qlen; 2163 if (rsp->qlen_lazy != rsp->qlen) 2164 rcu_idle_count_callbacks_posted(); 2165 rsp->qlen_lazy = 0; 2166 rsp->qlen = 0; 2167 2168 /* 2169 * We do not need a memory barrier here because the only way we 2170 * can get here if there is an rcu_barrier() in flight is if 2171 * we are the task doing the rcu_barrier(). 2172 */ 2173 2174 /* First adopt the ready-to-invoke callbacks. */ 2175 if (rsp->orphan_donelist != NULL) { 2176 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; 2177 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; 2178 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) 2179 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2180 rdp->nxttail[i] = rsp->orphan_donetail; 2181 rsp->orphan_donelist = NULL; 2182 rsp->orphan_donetail = &rsp->orphan_donelist; 2183 } 2184 2185 /* And then adopt the callbacks that still need a grace period. */ 2186 if (rsp->orphan_nxtlist != NULL) { 2187 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; 2188 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; 2189 rsp->orphan_nxtlist = NULL; 2190 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2191 } 2192 } 2193 2194 /* 2195 * Trace the fact that this CPU is going offline. 2196 */ 2197 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2198 { 2199 RCU_TRACE(unsigned long mask); 2200 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); 2201 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); 2202 2203 RCU_TRACE(mask = rdp->grpmask); 2204 trace_rcu_grace_period(rsp->name, 2205 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 2206 TPS("cpuofl")); 2207 } 2208 2209 /* 2210 * The CPU has been completely removed, and some other CPU is reporting 2211 * this fact from process context. Do the remainder of the cleanup, 2212 * including orphaning the outgoing CPU's RCU callbacks, and also 2213 * adopting them. There can only be one CPU hotplug operation at a time, 2214 * so no other CPU can be attempting to update rcu_cpu_kthread_task. 2215 */ 2216 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2217 { 2218 unsigned long flags; 2219 unsigned long mask; 2220 int need_report = 0; 2221 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2222 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2223 2224 /* Adjust any no-longer-needed kthreads. */ 2225 rcu_boost_kthread_setaffinity(rnp, -1); 2226 2227 /* Exclude any attempts to start a new grace period. */ 2228 mutex_lock(&rsp->onoff_mutex); 2229 raw_spin_lock_irqsave(&rsp->orphan_lock, flags); 2230 2231 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2232 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2233 rcu_adopt_orphan_cbs(rsp, flags); 2234 2235 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2236 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2237 do { 2238 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2239 smp_mb__after_unlock_lock(); 2240 rnp->qsmaskinit &= ~mask; 2241 if (rnp->qsmaskinit != 0) { 2242 if (rnp != rdp->mynode) 2243 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2244 break; 2245 } 2246 if (rnp == rdp->mynode) 2247 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 2248 else 2249 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2250 mask = rnp->grpmask; 2251 rnp = rnp->parent; 2252 } while (rnp != NULL); 2253 2254 /* 2255 * We still hold the leaf rcu_node structure lock here, and 2256 * irqs are still disabled. The reason for this subterfuge is 2257 * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock 2258 * held leads to deadlock. 2259 */ 2260 raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ 2261 rnp = rdp->mynode; 2262 if (need_report & RCU_OFL_TASKS_NORM_GP) 2263 rcu_report_unblock_qs_rnp(rnp, flags); 2264 else 2265 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2266 if (need_report & RCU_OFL_TASKS_EXP_GP) 2267 rcu_report_exp_rnp(rsp, rnp, true); 2268 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2269 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2270 cpu, rdp->qlen, rdp->nxtlist); 2271 init_callback_list(rdp); 2272 /* Disallow further callbacks on this CPU. */ 2273 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2274 mutex_unlock(&rsp->onoff_mutex); 2275 } 2276 2277 #else /* #ifdef CONFIG_HOTPLUG_CPU */ 2278 2279 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2280 { 2281 } 2282 2283 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2284 { 2285 } 2286 2287 #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ 2288 2289 /* 2290 * Invoke any RCU callbacks that have made it to the end of their grace 2291 * period. Thottle as specified by rdp->blimit. 2292 */ 2293 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) 2294 { 2295 unsigned long flags; 2296 struct rcu_head *next, *list, **tail; 2297 long bl, count, count_lazy; 2298 int i; 2299 2300 /* If no callbacks are ready, just return. */ 2301 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 2302 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 2303 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 2304 need_resched(), is_idle_task(current), 2305 rcu_is_callbacks_kthread()); 2306 return; 2307 } 2308 2309 /* 2310 * Extract the list of ready callbacks, disabling to prevent 2311 * races with call_rcu() from interrupt handlers. 2312 */ 2313 local_irq_save(flags); 2314 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2315 bl = rdp->blimit; 2316 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); 2317 list = rdp->nxtlist; 2318 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 2319 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 2320 tail = rdp->nxttail[RCU_DONE_TAIL]; 2321 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) 2322 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2323 rdp->nxttail[i] = &rdp->nxtlist; 2324 local_irq_restore(flags); 2325 2326 /* Invoke callbacks. */ 2327 count = count_lazy = 0; 2328 while (list) { 2329 next = list->next; 2330 prefetch(next); 2331 debug_rcu_head_unqueue(list); 2332 if (__rcu_reclaim(rsp->name, list)) 2333 count_lazy++; 2334 list = next; 2335 /* Stop only if limit reached and CPU has something to do. */ 2336 if (++count >= bl && 2337 (need_resched() || 2338 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2339 break; 2340 } 2341 2342 local_irq_save(flags); 2343 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), 2344 is_idle_task(current), 2345 rcu_is_callbacks_kthread()); 2346 2347 /* Update count, and requeue any remaining callbacks. */ 2348 if (list != NULL) { 2349 *tail = rdp->nxtlist; 2350 rdp->nxtlist = list; 2351 for (i = 0; i < RCU_NEXT_SIZE; i++) 2352 if (&rdp->nxtlist == rdp->nxttail[i]) 2353 rdp->nxttail[i] = tail; 2354 else 2355 break; 2356 } 2357 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2358 rdp->qlen_lazy -= count_lazy; 2359 ACCESS_ONCE(rdp->qlen) = rdp->qlen - count; 2360 rdp->n_cbs_invoked += count; 2361 2362 /* Reinstate batch limit if we have worked down the excess. */ 2363 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 2364 rdp->blimit = blimit; 2365 2366 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2367 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { 2368 rdp->qlen_last_fqs_check = 0; 2369 rdp->n_force_qs_snap = rsp->n_force_qs; 2370 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 2371 rdp->qlen_last_fqs_check = rdp->qlen; 2372 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); 2373 2374 local_irq_restore(flags); 2375 2376 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2377 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2378 invoke_rcu_core(); 2379 } 2380 2381 /* 2382 * Check to see if this CPU is in a non-context-switch quiescent state 2383 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 2384 * Also schedule RCU core processing. 2385 * 2386 * This function must be called from hardirq context. It is normally 2387 * invoked from the scheduling-clock interrupt. If rcu_pending returns 2388 * false, there is no point in invoking rcu_check_callbacks(). 2389 */ 2390 void rcu_check_callbacks(int cpu, int user) 2391 { 2392 trace_rcu_utilization(TPS("Start scheduler-tick")); 2393 increment_cpu_stall_ticks(); 2394 if (user || rcu_is_cpu_rrupt_from_idle()) { 2395 2396 /* 2397 * Get here if this CPU took its interrupt from user 2398 * mode or from the idle loop, and if this is not a 2399 * nested interrupt. In this case, the CPU is in 2400 * a quiescent state, so note it. 2401 * 2402 * No memory barrier is required here because both 2403 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local 2404 * variables that other CPUs neither access nor modify, 2405 * at least not while the corresponding CPU is online. 2406 */ 2407 2408 rcu_sched_qs(); 2409 rcu_bh_qs(); 2410 2411 } else if (!in_softirq()) { 2412 2413 /* 2414 * Get here if this CPU did not take its interrupt from 2415 * softirq, in other words, if it is not interrupting 2416 * a rcu_bh read-side critical section. This is an _bh 2417 * critical section, so note it. 2418 */ 2419 2420 rcu_bh_qs(); 2421 } 2422 rcu_preempt_check_callbacks(cpu); 2423 if (rcu_pending(cpu)) 2424 invoke_rcu_core(); 2425 if (user) 2426 rcu_note_voluntary_context_switch(current); 2427 trace_rcu_utilization(TPS("End scheduler-tick")); 2428 } 2429 2430 /* 2431 * Scan the leaf rcu_node structures, processing dyntick state for any that 2432 * have not yet encountered a quiescent state, using the function specified. 2433 * Also initiate boosting for any threads blocked on the root rcu_node. 2434 * 2435 * The caller must have suppressed start of new grace periods. 2436 */ 2437 static void force_qs_rnp(struct rcu_state *rsp, 2438 int (*f)(struct rcu_data *rsp, bool *isidle, 2439 unsigned long *maxj), 2440 bool *isidle, unsigned long *maxj) 2441 { 2442 unsigned long bit; 2443 int cpu; 2444 unsigned long flags; 2445 unsigned long mask; 2446 struct rcu_node *rnp; 2447 2448 rcu_for_each_leaf_node(rsp, rnp) { 2449 cond_resched_rcu_qs(); 2450 mask = 0; 2451 raw_spin_lock_irqsave(&rnp->lock, flags); 2452 smp_mb__after_unlock_lock(); 2453 if (!rcu_gp_in_progress(rsp)) { 2454 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2455 return; 2456 } 2457 if (rnp->qsmask == 0) { 2458 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 2459 continue; 2460 } 2461 cpu = rnp->grplo; 2462 bit = 1; 2463 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2464 if ((rnp->qsmask & bit) != 0) { 2465 if ((rnp->qsmaskinit & bit) != 0) 2466 *isidle = false; 2467 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2468 mask |= bit; 2469 } 2470 } 2471 if (mask != 0) { 2472 2473 /* rcu_report_qs_rnp() releases rnp->lock. */ 2474 rcu_report_qs_rnp(mask, rsp, rnp, flags); 2475 continue; 2476 } 2477 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2478 } 2479 rnp = rcu_get_root(rsp); 2480 if (rnp->qsmask == 0) { 2481 raw_spin_lock_irqsave(&rnp->lock, flags); 2482 smp_mb__after_unlock_lock(); 2483 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ 2484 } 2485 } 2486 2487 /* 2488 * Force quiescent states on reluctant CPUs, and also detect which 2489 * CPUs are in dyntick-idle mode. 2490 */ 2491 static void force_quiescent_state(struct rcu_state *rsp) 2492 { 2493 unsigned long flags; 2494 bool ret; 2495 struct rcu_node *rnp; 2496 struct rcu_node *rnp_old = NULL; 2497 2498 /* Funnel through hierarchy to reduce memory contention. */ 2499 rnp = __this_cpu_read(rsp->rda->mynode); 2500 for (; rnp != NULL; rnp = rnp->parent) { 2501 ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || 2502 !raw_spin_trylock(&rnp->fqslock); 2503 if (rnp_old != NULL) 2504 raw_spin_unlock(&rnp_old->fqslock); 2505 if (ret) { 2506 rsp->n_force_qs_lh++; 2507 return; 2508 } 2509 rnp_old = rnp; 2510 } 2511 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 2512 2513 /* Reached the root of the rcu_node tree, acquire lock. */ 2514 raw_spin_lock_irqsave(&rnp_old->lock, flags); 2515 smp_mb__after_unlock_lock(); 2516 raw_spin_unlock(&rnp_old->fqslock); 2517 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2518 rsp->n_force_qs_lh++; 2519 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2520 return; /* Someone beat us to it. */ 2521 } 2522 ACCESS_ONCE(rsp->gp_flags) = 2523 ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS; 2524 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2525 rcu_gp_kthread_wake(rsp); 2526 } 2527 2528 /* 2529 * This does the RCU core processing work for the specified rcu_state 2530 * and rcu_data structures. This may be called only from the CPU to 2531 * whom the rdp belongs. 2532 */ 2533 static void 2534 __rcu_process_callbacks(struct rcu_state *rsp) 2535 { 2536 unsigned long flags; 2537 bool needwake; 2538 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2539 2540 WARN_ON_ONCE(rdp->beenonline == 0); 2541 2542 /* Update RCU state based on any recent quiescent states. */ 2543 rcu_check_quiescent_state(rsp, rdp); 2544 2545 /* Does this CPU require a not-yet-started grace period? */ 2546 local_irq_save(flags); 2547 if (cpu_needs_another_gp(rsp, rdp)) { 2548 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2549 needwake = rcu_start_gp(rsp); 2550 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2551 if (needwake) 2552 rcu_gp_kthread_wake(rsp); 2553 } else { 2554 local_irq_restore(flags); 2555 } 2556 2557 /* If there are callbacks ready, invoke them. */ 2558 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2559 invoke_rcu_callbacks(rsp, rdp); 2560 2561 /* Do any needed deferred wakeups of rcuo kthreads. */ 2562 do_nocb_deferred_wakeup(rdp); 2563 } 2564 2565 /* 2566 * Do RCU core processing for the current CPU. 2567 */ 2568 static void rcu_process_callbacks(struct softirq_action *unused) 2569 { 2570 struct rcu_state *rsp; 2571 2572 if (cpu_is_offline(smp_processor_id())) 2573 return; 2574 trace_rcu_utilization(TPS("Start RCU core")); 2575 for_each_rcu_flavor(rsp) 2576 __rcu_process_callbacks(rsp); 2577 trace_rcu_utilization(TPS("End RCU core")); 2578 } 2579 2580 /* 2581 * Schedule RCU callback invocation. If the specified type of RCU 2582 * does not support RCU priority boosting, just do a direct call, 2583 * otherwise wake up the per-CPU kernel kthread. Note that because we 2584 * are running on the current CPU with interrupts disabled, the 2585 * rcu_cpu_kthread_task cannot disappear out from under us. 2586 */ 2587 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 2588 { 2589 if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) 2590 return; 2591 if (likely(!rsp->boost)) { 2592 rcu_do_batch(rsp, rdp); 2593 return; 2594 } 2595 invoke_rcu_callbacks_kthread(); 2596 } 2597 2598 static void invoke_rcu_core(void) 2599 { 2600 if (cpu_online(smp_processor_id())) 2601 raise_softirq(RCU_SOFTIRQ); 2602 } 2603 2604 /* 2605 * Handle any core-RCU processing required by a call_rcu() invocation. 2606 */ 2607 static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 2608 struct rcu_head *head, unsigned long flags) 2609 { 2610 bool needwake; 2611 2612 /* 2613 * If called from an extended quiescent state, invoke the RCU 2614 * core in order to force a re-evaluation of RCU's idleness. 2615 */ 2616 if (!rcu_is_watching() && cpu_online(smp_processor_id())) 2617 invoke_rcu_core(); 2618 2619 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2620 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) 2621 return; 2622 2623 /* 2624 * Force the grace period if too many callbacks or too long waiting. 2625 * Enforce hysteresis, and don't invoke force_quiescent_state() 2626 * if some other CPU has recently done so. Also, don't bother 2627 * invoking force_quiescent_state() if the newly enqueued callback 2628 * is the only one waiting for a grace period to complete. 2629 */ 2630 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 2631 2632 /* Are we ignoring a completed grace period? */ 2633 note_gp_changes(rsp, rdp); 2634 2635 /* Start a new grace period if one not already started. */ 2636 if (!rcu_gp_in_progress(rsp)) { 2637 struct rcu_node *rnp_root = rcu_get_root(rsp); 2638 2639 raw_spin_lock(&rnp_root->lock); 2640 smp_mb__after_unlock_lock(); 2641 needwake = rcu_start_gp(rsp); 2642 raw_spin_unlock(&rnp_root->lock); 2643 if (needwake) 2644 rcu_gp_kthread_wake(rsp); 2645 } else { 2646 /* Give the grace period a kick. */ 2647 rdp->blimit = LONG_MAX; 2648 if (rsp->n_force_qs == rdp->n_force_qs_snap && 2649 *rdp->nxttail[RCU_DONE_TAIL] != head) 2650 force_quiescent_state(rsp); 2651 rdp->n_force_qs_snap = rsp->n_force_qs; 2652 rdp->qlen_last_fqs_check = rdp->qlen; 2653 } 2654 } 2655 } 2656 2657 /* 2658 * RCU callback function to leak a callback. 2659 */ 2660 static void rcu_leak_callback(struct rcu_head *rhp) 2661 { 2662 } 2663 2664 /* 2665 * Helper function for call_rcu() and friends. The cpu argument will 2666 * normally be -1, indicating "currently running CPU". It may specify 2667 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 2668 * is expected to specify a CPU. 2669 */ 2670 static void 2671 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 2672 struct rcu_state *rsp, int cpu, bool lazy) 2673 { 2674 unsigned long flags; 2675 struct rcu_data *rdp; 2676 2677 WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */ 2678 if (debug_rcu_head_queue(head)) { 2679 /* Probable double call_rcu(), so leak the callback. */ 2680 ACCESS_ONCE(head->func) = rcu_leak_callback; 2681 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); 2682 return; 2683 } 2684 head->func = func; 2685 head->next = NULL; 2686 2687 /* 2688 * Opportunistically note grace-period endings and beginnings. 2689 * Note that we might see a beginning right after we see an 2690 * end, but never vice versa, since this CPU has to pass through 2691 * a quiescent state betweentimes. 2692 */ 2693 local_irq_save(flags); 2694 rdp = this_cpu_ptr(rsp->rda); 2695 2696 /* Add the callback to our list. */ 2697 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { 2698 int offline; 2699 2700 if (cpu != -1) 2701 rdp = per_cpu_ptr(rsp->rda, cpu); 2702 offline = !__call_rcu_nocb(rdp, head, lazy, flags); 2703 WARN_ON_ONCE(offline); 2704 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2705 local_irq_restore(flags); 2706 return; 2707 } 2708 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; 2709 if (lazy) 2710 rdp->qlen_lazy++; 2711 else 2712 rcu_idle_count_callbacks_posted(); 2713 smp_mb(); /* Count before adding callback for rcu_barrier(). */ 2714 *rdp->nxttail[RCU_NEXT_TAIL] = head; 2715 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 2716 2717 if (__is_kfree_rcu_offset((unsigned long)func)) 2718 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 2719 rdp->qlen_lazy, rdp->qlen); 2720 else 2721 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 2722 2723 /* Go handle any RCU core processing required. */ 2724 __call_rcu_core(rsp, rdp, head, flags); 2725 local_irq_restore(flags); 2726 } 2727 2728 /* 2729 * Queue an RCU-sched callback for invocation after a grace period. 2730 */ 2731 void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2732 { 2733 __call_rcu(head, func, &rcu_sched_state, -1, 0); 2734 } 2735 EXPORT_SYMBOL_GPL(call_rcu_sched); 2736 2737 /* 2738 * Queue an RCU callback for invocation after a quicker grace period. 2739 */ 2740 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 2741 { 2742 __call_rcu(head, func, &rcu_bh_state, -1, 0); 2743 } 2744 EXPORT_SYMBOL_GPL(call_rcu_bh); 2745 2746 /* 2747 * Queue an RCU callback for lazy invocation after a grace period. 2748 * This will likely be later named something like "call_rcu_lazy()", 2749 * but this change will require some way of tagging the lazy RCU 2750 * callbacks in the list of pending callbacks. Until then, this 2751 * function may only be called from __kfree_rcu(). 2752 */ 2753 void kfree_call_rcu(struct rcu_head *head, 2754 void (*func)(struct rcu_head *rcu)) 2755 { 2756 __call_rcu(head, func, rcu_state_p, -1, 1); 2757 } 2758 EXPORT_SYMBOL_GPL(kfree_call_rcu); 2759 2760 /* 2761 * Because a context switch is a grace period for RCU-sched and RCU-bh, 2762 * any blocking grace-period wait automatically implies a grace period 2763 * if there is only one CPU online at any point time during execution 2764 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to 2765 * occasionally incorrectly indicate that there are multiple CPUs online 2766 * when there was in fact only one the whole time, as this just adds 2767 * some overhead: RCU still operates correctly. 2768 */ 2769 static inline int rcu_blocking_is_gp(void) 2770 { 2771 int ret; 2772 2773 might_sleep(); /* Check for RCU read-side critical section. */ 2774 preempt_disable(); 2775 ret = num_online_cpus() <= 1; 2776 preempt_enable(); 2777 return ret; 2778 } 2779 2780 /** 2781 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 2782 * 2783 * Control will return to the caller some time after a full rcu-sched 2784 * grace period has elapsed, in other words after all currently executing 2785 * rcu-sched read-side critical sections have completed. These read-side 2786 * critical sections are delimited by rcu_read_lock_sched() and 2787 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), 2788 * local_irq_disable(), and so on may be used in place of 2789 * rcu_read_lock_sched(). 2790 * 2791 * This means that all preempt_disable code sequences, including NMI and 2792 * non-threaded hardware-interrupt handlers, in progress on entry will 2793 * have completed before this primitive returns. However, this does not 2794 * guarantee that softirq handlers will have completed, since in some 2795 * kernels, these handlers can run in process context, and can block. 2796 * 2797 * Note that this guarantee implies further memory-ordering guarantees. 2798 * On systems with more than one CPU, when synchronize_sched() returns, 2799 * each CPU is guaranteed to have executed a full memory barrier since the 2800 * end of its last RCU-sched read-side critical section whose beginning 2801 * preceded the call to synchronize_sched(). In addition, each CPU having 2802 * an RCU read-side critical section that extends beyond the return from 2803 * synchronize_sched() is guaranteed to have executed a full memory barrier 2804 * after the beginning of synchronize_sched() and before the beginning of 2805 * that RCU read-side critical section. Note that these guarantees include 2806 * CPUs that are offline, idle, or executing in user mode, as well as CPUs 2807 * that are executing in the kernel. 2808 * 2809 * Furthermore, if CPU A invoked synchronize_sched(), which returned 2810 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 2811 * to have executed a full memory barrier during the execution of 2812 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but 2813 * again only if the system has more than one CPU). 2814 * 2815 * This primitive provides the guarantees made by the (now removed) 2816 * synchronize_kernel() API. In contrast, synchronize_rcu() only 2817 * guarantees that rcu_read_lock() sections will have completed. 2818 * In "classic RCU", these two guarantees happen to be one and 2819 * the same, but can differ in realtime RCU implementations. 2820 */ 2821 void synchronize_sched(void) 2822 { 2823 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 2824 !lock_is_held(&rcu_lock_map) && 2825 !lock_is_held(&rcu_sched_lock_map), 2826 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 2827 if (rcu_blocking_is_gp()) 2828 return; 2829 if (rcu_expedited) 2830 synchronize_sched_expedited(); 2831 else 2832 wait_rcu_gp(call_rcu_sched); 2833 } 2834 EXPORT_SYMBOL_GPL(synchronize_sched); 2835 2836 /** 2837 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. 2838 * 2839 * Control will return to the caller some time after a full rcu_bh grace 2840 * period has elapsed, in other words after all currently executing rcu_bh 2841 * read-side critical sections have completed. RCU read-side critical 2842 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 2843 * and may be nested. 2844 * 2845 * See the description of synchronize_sched() for more detailed information 2846 * on memory ordering guarantees. 2847 */ 2848 void synchronize_rcu_bh(void) 2849 { 2850 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 2851 !lock_is_held(&rcu_lock_map) && 2852 !lock_is_held(&rcu_sched_lock_map), 2853 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 2854 if (rcu_blocking_is_gp()) 2855 return; 2856 if (rcu_expedited) 2857 synchronize_rcu_bh_expedited(); 2858 else 2859 wait_rcu_gp(call_rcu_bh); 2860 } 2861 EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 2862 2863 /** 2864 * get_state_synchronize_rcu - Snapshot current RCU state 2865 * 2866 * Returns a cookie that is used by a later call to cond_synchronize_rcu() 2867 * to determine whether or not a full grace period has elapsed in the 2868 * meantime. 2869 */ 2870 unsigned long get_state_synchronize_rcu(void) 2871 { 2872 /* 2873 * Any prior manipulation of RCU-protected data must happen 2874 * before the load from ->gpnum. 2875 */ 2876 smp_mb(); /* ^^^ */ 2877 2878 /* 2879 * Make sure this load happens before the purportedly 2880 * time-consuming work between get_state_synchronize_rcu() 2881 * and cond_synchronize_rcu(). 2882 */ 2883 return smp_load_acquire(&rcu_state_p->gpnum); 2884 } 2885 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 2886 2887 /** 2888 * cond_synchronize_rcu - Conditionally wait for an RCU grace period 2889 * 2890 * @oldstate: return value from earlier call to get_state_synchronize_rcu() 2891 * 2892 * If a full RCU grace period has elapsed since the earlier call to 2893 * get_state_synchronize_rcu(), just return. Otherwise, invoke 2894 * synchronize_rcu() to wait for a full grace period. 2895 * 2896 * Yes, this function does not take counter wrap into account. But 2897 * counter wrap is harmless. If the counter wraps, we have waited for 2898 * more than 2 billion grace periods (and way more on a 64-bit system!), 2899 * so waiting for one additional grace period should be just fine. 2900 */ 2901 void cond_synchronize_rcu(unsigned long oldstate) 2902 { 2903 unsigned long newstate; 2904 2905 /* 2906 * Ensure that this load happens before any RCU-destructive 2907 * actions the caller might carry out after we return. 2908 */ 2909 newstate = smp_load_acquire(&rcu_state_p->completed); 2910 if (ULONG_CMP_GE(oldstate, newstate)) 2911 synchronize_rcu(); 2912 } 2913 EXPORT_SYMBOL_GPL(cond_synchronize_rcu); 2914 2915 static int synchronize_sched_expedited_cpu_stop(void *data) 2916 { 2917 /* 2918 * There must be a full memory barrier on each affected CPU 2919 * between the time that try_stop_cpus() is called and the 2920 * time that it returns. 2921 * 2922 * In the current initial implementation of cpu_stop, the 2923 * above condition is already met when the control reaches 2924 * this point and the following smp_mb() is not strictly 2925 * necessary. Do smp_mb() anyway for documentation and 2926 * robustness against future implementation changes. 2927 */ 2928 smp_mb(); /* See above comment block. */ 2929 return 0; 2930 } 2931 2932 /** 2933 * synchronize_sched_expedited - Brute-force RCU-sched grace period 2934 * 2935 * Wait for an RCU-sched grace period to elapse, but use a "big hammer" 2936 * approach to force the grace period to end quickly. This consumes 2937 * significant time on all CPUs and is unfriendly to real-time workloads, 2938 * so is thus not recommended for any sort of common-case code. In fact, 2939 * if you are using synchronize_sched_expedited() in a loop, please 2940 * restructure your code to batch your updates, and then use a single 2941 * synchronize_sched() instead. 2942 * 2943 * This implementation can be thought of as an application of ticket 2944 * locking to RCU, with sync_sched_expedited_started and 2945 * sync_sched_expedited_done taking on the roles of the halves 2946 * of the ticket-lock word. Each task atomically increments 2947 * sync_sched_expedited_started upon entry, snapshotting the old value, 2948 * then attempts to stop all the CPUs. If this succeeds, then each 2949 * CPU will have executed a context switch, resulting in an RCU-sched 2950 * grace period. We are then done, so we use atomic_cmpxchg() to 2951 * update sync_sched_expedited_done to match our snapshot -- but 2952 * only if someone else has not already advanced past our snapshot. 2953 * 2954 * On the other hand, if try_stop_cpus() fails, we check the value 2955 * of sync_sched_expedited_done. If it has advanced past our 2956 * initial snapshot, then someone else must have forced a grace period 2957 * some time after we took our snapshot. In this case, our work is 2958 * done for us, and we can simply return. Otherwise, we try again, 2959 * but keep our initial snapshot for purposes of checking for someone 2960 * doing our work for us. 2961 * 2962 * If we fail too many times in a row, we fall back to synchronize_sched(). 2963 */ 2964 void synchronize_sched_expedited(void) 2965 { 2966 long firstsnap, s, snap; 2967 int trycount = 0; 2968 struct rcu_state *rsp = &rcu_sched_state; 2969 2970 /* 2971 * If we are in danger of counter wrap, just do synchronize_sched(). 2972 * By allowing sync_sched_expedited_started to advance no more than 2973 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring 2974 * that more than 3.5 billion CPUs would be required to force a 2975 * counter wrap on a 32-bit system. Quite a few more CPUs would of 2976 * course be required on a 64-bit system. 2977 */ 2978 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), 2979 (ulong)atomic_long_read(&rsp->expedited_done) + 2980 ULONG_MAX / 8)) { 2981 synchronize_sched(); 2982 atomic_long_inc(&rsp->expedited_wrap); 2983 return; 2984 } 2985 2986 /* 2987 * Take a ticket. Note that atomic_inc_return() implies a 2988 * full memory barrier. 2989 */ 2990 snap = atomic_long_inc_return(&rsp->expedited_start); 2991 firstsnap = snap; 2992 if (!try_get_online_cpus()) { 2993 /* CPU hotplug operation in flight, fall back to normal GP. */ 2994 wait_rcu_gp(call_rcu_sched); 2995 atomic_long_inc(&rsp->expedited_normal); 2996 return; 2997 } 2998 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2999 3000 /* 3001 * Each pass through the following loop attempts to force a 3002 * context switch on each CPU. 3003 */ 3004 while (try_stop_cpus(cpu_online_mask, 3005 synchronize_sched_expedited_cpu_stop, 3006 NULL) == -EAGAIN) { 3007 put_online_cpus(); 3008 atomic_long_inc(&rsp->expedited_tryfail); 3009 3010 /* Check to see if someone else did our work for us. */ 3011 s = atomic_long_read(&rsp->expedited_done); 3012 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { 3013 /* ensure test happens before caller kfree */ 3014 smp_mb__before_atomic(); /* ^^^ */ 3015 atomic_long_inc(&rsp->expedited_workdone1); 3016 return; 3017 } 3018 3019 /* No joy, try again later. Or just synchronize_sched(). */ 3020 if (trycount++ < 10) { 3021 udelay(trycount * num_online_cpus()); 3022 } else { 3023 wait_rcu_gp(call_rcu_sched); 3024 atomic_long_inc(&rsp->expedited_normal); 3025 return; 3026 } 3027 3028 /* Recheck to see if someone else did our work for us. */ 3029 s = atomic_long_read(&rsp->expedited_done); 3030 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { 3031 /* ensure test happens before caller kfree */ 3032 smp_mb__before_atomic(); /* ^^^ */ 3033 atomic_long_inc(&rsp->expedited_workdone2); 3034 return; 3035 } 3036 3037 /* 3038 * Refetching sync_sched_expedited_started allows later 3039 * callers to piggyback on our grace period. We retry 3040 * after they started, so our grace period works for them, 3041 * and they started after our first try, so their grace 3042 * period works for us. 3043 */ 3044 if (!try_get_online_cpus()) { 3045 /* CPU hotplug operation in flight, use normal GP. */ 3046 wait_rcu_gp(call_rcu_sched); 3047 atomic_long_inc(&rsp->expedited_normal); 3048 return; 3049 } 3050 snap = atomic_long_read(&rsp->expedited_start); 3051 smp_mb(); /* ensure read is before try_stop_cpus(). */ 3052 } 3053 atomic_long_inc(&rsp->expedited_stoppedcpus); 3054 3055 /* 3056 * Everyone up to our most recent fetch is covered by our grace 3057 * period. Update the counter, but only if our work is still 3058 * relevant -- which it won't be if someone who started later 3059 * than we did already did their update. 3060 */ 3061 do { 3062 atomic_long_inc(&rsp->expedited_done_tries); 3063 s = atomic_long_read(&rsp->expedited_done); 3064 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { 3065 /* ensure test happens before caller kfree */ 3066 smp_mb__before_atomic(); /* ^^^ */ 3067 atomic_long_inc(&rsp->expedited_done_lost); 3068 break; 3069 } 3070 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); 3071 atomic_long_inc(&rsp->expedited_done_exit); 3072 3073 put_online_cpus(); 3074 } 3075 EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 3076 3077 /* 3078 * Check to see if there is any immediate RCU-related work to be done 3079 * by the current CPU, for the specified type of RCU, returning 1 if so. 3080 * The checks are in order of increasing expense: checks that can be 3081 * carried out against CPU-local state are performed first. However, 3082 * we must check for CPU stalls first, else we might not get a chance. 3083 */ 3084 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 3085 { 3086 struct rcu_node *rnp = rdp->mynode; 3087 3088 rdp->n_rcu_pending++; 3089 3090 /* Check for CPU stalls, if enabled. */ 3091 check_cpu_stall(rsp, rdp); 3092 3093 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ 3094 if (rcu_nohz_full_cpu(rsp)) 3095 return 0; 3096 3097 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3098 if (rcu_scheduler_fully_active && 3099 rdp->qs_pending && !rdp->passed_quiesce) { 3100 rdp->n_rp_qs_pending++; 3101 } else if (rdp->qs_pending && rdp->passed_quiesce) { 3102 rdp->n_rp_report_qs++; 3103 return 1; 3104 } 3105 3106 /* Does this CPU have callbacks ready to invoke? */ 3107 if (cpu_has_callbacks_ready_to_invoke(rdp)) { 3108 rdp->n_rp_cb_ready++; 3109 return 1; 3110 } 3111 3112 /* Has RCU gone idle with this CPU needing another grace period? */ 3113 if (cpu_needs_another_gp(rsp, rdp)) { 3114 rdp->n_rp_cpu_needs_gp++; 3115 return 1; 3116 } 3117 3118 /* Has another RCU grace period completed? */ 3119 if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ 3120 rdp->n_rp_gp_completed++; 3121 return 1; 3122 } 3123 3124 /* Has a new RCU grace period started? */ 3125 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ 3126 rdp->n_rp_gp_started++; 3127 return 1; 3128 } 3129 3130 /* Does this CPU need a deferred NOCB wakeup? */ 3131 if (rcu_nocb_need_deferred_wakeup(rdp)) { 3132 rdp->n_rp_nocb_defer_wakeup++; 3133 return 1; 3134 } 3135 3136 /* nothing to do */ 3137 rdp->n_rp_need_nothing++; 3138 return 0; 3139 } 3140 3141 /* 3142 * Check to see if there is any immediate RCU-related work to be done 3143 * by the current CPU, returning 1 if so. This function is part of the 3144 * RCU implementation; it is -not- an exported member of the RCU API. 3145 */ 3146 static int rcu_pending(int cpu) 3147 { 3148 struct rcu_state *rsp; 3149 3150 for_each_rcu_flavor(rsp) 3151 if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) 3152 return 1; 3153 return 0; 3154 } 3155 3156 /* 3157 * Return true if the specified CPU has any callback. If all_lazy is 3158 * non-NULL, store an indication of whether all callbacks are lazy. 3159 * (If there are no callbacks, all of them are deemed to be lazy.) 3160 */ 3161 static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy) 3162 { 3163 bool al = true; 3164 bool hc = false; 3165 struct rcu_data *rdp; 3166 struct rcu_state *rsp; 3167 3168 for_each_rcu_flavor(rsp) { 3169 rdp = per_cpu_ptr(rsp->rda, cpu); 3170 if (!rdp->nxtlist) 3171 continue; 3172 hc = true; 3173 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { 3174 al = false; 3175 break; 3176 } 3177 } 3178 if (all_lazy) 3179 *all_lazy = al; 3180 return hc; 3181 } 3182 3183 /* 3184 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 3185 * the compiler is expected to optimize this away. 3186 */ 3187 static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, 3188 int cpu, unsigned long done) 3189 { 3190 trace_rcu_barrier(rsp->name, s, cpu, 3191 atomic_read(&rsp->barrier_cpu_count), done); 3192 } 3193 3194 /* 3195 * RCU callback function for _rcu_barrier(). If we are last, wake 3196 * up the task executing _rcu_barrier(). 3197 */ 3198 static void rcu_barrier_callback(struct rcu_head *rhp) 3199 { 3200 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); 3201 struct rcu_state *rsp = rdp->rsp; 3202 3203 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { 3204 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); 3205 complete(&rsp->barrier_completion); 3206 } else { 3207 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); 3208 } 3209 } 3210 3211 /* 3212 * Called with preemption disabled, and from cross-cpu IRQ context. 3213 */ 3214 static void rcu_barrier_func(void *type) 3215 { 3216 struct rcu_state *rsp = type; 3217 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3218 3219 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); 3220 atomic_inc(&rsp->barrier_cpu_count); 3221 rsp->call(&rdp->barrier_head, rcu_barrier_callback); 3222 } 3223 3224 /* 3225 * Orchestrate the specified type of RCU barrier, waiting for all 3226 * RCU callbacks of the specified type to complete. 3227 */ 3228 static void _rcu_barrier(struct rcu_state *rsp) 3229 { 3230 int cpu; 3231 struct rcu_data *rdp; 3232 unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); 3233 unsigned long snap_done; 3234 3235 _rcu_barrier_trace(rsp, "Begin", -1, snap); 3236 3237 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 3238 mutex_lock(&rsp->barrier_mutex); 3239 3240 /* 3241 * Ensure that all prior references, including to ->n_barrier_done, 3242 * are ordered before the _rcu_barrier() machinery. 3243 */ 3244 smp_mb(); /* See above block comment. */ 3245 3246 /* 3247 * Recheck ->n_barrier_done to see if others did our work for us. 3248 * This means checking ->n_barrier_done for an even-to-odd-to-even 3249 * transition. The "if" expression below therefore rounds the old 3250 * value up to the next even number and adds two before comparing. 3251 */ 3252 snap_done = rsp->n_barrier_done; 3253 _rcu_barrier_trace(rsp, "Check", -1, snap_done); 3254 3255 /* 3256 * If the value in snap is odd, we needed to wait for the current 3257 * rcu_barrier() to complete, then wait for the next one, in other 3258 * words, we need the value of snap_done to be three larger than 3259 * the value of snap. On the other hand, if the value in snap is 3260 * even, we only had to wait for the next rcu_barrier() to complete, 3261 * in other words, we need the value of snap_done to be only two 3262 * greater than the value of snap. The "(snap + 3) & ~0x1" computes 3263 * this for us (thank you, Linus!). 3264 */ 3265 if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { 3266 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); 3267 smp_mb(); /* caller's subsequent code after above check. */ 3268 mutex_unlock(&rsp->barrier_mutex); 3269 return; 3270 } 3271 3272 /* 3273 * Increment ->n_barrier_done to avoid duplicate work. Use 3274 * ACCESS_ONCE() to prevent the compiler from speculating 3275 * the increment to precede the early-exit check. 3276 */ 3277 ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; 3278 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); 3279 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); 3280 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ 3281 3282 /* 3283 * Initialize the count to one rather than to zero in order to 3284 * avoid a too-soon return to zero in case of a short grace period 3285 * (or preemption of this task). Exclude CPU-hotplug operations 3286 * to ensure that no offline CPU has callbacks queued. 3287 */ 3288 init_completion(&rsp->barrier_completion); 3289 atomic_set(&rsp->barrier_cpu_count, 1); 3290 get_online_cpus(); 3291 3292 /* 3293 * Force each CPU with callbacks to register a new callback. 3294 * When that callback is invoked, we will know that all of the 3295 * corresponding CPU's preceding callbacks have been invoked. 3296 */ 3297 for_each_possible_cpu(cpu) { 3298 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) 3299 continue; 3300 rdp = per_cpu_ptr(rsp->rda, cpu); 3301 if (rcu_is_nocb_cpu(cpu)) { 3302 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3303 rsp->n_barrier_done); 3304 atomic_inc(&rsp->barrier_cpu_count); 3305 __call_rcu(&rdp->barrier_head, rcu_barrier_callback, 3306 rsp, cpu, 0); 3307 } else if (ACCESS_ONCE(rdp->qlen)) { 3308 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3309 rsp->n_barrier_done); 3310 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3311 } else { 3312 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 3313 rsp->n_barrier_done); 3314 } 3315 } 3316 put_online_cpus(); 3317 3318 /* 3319 * Now that we have an rcu_barrier_callback() callback on each 3320 * CPU, and thus each counted, remove the initial count. 3321 */ 3322 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) 3323 complete(&rsp->barrier_completion); 3324 3325 /* Increment ->n_barrier_done to prevent duplicate work. */ 3326 smp_mb(); /* Keep increment after above mechanism. */ 3327 ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1; 3328 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); 3329 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); 3330 smp_mb(); /* Keep increment before caller's subsequent code. */ 3331 3332 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 3333 wait_for_completion(&rsp->barrier_completion); 3334 3335 /* Other rcu_barrier() invocations can now safely proceed. */ 3336 mutex_unlock(&rsp->barrier_mutex); 3337 } 3338 3339 /** 3340 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. 3341 */ 3342 void rcu_barrier_bh(void) 3343 { 3344 _rcu_barrier(&rcu_bh_state); 3345 } 3346 EXPORT_SYMBOL_GPL(rcu_barrier_bh); 3347 3348 /** 3349 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. 3350 */ 3351 void rcu_barrier_sched(void) 3352 { 3353 _rcu_barrier(&rcu_sched_state); 3354 } 3355 EXPORT_SYMBOL_GPL(rcu_barrier_sched); 3356 3357 /* 3358 * Do boot-time initialization of a CPU's per-CPU RCU data. 3359 */ 3360 static void __init 3361 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 3362 { 3363 unsigned long flags; 3364 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3365 struct rcu_node *rnp = rcu_get_root(rsp); 3366 3367 /* Set up local state, ensuring consistent view of global state. */ 3368 raw_spin_lock_irqsave(&rnp->lock, flags); 3369 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 3370 init_callback_list(rdp); 3371 rdp->qlen_lazy = 0; 3372 ACCESS_ONCE(rdp->qlen) = 0; 3373 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3374 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 3375 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 3376 rdp->cpu = cpu; 3377 rdp->rsp = rsp; 3378 rcu_boot_init_nocb_percpu_data(rdp); 3379 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3380 } 3381 3382 /* 3383 * Initialize a CPU's per-CPU RCU data. Note that only one online or 3384 * offline event can be happening at a given time. Note also that we 3385 * can accept some slop in the rsp->completed access due to the fact 3386 * that this CPU cannot possibly have any RCU callbacks in flight yet. 3387 */ 3388 static void 3389 rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 3390 { 3391 unsigned long flags; 3392 unsigned long mask; 3393 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3394 struct rcu_node *rnp = rcu_get_root(rsp); 3395 3396 /* Exclude new grace periods. */ 3397 mutex_lock(&rsp->onoff_mutex); 3398 3399 /* Set up local state, ensuring consistent view of global state. */ 3400 raw_spin_lock_irqsave(&rnp->lock, flags); 3401 rdp->beenonline = 1; /* We have now been online. */ 3402 rdp->qlen_last_fqs_check = 0; 3403 rdp->n_force_qs_snap = rsp->n_force_qs; 3404 rdp->blimit = blimit; 3405 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3406 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3407 rcu_sysidle_init_percpu_data(rdp->dynticks); 3408 atomic_set(&rdp->dynticks->dynticks, 3409 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3410 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3411 3412 /* Add CPU to rcu_node bitmasks. */ 3413 rnp = rdp->mynode; 3414 mask = rdp->grpmask; 3415 do { 3416 /* Exclude any attempts to start a new GP on small systems. */ 3417 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 3418 rnp->qsmaskinit |= mask; 3419 mask = rnp->grpmask; 3420 if (rnp == rdp->mynode) { 3421 /* 3422 * If there is a grace period in progress, we will 3423 * set up to wait for it next time we run the 3424 * RCU core code. 3425 */ 3426 rdp->gpnum = rnp->completed; 3427 rdp->completed = rnp->completed; 3428 rdp->passed_quiesce = 0; 3429 rdp->qs_pending = 0; 3430 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3431 } 3432 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 3433 rnp = rnp->parent; 3434 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 3435 local_irq_restore(flags); 3436 3437 mutex_unlock(&rsp->onoff_mutex); 3438 } 3439 3440 static void rcu_prepare_cpu(int cpu) 3441 { 3442 struct rcu_state *rsp; 3443 3444 for_each_rcu_flavor(rsp) 3445 rcu_init_percpu_data(cpu, rsp); 3446 } 3447 3448 /* 3449 * Handle CPU online/offline notification events. 3450 */ 3451 static int rcu_cpu_notify(struct notifier_block *self, 3452 unsigned long action, void *hcpu) 3453 { 3454 long cpu = (long)hcpu; 3455 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3456 struct rcu_node *rnp = rdp->mynode; 3457 struct rcu_state *rsp; 3458 3459 trace_rcu_utilization(TPS("Start CPU hotplug")); 3460 switch (action) { 3461 case CPU_UP_PREPARE: 3462 case CPU_UP_PREPARE_FROZEN: 3463 rcu_prepare_cpu(cpu); 3464 rcu_prepare_kthreads(cpu); 3465 rcu_spawn_all_nocb_kthreads(cpu); 3466 break; 3467 case CPU_ONLINE: 3468 case CPU_DOWN_FAILED: 3469 rcu_boost_kthread_setaffinity(rnp, -1); 3470 break; 3471 case CPU_DOWN_PREPARE: 3472 rcu_boost_kthread_setaffinity(rnp, cpu); 3473 break; 3474 case CPU_DYING: 3475 case CPU_DYING_FROZEN: 3476 for_each_rcu_flavor(rsp) 3477 rcu_cleanup_dying_cpu(rsp); 3478 break; 3479 case CPU_DEAD: 3480 case CPU_DEAD_FROZEN: 3481 case CPU_UP_CANCELED: 3482 case CPU_UP_CANCELED_FROZEN: 3483 for_each_rcu_flavor(rsp) 3484 rcu_cleanup_dead_cpu(cpu, rsp); 3485 break; 3486 default: 3487 break; 3488 } 3489 trace_rcu_utilization(TPS("End CPU hotplug")); 3490 return NOTIFY_OK; 3491 } 3492 3493 static int rcu_pm_notify(struct notifier_block *self, 3494 unsigned long action, void *hcpu) 3495 { 3496 switch (action) { 3497 case PM_HIBERNATION_PREPARE: 3498 case PM_SUSPEND_PREPARE: 3499 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3500 rcu_expedited = 1; 3501 break; 3502 case PM_POST_HIBERNATION: 3503 case PM_POST_SUSPEND: 3504 rcu_expedited = 0; 3505 break; 3506 default: 3507 break; 3508 } 3509 return NOTIFY_OK; 3510 } 3511 3512 /* 3513 * Spawn the kthreads that handle each RCU flavor's grace periods. 3514 */ 3515 static int __init rcu_spawn_gp_kthread(void) 3516 { 3517 unsigned long flags; 3518 struct rcu_node *rnp; 3519 struct rcu_state *rsp; 3520 struct task_struct *t; 3521 3522 rcu_scheduler_fully_active = 1; 3523 for_each_rcu_flavor(rsp) { 3524 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); 3525 BUG_ON(IS_ERR(t)); 3526 rnp = rcu_get_root(rsp); 3527 raw_spin_lock_irqsave(&rnp->lock, flags); 3528 rsp->gp_kthread = t; 3529 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3530 } 3531 rcu_spawn_nocb_kthreads(); 3532 rcu_spawn_boost_kthreads(); 3533 return 0; 3534 } 3535 early_initcall(rcu_spawn_gp_kthread); 3536 3537 /* 3538 * This function is invoked towards the end of the scheduler's initialization 3539 * process. Before this is called, the idle task might contain 3540 * RCU read-side critical sections (during which time, this idle 3541 * task is booting the system). After this function is called, the 3542 * idle tasks are prohibited from containing RCU read-side critical 3543 * sections. This function also enables RCU lockdep checking. 3544 */ 3545 void rcu_scheduler_starting(void) 3546 { 3547 WARN_ON(num_online_cpus() != 1); 3548 WARN_ON(nr_context_switches() > 0); 3549 rcu_scheduler_active = 1; 3550 } 3551 3552 /* 3553 * Compute the per-level fanout, either using the exact fanout specified 3554 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 3555 */ 3556 #ifdef CONFIG_RCU_FANOUT_EXACT 3557 static void __init rcu_init_levelspread(struct rcu_state *rsp) 3558 { 3559 int i; 3560 3561 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; 3562 for (i = rcu_num_lvls - 2; i >= 0; i--) 3563 rsp->levelspread[i] = CONFIG_RCU_FANOUT; 3564 } 3565 #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ 3566 static void __init rcu_init_levelspread(struct rcu_state *rsp) 3567 { 3568 int ccur; 3569 int cprv; 3570 int i; 3571 3572 cprv = nr_cpu_ids; 3573 for (i = rcu_num_lvls - 1; i >= 0; i--) { 3574 ccur = rsp->levelcnt[i]; 3575 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 3576 cprv = ccur; 3577 } 3578 } 3579 #endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ 3580 3581 /* 3582 * Helper function for rcu_init() that initializes one rcu_state structure. 3583 */ 3584 static void __init rcu_init_one(struct rcu_state *rsp, 3585 struct rcu_data __percpu *rda) 3586 { 3587 static const char * const buf[] = { 3588 "rcu_node_0", 3589 "rcu_node_1", 3590 "rcu_node_2", 3591 "rcu_node_3" }; /* Match MAX_RCU_LVLS */ 3592 static const char * const fqs[] = { 3593 "rcu_node_fqs_0", 3594 "rcu_node_fqs_1", 3595 "rcu_node_fqs_2", 3596 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ 3597 static u8 fl_mask = 0x1; 3598 int cpustride = 1; 3599 int i; 3600 int j; 3601 struct rcu_node *rnp; 3602 3603 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 3604 3605 /* Silence gcc 4.8 warning about array index out of range. */ 3606 if (rcu_num_lvls > RCU_NUM_LVLS) 3607 panic("rcu_init_one: rcu_num_lvls overflow"); 3608 3609 /* Initialize the level-tracking arrays. */ 3610 3611 for (i = 0; i < rcu_num_lvls; i++) 3612 rsp->levelcnt[i] = num_rcu_lvl[i]; 3613 for (i = 1; i < rcu_num_lvls; i++) 3614 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 3615 rcu_init_levelspread(rsp); 3616 rsp->flavor_mask = fl_mask; 3617 fl_mask <<= 1; 3618 3619 /* Initialize the elements themselves, starting from the leaves. */ 3620 3621 for (i = rcu_num_lvls - 1; i >= 0; i--) { 3622 cpustride *= rsp->levelspread[i]; 3623 rnp = rsp->level[i]; 3624 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 3625 raw_spin_lock_init(&rnp->lock); 3626 lockdep_set_class_and_name(&rnp->lock, 3627 &rcu_node_class[i], buf[i]); 3628 raw_spin_lock_init(&rnp->fqslock); 3629 lockdep_set_class_and_name(&rnp->fqslock, 3630 &rcu_fqs_class[i], fqs[i]); 3631 rnp->gpnum = rsp->gpnum; 3632 rnp->completed = rsp->completed; 3633 rnp->qsmask = 0; 3634 rnp->qsmaskinit = 0; 3635 rnp->grplo = j * cpustride; 3636 rnp->grphi = (j + 1) * cpustride - 1; 3637 if (rnp->grphi >= nr_cpu_ids) 3638 rnp->grphi = nr_cpu_ids - 1; 3639 if (i == 0) { 3640 rnp->grpnum = 0; 3641 rnp->grpmask = 0; 3642 rnp->parent = NULL; 3643 } else { 3644 rnp->grpnum = j % rsp->levelspread[i - 1]; 3645 rnp->grpmask = 1UL << rnp->grpnum; 3646 rnp->parent = rsp->level[i - 1] + 3647 j / rsp->levelspread[i - 1]; 3648 } 3649 rnp->level = i; 3650 INIT_LIST_HEAD(&rnp->blkd_tasks); 3651 rcu_init_one_nocb(rnp); 3652 } 3653 } 3654 3655 rsp->rda = rda; 3656 init_waitqueue_head(&rsp->gp_wq); 3657 rnp = rsp->level[rcu_num_lvls - 1]; 3658 for_each_possible_cpu(i) { 3659 while (i > rnp->grphi) 3660 rnp++; 3661 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 3662 rcu_boot_init_percpu_data(i, rsp); 3663 } 3664 list_add(&rsp->flavors, &rcu_struct_flavors); 3665 } 3666 3667 /* 3668 * Compute the rcu_node tree geometry from kernel parameters. This cannot 3669 * replace the definitions in tree.h because those are needed to size 3670 * the ->node array in the rcu_state structure. 3671 */ 3672 static void __init rcu_init_geometry(void) 3673 { 3674 ulong d; 3675 int i; 3676 int j; 3677 int n = nr_cpu_ids; 3678 int rcu_capacity[MAX_RCU_LVLS + 1]; 3679 3680 /* 3681 * Initialize any unspecified boot parameters. 3682 * The default values of jiffies_till_first_fqs and 3683 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS 3684 * value, which is a function of HZ, then adding one for each 3685 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. 3686 */ 3687 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; 3688 if (jiffies_till_first_fqs == ULONG_MAX) 3689 jiffies_till_first_fqs = d; 3690 if (jiffies_till_next_fqs == ULONG_MAX) 3691 jiffies_till_next_fqs = d; 3692 3693 /* If the compile-time values are accurate, just leave. */ 3694 if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && 3695 nr_cpu_ids == NR_CPUS) 3696 return; 3697 pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", 3698 rcu_fanout_leaf, nr_cpu_ids); 3699 3700 /* 3701 * Compute number of nodes that can be handled an rcu_node tree 3702 * with the given number of levels. Setting rcu_capacity[0] makes 3703 * some of the arithmetic easier. 3704 */ 3705 rcu_capacity[0] = 1; 3706 rcu_capacity[1] = rcu_fanout_leaf; 3707 for (i = 2; i <= MAX_RCU_LVLS; i++) 3708 rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; 3709 3710 /* 3711 * The boot-time rcu_fanout_leaf parameter is only permitted 3712 * to increase the leaf-level fanout, not decrease it. Of course, 3713 * the leaf-level fanout cannot exceed the number of bits in 3714 * the rcu_node masks. Finally, the tree must be able to accommodate 3715 * the configured number of CPUs. Complain and fall back to the 3716 * compile-time values if these limits are exceeded. 3717 */ 3718 if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || 3719 rcu_fanout_leaf > sizeof(unsigned long) * 8 || 3720 n > rcu_capacity[MAX_RCU_LVLS]) { 3721 WARN_ON(1); 3722 return; 3723 } 3724 3725 /* Calculate the number of rcu_nodes at each level of the tree. */ 3726 for (i = 1; i <= MAX_RCU_LVLS; i++) 3727 if (n <= rcu_capacity[i]) { 3728 for (j = 0; j <= i; j++) 3729 num_rcu_lvl[j] = 3730 DIV_ROUND_UP(n, rcu_capacity[i - j]); 3731 rcu_num_lvls = i; 3732 for (j = i + 1; j <= MAX_RCU_LVLS; j++) 3733 num_rcu_lvl[j] = 0; 3734 break; 3735 } 3736 3737 /* Calculate the total number of rcu_node structures. */ 3738 rcu_num_nodes = 0; 3739 for (i = 0; i <= MAX_RCU_LVLS; i++) 3740 rcu_num_nodes += num_rcu_lvl[i]; 3741 rcu_num_nodes -= n; 3742 } 3743 3744 void __init rcu_init(void) 3745 { 3746 int cpu; 3747 3748 rcu_bootup_announce(); 3749 rcu_init_geometry(); 3750 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3751 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3752 __rcu_init_preempt(); 3753 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3754 3755 /* 3756 * We don't need protection against CPU-hotplug here because 3757 * this is called early in boot, before either interrupts 3758 * or the scheduler are operational. 3759 */ 3760 cpu_notifier(rcu_cpu_notify, 0); 3761 pm_notifier(rcu_pm_notify, 0); 3762 for_each_online_cpu(cpu) 3763 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3764 } 3765 3766 #include "tree_plugin.h" 3767