1 /* 2 * Read-Copy Update mechanism for mutual exclusion 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, you can access it online at 16 * http://www.gnu.org/licenses/gpl-2.0.html. 17 * 18 * Copyright IBM Corporation, 2001 19 * 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 21 * Manfred Spraul <manfred@colorfullife.com> 22 * 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 25 * Papers: 26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) 28 * 29 * For detailed explanation of Read-Copy Update mechanism see - 30 * http://lse.sourceforge.net/locking/rcupdate.html 31 * 32 */ 33 #include <linux/types.h> 34 #include <linux/kernel.h> 35 #include <linux/init.h> 36 #include <linux/spinlock.h> 37 #include <linux/smp.h> 38 #include <linux/interrupt.h> 39 #include <linux/sched.h> 40 #include <linux/atomic.h> 41 #include <linux/bitops.h> 42 #include <linux/percpu.h> 43 #include <linux/notifier.h> 44 #include <linux/cpu.h> 45 #include <linux/mutex.h> 46 #include <linux/export.h> 47 #include <linux/hardirq.h> 48 #include <linux/delay.h> 49 #include <linux/moduleparam.h> 50 #include <linux/kthread.h> 51 #include <linux/tick.h> 52 #include <linux/rcupdate_wait.h> 53 54 #define CREATE_TRACE_POINTS 55 56 #include "rcu.h" 57 58 #ifdef MODULE_PARAM_PREFIX 59 #undef MODULE_PARAM_PREFIX 60 #endif 61 #define MODULE_PARAM_PREFIX "rcupdate." 62 63 #ifndef CONFIG_TINY_RCU 64 module_param(rcu_expedited, int, 0); 65 module_param(rcu_normal, int, 0); 66 static int rcu_normal_after_boot; 67 module_param(rcu_normal_after_boot, int, 0); 68 #endif /* #ifndef CONFIG_TINY_RCU */ 69 70 #ifdef CONFIG_DEBUG_LOCK_ALLOC 71 /** 72 * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? 73 * 74 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an 75 * RCU-sched read-side critical section. In absence of 76 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side 77 * critical section unless it can prove otherwise. Note that disabling 78 * of preemption (including disabling irqs) counts as an RCU-sched 79 * read-side critical section. This is useful for debug checks in functions 80 * that required that they be called within an RCU-sched read-side 81 * critical section. 82 * 83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 84 * and while lockdep is disabled. 85 * 86 * Note that if the CPU is in the idle loop from an RCU point of 87 * view (ie: that we are in the section between rcu_idle_enter() and 88 * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU 89 * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs 90 * that are in such a section, considering these as in extended quiescent 91 * state, so such a CPU is effectively never in an RCU read-side critical 92 * section regardless of what RCU primitives it invokes. This state of 93 * affairs is required --- we need to keep an RCU-free window in idle 94 * where the CPU may possibly enter into low power mode. This way we can 95 * notice an extended quiescent state to other CPUs that started a grace 96 * period. Otherwise we would delay any grace period as long as we run in 97 * the idle task. 98 * 99 * Similarly, we avoid claiming an SRCU read lock held if the current 100 * CPU is offline. 101 */ 102 int rcu_read_lock_sched_held(void) 103 { 104 int lockdep_opinion = 0; 105 106 if (!debug_lockdep_rcu_enabled()) 107 return 1; 108 if (!rcu_is_watching()) 109 return 0; 110 if (!rcu_lockdep_current_cpu_online()) 111 return 0; 112 if (debug_locks) 113 lockdep_opinion = lock_is_held(&rcu_sched_lock_map); 114 return lockdep_opinion || !preemptible(); 115 } 116 EXPORT_SYMBOL(rcu_read_lock_sched_held); 117 #endif 118 119 #ifndef CONFIG_TINY_RCU 120 121 /* 122 * Should expedited grace-period primitives always fall back to their 123 * non-expedited counterparts? Intended for use within RCU. Note 124 * that if the user specifies both rcu_expedited and rcu_normal, then 125 * rcu_normal wins. (Except during the time period during boot from 126 * when the first task is spawned until the rcu_exp_runtime_mode() 127 * core_initcall() is invoked, at which point everything is expedited.) 128 */ 129 bool rcu_gp_is_normal(void) 130 { 131 return READ_ONCE(rcu_normal) && 132 rcu_scheduler_active != RCU_SCHEDULER_INIT; 133 } 134 EXPORT_SYMBOL_GPL(rcu_gp_is_normal); 135 136 static atomic_t rcu_expedited_nesting = ATOMIC_INIT(1); 137 138 /* 139 * Should normal grace-period primitives be expedited? Intended for 140 * use within RCU. Note that this function takes the rcu_expedited 141 * sysfs/boot variable and rcu_scheduler_active into account as well 142 * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp() 143 * until rcu_gp_is_expedited() returns false is a -really- bad idea. 144 */ 145 bool rcu_gp_is_expedited(void) 146 { 147 return rcu_expedited || atomic_read(&rcu_expedited_nesting) || 148 rcu_scheduler_active == RCU_SCHEDULER_INIT; 149 } 150 EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); 151 152 /** 153 * rcu_expedite_gp - Expedite future RCU grace periods 154 * 155 * After a call to this function, future calls to synchronize_rcu() and 156 * friends act as the corresponding synchronize_rcu_expedited() function 157 * had instead been called. 158 */ 159 void rcu_expedite_gp(void) 160 { 161 atomic_inc(&rcu_expedited_nesting); 162 } 163 EXPORT_SYMBOL_GPL(rcu_expedite_gp); 164 165 /** 166 * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation 167 * 168 * Undo a prior call to rcu_expedite_gp(). If all prior calls to 169 * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(), 170 * and if the rcu_expedited sysfs/boot parameter is not set, then all 171 * subsequent calls to synchronize_rcu() and friends will return to 172 * their normal non-expedited behavior. 173 */ 174 void rcu_unexpedite_gp(void) 175 { 176 atomic_dec(&rcu_expedited_nesting); 177 } 178 EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); 179 180 /* 181 * Inform RCU of the end of the in-kernel boot sequence. 182 */ 183 void rcu_end_inkernel_boot(void) 184 { 185 rcu_unexpedite_gp(); 186 if (rcu_normal_after_boot) 187 WRITE_ONCE(rcu_normal, 1); 188 } 189 190 #endif /* #ifndef CONFIG_TINY_RCU */ 191 192 #ifdef CONFIG_PREEMPT_RCU 193 194 /* 195 * Preemptible RCU implementation for rcu_read_lock(). 196 * Just increment ->rcu_read_lock_nesting, shared state will be updated 197 * if we block. 198 */ 199 void __rcu_read_lock(void) 200 { 201 current->rcu_read_lock_nesting++; 202 barrier(); /* critical section after entry code. */ 203 } 204 EXPORT_SYMBOL_GPL(__rcu_read_lock); 205 206 /* 207 * Preemptible RCU implementation for rcu_read_unlock(). 208 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 209 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 210 * invoke rcu_read_unlock_special() to clean up after a context switch 211 * in an RCU read-side critical section and other special cases. 212 */ 213 void __rcu_read_unlock(void) 214 { 215 struct task_struct *t = current; 216 217 if (t->rcu_read_lock_nesting != 1) { 218 --t->rcu_read_lock_nesting; 219 } else { 220 barrier(); /* critical section before exit code. */ 221 t->rcu_read_lock_nesting = INT_MIN; 222 barrier(); /* assign before ->rcu_read_unlock_special load */ 223 if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) 224 rcu_read_unlock_special(t); 225 barrier(); /* ->rcu_read_unlock_special load before assign */ 226 t->rcu_read_lock_nesting = 0; 227 } 228 #ifdef CONFIG_PROVE_LOCKING 229 { 230 int rrln = READ_ONCE(t->rcu_read_lock_nesting); 231 232 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); 233 } 234 #endif /* #ifdef CONFIG_PROVE_LOCKING */ 235 } 236 EXPORT_SYMBOL_GPL(__rcu_read_unlock); 237 238 #endif /* #ifdef CONFIG_PREEMPT_RCU */ 239 240 #ifdef CONFIG_DEBUG_LOCK_ALLOC 241 static struct lock_class_key rcu_lock_key; 242 struct lockdep_map rcu_lock_map = 243 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 244 EXPORT_SYMBOL_GPL(rcu_lock_map); 245 246 static struct lock_class_key rcu_bh_lock_key; 247 struct lockdep_map rcu_bh_lock_map = 248 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); 249 EXPORT_SYMBOL_GPL(rcu_bh_lock_map); 250 251 static struct lock_class_key rcu_sched_lock_key; 252 struct lockdep_map rcu_sched_lock_map = 253 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); 254 EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 255 256 static struct lock_class_key rcu_callback_key; 257 struct lockdep_map rcu_callback_map = 258 STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key); 259 EXPORT_SYMBOL_GPL(rcu_callback_map); 260 261 int notrace debug_lockdep_rcu_enabled(void) 262 { 263 return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks && 264 current->lockdep_recursion == 0; 265 } 266 EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 267 268 /** 269 * rcu_read_lock_held() - might we be in RCU read-side critical section? 270 * 271 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU 272 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, 273 * this assumes we are in an RCU read-side critical section unless it can 274 * prove otherwise. This is useful for debug checks in functions that 275 * require that they be called within an RCU read-side critical section. 276 * 277 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot 278 * and while lockdep is disabled. 279 * 280 * Note that rcu_read_lock() and the matching rcu_read_unlock() must 281 * occur in the same context, for example, it is illegal to invoke 282 * rcu_read_unlock() in process context if the matching rcu_read_lock() 283 * was invoked from within an irq handler. 284 * 285 * Note that rcu_read_lock() is disallowed if the CPU is either idle or 286 * offline from an RCU perspective, so check for those as well. 287 */ 288 int rcu_read_lock_held(void) 289 { 290 if (!debug_lockdep_rcu_enabled()) 291 return 1; 292 if (!rcu_is_watching()) 293 return 0; 294 if (!rcu_lockdep_current_cpu_online()) 295 return 0; 296 return lock_is_held(&rcu_lock_map); 297 } 298 EXPORT_SYMBOL_GPL(rcu_read_lock_held); 299 300 /** 301 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? 302 * 303 * Check for bottom half being disabled, which covers both the 304 * CONFIG_PROVE_RCU and not cases. Note that if someone uses 305 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) 306 * will show the situation. This is useful for debug checks in functions 307 * that require that they be called within an RCU read-side critical 308 * section. 309 * 310 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 311 * 312 * Note that rcu_read_lock() is disallowed if the CPU is either idle or 313 * offline from an RCU perspective, so check for those as well. 314 */ 315 int rcu_read_lock_bh_held(void) 316 { 317 if (!debug_lockdep_rcu_enabled()) 318 return 1; 319 if (!rcu_is_watching()) 320 return 0; 321 if (!rcu_lockdep_current_cpu_online()) 322 return 0; 323 return in_softirq() || irqs_disabled(); 324 } 325 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 326 327 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 328 329 /** 330 * wakeme_after_rcu() - Callback function to awaken a task after grace period 331 * @head: Pointer to rcu_head member within rcu_synchronize structure 332 * 333 * Awaken the corresponding task now that a grace period has elapsed. 334 */ 335 void wakeme_after_rcu(struct rcu_head *head) 336 { 337 struct rcu_synchronize *rcu; 338 339 rcu = container_of(head, struct rcu_synchronize, head); 340 complete(&rcu->completion); 341 } 342 EXPORT_SYMBOL_GPL(wakeme_after_rcu); 343 344 void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, 345 struct rcu_synchronize *rs_array) 346 { 347 int i; 348 349 /* Initialize and register callbacks for each flavor specified. */ 350 for (i = 0; i < n; i++) { 351 if (checktiny && 352 (crcu_array[i] == call_rcu || 353 crcu_array[i] == call_rcu_bh)) { 354 might_sleep(); 355 continue; 356 } 357 init_rcu_head_on_stack(&rs_array[i].head); 358 init_completion(&rs_array[i].completion); 359 (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); 360 } 361 362 /* Wait for all callbacks to be invoked. */ 363 for (i = 0; i < n; i++) { 364 if (checktiny && 365 (crcu_array[i] == call_rcu || 366 crcu_array[i] == call_rcu_bh)) 367 continue; 368 wait_for_completion(&rs_array[i].completion); 369 destroy_rcu_head_on_stack(&rs_array[i].head); 370 } 371 } 372 EXPORT_SYMBOL_GPL(__wait_rcu_gp); 373 374 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 375 void init_rcu_head(struct rcu_head *head) 376 { 377 debug_object_init(head, &rcuhead_debug_descr); 378 } 379 380 void destroy_rcu_head(struct rcu_head *head) 381 { 382 debug_object_free(head, &rcuhead_debug_descr); 383 } 384 385 static bool rcuhead_is_static_object(void *addr) 386 { 387 return true; 388 } 389 390 /** 391 * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects 392 * @head: pointer to rcu_head structure to be initialized 393 * 394 * This function informs debugobjects of a new rcu_head structure that 395 * has been allocated as an auto variable on the stack. This function 396 * is not required for rcu_head structures that are statically defined or 397 * that are dynamically allocated on the heap. This function has no 398 * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. 399 */ 400 void init_rcu_head_on_stack(struct rcu_head *head) 401 { 402 debug_object_init_on_stack(head, &rcuhead_debug_descr); 403 } 404 EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); 405 406 /** 407 * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects 408 * @head: pointer to rcu_head structure to be initialized 409 * 410 * This function informs debugobjects that an on-stack rcu_head structure 411 * is about to go out of scope. As with init_rcu_head_on_stack(), this 412 * function is not required for rcu_head structures that are statically 413 * defined or that are dynamically allocated on the heap. Also as with 414 * init_rcu_head_on_stack(), this function has no effect for 415 * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. 416 */ 417 void destroy_rcu_head_on_stack(struct rcu_head *head) 418 { 419 debug_object_free(head, &rcuhead_debug_descr); 420 } 421 EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); 422 423 struct debug_obj_descr rcuhead_debug_descr = { 424 .name = "rcu_head", 425 .is_static_object = rcuhead_is_static_object, 426 }; 427 EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 428 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 429 430 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) 431 void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, 432 unsigned long secs, 433 unsigned long c_old, unsigned long c) 434 { 435 trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); 436 } 437 EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); 438 #else 439 #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ 440 do { } while (0) 441 #endif 442 443 #ifdef CONFIG_RCU_STALL_COMMON 444 445 #ifdef CONFIG_PROVE_RCU 446 #define RCU_STALL_DELAY_DELTA (5 * HZ) 447 #else 448 #define RCU_STALL_DELAY_DELTA 0 449 #endif 450 451 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 452 static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 453 454 module_param(rcu_cpu_stall_suppress, int, 0644); 455 module_param(rcu_cpu_stall_timeout, int, 0644); 456 457 int rcu_jiffies_till_stall_check(void) 458 { 459 int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); 460 461 /* 462 * Limit check must be consistent with the Kconfig limits 463 * for CONFIG_RCU_CPU_STALL_TIMEOUT. 464 */ 465 if (till_stall_check < 3) { 466 WRITE_ONCE(rcu_cpu_stall_timeout, 3); 467 till_stall_check = 3; 468 } else if (till_stall_check > 300) { 469 WRITE_ONCE(rcu_cpu_stall_timeout, 300); 470 till_stall_check = 300; 471 } 472 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; 473 } 474 475 void rcu_sysrq_start(void) 476 { 477 if (!rcu_cpu_stall_suppress) 478 rcu_cpu_stall_suppress = 2; 479 } 480 481 void rcu_sysrq_end(void) 482 { 483 if (rcu_cpu_stall_suppress == 2) 484 rcu_cpu_stall_suppress = 0; 485 } 486 487 static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 488 { 489 rcu_cpu_stall_suppress = 1; 490 return NOTIFY_DONE; 491 } 492 493 static struct notifier_block rcu_panic_block = { 494 .notifier_call = rcu_panic, 495 }; 496 497 static int __init check_cpu_stall_init(void) 498 { 499 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); 500 return 0; 501 } 502 early_initcall(check_cpu_stall_init); 503 504 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 505 506 #ifdef CONFIG_TASKS_RCU 507 508 /* 509 * Simple variant of RCU whose quiescent states are voluntary context switch, 510 * user-space execution, and idle. As such, grace periods can take one good 511 * long time. There are no read-side primitives similar to rcu_read_lock() 512 * and rcu_read_unlock() because this implementation is intended to get 513 * the system into a safe state for some of the manipulations involved in 514 * tracing and the like. Finally, this implementation does not support 515 * high call_rcu_tasks() rates from multiple CPUs. If this is required, 516 * per-CPU callback lists will be needed. 517 */ 518 519 /* Global list of callbacks and associated lock. */ 520 static struct rcu_head *rcu_tasks_cbs_head; 521 static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; 522 static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); 523 static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); 524 525 /* Track exiting tasks in order to allow them to be waited for. */ 526 DEFINE_SRCU(tasks_rcu_exit_srcu); 527 528 /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ 529 static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; 530 module_param(rcu_task_stall_timeout, int, 0644); 531 532 static void rcu_spawn_tasks_kthread(void); 533 static struct task_struct *rcu_tasks_kthread_ptr; 534 535 /* 536 * Post an RCU-tasks callback. First call must be from process context 537 * after the scheduler if fully operational. 538 */ 539 void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) 540 { 541 unsigned long flags; 542 bool needwake; 543 bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); 544 545 rhp->next = NULL; 546 rhp->func = func; 547 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); 548 needwake = !rcu_tasks_cbs_head; 549 *rcu_tasks_cbs_tail = rhp; 550 rcu_tasks_cbs_tail = &rhp->next; 551 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); 552 /* We can't create the thread unless interrupts are enabled. */ 553 if ((needwake && havetask) || 554 (!havetask && !irqs_disabled_flags(flags))) { 555 rcu_spawn_tasks_kthread(); 556 wake_up(&rcu_tasks_cbs_wq); 557 } 558 } 559 EXPORT_SYMBOL_GPL(call_rcu_tasks); 560 561 /** 562 * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed. 563 * 564 * Control will return to the caller some time after a full rcu-tasks 565 * grace period has elapsed, in other words after all currently 566 * executing rcu-tasks read-side critical sections have elapsed. These 567 * read-side critical sections are delimited by calls to schedule(), 568 * cond_resched_rcu_qs(), idle execution, userspace execution, calls 569 * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched(). 570 * 571 * This is a very specialized primitive, intended only for a few uses in 572 * tracing and other situations requiring manipulation of function 573 * preambles and profiling hooks. The synchronize_rcu_tasks() function 574 * is not (yet) intended for heavy use from multiple CPUs. 575 * 576 * Note that this guarantee implies further memory-ordering guarantees. 577 * On systems with more than one CPU, when synchronize_rcu_tasks() returns, 578 * each CPU is guaranteed to have executed a full memory barrier since the 579 * end of its last RCU-tasks read-side critical section whose beginning 580 * preceded the call to synchronize_rcu_tasks(). In addition, each CPU 581 * having an RCU-tasks read-side critical section that extends beyond 582 * the return from synchronize_rcu_tasks() is guaranteed to have executed 583 * a full memory barrier after the beginning of synchronize_rcu_tasks() 584 * and before the beginning of that RCU-tasks read-side critical section. 585 * Note that these guarantees include CPUs that are offline, idle, or 586 * executing in user mode, as well as CPUs that are executing in the kernel. 587 * 588 * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned 589 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 590 * to have executed a full memory barrier during the execution of 591 * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU 592 * (but again only if the system has more than one CPU). 593 */ 594 void synchronize_rcu_tasks(void) 595 { 596 /* Complain if the scheduler has not started. */ 597 RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, 598 "synchronize_rcu_tasks called too soon"); 599 600 /* Wait for the grace period. */ 601 wait_rcu_gp(call_rcu_tasks); 602 } 603 EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); 604 605 /** 606 * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks. 607 * 608 * Although the current implementation is guaranteed to wait, it is not 609 * obligated to, for example, if there are no pending callbacks. 610 */ 611 void rcu_barrier_tasks(void) 612 { 613 /* There is only one callback queue, so this is easy. ;-) */ 614 synchronize_rcu_tasks(); 615 } 616 EXPORT_SYMBOL_GPL(rcu_barrier_tasks); 617 618 /* See if tasks are still holding out, complain if so. */ 619 static void check_holdout_task(struct task_struct *t, 620 bool needreport, bool *firstreport) 621 { 622 int cpu; 623 624 if (!READ_ONCE(t->rcu_tasks_holdout) || 625 t->rcu_tasks_nvcsw != READ_ONCE(t->nvcsw) || 626 !READ_ONCE(t->on_rq) || 627 (IS_ENABLED(CONFIG_NO_HZ_FULL) && 628 !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) { 629 WRITE_ONCE(t->rcu_tasks_holdout, false); 630 list_del_init(&t->rcu_tasks_holdout_list); 631 put_task_struct(t); 632 return; 633 } 634 if (!needreport) 635 return; 636 if (*firstreport) { 637 pr_err("INFO: rcu_tasks detected stalls on tasks:\n"); 638 *firstreport = false; 639 } 640 cpu = task_cpu(t); 641 pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n", 642 t, ".I"[is_idle_task(t)], 643 "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)], 644 t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout, 645 t->rcu_tasks_idle_cpu, cpu); 646 sched_show_task(t); 647 } 648 649 /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ 650 static int __noreturn rcu_tasks_kthread(void *arg) 651 { 652 unsigned long flags; 653 struct task_struct *g, *t; 654 unsigned long lastreport; 655 struct rcu_head *list; 656 struct rcu_head *next; 657 LIST_HEAD(rcu_tasks_holdouts); 658 659 /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ 660 housekeeping_affine(current); 661 662 /* 663 * Each pass through the following loop makes one check for 664 * newly arrived callbacks, and, if there are some, waits for 665 * one RCU-tasks grace period and then invokes the callbacks. 666 * This loop is terminated by the system going down. ;-) 667 */ 668 for (;;) { 669 670 /* Pick up any new callbacks. */ 671 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags); 672 list = rcu_tasks_cbs_head; 673 rcu_tasks_cbs_head = NULL; 674 rcu_tasks_cbs_tail = &rcu_tasks_cbs_head; 675 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); 676 677 /* If there were none, wait a bit and start over. */ 678 if (!list) { 679 wait_event_interruptible(rcu_tasks_cbs_wq, 680 rcu_tasks_cbs_head); 681 if (!rcu_tasks_cbs_head) { 682 WARN_ON(signal_pending(current)); 683 schedule_timeout_interruptible(HZ/10); 684 } 685 continue; 686 } 687 688 /* 689 * Wait for all pre-existing t->on_rq and t->nvcsw 690 * transitions to complete. Invoking synchronize_sched() 691 * suffices because all these transitions occur with 692 * interrupts disabled. Without this synchronize_sched(), 693 * a read-side critical section that started before the 694 * grace period might be incorrectly seen as having started 695 * after the grace period. 696 * 697 * This synchronize_sched() also dispenses with the 698 * need for a memory barrier on the first store to 699 * ->rcu_tasks_holdout, as it forces the store to happen 700 * after the beginning of the grace period. 701 */ 702 synchronize_sched(); 703 704 /* 705 * There were callbacks, so we need to wait for an 706 * RCU-tasks grace period. Start off by scanning 707 * the task list for tasks that are not already 708 * voluntarily blocked. Mark these tasks and make 709 * a list of them in rcu_tasks_holdouts. 710 */ 711 rcu_read_lock(); 712 for_each_process_thread(g, t) { 713 if (t != current && READ_ONCE(t->on_rq) && 714 !is_idle_task(t)) { 715 get_task_struct(t); 716 t->rcu_tasks_nvcsw = READ_ONCE(t->nvcsw); 717 WRITE_ONCE(t->rcu_tasks_holdout, true); 718 list_add(&t->rcu_tasks_holdout_list, 719 &rcu_tasks_holdouts); 720 } 721 } 722 rcu_read_unlock(); 723 724 /* 725 * Wait for tasks that are in the process of exiting. 726 * This does only part of the job, ensuring that all 727 * tasks that were previously exiting reach the point 728 * where they have disabled preemption, allowing the 729 * later synchronize_sched() to finish the job. 730 */ 731 synchronize_srcu(&tasks_rcu_exit_srcu); 732 733 /* 734 * Each pass through the following loop scans the list 735 * of holdout tasks, removing any that are no longer 736 * holdouts. When the list is empty, we are done. 737 */ 738 lastreport = jiffies; 739 while (!list_empty(&rcu_tasks_holdouts)) { 740 bool firstreport; 741 bool needreport; 742 int rtst; 743 struct task_struct *t1; 744 745 schedule_timeout_interruptible(HZ); 746 rtst = READ_ONCE(rcu_task_stall_timeout); 747 needreport = rtst > 0 && 748 time_after(jiffies, lastreport + rtst); 749 if (needreport) 750 lastreport = jiffies; 751 firstreport = true; 752 WARN_ON(signal_pending(current)); 753 list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts, 754 rcu_tasks_holdout_list) { 755 check_holdout_task(t, needreport, &firstreport); 756 cond_resched(); 757 } 758 } 759 760 /* 761 * Because ->on_rq and ->nvcsw are not guaranteed 762 * to have a full memory barriers prior to them in the 763 * schedule() path, memory reordering on other CPUs could 764 * cause their RCU-tasks read-side critical sections to 765 * extend past the end of the grace period. However, 766 * because these ->nvcsw updates are carried out with 767 * interrupts disabled, we can use synchronize_sched() 768 * to force the needed ordering on all such CPUs. 769 * 770 * This synchronize_sched() also confines all 771 * ->rcu_tasks_holdout accesses to be within the grace 772 * period, avoiding the need for memory barriers for 773 * ->rcu_tasks_holdout accesses. 774 * 775 * In addition, this synchronize_sched() waits for exiting 776 * tasks to complete their final preempt_disable() region 777 * of execution, cleaning up after the synchronize_srcu() 778 * above. 779 */ 780 synchronize_sched(); 781 782 /* Invoke the callbacks. */ 783 while (list) { 784 next = list->next; 785 local_bh_disable(); 786 list->func(list); 787 local_bh_enable(); 788 list = next; 789 cond_resched(); 790 } 791 schedule_timeout_uninterruptible(HZ/10); 792 } 793 } 794 795 /* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */ 796 static void rcu_spawn_tasks_kthread(void) 797 { 798 static DEFINE_MUTEX(rcu_tasks_kthread_mutex); 799 struct task_struct *t; 800 801 if (READ_ONCE(rcu_tasks_kthread_ptr)) { 802 smp_mb(); /* Ensure caller sees full kthread. */ 803 return; 804 } 805 mutex_lock(&rcu_tasks_kthread_mutex); 806 if (rcu_tasks_kthread_ptr) { 807 mutex_unlock(&rcu_tasks_kthread_mutex); 808 return; 809 } 810 t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread"); 811 BUG_ON(IS_ERR(t)); 812 smp_mb(); /* Ensure others see full kthread. */ 813 WRITE_ONCE(rcu_tasks_kthread_ptr, t); 814 mutex_unlock(&rcu_tasks_kthread_mutex); 815 } 816 817 #endif /* #ifdef CONFIG_TASKS_RCU */ 818 819 /* 820 * Test each non-SRCU synchronous grace-period wait API. This is 821 * useful just after a change in mode for these primitives, and 822 * during early boot. 823 */ 824 void rcu_test_sync_prims(void) 825 { 826 if (!IS_ENABLED(CONFIG_PROVE_RCU)) 827 return; 828 synchronize_rcu(); 829 synchronize_rcu_bh(); 830 synchronize_sched(); 831 synchronize_rcu_expedited(); 832 synchronize_rcu_bh_expedited(); 833 synchronize_sched_expedited(); 834 } 835 836 #ifdef CONFIG_PROVE_RCU 837 838 /* 839 * Early boot self test parameters, one for each flavor 840 */ 841 static bool rcu_self_test; 842 static bool rcu_self_test_bh; 843 static bool rcu_self_test_sched; 844 845 module_param(rcu_self_test, bool, 0444); 846 module_param(rcu_self_test_bh, bool, 0444); 847 module_param(rcu_self_test_sched, bool, 0444); 848 849 static int rcu_self_test_counter; 850 851 static void test_callback(struct rcu_head *r) 852 { 853 rcu_self_test_counter++; 854 pr_info("RCU test callback executed %d\n", rcu_self_test_counter); 855 } 856 857 static void early_boot_test_call_rcu(void) 858 { 859 static struct rcu_head head; 860 861 call_rcu(&head, test_callback); 862 } 863 864 static void early_boot_test_call_rcu_bh(void) 865 { 866 static struct rcu_head head; 867 868 call_rcu_bh(&head, test_callback); 869 } 870 871 static void early_boot_test_call_rcu_sched(void) 872 { 873 static struct rcu_head head; 874 875 call_rcu_sched(&head, test_callback); 876 } 877 878 void rcu_early_boot_tests(void) 879 { 880 pr_info("Running RCU self tests\n"); 881 882 if (rcu_self_test) 883 early_boot_test_call_rcu(); 884 if (rcu_self_test_bh) 885 early_boot_test_call_rcu_bh(); 886 if (rcu_self_test_sched) 887 early_boot_test_call_rcu_sched(); 888 rcu_test_sync_prims(); 889 } 890 891 static int rcu_verify_early_boot_tests(void) 892 { 893 int ret = 0; 894 int early_boot_test_counter = 0; 895 896 if (rcu_self_test) { 897 early_boot_test_counter++; 898 rcu_barrier(); 899 } 900 if (rcu_self_test_bh) { 901 early_boot_test_counter++; 902 rcu_barrier_bh(); 903 } 904 if (rcu_self_test_sched) { 905 early_boot_test_counter++; 906 rcu_barrier_sched(); 907 } 908 909 if (rcu_self_test_counter != early_boot_test_counter) { 910 WARN_ON(1); 911 ret = -1; 912 } 913 914 return ret; 915 } 916 late_initcall(rcu_verify_early_boot_tests); 917 #else 918 void rcu_early_boot_tests(void) {} 919 #endif /* CONFIG_PROVE_RCU */ 920