1 /* 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 3 * Internal non-public definitions that provide either classic 4 * or preemptible semantics. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, you can access it online at 18 * http://www.gnu.org/licenses/gpl-2.0.html. 19 * 20 * Copyright Red Hat, 2009 21 * Copyright IBM Corporation, 2009 22 * 23 * Author: Ingo Molnar <mingo@elte.hu> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 25 */ 26 27 #include <linux/delay.h> 28 #include <linux/gfp.h> 29 #include <linux/oom.h> 30 #include <linux/smpboot.h> 31 #include "../time/tick-internal.h" 32 33 #define RCU_KTHREAD_PRIO 1 34 35 #ifdef CONFIG_RCU_BOOST 36 #include "../locking/rtmutex_common.h" 37 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO 38 #else 39 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 40 #endif 41 42 #ifdef CONFIG_RCU_NOCB_CPU 43 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 44 static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ 45 static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 46 static char __initdata nocb_buf[NR_CPUS * 5]; 47 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 48 49 /* 50 * Check the RCU kernel configuration parameters and print informative 51 * messages about anything out of the ordinary. If you like #ifdef, you 52 * will love this function. 53 */ 54 static void __init rcu_bootup_announce_oddness(void) 55 { 56 #ifdef CONFIG_RCU_TRACE 57 pr_info("\tRCU debugfs-based tracing is enabled.\n"); 58 #endif 59 #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 60 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 61 CONFIG_RCU_FANOUT); 62 #endif 63 #ifdef CONFIG_RCU_FANOUT_EXACT 64 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 65 #endif 66 #ifdef CONFIG_RCU_FAST_NO_HZ 67 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 68 #endif 69 #ifdef CONFIG_PROVE_RCU 70 pr_info("\tRCU lockdep checking is enabled.\n"); 71 #endif 72 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 73 pr_info("\tRCU torture testing starts during boot.\n"); 74 #endif 75 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 76 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); 77 #endif 78 #if defined(CONFIG_RCU_CPU_STALL_INFO) 79 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 80 #endif 81 #if NUM_RCU_LVL_4 != 0 82 pr_info("\tFour-level hierarchy is enabled.\n"); 83 #endif 84 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 85 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 86 if (nr_cpu_ids != NR_CPUS) 87 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 88 #ifdef CONFIG_RCU_NOCB_CPU 89 #ifndef CONFIG_RCU_NOCB_CPU_NONE 90 if (!have_rcu_nocb_mask) { 91 zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); 92 have_rcu_nocb_mask = true; 93 } 94 #ifdef CONFIG_RCU_NOCB_CPU_ZERO 95 pr_info("\tOffload RCU callbacks from CPU 0\n"); 96 cpumask_set_cpu(0, rcu_nocb_mask); 97 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 98 #ifdef CONFIG_RCU_NOCB_CPU_ALL 99 pr_info("\tOffload RCU callbacks from all CPUs\n"); 100 cpumask_copy(rcu_nocb_mask, cpu_possible_mask); 101 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 102 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 103 if (have_rcu_nocb_mask) { 104 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { 105 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); 106 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 107 rcu_nocb_mask); 108 } 109 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 110 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 111 if (rcu_nocb_poll) 112 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 113 } 114 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 115 } 116 117 #ifdef CONFIG_TREE_PREEMPT_RCU 118 119 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 120 static struct rcu_state *rcu_state_p = &rcu_preempt_state; 121 122 static int rcu_preempted_readers_exp(struct rcu_node *rnp); 123 124 /* 125 * Tell them what RCU they are running. 126 */ 127 static void __init rcu_bootup_announce(void) 128 { 129 pr_info("Preemptible hierarchical RCU implementation.\n"); 130 rcu_bootup_announce_oddness(); 131 } 132 133 /* 134 * Return the number of RCU-preempt batches processed thus far 135 * for debug and statistics. 136 */ 137 long rcu_batches_completed_preempt(void) 138 { 139 return rcu_preempt_state.completed; 140 } 141 EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); 142 143 /* 144 * Return the number of RCU batches processed thus far for debug & stats. 145 */ 146 long rcu_batches_completed(void) 147 { 148 return rcu_batches_completed_preempt(); 149 } 150 EXPORT_SYMBOL_GPL(rcu_batches_completed); 151 152 /* 153 * Record a preemptible-RCU quiescent state for the specified CPU. Note 154 * that this just means that the task currently running on the CPU is 155 * not in a quiescent state. There might be any number of tasks blocked 156 * while in an RCU read-side critical section. 157 * 158 * Unlike the other rcu_*_qs() functions, callers to this function 159 * must disable irqs in order to protect the assignment to 160 * ->rcu_read_unlock_special. 161 */ 162 static void rcu_preempt_qs(int cpu) 163 { 164 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 165 166 if (rdp->passed_quiesce == 0) 167 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); 168 rdp->passed_quiesce = 1; 169 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 170 } 171 172 /* 173 * We have entered the scheduler, and the current task might soon be 174 * context-switched away from. If this task is in an RCU read-side 175 * critical section, we will no longer be able to rely on the CPU to 176 * record that fact, so we enqueue the task on the blkd_tasks list. 177 * The task will dequeue itself when it exits the outermost enclosing 178 * RCU read-side critical section. Therefore, the current grace period 179 * cannot be permitted to complete until the blkd_tasks list entries 180 * predating the current grace period drain, in other words, until 181 * rnp->gp_tasks becomes NULL. 182 * 183 * Caller must disable preemption. 184 */ 185 static void rcu_preempt_note_context_switch(int cpu) 186 { 187 struct task_struct *t = current; 188 unsigned long flags; 189 struct rcu_data *rdp; 190 struct rcu_node *rnp; 191 192 if (t->rcu_read_lock_nesting > 0 && 193 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 194 195 /* Possibly blocking in an RCU read-side critical section. */ 196 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 197 rnp = rdp->mynode; 198 raw_spin_lock_irqsave(&rnp->lock, flags); 199 smp_mb__after_unlock_lock(); 200 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 201 t->rcu_blocked_node = rnp; 202 203 /* 204 * If this CPU has already checked in, then this task 205 * will hold up the next grace period rather than the 206 * current grace period. Queue the task accordingly. 207 * If the task is queued for the current grace period 208 * (i.e., this CPU has not yet passed through a quiescent 209 * state for the current grace period), then as long 210 * as that task remains queued, the current grace period 211 * cannot end. Note that there is some uncertainty as 212 * to exactly when the current grace period started. 213 * We take a conservative approach, which can result 214 * in unnecessarily waiting on tasks that started very 215 * slightly after the current grace period began. C'est 216 * la vie!!! 217 * 218 * But first, note that the current CPU must still be 219 * on line! 220 */ 221 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 222 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 223 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { 224 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); 225 rnp->gp_tasks = &t->rcu_node_entry; 226 #ifdef CONFIG_RCU_BOOST 227 if (rnp->boost_tasks != NULL) 228 rnp->boost_tasks = rnp->gp_tasks; 229 #endif /* #ifdef CONFIG_RCU_BOOST */ 230 } else { 231 list_add(&t->rcu_node_entry, &rnp->blkd_tasks); 232 if (rnp->qsmask & rdp->grpmask) 233 rnp->gp_tasks = &t->rcu_node_entry; 234 } 235 trace_rcu_preempt_task(rdp->rsp->name, 236 t->pid, 237 (rnp->qsmask & rdp->grpmask) 238 ? rnp->gpnum 239 : rnp->gpnum + 1); 240 raw_spin_unlock_irqrestore(&rnp->lock, flags); 241 } else if (t->rcu_read_lock_nesting < 0 && 242 t->rcu_read_unlock_special) { 243 244 /* 245 * Complete exit from RCU read-side critical section on 246 * behalf of preempted instance of __rcu_read_unlock(). 247 */ 248 rcu_read_unlock_special(t); 249 } 250 251 /* 252 * Either we were not in an RCU read-side critical section to 253 * begin with, or we have now recorded that critical section 254 * globally. Either way, we can now note a quiescent state 255 * for this CPU. Again, if we were in an RCU read-side critical 256 * section, and if that critical section was blocking the current 257 * grace period, then the fact that the task has been enqueued 258 * means that we continue to block the current grace period. 259 */ 260 local_irq_save(flags); 261 rcu_preempt_qs(cpu); 262 local_irq_restore(flags); 263 } 264 265 /* 266 * Check for preempted RCU readers blocking the current grace period 267 * for the specified rcu_node structure. If the caller needs a reliable 268 * answer, it must hold the rcu_node's ->lock. 269 */ 270 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 271 { 272 return rnp->gp_tasks != NULL; 273 } 274 275 /* 276 * Record a quiescent state for all tasks that were previously queued 277 * on the specified rcu_node structure and that were blocking the current 278 * RCU grace period. The caller must hold the specified rnp->lock with 279 * irqs disabled, and this lock is released upon return, but irqs remain 280 * disabled. 281 */ 282 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 283 __releases(rnp->lock) 284 { 285 unsigned long mask; 286 struct rcu_node *rnp_p; 287 288 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 289 raw_spin_unlock_irqrestore(&rnp->lock, flags); 290 return; /* Still need more quiescent states! */ 291 } 292 293 rnp_p = rnp->parent; 294 if (rnp_p == NULL) { 295 /* 296 * Either there is only one rcu_node in the tree, 297 * or tasks were kicked up to root rcu_node due to 298 * CPUs going offline. 299 */ 300 rcu_report_qs_rsp(&rcu_preempt_state, flags); 301 return; 302 } 303 304 /* Report up the rest of the hierarchy. */ 305 mask = rnp->grpmask; 306 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 307 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ 308 smp_mb__after_unlock_lock(); 309 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 310 } 311 312 /* 313 * Advance a ->blkd_tasks-list pointer to the next entry, instead 314 * returning NULL if at the end of the list. 315 */ 316 static struct list_head *rcu_next_node_entry(struct task_struct *t, 317 struct rcu_node *rnp) 318 { 319 struct list_head *np; 320 321 np = t->rcu_node_entry.next; 322 if (np == &rnp->blkd_tasks) 323 np = NULL; 324 return np; 325 } 326 327 /* 328 * Handle special cases during rcu_read_unlock(), such as needing to 329 * notify RCU core processing or task having blocked during the RCU 330 * read-side critical section. 331 */ 332 void rcu_read_unlock_special(struct task_struct *t) 333 { 334 int empty; 335 int empty_exp; 336 int empty_exp_now; 337 unsigned long flags; 338 struct list_head *np; 339 #ifdef CONFIG_RCU_BOOST 340 bool drop_boost_mutex = false; 341 #endif /* #ifdef CONFIG_RCU_BOOST */ 342 struct rcu_node *rnp; 343 int special; 344 345 /* NMI handlers cannot block and cannot safely manipulate state. */ 346 if (in_nmi()) 347 return; 348 349 local_irq_save(flags); 350 351 /* 352 * If RCU core is waiting for this CPU to exit critical section, 353 * let it know that we have done so. 354 */ 355 special = t->rcu_read_unlock_special; 356 if (special & RCU_READ_UNLOCK_NEED_QS) { 357 rcu_preempt_qs(smp_processor_id()); 358 if (!t->rcu_read_unlock_special) { 359 local_irq_restore(flags); 360 return; 361 } 362 } 363 364 /* Hardware IRQ handlers cannot block, complain if they get here. */ 365 if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { 366 local_irq_restore(flags); 367 return; 368 } 369 370 /* Clean up if blocked during RCU read-side critical section. */ 371 if (special & RCU_READ_UNLOCK_BLOCKED) { 372 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; 373 374 /* 375 * Remove this task from the list it blocked on. The 376 * task can migrate while we acquire the lock, but at 377 * most one time. So at most two passes through loop. 378 */ 379 for (;;) { 380 rnp = t->rcu_blocked_node; 381 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 382 smp_mb__after_unlock_lock(); 383 if (rnp == t->rcu_blocked_node) 384 break; 385 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 386 } 387 empty = !rcu_preempt_blocked_readers_cgp(rnp); 388 empty_exp = !rcu_preempted_readers_exp(rnp); 389 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 390 np = rcu_next_node_entry(t, rnp); 391 list_del_init(&t->rcu_node_entry); 392 t->rcu_blocked_node = NULL; 393 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), 394 rnp->gpnum, t->pid); 395 if (&t->rcu_node_entry == rnp->gp_tasks) 396 rnp->gp_tasks = np; 397 if (&t->rcu_node_entry == rnp->exp_tasks) 398 rnp->exp_tasks = np; 399 #ifdef CONFIG_RCU_BOOST 400 if (&t->rcu_node_entry == rnp->boost_tasks) 401 rnp->boost_tasks = np; 402 /* Snapshot ->boost_mtx ownership with rcu_node lock held. */ 403 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; 404 #endif /* #ifdef CONFIG_RCU_BOOST */ 405 406 /* 407 * If this was the last task on the current list, and if 408 * we aren't waiting on any CPUs, report the quiescent state. 409 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 410 * so we must take a snapshot of the expedited state. 411 */ 412 empty_exp_now = !rcu_preempted_readers_exp(rnp); 413 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 414 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 415 rnp->gpnum, 416 0, rnp->qsmask, 417 rnp->level, 418 rnp->grplo, 419 rnp->grphi, 420 !!rnp->gp_tasks); 421 rcu_report_unblock_qs_rnp(rnp, flags); 422 } else { 423 raw_spin_unlock_irqrestore(&rnp->lock, flags); 424 } 425 426 #ifdef CONFIG_RCU_BOOST 427 /* Unboost if we were boosted. */ 428 if (drop_boost_mutex) { 429 rt_mutex_unlock(&rnp->boost_mtx); 430 complete(&rnp->boost_completion); 431 } 432 #endif /* #ifdef CONFIG_RCU_BOOST */ 433 434 /* 435 * If this was the last task on the expedited lists, 436 * then we need to report up the rcu_node hierarchy. 437 */ 438 if (!empty_exp && empty_exp_now) 439 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); 440 } else { 441 local_irq_restore(flags); 442 } 443 } 444 445 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE 446 447 /* 448 * Dump detailed information for all tasks blocking the current RCU 449 * grace period on the specified rcu_node structure. 450 */ 451 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 452 { 453 unsigned long flags; 454 struct task_struct *t; 455 456 raw_spin_lock_irqsave(&rnp->lock, flags); 457 if (!rcu_preempt_blocked_readers_cgp(rnp)) { 458 raw_spin_unlock_irqrestore(&rnp->lock, flags); 459 return; 460 } 461 t = list_entry(rnp->gp_tasks, 462 struct task_struct, rcu_node_entry); 463 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 464 sched_show_task(t); 465 raw_spin_unlock_irqrestore(&rnp->lock, flags); 466 } 467 468 /* 469 * Dump detailed information for all tasks blocking the current RCU 470 * grace period. 471 */ 472 static void rcu_print_detail_task_stall(struct rcu_state *rsp) 473 { 474 struct rcu_node *rnp = rcu_get_root(rsp); 475 476 rcu_print_detail_task_stall_rnp(rnp); 477 rcu_for_each_leaf_node(rsp, rnp) 478 rcu_print_detail_task_stall_rnp(rnp); 479 } 480 481 #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 482 483 static void rcu_print_detail_task_stall(struct rcu_state *rsp) 484 { 485 } 486 487 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 488 489 #ifdef CONFIG_RCU_CPU_STALL_INFO 490 491 static void rcu_print_task_stall_begin(struct rcu_node *rnp) 492 { 493 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", 494 rnp->level, rnp->grplo, rnp->grphi); 495 } 496 497 static void rcu_print_task_stall_end(void) 498 { 499 pr_cont("\n"); 500 } 501 502 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 503 504 static void rcu_print_task_stall_begin(struct rcu_node *rnp) 505 { 506 } 507 508 static void rcu_print_task_stall_end(void) 509 { 510 } 511 512 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 513 514 /* 515 * Scan the current list of tasks blocked within RCU read-side critical 516 * sections, printing out the tid of each. 517 */ 518 static int rcu_print_task_stall(struct rcu_node *rnp) 519 { 520 struct task_struct *t; 521 int ndetected = 0; 522 523 if (!rcu_preempt_blocked_readers_cgp(rnp)) 524 return 0; 525 rcu_print_task_stall_begin(rnp); 526 t = list_entry(rnp->gp_tasks, 527 struct task_struct, rcu_node_entry); 528 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 529 pr_cont(" P%d", t->pid); 530 ndetected++; 531 } 532 rcu_print_task_stall_end(); 533 return ndetected; 534 } 535 536 /* 537 * Check that the list of blocked tasks for the newly completed grace 538 * period is in fact empty. It is a serious bug to complete a grace 539 * period that still has RCU readers blocked! This function must be 540 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 541 * must be held by the caller. 542 * 543 * Also, if there are blocked tasks on the list, they automatically 544 * block the newly created grace period, so set up ->gp_tasks accordingly. 545 */ 546 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 547 { 548 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 549 if (!list_empty(&rnp->blkd_tasks)) 550 rnp->gp_tasks = rnp->blkd_tasks.next; 551 WARN_ON_ONCE(rnp->qsmask); 552 } 553 554 #ifdef CONFIG_HOTPLUG_CPU 555 556 /* 557 * Handle tasklist migration for case in which all CPUs covered by the 558 * specified rcu_node have gone offline. Move them up to the root 559 * rcu_node. The reason for not just moving them to the immediate 560 * parent is to remove the need for rcu_read_unlock_special() to 561 * make more than two attempts to acquire the target rcu_node's lock. 562 * Returns true if there were tasks blocking the current RCU grace 563 * period. 564 * 565 * Returns 1 if there was previously a task blocking the current grace 566 * period on the specified rcu_node structure. 567 * 568 * The caller must hold rnp->lock with irqs disabled. 569 */ 570 static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 571 struct rcu_node *rnp, 572 struct rcu_data *rdp) 573 { 574 struct list_head *lp; 575 struct list_head *lp_root; 576 int retval = 0; 577 struct rcu_node *rnp_root = rcu_get_root(rsp); 578 struct task_struct *t; 579 580 if (rnp == rnp_root) { 581 WARN_ONCE(1, "Last CPU thought to be offlined?"); 582 return 0; /* Shouldn't happen: at least one CPU online. */ 583 } 584 585 /* If we are on an internal node, complain bitterly. */ 586 WARN_ON_ONCE(rnp != rdp->mynode); 587 588 /* 589 * Move tasks up to root rcu_node. Don't try to get fancy for 590 * this corner-case operation -- just put this node's tasks 591 * at the head of the root node's list, and update the root node's 592 * ->gp_tasks and ->exp_tasks pointers to those of this node's, 593 * if non-NULL. This might result in waiting for more tasks than 594 * absolutely necessary, but this is a good performance/complexity 595 * tradeoff. 596 */ 597 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) 598 retval |= RCU_OFL_TASKS_NORM_GP; 599 if (rcu_preempted_readers_exp(rnp)) 600 retval |= RCU_OFL_TASKS_EXP_GP; 601 lp = &rnp->blkd_tasks; 602 lp_root = &rnp_root->blkd_tasks; 603 while (!list_empty(lp)) { 604 t = list_entry(lp->next, typeof(*t), rcu_node_entry); 605 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 606 smp_mb__after_unlock_lock(); 607 list_del(&t->rcu_node_entry); 608 t->rcu_blocked_node = rnp_root; 609 list_add(&t->rcu_node_entry, lp_root); 610 if (&t->rcu_node_entry == rnp->gp_tasks) 611 rnp_root->gp_tasks = rnp->gp_tasks; 612 if (&t->rcu_node_entry == rnp->exp_tasks) 613 rnp_root->exp_tasks = rnp->exp_tasks; 614 #ifdef CONFIG_RCU_BOOST 615 if (&t->rcu_node_entry == rnp->boost_tasks) 616 rnp_root->boost_tasks = rnp->boost_tasks; 617 #endif /* #ifdef CONFIG_RCU_BOOST */ 618 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 619 } 620 621 rnp->gp_tasks = NULL; 622 rnp->exp_tasks = NULL; 623 #ifdef CONFIG_RCU_BOOST 624 rnp->boost_tasks = NULL; 625 /* 626 * In case root is being boosted and leaf was not. Make sure 627 * that we boost the tasks blocking the current grace period 628 * in this case. 629 */ 630 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 631 smp_mb__after_unlock_lock(); 632 if (rnp_root->boost_tasks != NULL && 633 rnp_root->boost_tasks != rnp_root->gp_tasks && 634 rnp_root->boost_tasks != rnp_root->exp_tasks) 635 rnp_root->boost_tasks = rnp_root->gp_tasks; 636 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 637 #endif /* #ifdef CONFIG_RCU_BOOST */ 638 639 return retval; 640 } 641 642 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 643 644 /* 645 * Check for a quiescent state from the current CPU. When a task blocks, 646 * the task is recorded in the corresponding CPU's rcu_node structure, 647 * which is checked elsewhere. 648 * 649 * Caller must disable hard irqs. 650 */ 651 static void rcu_preempt_check_callbacks(int cpu) 652 { 653 struct task_struct *t = current; 654 655 if (t->rcu_read_lock_nesting == 0) { 656 rcu_preempt_qs(cpu); 657 return; 658 } 659 if (t->rcu_read_lock_nesting > 0 && 660 per_cpu(rcu_preempt_data, cpu).qs_pending) 661 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 662 } 663 664 #ifdef CONFIG_RCU_BOOST 665 666 static void rcu_preempt_do_callbacks(void) 667 { 668 rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); 669 } 670 671 #endif /* #ifdef CONFIG_RCU_BOOST */ 672 673 /* 674 * Queue a preemptible-RCU callback for invocation after a grace period. 675 */ 676 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 677 { 678 __call_rcu(head, func, &rcu_preempt_state, -1, 0); 679 } 680 EXPORT_SYMBOL_GPL(call_rcu); 681 682 /** 683 * synchronize_rcu - wait until a grace period has elapsed. 684 * 685 * Control will return to the caller some time after a full grace 686 * period has elapsed, in other words after all currently executing RCU 687 * read-side critical sections have completed. Note, however, that 688 * upon return from synchronize_rcu(), the caller might well be executing 689 * concurrently with new RCU read-side critical sections that began while 690 * synchronize_rcu() was waiting. RCU read-side critical sections are 691 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 692 * 693 * See the description of synchronize_sched() for more detailed information 694 * on memory ordering guarantees. 695 */ 696 void synchronize_rcu(void) 697 { 698 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 699 !lock_is_held(&rcu_lock_map) && 700 !lock_is_held(&rcu_sched_lock_map), 701 "Illegal synchronize_rcu() in RCU read-side critical section"); 702 if (!rcu_scheduler_active) 703 return; 704 if (rcu_expedited) 705 synchronize_rcu_expedited(); 706 else 707 wait_rcu_gp(call_rcu); 708 } 709 EXPORT_SYMBOL_GPL(synchronize_rcu); 710 711 static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); 712 static unsigned long sync_rcu_preempt_exp_count; 713 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); 714 715 /* 716 * Return non-zero if there are any tasks in RCU read-side critical 717 * sections blocking the current preemptible-RCU expedited grace period. 718 * If there is no preemptible-RCU expedited grace period currently in 719 * progress, returns zero unconditionally. 720 */ 721 static int rcu_preempted_readers_exp(struct rcu_node *rnp) 722 { 723 return rnp->exp_tasks != NULL; 724 } 725 726 /* 727 * return non-zero if there is no RCU expedited grace period in progress 728 * for the specified rcu_node structure, in other words, if all CPUs and 729 * tasks covered by the specified rcu_node structure have done their bit 730 * for the current expedited grace period. Works only for preemptible 731 * RCU -- other RCU implementation use other means. 732 * 733 * Caller must hold sync_rcu_preempt_exp_mutex. 734 */ 735 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) 736 { 737 return !rcu_preempted_readers_exp(rnp) && 738 ACCESS_ONCE(rnp->expmask) == 0; 739 } 740 741 /* 742 * Report the exit from RCU read-side critical section for the last task 743 * that queued itself during or before the current expedited preemptible-RCU 744 * grace period. This event is reported either to the rcu_node structure on 745 * which the task was queued or to one of that rcu_node structure's ancestors, 746 * recursively up the tree. (Calm down, calm down, we do the recursion 747 * iteratively!) 748 * 749 * Most callers will set the "wake" flag, but the task initiating the 750 * expedited grace period need not wake itself. 751 * 752 * Caller must hold sync_rcu_preempt_exp_mutex. 753 */ 754 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 755 bool wake) 756 { 757 unsigned long flags; 758 unsigned long mask; 759 760 raw_spin_lock_irqsave(&rnp->lock, flags); 761 smp_mb__after_unlock_lock(); 762 for (;;) { 763 if (!sync_rcu_preempt_exp_done(rnp)) { 764 raw_spin_unlock_irqrestore(&rnp->lock, flags); 765 break; 766 } 767 if (rnp->parent == NULL) { 768 raw_spin_unlock_irqrestore(&rnp->lock, flags); 769 if (wake) { 770 smp_mb(); /* EGP done before wake_up(). */ 771 wake_up(&sync_rcu_preempt_exp_wq); 772 } 773 break; 774 } 775 mask = rnp->grpmask; 776 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 777 rnp = rnp->parent; 778 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 779 smp_mb__after_unlock_lock(); 780 rnp->expmask &= ~mask; 781 } 782 } 783 784 /* 785 * Snapshot the tasks blocking the newly started preemptible-RCU expedited 786 * grace period for the specified rcu_node structure. If there are no such 787 * tasks, report it up the rcu_node hierarchy. 788 * 789 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude 790 * CPU hotplug operations. 791 */ 792 static void 793 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 794 { 795 unsigned long flags; 796 int must_wait = 0; 797 798 raw_spin_lock_irqsave(&rnp->lock, flags); 799 smp_mb__after_unlock_lock(); 800 if (list_empty(&rnp->blkd_tasks)) { 801 raw_spin_unlock_irqrestore(&rnp->lock, flags); 802 } else { 803 rnp->exp_tasks = rnp->blkd_tasks.next; 804 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 805 must_wait = 1; 806 } 807 if (!must_wait) 808 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 809 } 810 811 /** 812 * synchronize_rcu_expedited - Brute-force RCU grace period 813 * 814 * Wait for an RCU-preempt grace period, but expedite it. The basic 815 * idea is to invoke synchronize_sched_expedited() to push all the tasks to 816 * the ->blkd_tasks lists and wait for this list to drain. This consumes 817 * significant time on all CPUs and is unfriendly to real-time workloads, 818 * so is thus not recommended for any sort of common-case code. 819 * In fact, if you are using synchronize_rcu_expedited() in a loop, 820 * please restructure your code to batch your updates, and then Use a 821 * single synchronize_rcu() instead. 822 * 823 * Note that it is illegal to call this function while holding any lock 824 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 825 * to call this function from a CPU-hotplug notifier. Failing to observe 826 * these restriction will result in deadlock. 827 */ 828 void synchronize_rcu_expedited(void) 829 { 830 unsigned long flags; 831 struct rcu_node *rnp; 832 struct rcu_state *rsp = &rcu_preempt_state; 833 unsigned long snap; 834 int trycount = 0; 835 836 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 837 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; 838 smp_mb(); /* Above access cannot bleed into critical section. */ 839 840 /* 841 * Block CPU-hotplug operations. This means that any CPU-hotplug 842 * operation that finds an rcu_node structure with tasks in the 843 * process of being boosted will know that all tasks blocking 844 * this expedited grace period will already be in the process of 845 * being boosted. This simplifies the process of moving tasks 846 * from leaf to root rcu_node structures. 847 */ 848 get_online_cpus(); 849 850 /* 851 * Acquire lock, falling back to synchronize_rcu() if too many 852 * lock-acquisition failures. Of course, if someone does the 853 * expedited grace period for us, just leave. 854 */ 855 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 856 if (ULONG_CMP_LT(snap, 857 ACCESS_ONCE(sync_rcu_preempt_exp_count))) { 858 put_online_cpus(); 859 goto mb_ret; /* Others did our work for us. */ 860 } 861 if (trycount++ < 10) { 862 udelay(trycount * num_online_cpus()); 863 } else { 864 put_online_cpus(); 865 wait_rcu_gp(call_rcu); 866 return; 867 } 868 } 869 if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { 870 put_online_cpus(); 871 goto unlock_mb_ret; /* Others did our work for us. */ 872 } 873 874 /* force all RCU readers onto ->blkd_tasks lists. */ 875 synchronize_sched_expedited(); 876 877 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 878 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 879 raw_spin_lock_irqsave(&rnp->lock, flags); 880 smp_mb__after_unlock_lock(); 881 rnp->expmask = rnp->qsmaskinit; 882 raw_spin_unlock_irqrestore(&rnp->lock, flags); 883 } 884 885 /* Snapshot current state of ->blkd_tasks lists. */ 886 rcu_for_each_leaf_node(rsp, rnp) 887 sync_rcu_preempt_exp_init(rsp, rnp); 888 if (NUM_RCU_NODES > 1) 889 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 890 891 put_online_cpus(); 892 893 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 894 rnp = rcu_get_root(rsp); 895 wait_event(sync_rcu_preempt_exp_wq, 896 sync_rcu_preempt_exp_done(rnp)); 897 898 /* Clean up and exit. */ 899 smp_mb(); /* ensure expedited GP seen before counter increment. */ 900 ACCESS_ONCE(sync_rcu_preempt_exp_count)++; 901 unlock_mb_ret: 902 mutex_unlock(&sync_rcu_preempt_exp_mutex); 903 mb_ret: 904 smp_mb(); /* ensure subsequent action seen after grace period. */ 905 } 906 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 907 908 /** 909 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 910 * 911 * Note that this primitive does not necessarily wait for an RCU grace period 912 * to complete. For example, if there are no RCU callbacks queued anywhere 913 * in the system, then rcu_barrier() is within its rights to return 914 * immediately, without waiting for anything, much less an RCU grace period. 915 */ 916 void rcu_barrier(void) 917 { 918 _rcu_barrier(&rcu_preempt_state); 919 } 920 EXPORT_SYMBOL_GPL(rcu_barrier); 921 922 /* 923 * Initialize preemptible RCU's state structures. 924 */ 925 static void __init __rcu_init_preempt(void) 926 { 927 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 928 } 929 930 /* 931 * Check for a task exiting while in a preemptible-RCU read-side 932 * critical section, clean up if so. No need to issue warnings, 933 * as debug_check_no_locks_held() already does this if lockdep 934 * is enabled. 935 */ 936 void exit_rcu(void) 937 { 938 struct task_struct *t = current; 939 940 if (likely(list_empty(¤t->rcu_node_entry))) 941 return; 942 t->rcu_read_lock_nesting = 1; 943 barrier(); 944 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; 945 __rcu_read_unlock(); 946 } 947 948 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 949 950 static struct rcu_state *rcu_state_p = &rcu_sched_state; 951 952 /* 953 * Tell them what RCU they are running. 954 */ 955 static void __init rcu_bootup_announce(void) 956 { 957 pr_info("Hierarchical RCU implementation.\n"); 958 rcu_bootup_announce_oddness(); 959 } 960 961 /* 962 * Return the number of RCU batches processed thus far for debug & stats. 963 */ 964 long rcu_batches_completed(void) 965 { 966 return rcu_batches_completed_sched(); 967 } 968 EXPORT_SYMBOL_GPL(rcu_batches_completed); 969 970 /* 971 * Because preemptible RCU does not exist, we never have to check for 972 * CPUs being in quiescent states. 973 */ 974 static void rcu_preempt_note_context_switch(int cpu) 975 { 976 } 977 978 /* 979 * Because preemptible RCU does not exist, there are never any preempted 980 * RCU readers. 981 */ 982 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 983 { 984 return 0; 985 } 986 987 #ifdef CONFIG_HOTPLUG_CPU 988 989 /* Because preemptible RCU does not exist, no quieting of tasks. */ 990 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 991 __releases(rnp->lock) 992 { 993 raw_spin_unlock_irqrestore(&rnp->lock, flags); 994 } 995 996 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 997 998 /* 999 * Because preemptible RCU does not exist, we never have to check for 1000 * tasks blocked within RCU read-side critical sections. 1001 */ 1002 static void rcu_print_detail_task_stall(struct rcu_state *rsp) 1003 { 1004 } 1005 1006 /* 1007 * Because preemptible RCU does not exist, we never have to check for 1008 * tasks blocked within RCU read-side critical sections. 1009 */ 1010 static int rcu_print_task_stall(struct rcu_node *rnp) 1011 { 1012 return 0; 1013 } 1014 1015 /* 1016 * Because there is no preemptible RCU, there can be no readers blocked, 1017 * so there is no need to check for blocked tasks. So check only for 1018 * bogus qsmask values. 1019 */ 1020 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 1021 { 1022 WARN_ON_ONCE(rnp->qsmask); 1023 } 1024 1025 #ifdef CONFIG_HOTPLUG_CPU 1026 1027 /* 1028 * Because preemptible RCU does not exist, it never needs to migrate 1029 * tasks that were blocked within RCU read-side critical sections, and 1030 * such non-existent tasks cannot possibly have been blocking the current 1031 * grace period. 1032 */ 1033 static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 1034 struct rcu_node *rnp, 1035 struct rcu_data *rdp) 1036 { 1037 return 0; 1038 } 1039 1040 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1041 1042 /* 1043 * Because preemptible RCU does not exist, it never has any callbacks 1044 * to check. 1045 */ 1046 static void rcu_preempt_check_callbacks(int cpu) 1047 { 1048 } 1049 1050 /* 1051 * Wait for an rcu-preempt grace period, but make it happen quickly. 1052 * But because preemptible RCU does not exist, map to rcu-sched. 1053 */ 1054 void synchronize_rcu_expedited(void) 1055 { 1056 synchronize_sched_expedited(); 1057 } 1058 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 1059 1060 #ifdef CONFIG_HOTPLUG_CPU 1061 1062 /* 1063 * Because preemptible RCU does not exist, there is never any need to 1064 * report on tasks preempted in RCU read-side critical sections during 1065 * expedited RCU grace periods. 1066 */ 1067 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 1068 bool wake) 1069 { 1070 } 1071 1072 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1073 1074 /* 1075 * Because preemptible RCU does not exist, rcu_barrier() is just 1076 * another name for rcu_barrier_sched(). 1077 */ 1078 void rcu_barrier(void) 1079 { 1080 rcu_barrier_sched(); 1081 } 1082 EXPORT_SYMBOL_GPL(rcu_barrier); 1083 1084 /* 1085 * Because preemptible RCU does not exist, it need not be initialized. 1086 */ 1087 static void __init __rcu_init_preempt(void) 1088 { 1089 } 1090 1091 /* 1092 * Because preemptible RCU does not exist, tasks cannot possibly exit 1093 * while in preemptible RCU read-side critical sections. 1094 */ 1095 void exit_rcu(void) 1096 { 1097 } 1098 1099 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1100 1101 #ifdef CONFIG_RCU_BOOST 1102 1103 #include "../locking/rtmutex_common.h" 1104 1105 #ifdef CONFIG_RCU_TRACE 1106 1107 static void rcu_initiate_boost_trace(struct rcu_node *rnp) 1108 { 1109 if (list_empty(&rnp->blkd_tasks)) 1110 rnp->n_balk_blkd_tasks++; 1111 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) 1112 rnp->n_balk_exp_gp_tasks++; 1113 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) 1114 rnp->n_balk_boost_tasks++; 1115 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) 1116 rnp->n_balk_notblocked++; 1117 else if (rnp->gp_tasks != NULL && 1118 ULONG_CMP_LT(jiffies, rnp->boost_time)) 1119 rnp->n_balk_notyet++; 1120 else 1121 rnp->n_balk_nos++; 1122 } 1123 1124 #else /* #ifdef CONFIG_RCU_TRACE */ 1125 1126 static void rcu_initiate_boost_trace(struct rcu_node *rnp) 1127 { 1128 } 1129 1130 #endif /* #else #ifdef CONFIG_RCU_TRACE */ 1131 1132 static void rcu_wake_cond(struct task_struct *t, int status) 1133 { 1134 /* 1135 * If the thread is yielding, only wake it when this 1136 * is invoked from idle 1137 */ 1138 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) 1139 wake_up_process(t); 1140 } 1141 1142 /* 1143 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1144 * or ->boost_tasks, advancing the pointer to the next task in the 1145 * ->blkd_tasks list. 1146 * 1147 * Note that irqs must be enabled: boosting the task can block. 1148 * Returns 1 if there are more tasks needing to be boosted. 1149 */ 1150 static int rcu_boost(struct rcu_node *rnp) 1151 { 1152 unsigned long flags; 1153 struct task_struct *t; 1154 struct list_head *tb; 1155 1156 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) 1157 return 0; /* Nothing left to boost. */ 1158 1159 raw_spin_lock_irqsave(&rnp->lock, flags); 1160 smp_mb__after_unlock_lock(); 1161 1162 /* 1163 * Recheck under the lock: all tasks in need of boosting 1164 * might exit their RCU read-side critical sections on their own. 1165 */ 1166 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { 1167 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1168 return 0; 1169 } 1170 1171 /* 1172 * Preferentially boost tasks blocking expedited grace periods. 1173 * This cannot starve the normal grace periods because a second 1174 * expedited grace period must boost all blocked tasks, including 1175 * those blocking the pre-existing normal grace period. 1176 */ 1177 if (rnp->exp_tasks != NULL) { 1178 tb = rnp->exp_tasks; 1179 rnp->n_exp_boosts++; 1180 } else { 1181 tb = rnp->boost_tasks; 1182 rnp->n_normal_boosts++; 1183 } 1184 rnp->n_tasks_boosted++; 1185 1186 /* 1187 * We boost task t by manufacturing an rt_mutex that appears to 1188 * be held by task t. We leave a pointer to that rt_mutex where 1189 * task t can find it, and task t will release the mutex when it 1190 * exits its outermost RCU read-side critical section. Then 1191 * simply acquiring this artificial rt_mutex will boost task 1192 * t's priority. (Thanks to tglx for suggesting this approach!) 1193 * 1194 * Note that task t must acquire rnp->lock to remove itself from 1195 * the ->blkd_tasks list, which it will do from exit() if from 1196 * nowhere else. We therefore are guaranteed that task t will 1197 * stay around at least until we drop rnp->lock. Note that 1198 * rnp->lock also resolves races between our priority boosting 1199 * and task t's exiting its outermost RCU read-side critical 1200 * section. 1201 */ 1202 t = container_of(tb, struct task_struct, rcu_node_entry); 1203 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1204 init_completion(&rnp->boost_completion); 1205 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1206 /* Lock only for side effect: boosts task t's priority. */ 1207 rt_mutex_lock(&rnp->boost_mtx); 1208 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1209 1210 /* Wait for boostee to be done w/boost_mtx before reinitializing. */ 1211 wait_for_completion(&rnp->boost_completion); 1212 1213 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1214 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1215 } 1216 1217 /* 1218 * Priority-boosting kthread. One per leaf rcu_node and one for the 1219 * root rcu_node. 1220 */ 1221 static int rcu_boost_kthread(void *arg) 1222 { 1223 struct rcu_node *rnp = (struct rcu_node *)arg; 1224 int spincnt = 0; 1225 int more2boost; 1226 1227 trace_rcu_utilization(TPS("Start boost kthread@init")); 1228 for (;;) { 1229 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1230 trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); 1231 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1232 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); 1233 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1234 more2boost = rcu_boost(rnp); 1235 if (more2boost) 1236 spincnt++; 1237 else 1238 spincnt = 0; 1239 if (spincnt > 10) { 1240 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1241 trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); 1242 schedule_timeout_interruptible(2); 1243 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); 1244 spincnt = 0; 1245 } 1246 } 1247 /* NOTREACHED */ 1248 trace_rcu_utilization(TPS("End boost kthread@notreached")); 1249 return 0; 1250 } 1251 1252 /* 1253 * Check to see if it is time to start boosting RCU readers that are 1254 * blocking the current grace period, and, if so, tell the per-rcu_node 1255 * kthread to start boosting them. If there is an expedited grace 1256 * period in progress, it is always time to boost. 1257 * 1258 * The caller must hold rnp->lock, which this function releases. 1259 * The ->boost_kthread_task is immortal, so we don't need to worry 1260 * about it going away. 1261 */ 1262 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1263 __releases(rnp->lock) 1264 { 1265 struct task_struct *t; 1266 1267 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1268 rnp->n_balk_exp_gp_tasks++; 1269 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1270 return; 1271 } 1272 if (rnp->exp_tasks != NULL || 1273 (rnp->gp_tasks != NULL && 1274 rnp->boost_tasks == NULL && 1275 rnp->qsmask == 0 && 1276 ULONG_CMP_GE(jiffies, rnp->boost_time))) { 1277 if (rnp->exp_tasks == NULL) 1278 rnp->boost_tasks = rnp->gp_tasks; 1279 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1280 t = rnp->boost_kthread_task; 1281 if (t) 1282 rcu_wake_cond(t, rnp->boost_kthread_status); 1283 } else { 1284 rcu_initiate_boost_trace(rnp); 1285 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1286 } 1287 } 1288 1289 /* 1290 * Wake up the per-CPU kthread to invoke RCU callbacks. 1291 */ 1292 static void invoke_rcu_callbacks_kthread(void) 1293 { 1294 unsigned long flags; 1295 1296 local_irq_save(flags); 1297 __this_cpu_write(rcu_cpu_has_work, 1); 1298 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && 1299 current != __this_cpu_read(rcu_cpu_kthread_task)) { 1300 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), 1301 __this_cpu_read(rcu_cpu_kthread_status)); 1302 } 1303 local_irq_restore(flags); 1304 } 1305 1306 /* 1307 * Is the current CPU running the RCU-callbacks kthread? 1308 * Caller must have preemption disabled. 1309 */ 1310 static bool rcu_is_callbacks_kthread(void) 1311 { 1312 return __this_cpu_read(rcu_cpu_kthread_task) == current; 1313 } 1314 1315 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1316 1317 /* 1318 * Do priority-boost accounting for the start of a new grace period. 1319 */ 1320 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1321 { 1322 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 1323 } 1324 1325 /* 1326 * Create an RCU-boost kthread for the specified node if one does not 1327 * already exist. We only create this kthread for preemptible RCU. 1328 * Returns zero if all is well, a negated errno otherwise. 1329 */ 1330 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1331 struct rcu_node *rnp) 1332 { 1333 int rnp_index = rnp - &rsp->node[0]; 1334 unsigned long flags; 1335 struct sched_param sp; 1336 struct task_struct *t; 1337 1338 if (&rcu_preempt_state != rsp) 1339 return 0; 1340 1341 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) 1342 return 0; 1343 1344 rsp->boost = 1; 1345 if (rnp->boost_kthread_task != NULL) 1346 return 0; 1347 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1348 "rcub/%d", rnp_index); 1349 if (IS_ERR(t)) 1350 return PTR_ERR(t); 1351 raw_spin_lock_irqsave(&rnp->lock, flags); 1352 smp_mb__after_unlock_lock(); 1353 rnp->boost_kthread_task = t; 1354 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1355 sp.sched_priority = RCU_BOOST_PRIO; 1356 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1357 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1358 return 0; 1359 } 1360 1361 static void rcu_kthread_do_work(void) 1362 { 1363 rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); 1364 rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); 1365 rcu_preempt_do_callbacks(); 1366 } 1367 1368 static void rcu_cpu_kthread_setup(unsigned int cpu) 1369 { 1370 struct sched_param sp; 1371 1372 sp.sched_priority = RCU_KTHREAD_PRIO; 1373 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1374 } 1375 1376 static void rcu_cpu_kthread_park(unsigned int cpu) 1377 { 1378 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; 1379 } 1380 1381 static int rcu_cpu_kthread_should_run(unsigned int cpu) 1382 { 1383 return __this_cpu_read(rcu_cpu_has_work); 1384 } 1385 1386 /* 1387 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the 1388 * RCU softirq used in flavors and configurations of RCU that do not 1389 * support RCU priority boosting. 1390 */ 1391 static void rcu_cpu_kthread(unsigned int cpu) 1392 { 1393 unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); 1394 char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); 1395 int spincnt; 1396 1397 for (spincnt = 0; spincnt < 10; spincnt++) { 1398 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); 1399 local_bh_disable(); 1400 *statusp = RCU_KTHREAD_RUNNING; 1401 this_cpu_inc(rcu_cpu_kthread_loops); 1402 local_irq_disable(); 1403 work = *workp; 1404 *workp = 0; 1405 local_irq_enable(); 1406 if (work) 1407 rcu_kthread_do_work(); 1408 local_bh_enable(); 1409 if (*workp == 0) { 1410 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); 1411 *statusp = RCU_KTHREAD_WAITING; 1412 return; 1413 } 1414 } 1415 *statusp = RCU_KTHREAD_YIELDING; 1416 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); 1417 schedule_timeout_interruptible(2); 1418 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); 1419 *statusp = RCU_KTHREAD_WAITING; 1420 } 1421 1422 /* 1423 * Set the per-rcu_node kthread's affinity to cover all CPUs that are 1424 * served by the rcu_node in question. The CPU hotplug lock is still 1425 * held, so the value of rnp->qsmaskinit will be stable. 1426 * 1427 * We don't include outgoingcpu in the affinity set, use -1 if there is 1428 * no outgoing CPU. If there are no CPUs left in the affinity set, 1429 * this function allows the kthread to execute on any CPU. 1430 */ 1431 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1432 { 1433 struct task_struct *t = rnp->boost_kthread_task; 1434 unsigned long mask = rnp->qsmaskinit; 1435 cpumask_var_t cm; 1436 int cpu; 1437 1438 if (!t) 1439 return; 1440 if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) 1441 return; 1442 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1443 if ((mask & 0x1) && cpu != outgoingcpu) 1444 cpumask_set_cpu(cpu, cm); 1445 if (cpumask_weight(cm) == 0) { 1446 cpumask_setall(cm); 1447 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) 1448 cpumask_clear_cpu(cpu, cm); 1449 WARN_ON_ONCE(cpumask_weight(cm) == 0); 1450 } 1451 set_cpus_allowed_ptr(t, cm); 1452 free_cpumask_var(cm); 1453 } 1454 1455 static struct smp_hotplug_thread rcu_cpu_thread_spec = { 1456 .store = &rcu_cpu_kthread_task, 1457 .thread_should_run = rcu_cpu_kthread_should_run, 1458 .thread_fn = rcu_cpu_kthread, 1459 .thread_comm = "rcuc/%u", 1460 .setup = rcu_cpu_kthread_setup, 1461 .park = rcu_cpu_kthread_park, 1462 }; 1463 1464 /* 1465 * Spawn all kthreads -- called as soon as the scheduler is running. 1466 */ 1467 static int __init rcu_spawn_kthreads(void) 1468 { 1469 struct rcu_node *rnp; 1470 int cpu; 1471 1472 rcu_scheduler_fully_active = 1; 1473 for_each_possible_cpu(cpu) 1474 per_cpu(rcu_cpu_has_work, cpu) = 0; 1475 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1476 rnp = rcu_get_root(rcu_state_p); 1477 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1478 if (NUM_RCU_NODES > 1) { 1479 rcu_for_each_leaf_node(rcu_state_p, rnp) 1480 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1481 } 1482 return 0; 1483 } 1484 early_initcall(rcu_spawn_kthreads); 1485 1486 static void rcu_prepare_kthreads(int cpu) 1487 { 1488 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 1489 struct rcu_node *rnp = rdp->mynode; 1490 1491 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1492 if (rcu_scheduler_fully_active) 1493 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1494 } 1495 1496 #else /* #ifdef CONFIG_RCU_BOOST */ 1497 1498 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1499 __releases(rnp->lock) 1500 { 1501 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1502 } 1503 1504 static void invoke_rcu_callbacks_kthread(void) 1505 { 1506 WARN_ON_ONCE(1); 1507 } 1508 1509 static bool rcu_is_callbacks_kthread(void) 1510 { 1511 return false; 1512 } 1513 1514 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1515 { 1516 } 1517 1518 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1519 { 1520 } 1521 1522 static int __init rcu_scheduler_really_started(void) 1523 { 1524 rcu_scheduler_fully_active = 1; 1525 return 0; 1526 } 1527 early_initcall(rcu_scheduler_really_started); 1528 1529 static void rcu_prepare_kthreads(int cpu) 1530 { 1531 } 1532 1533 #endif /* #else #ifdef CONFIG_RCU_BOOST */ 1534 1535 #if !defined(CONFIG_RCU_FAST_NO_HZ) 1536 1537 /* 1538 * Check to see if any future RCU-related work will need to be done 1539 * by the current CPU, even if none need be done immediately, returning 1540 * 1 if so. This function is part of the RCU implementation; it is -not- 1541 * an exported member of the RCU API. 1542 * 1543 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1544 * any flavor of RCU. 1545 */ 1546 #ifndef CONFIG_RCU_NOCB_CPU_ALL 1547 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1548 { 1549 *delta_jiffies = ULONG_MAX; 1550 return rcu_cpu_has_callbacks(cpu, NULL); 1551 } 1552 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1553 1554 /* 1555 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1556 * after it. 1557 */ 1558 static void rcu_cleanup_after_idle(int cpu) 1559 { 1560 } 1561 1562 /* 1563 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1564 * is nothing. 1565 */ 1566 static void rcu_prepare_for_idle(int cpu) 1567 { 1568 } 1569 1570 /* 1571 * Don't bother keeping a running count of the number of RCU callbacks 1572 * posted because CONFIG_RCU_FAST_NO_HZ=n. 1573 */ 1574 static void rcu_idle_count_callbacks_posted(void) 1575 { 1576 } 1577 1578 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1579 1580 /* 1581 * This code is invoked when a CPU goes idle, at which point we want 1582 * to have the CPU do everything required for RCU so that it can enter 1583 * the energy-efficient dyntick-idle mode. This is handled by a 1584 * state machine implemented by rcu_prepare_for_idle() below. 1585 * 1586 * The following three proprocessor symbols control this state machine: 1587 * 1588 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1589 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1590 * is sized to be roughly one RCU grace period. Those energy-efficiency 1591 * benchmarkers who might otherwise be tempted to set this to a large 1592 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1593 * system. And if you are -that- concerned about energy efficiency, 1594 * just power the system down and be done with it! 1595 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is 1596 * permitted to sleep in dyntick-idle mode with only lazy RCU 1597 * callbacks pending. Setting this too high can OOM your system. 1598 * 1599 * The values below work well in practice. If future workloads require 1600 * adjustment, they can be converted into kernel config parameters, though 1601 * making the state machine smarter might be a better option. 1602 */ 1603 #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1604 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1605 1606 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; 1607 module_param(rcu_idle_gp_delay, int, 0644); 1608 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; 1609 module_param(rcu_idle_lazy_gp_delay, int, 0644); 1610 1611 extern int tick_nohz_active; 1612 1613 /* 1614 * Try to advance callbacks for all flavors of RCU on the current CPU, but 1615 * only if it has been awhile since the last time we did so. Afterwards, 1616 * if there are any callbacks ready for immediate invocation, return true. 1617 */ 1618 static bool __maybe_unused rcu_try_advance_all_cbs(void) 1619 { 1620 bool cbs_ready = false; 1621 struct rcu_data *rdp; 1622 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 1623 struct rcu_node *rnp; 1624 struct rcu_state *rsp; 1625 1626 /* Exit early if we advanced recently. */ 1627 if (jiffies == rdtp->last_advance_all) 1628 return 0; 1629 rdtp->last_advance_all = jiffies; 1630 1631 for_each_rcu_flavor(rsp) { 1632 rdp = this_cpu_ptr(rsp->rda); 1633 rnp = rdp->mynode; 1634 1635 /* 1636 * Don't bother checking unless a grace period has 1637 * completed since we last checked and there are 1638 * callbacks not yet ready to invoke. 1639 */ 1640 if (rdp->completed != rnp->completed && 1641 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1642 note_gp_changes(rsp, rdp); 1643 1644 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1645 cbs_ready = true; 1646 } 1647 return cbs_ready; 1648 } 1649 1650 /* 1651 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready 1652 * to invoke. If the CPU has callbacks, try to advance them. Tell the 1653 * caller to set the timeout based on whether or not there are non-lazy 1654 * callbacks. 1655 * 1656 * The caller must have disabled interrupts. 1657 */ 1658 #ifndef CONFIG_RCU_NOCB_CPU_ALL 1659 int rcu_needs_cpu(int cpu, unsigned long *dj) 1660 { 1661 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1662 1663 /* Snapshot to detect later posting of non-lazy callback. */ 1664 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1665 1666 /* If no callbacks, RCU doesn't need the CPU. */ 1667 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { 1668 *dj = ULONG_MAX; 1669 return 0; 1670 } 1671 1672 /* Attempt to advance callbacks. */ 1673 if (rcu_try_advance_all_cbs()) { 1674 /* Some ready to invoke, so initiate later invocation. */ 1675 invoke_rcu_core(); 1676 return 1; 1677 } 1678 rdtp->last_accelerate = jiffies; 1679 1680 /* Request timer delay depending on laziness, and round. */ 1681 if (!rdtp->all_lazy) { 1682 *dj = round_up(rcu_idle_gp_delay + jiffies, 1683 rcu_idle_gp_delay) - jiffies; 1684 } else { 1685 *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; 1686 } 1687 return 0; 1688 } 1689 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1690 1691 /* 1692 * Prepare a CPU for idle from an RCU perspective. The first major task 1693 * is to sense whether nohz mode has been enabled or disabled via sysfs. 1694 * The second major task is to check to see if a non-lazy callback has 1695 * arrived at a CPU that previously had only lazy callbacks. The third 1696 * major task is to accelerate (that is, assign grace-period numbers to) 1697 * any recently arrived callbacks. 1698 * 1699 * The caller must have disabled interrupts. 1700 */ 1701 static void rcu_prepare_for_idle(int cpu) 1702 { 1703 #ifndef CONFIG_RCU_NOCB_CPU_ALL 1704 bool needwake; 1705 struct rcu_data *rdp; 1706 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1707 struct rcu_node *rnp; 1708 struct rcu_state *rsp; 1709 int tne; 1710 1711 /* Handle nohz enablement switches conservatively. */ 1712 tne = ACCESS_ONCE(tick_nohz_active); 1713 if (tne != rdtp->tick_nohz_enabled_snap) { 1714 if (rcu_cpu_has_callbacks(cpu, NULL)) 1715 invoke_rcu_core(); /* force nohz to see update. */ 1716 rdtp->tick_nohz_enabled_snap = tne; 1717 return; 1718 } 1719 if (!tne) 1720 return; 1721 1722 /* If this is a no-CBs CPU, no callbacks, just return. */ 1723 if (rcu_is_nocb_cpu(cpu)) 1724 return; 1725 1726 /* 1727 * If a non-lazy callback arrived at a CPU having only lazy 1728 * callbacks, invoke RCU core for the side-effect of recalculating 1729 * idle duration on re-entry to idle. 1730 */ 1731 if (rdtp->all_lazy && 1732 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { 1733 rdtp->all_lazy = false; 1734 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1735 invoke_rcu_core(); 1736 return; 1737 } 1738 1739 /* 1740 * If we have not yet accelerated this jiffy, accelerate all 1741 * callbacks on this CPU. 1742 */ 1743 if (rdtp->last_accelerate == jiffies) 1744 return; 1745 rdtp->last_accelerate = jiffies; 1746 for_each_rcu_flavor(rsp) { 1747 rdp = per_cpu_ptr(rsp->rda, cpu); 1748 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1749 continue; 1750 rnp = rdp->mynode; 1751 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1752 smp_mb__after_unlock_lock(); 1753 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 1754 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1755 if (needwake) 1756 rcu_gp_kthread_wake(rsp); 1757 } 1758 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1759 } 1760 1761 /* 1762 * Clean up for exit from idle. Attempt to advance callbacks based on 1763 * any grace periods that elapsed while the CPU was idle, and if any 1764 * callbacks are now ready to invoke, initiate invocation. 1765 */ 1766 static void rcu_cleanup_after_idle(int cpu) 1767 { 1768 #ifndef CONFIG_RCU_NOCB_CPU_ALL 1769 if (rcu_is_nocb_cpu(cpu)) 1770 return; 1771 if (rcu_try_advance_all_cbs()) 1772 invoke_rcu_core(); 1773 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 1774 } 1775 1776 /* 1777 * Keep a running count of the number of non-lazy callbacks posted 1778 * on this CPU. This running counter (which is never decremented) allows 1779 * rcu_prepare_for_idle() to detect when something out of the idle loop 1780 * posts a callback, even if an equal number of callbacks are invoked. 1781 * Of course, callbacks should only be posted from within a trace event 1782 * designed to be called from idle or from within RCU_NONIDLE(). 1783 */ 1784 static void rcu_idle_count_callbacks_posted(void) 1785 { 1786 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); 1787 } 1788 1789 /* 1790 * Data for flushing lazy RCU callbacks at OOM time. 1791 */ 1792 static atomic_t oom_callback_count; 1793 static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); 1794 1795 /* 1796 * RCU OOM callback -- decrement the outstanding count and deliver the 1797 * wake-up if we are the last one. 1798 */ 1799 static void rcu_oom_callback(struct rcu_head *rhp) 1800 { 1801 if (atomic_dec_and_test(&oom_callback_count)) 1802 wake_up(&oom_callback_wq); 1803 } 1804 1805 /* 1806 * Post an rcu_oom_notify callback on the current CPU if it has at 1807 * least one lazy callback. This will unnecessarily post callbacks 1808 * to CPUs that already have a non-lazy callback at the end of their 1809 * callback list, but this is an infrequent operation, so accept some 1810 * extra overhead to keep things simple. 1811 */ 1812 static void rcu_oom_notify_cpu(void *unused) 1813 { 1814 struct rcu_state *rsp; 1815 struct rcu_data *rdp; 1816 1817 for_each_rcu_flavor(rsp) { 1818 rdp = raw_cpu_ptr(rsp->rda); 1819 if (rdp->qlen_lazy != 0) { 1820 atomic_inc(&oom_callback_count); 1821 rsp->call(&rdp->oom_head, rcu_oom_callback); 1822 } 1823 } 1824 } 1825 1826 /* 1827 * If low on memory, ensure that each CPU has a non-lazy callback. 1828 * This will wake up CPUs that have only lazy callbacks, in turn 1829 * ensuring that they free up the corresponding memory in a timely manner. 1830 * Because an uncertain amount of memory will be freed in some uncertain 1831 * timeframe, we do not claim to have freed anything. 1832 */ 1833 static int rcu_oom_notify(struct notifier_block *self, 1834 unsigned long notused, void *nfreed) 1835 { 1836 int cpu; 1837 1838 /* Wait for callbacks from earlier instance to complete. */ 1839 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); 1840 smp_mb(); /* Ensure callback reuse happens after callback invocation. */ 1841 1842 /* 1843 * Prevent premature wakeup: ensure that all increments happen 1844 * before there is a chance of the counter reaching zero. 1845 */ 1846 atomic_set(&oom_callback_count, 1); 1847 1848 get_online_cpus(); 1849 for_each_online_cpu(cpu) { 1850 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); 1851 cond_resched(); 1852 } 1853 put_online_cpus(); 1854 1855 /* Unconditionally decrement: no need to wake ourselves up. */ 1856 atomic_dec(&oom_callback_count); 1857 1858 return NOTIFY_OK; 1859 } 1860 1861 static struct notifier_block rcu_oom_nb = { 1862 .notifier_call = rcu_oom_notify 1863 }; 1864 1865 static int __init rcu_register_oom_notifier(void) 1866 { 1867 register_oom_notifier(&rcu_oom_nb); 1868 return 0; 1869 } 1870 early_initcall(rcu_register_oom_notifier); 1871 1872 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1873 1874 #ifdef CONFIG_RCU_CPU_STALL_INFO 1875 1876 #ifdef CONFIG_RCU_FAST_NO_HZ 1877 1878 static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1879 { 1880 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1881 unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; 1882 1883 sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", 1884 rdtp->last_accelerate & 0xffff, jiffies & 0xffff, 1885 ulong2long(nlpd), 1886 rdtp->all_lazy ? 'L' : '.', 1887 rdtp->tick_nohz_enabled_snap ? '.' : 'D'); 1888 } 1889 1890 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1891 1892 static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1893 { 1894 *cp = '\0'; 1895 } 1896 1897 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ 1898 1899 /* Initiate the stall-info list. */ 1900 static void print_cpu_stall_info_begin(void) 1901 { 1902 pr_cont("\n"); 1903 } 1904 1905 /* 1906 * Print out diagnostic information for the specified stalled CPU. 1907 * 1908 * If the specified CPU is aware of the current RCU grace period 1909 * (flavor specified by rsp), then print the number of scheduling 1910 * clock interrupts the CPU has taken during the time that it has 1911 * been aware. Otherwise, print the number of RCU grace periods 1912 * that this CPU is ignorant of, for example, "1" if the CPU was 1913 * aware of the previous grace period. 1914 * 1915 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. 1916 */ 1917 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) 1918 { 1919 char fast_no_hz[72]; 1920 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1921 struct rcu_dynticks *rdtp = rdp->dynticks; 1922 char *ticks_title; 1923 unsigned long ticks_value; 1924 1925 if (rsp->gpnum == rdp->gpnum) { 1926 ticks_title = "ticks this GP"; 1927 ticks_value = rdp->ticks_this_gp; 1928 } else { 1929 ticks_title = "GPs behind"; 1930 ticks_value = rsp->gpnum - rdp->gpnum; 1931 } 1932 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1933 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1934 cpu, ticks_value, ticks_title, 1935 atomic_read(&rdtp->dynticks) & 0xfff, 1936 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1937 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1938 fast_no_hz); 1939 } 1940 1941 /* Terminate the stall-info list. */ 1942 static void print_cpu_stall_info_end(void) 1943 { 1944 pr_err("\t"); 1945 } 1946 1947 /* Zero ->ticks_this_gp for all flavors of RCU. */ 1948 static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1949 { 1950 rdp->ticks_this_gp = 0; 1951 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); 1952 } 1953 1954 /* Increment ->ticks_this_gp for all flavors of RCU. */ 1955 static void increment_cpu_stall_ticks(void) 1956 { 1957 struct rcu_state *rsp; 1958 1959 for_each_rcu_flavor(rsp) 1960 raw_cpu_inc(rsp->rda->ticks_this_gp); 1961 } 1962 1963 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 1964 1965 static void print_cpu_stall_info_begin(void) 1966 { 1967 pr_cont(" {"); 1968 } 1969 1970 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) 1971 { 1972 pr_cont(" %d", cpu); 1973 } 1974 1975 static void print_cpu_stall_info_end(void) 1976 { 1977 pr_cont("} "); 1978 } 1979 1980 static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1981 { 1982 } 1983 1984 static void increment_cpu_stall_ticks(void) 1985 { 1986 } 1987 1988 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 1989 1990 #ifdef CONFIG_RCU_NOCB_CPU 1991 1992 /* 1993 * Offload callback processing from the boot-time-specified set of CPUs 1994 * specified by rcu_nocb_mask. For each CPU in the set, there is a 1995 * kthread created that pulls the callbacks from the corresponding CPU, 1996 * waits for a grace period to elapse, and invokes the callbacks. 1997 * The no-CBs CPUs do a wake_up() on their kthread when they insert 1998 * a callback into any empty list, unless the rcu_nocb_poll boot parameter 1999 * has been specified, in which case each kthread actively polls its 2000 * CPU. (Which isn't so great for energy efficiency, but which does 2001 * reduce RCU's overhead on that CPU.) 2002 * 2003 * This is intended to be used in conjunction with Frederic Weisbecker's 2004 * adaptive-idle work, which would seriously reduce OS jitter on CPUs 2005 * running CPU-bound user-mode computations. 2006 * 2007 * Offloading of callback processing could also in theory be used as 2008 * an energy-efficiency measure because CPUs with no RCU callbacks 2009 * queued are more aggressive about entering dyntick-idle mode. 2010 */ 2011 2012 2013 /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ 2014 static int __init rcu_nocb_setup(char *str) 2015 { 2016 alloc_bootmem_cpumask_var(&rcu_nocb_mask); 2017 have_rcu_nocb_mask = true; 2018 cpulist_parse(str, rcu_nocb_mask); 2019 return 1; 2020 } 2021 __setup("rcu_nocbs=", rcu_nocb_setup); 2022 2023 static int __init parse_rcu_nocb_poll(char *arg) 2024 { 2025 rcu_nocb_poll = 1; 2026 return 0; 2027 } 2028 early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2029 2030 /* 2031 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 2032 * grace period. 2033 */ 2034 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2035 { 2036 wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); 2037 } 2038 2039 /* 2040 * Set the root rcu_node structure's ->need_future_gp field 2041 * based on the sum of those of all rcu_node structures. This does 2042 * double-count the root rcu_node structure's requests, but this 2043 * is necessary to handle the possibility of a rcu_nocb_kthread() 2044 * having awakened during the time that the rcu_node structures 2045 * were being updated for the end of the previous grace period. 2046 */ 2047 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) 2048 { 2049 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; 2050 } 2051 2052 static void rcu_init_one_nocb(struct rcu_node *rnp) 2053 { 2054 init_waitqueue_head(&rnp->nocb_gp_wq[0]); 2055 init_waitqueue_head(&rnp->nocb_gp_wq[1]); 2056 } 2057 2058 #ifndef CONFIG_RCU_NOCB_CPU_ALL 2059 /* Is the specified CPU a no-CBs CPU? */ 2060 bool rcu_is_nocb_cpu(int cpu) 2061 { 2062 if (have_rcu_nocb_mask) 2063 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2064 return false; 2065 } 2066 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */ 2067 2068 /* 2069 * Kick the leader kthread for this NOCB group. 2070 */ 2071 static void wake_nocb_leader(struct rcu_data *rdp, bool force) 2072 { 2073 struct rcu_data *rdp_leader = rdp->nocb_leader; 2074 2075 if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) 2076 return; 2077 if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { 2078 /* Prior xchg orders against prior callback enqueue. */ 2079 ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; 2080 wake_up(&rdp_leader->nocb_wq); 2081 } 2082 } 2083 2084 /* 2085 * Enqueue the specified string of rcu_head structures onto the specified 2086 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 2087 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 2088 * counts are supplied by rhcount and rhcount_lazy. 2089 * 2090 * If warranted, also wake up the kthread servicing this CPUs queues. 2091 */ 2092 static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, 2093 struct rcu_head *rhp, 2094 struct rcu_head **rhtp, 2095 int rhcount, int rhcount_lazy, 2096 unsigned long flags) 2097 { 2098 int len; 2099 struct rcu_head **old_rhpp; 2100 struct task_struct *t; 2101 2102 /* Enqueue the callback on the nocb list and update counts. */ 2103 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 2104 ACCESS_ONCE(*old_rhpp) = rhp; 2105 atomic_long_add(rhcount, &rdp->nocb_q_count); 2106 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 2107 2108 /* If we are not being polled and there is a kthread, awaken it ... */ 2109 t = ACCESS_ONCE(rdp->nocb_kthread); 2110 if (rcu_nocb_poll || !t) { 2111 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2112 TPS("WakeNotPoll")); 2113 return; 2114 } 2115 len = atomic_long_read(&rdp->nocb_q_count); 2116 if (old_rhpp == &rdp->nocb_head) { 2117 if (!irqs_disabled_flags(flags)) { 2118 /* ... if queue was empty ... */ 2119 wake_nocb_leader(rdp, false); 2120 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2121 TPS("WakeEmpty")); 2122 } else { 2123 rdp->nocb_defer_wakeup = true; 2124 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2125 TPS("WakeEmptyIsDeferred")); 2126 } 2127 rdp->qlen_last_fqs_check = 0; 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2129 /* ... or if many callbacks queued. */ 2130 wake_nocb_leader(rdp, true); 2131 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2132 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); 2133 } else { 2134 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); 2135 } 2136 return; 2137 } 2138 2139 /* 2140 * This is a helper for __call_rcu(), which invokes this when the normal 2141 * callback queue is inoperable. If this is not a no-CBs CPU, this 2142 * function returns failure back to __call_rcu(), which can complain 2143 * appropriately. 2144 * 2145 * Otherwise, this function queues the callback where the corresponding 2146 * "rcuo" kthread can find it. 2147 */ 2148 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2149 bool lazy, unsigned long flags) 2150 { 2151 2152 if (!rcu_is_nocb_cpu(rdp->cpu)) 2153 return 0; 2154 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); 2155 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2156 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2157 (unsigned long)rhp->func, 2158 -atomic_long_read(&rdp->nocb_q_count_lazy), 2159 -atomic_long_read(&rdp->nocb_q_count)); 2160 else 2161 trace_rcu_callback(rdp->rsp->name, rhp, 2162 -atomic_long_read(&rdp->nocb_q_count_lazy), 2163 -atomic_long_read(&rdp->nocb_q_count)); 2164 return 1; 2165 } 2166 2167 /* 2168 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is 2169 * not a no-CBs CPU. 2170 */ 2171 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2172 struct rcu_data *rdp, 2173 unsigned long flags) 2174 { 2175 long ql = rsp->qlen; 2176 long qll = rsp->qlen_lazy; 2177 2178 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 2179 if (!rcu_is_nocb_cpu(smp_processor_id())) 2180 return 0; 2181 rsp->qlen = 0; 2182 rsp->qlen_lazy = 0; 2183 2184 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 2185 if (rsp->orphan_donelist != NULL) { 2186 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, 2187 rsp->orphan_donetail, ql, qll, flags); 2188 ql = qll = 0; 2189 rsp->orphan_donelist = NULL; 2190 rsp->orphan_donetail = &rsp->orphan_donelist; 2191 } 2192 if (rsp->orphan_nxtlist != NULL) { 2193 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, 2194 rsp->orphan_nxttail, ql, qll, flags); 2195 ql = qll = 0; 2196 rsp->orphan_nxtlist = NULL; 2197 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2198 } 2199 return 1; 2200 } 2201 2202 /* 2203 * If necessary, kick off a new grace period, and either way wait 2204 * for a subsequent grace period to complete. 2205 */ 2206 static void rcu_nocb_wait_gp(struct rcu_data *rdp) 2207 { 2208 unsigned long c; 2209 bool d; 2210 unsigned long flags; 2211 bool needwake; 2212 struct rcu_node *rnp = rdp->mynode; 2213 2214 raw_spin_lock_irqsave(&rnp->lock, flags); 2215 smp_mb__after_unlock_lock(); 2216 needwake = rcu_start_future_gp(rnp, rdp, &c); 2217 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2218 if (needwake) 2219 rcu_gp_kthread_wake(rdp->rsp); 2220 2221 /* 2222 * Wait for the grace period. Do so interruptibly to avoid messing 2223 * up the load average. 2224 */ 2225 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); 2226 for (;;) { 2227 wait_event_interruptible( 2228 rnp->nocb_gp_wq[c & 0x1], 2229 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); 2230 if (likely(d)) 2231 break; 2232 flush_signals(current); 2233 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); 2234 } 2235 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); 2236 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2237 } 2238 2239 /* 2240 * Leaders come here to wait for additional callbacks to show up. 2241 * This function does not return until callbacks appear. 2242 */ 2243 static void nocb_leader_wait(struct rcu_data *my_rdp) 2244 { 2245 bool firsttime = true; 2246 bool gotcbs; 2247 struct rcu_data *rdp; 2248 struct rcu_head **tail; 2249 2250 wait_again: 2251 2252 /* Wait for callbacks to appear. */ 2253 if (!rcu_nocb_poll) { 2254 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); 2255 wait_event_interruptible(my_rdp->nocb_wq, 2256 ACCESS_ONCE(my_rdp->nocb_leader_wake)); 2257 /* Memory barrier handled by smp_mb() calls below and repoll. */ 2258 } else if (firsttime) { 2259 firsttime = false; /* Don't drown trace log with "Poll"! */ 2260 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); 2261 } 2262 2263 /* 2264 * Each pass through the following loop checks a follower for CBs. 2265 * We are our own first follower. Any CBs found are moved to 2266 * nocb_gp_head, where they await a grace period. 2267 */ 2268 gotcbs = false; 2269 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2270 rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); 2271 if (!rdp->nocb_gp_head) 2272 continue; /* No CBs here, try next follower. */ 2273 2274 /* Move callbacks to wait-for-GP list, which is empty. */ 2275 ACCESS_ONCE(rdp->nocb_head) = NULL; 2276 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 2277 rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); 2278 rdp->nocb_gp_count_lazy = 2279 atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); 2280 gotcbs = true; 2281 } 2282 2283 /* 2284 * If there were no callbacks, sleep a bit, rescan after a 2285 * memory barrier, and go retry. 2286 */ 2287 if (unlikely(!gotcbs)) { 2288 if (!rcu_nocb_poll) 2289 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, 2290 "WokeEmpty"); 2291 flush_signals(current); 2292 schedule_timeout_interruptible(1); 2293 2294 /* Rescan in case we were a victim of memory ordering. */ 2295 my_rdp->nocb_leader_wake = false; 2296 smp_mb(); /* Ensure _wake false before scan. */ 2297 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) 2298 if (ACCESS_ONCE(rdp->nocb_head)) { 2299 /* Found CB, so short-circuit next wait. */ 2300 my_rdp->nocb_leader_wake = true; 2301 break; 2302 } 2303 goto wait_again; 2304 } 2305 2306 /* Wait for one grace period. */ 2307 rcu_nocb_wait_gp(my_rdp); 2308 2309 /* 2310 * We left ->nocb_leader_wake set to reduce cache thrashing. 2311 * We clear it now, but recheck for new callbacks while 2312 * traversing our follower list. 2313 */ 2314 my_rdp->nocb_leader_wake = false; 2315 smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ 2316 2317 /* Each pass through the following loop wakes a follower, if needed. */ 2318 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2319 if (ACCESS_ONCE(rdp->nocb_head)) 2320 my_rdp->nocb_leader_wake = true; /* No need to wait. */ 2321 if (!rdp->nocb_gp_head) 2322 continue; /* No CBs, so no need to wake follower. */ 2323 2324 /* Append callbacks to follower's "done" list. */ 2325 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); 2326 *tail = rdp->nocb_gp_head; 2327 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); 2328 atomic_long_add(rdp->nocb_gp_count_lazy, 2329 &rdp->nocb_follower_count_lazy); 2330 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2331 /* 2332 * List was empty, wake up the follower. 2333 * Memory barriers supplied by atomic_long_add(). 2334 */ 2335 wake_up(&rdp->nocb_wq); 2336 } 2337 } 2338 2339 /* If we (the leader) don't have CBs, go wait some more. */ 2340 if (!my_rdp->nocb_follower_head) 2341 goto wait_again; 2342 } 2343 2344 /* 2345 * Followers come here to wait for additional callbacks to show up. 2346 * This function does not return until callbacks appear. 2347 */ 2348 static void nocb_follower_wait(struct rcu_data *rdp) 2349 { 2350 bool firsttime = true; 2351 2352 for (;;) { 2353 if (!rcu_nocb_poll) { 2354 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2355 "FollowerSleep"); 2356 wait_event_interruptible(rdp->nocb_wq, 2357 ACCESS_ONCE(rdp->nocb_follower_head)); 2358 } else if (firsttime) { 2359 /* Don't drown trace log with "Poll"! */ 2360 firsttime = false; 2361 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); 2362 } 2363 if (smp_load_acquire(&rdp->nocb_follower_head)) { 2364 /* ^^^ Ensure CB invocation follows _head test. */ 2365 return; 2366 } 2367 if (!rcu_nocb_poll) 2368 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2369 "WokeEmpty"); 2370 flush_signals(current); 2371 schedule_timeout_interruptible(1); 2372 } 2373 } 2374 2375 /* 2376 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes 2377 * callbacks queued by the corresponding no-CBs CPU, however, there is 2378 * an optional leader-follower relationship so that the grace-period 2379 * kthreads don't have to do quite so many wakeups. 2380 */ 2381 static int rcu_nocb_kthread(void *arg) 2382 { 2383 int c, cl; 2384 struct rcu_head *list; 2385 struct rcu_head *next; 2386 struct rcu_head **tail; 2387 struct rcu_data *rdp = arg; 2388 2389 /* Each pass through this loop invokes one batch of callbacks */ 2390 for (;;) { 2391 /* Wait for callbacks. */ 2392 if (rdp->nocb_leader == rdp) 2393 nocb_leader_wait(rdp); 2394 else 2395 nocb_follower_wait(rdp); 2396 2397 /* Pull the ready-to-invoke callbacks onto local list. */ 2398 list = ACCESS_ONCE(rdp->nocb_follower_head); 2399 BUG_ON(!list); 2400 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); 2401 ACCESS_ONCE(rdp->nocb_follower_head) = NULL; 2402 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); 2403 c = atomic_long_xchg(&rdp->nocb_follower_count, 0); 2404 cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); 2405 rdp->nocb_p_count += c; 2406 rdp->nocb_p_count_lazy += cl; 2407 2408 /* Each pass through the following loop invokes a callback. */ 2409 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2410 c = cl = 0; 2411 while (list) { 2412 next = list->next; 2413 /* Wait for enqueuing to complete, if needed. */ 2414 while (next == NULL && &list->next != tail) { 2415 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2416 TPS("WaitQueue")); 2417 schedule_timeout_interruptible(1); 2418 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2419 TPS("WokeQueue")); 2420 next = list->next; 2421 } 2422 debug_rcu_head_unqueue(list); 2423 local_bh_disable(); 2424 if (__rcu_reclaim(rdp->rsp->name, list)) 2425 cl++; 2426 c++; 2427 local_bh_enable(); 2428 list = next; 2429 } 2430 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2431 ACCESS_ONCE(rdp->nocb_p_count) -= c; 2432 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; 2433 rdp->n_nocbs_invoked += c; 2434 } 2435 return 0; 2436 } 2437 2438 /* Is a deferred wakeup of rcu_nocb_kthread() required? */ 2439 static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2440 { 2441 return ACCESS_ONCE(rdp->nocb_defer_wakeup); 2442 } 2443 2444 /* Do a deferred wakeup of rcu_nocb_kthread(). */ 2445 static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2446 { 2447 if (!rcu_nocb_need_deferred_wakeup(rdp)) 2448 return; 2449 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; 2450 wake_nocb_leader(rdp, false); 2451 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); 2452 } 2453 2454 /* Initialize per-rcu_data variables for no-CBs CPUs. */ 2455 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2456 { 2457 rdp->nocb_tail = &rdp->nocb_head; 2458 init_waitqueue_head(&rdp->nocb_wq); 2459 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2460 } 2461 2462 /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ 2463 static int rcu_nocb_leader_stride = -1; 2464 module_param(rcu_nocb_leader_stride, int, 0444); 2465 2466 /* 2467 * Create a kthread for each RCU flavor for each no-CBs CPU. 2468 * Also initialize leader-follower relationships. 2469 */ 2470 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2471 { 2472 int cpu; 2473 int ls = rcu_nocb_leader_stride; 2474 int nl = 0; /* Next leader. */ 2475 struct rcu_data *rdp; 2476 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ 2477 struct rcu_data *rdp_prev = NULL; 2478 struct task_struct *t; 2479 2480 if (rcu_nocb_mask == NULL) 2481 return; 2482 #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) 2483 if (tick_nohz_full_running) 2484 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); 2485 #endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ 2486 if (ls == -1) { 2487 ls = int_sqrt(nr_cpu_ids); 2488 rcu_nocb_leader_stride = ls; 2489 } 2490 2491 /* 2492 * Each pass through this loop sets up one rcu_data structure and 2493 * spawns one rcu_nocb_kthread(). 2494 */ 2495 for_each_cpu(cpu, rcu_nocb_mask) { 2496 rdp = per_cpu_ptr(rsp->rda, cpu); 2497 if (rdp->cpu >= nl) { 2498 /* New leader, set up for followers & next leader. */ 2499 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; 2500 rdp->nocb_leader = rdp; 2501 rdp_leader = rdp; 2502 } else { 2503 /* Another follower, link to previous leader. */ 2504 rdp->nocb_leader = rdp_leader; 2505 rdp_prev->nocb_next_follower = rdp; 2506 } 2507 rdp_prev = rdp; 2508 2509 /* Spawn the kthread for this CPU. */ 2510 t = kthread_run(rcu_nocb_kthread, rdp, 2511 "rcuo%c/%d", rsp->abbr, cpu); 2512 BUG_ON(IS_ERR(t)); 2513 ACCESS_ONCE(rdp->nocb_kthread) = t; 2514 } 2515 } 2516 2517 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2518 static bool init_nocb_callback_list(struct rcu_data *rdp) 2519 { 2520 if (rcu_nocb_mask == NULL || 2521 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) 2522 return false; 2523 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2524 return true; 2525 } 2526 2527 #else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2528 2529 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2530 { 2531 } 2532 2533 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) 2534 { 2535 } 2536 2537 static void rcu_init_one_nocb(struct rcu_node *rnp) 2538 { 2539 } 2540 2541 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2542 bool lazy, unsigned long flags) 2543 { 2544 return 0; 2545 } 2546 2547 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2548 struct rcu_data *rdp, 2549 unsigned long flags) 2550 { 2551 return 0; 2552 } 2553 2554 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2555 { 2556 } 2557 2558 static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2559 { 2560 return false; 2561 } 2562 2563 static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2564 { 2565 } 2566 2567 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2568 { 2569 } 2570 2571 static bool init_nocb_callback_list(struct rcu_data *rdp) 2572 { 2573 return false; 2574 } 2575 2576 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2577 2578 /* 2579 * An adaptive-ticks CPU can potentially execute in kernel mode for an 2580 * arbitrarily long period of time with the scheduling-clock tick turned 2581 * off. RCU will be paying attention to this CPU because it is in the 2582 * kernel, but the CPU cannot be guaranteed to be executing the RCU state 2583 * machine because the scheduling-clock tick has been disabled. Therefore, 2584 * if an adaptive-ticks CPU is failing to respond to the current grace 2585 * period and has not be idle from an RCU perspective, kick it. 2586 */ 2587 static void __maybe_unused rcu_kick_nohz_cpu(int cpu) 2588 { 2589 #ifdef CONFIG_NO_HZ_FULL 2590 if (tick_nohz_full_cpu(cpu)) 2591 smp_send_reschedule(cpu); 2592 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 2593 } 2594 2595 2596 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 2597 2598 /* 2599 * Define RCU flavor that holds sysidle state. This needs to be the 2600 * most active flavor of RCU. 2601 */ 2602 #ifdef CONFIG_PREEMPT_RCU 2603 static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; 2604 #else /* #ifdef CONFIG_PREEMPT_RCU */ 2605 static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; 2606 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 2607 2608 static int full_sysidle_state; /* Current system-idle state. */ 2609 #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ 2610 #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ 2611 #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ 2612 #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ 2613 #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ 2614 2615 /* 2616 * Invoked to note exit from irq or task transition to idle. Note that 2617 * usermode execution does -not- count as idle here! After all, we want 2618 * to detect full-system idle states, not RCU quiescent states and grace 2619 * periods. The caller must have disabled interrupts. 2620 */ 2621 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2622 { 2623 unsigned long j; 2624 2625 /* Adjust nesting, check for fully idle. */ 2626 if (irq) { 2627 rdtp->dynticks_idle_nesting--; 2628 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); 2629 if (rdtp->dynticks_idle_nesting != 0) 2630 return; /* Still not fully idle. */ 2631 } else { 2632 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == 2633 DYNTICK_TASK_NEST_VALUE) { 2634 rdtp->dynticks_idle_nesting = 0; 2635 } else { 2636 rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; 2637 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); 2638 return; /* Still not fully idle. */ 2639 } 2640 } 2641 2642 /* Record start of fully idle period. */ 2643 j = jiffies; 2644 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; 2645 smp_mb__before_atomic(); 2646 atomic_inc(&rdtp->dynticks_idle); 2647 smp_mb__after_atomic(); 2648 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); 2649 } 2650 2651 /* 2652 * Unconditionally force exit from full system-idle state. This is 2653 * invoked when a normal CPU exits idle, but must be called separately 2654 * for the timekeeping CPU (tick_do_timer_cpu). The reason for this 2655 * is that the timekeeping CPU is permitted to take scheduling-clock 2656 * interrupts while the system is in system-idle state, and of course 2657 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock 2658 * interrupt from any other type of interrupt. 2659 */ 2660 void rcu_sysidle_force_exit(void) 2661 { 2662 int oldstate = ACCESS_ONCE(full_sysidle_state); 2663 int newoldstate; 2664 2665 /* 2666 * Each pass through the following loop attempts to exit full 2667 * system-idle state. If contention proves to be a problem, 2668 * a trylock-based contention tree could be used here. 2669 */ 2670 while (oldstate > RCU_SYSIDLE_SHORT) { 2671 newoldstate = cmpxchg(&full_sysidle_state, 2672 oldstate, RCU_SYSIDLE_NOT); 2673 if (oldstate == newoldstate && 2674 oldstate == RCU_SYSIDLE_FULL_NOTED) { 2675 rcu_kick_nohz_cpu(tick_do_timer_cpu); 2676 return; /* We cleared it, done! */ 2677 } 2678 oldstate = newoldstate; 2679 } 2680 smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ 2681 } 2682 2683 /* 2684 * Invoked to note entry to irq or task transition from idle. Note that 2685 * usermode execution does -not- count as idle here! The caller must 2686 * have disabled interrupts. 2687 */ 2688 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2689 { 2690 /* Adjust nesting, check for already non-idle. */ 2691 if (irq) { 2692 rdtp->dynticks_idle_nesting++; 2693 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); 2694 if (rdtp->dynticks_idle_nesting != 1) 2695 return; /* Already non-idle. */ 2696 } else { 2697 /* 2698 * Allow for irq misnesting. Yes, it really is possible 2699 * to enter an irq handler then never leave it, and maybe 2700 * also vice versa. Handle both possibilities. 2701 */ 2702 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { 2703 rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; 2704 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); 2705 return; /* Already non-idle. */ 2706 } else { 2707 rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; 2708 } 2709 } 2710 2711 /* Record end of idle period. */ 2712 smp_mb__before_atomic(); 2713 atomic_inc(&rdtp->dynticks_idle); 2714 smp_mb__after_atomic(); 2715 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); 2716 2717 /* 2718 * If we are the timekeeping CPU, we are permitted to be non-idle 2719 * during a system-idle state. This must be the case, because 2720 * the timekeeping CPU has to take scheduling-clock interrupts 2721 * during the time that the system is transitioning to full 2722 * system-idle state. This means that the timekeeping CPU must 2723 * invoke rcu_sysidle_force_exit() directly if it does anything 2724 * more than take a scheduling-clock interrupt. 2725 */ 2726 if (smp_processor_id() == tick_do_timer_cpu) 2727 return; 2728 2729 /* Update system-idle state: We are clearly no longer fully idle! */ 2730 rcu_sysidle_force_exit(); 2731 } 2732 2733 /* 2734 * Check to see if the current CPU is idle. Note that usermode execution 2735 * does not count as idle. The caller must have disabled interrupts. 2736 */ 2737 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2738 unsigned long *maxj) 2739 { 2740 int cur; 2741 unsigned long j; 2742 struct rcu_dynticks *rdtp = rdp->dynticks; 2743 2744 /* 2745 * If some other CPU has already reported non-idle, if this is 2746 * not the flavor of RCU that tracks sysidle state, or if this 2747 * is an offline or the timekeeping CPU, nothing to do. 2748 */ 2749 if (!*isidle || rdp->rsp != rcu_sysidle_state || 2750 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2751 return; 2752 if (rcu_gp_in_progress(rdp->rsp)) 2753 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); 2754 2755 /* Pick up current idle and NMI-nesting counter and check. */ 2756 cur = atomic_read(&rdtp->dynticks_idle); 2757 if (cur & 0x1) { 2758 *isidle = false; /* We are not idle! */ 2759 return; 2760 } 2761 smp_mb(); /* Read counters before timestamps. */ 2762 2763 /* Pick up timestamps. */ 2764 j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); 2765 /* If this CPU entered idle more recently, update maxj timestamp. */ 2766 if (ULONG_CMP_LT(*maxj, j)) 2767 *maxj = j; 2768 } 2769 2770 /* 2771 * Is this the flavor of RCU that is handling full-system idle? 2772 */ 2773 static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2774 { 2775 return rsp == rcu_sysidle_state; 2776 } 2777 2778 /* 2779 * Return a delay in jiffies based on the number of CPUs, rcu_node 2780 * leaf fanout, and jiffies tick rate. The idea is to allow larger 2781 * systems more time to transition to full-idle state in order to 2782 * avoid the cache thrashing that otherwise occur on the state variable. 2783 * Really small systems (less than a couple of tens of CPUs) should 2784 * instead use a single global atomically incremented counter, and later 2785 * versions of this will automatically reconfigure themselves accordingly. 2786 */ 2787 static unsigned long rcu_sysidle_delay(void) 2788 { 2789 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 2790 return 0; 2791 return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); 2792 } 2793 2794 /* 2795 * Advance the full-system-idle state. This is invoked when all of 2796 * the non-timekeeping CPUs are idle. 2797 */ 2798 static void rcu_sysidle(unsigned long j) 2799 { 2800 /* Check the current state. */ 2801 switch (ACCESS_ONCE(full_sysidle_state)) { 2802 case RCU_SYSIDLE_NOT: 2803 2804 /* First time all are idle, so note a short idle period. */ 2805 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; 2806 break; 2807 2808 case RCU_SYSIDLE_SHORT: 2809 2810 /* 2811 * Idle for a bit, time to advance to next state? 2812 * cmpxchg failure means race with non-idle, let them win. 2813 */ 2814 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) 2815 (void)cmpxchg(&full_sysidle_state, 2816 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); 2817 break; 2818 2819 case RCU_SYSIDLE_LONG: 2820 2821 /* 2822 * Do an additional check pass before advancing to full. 2823 * cmpxchg failure means race with non-idle, let them win. 2824 */ 2825 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) 2826 (void)cmpxchg(&full_sysidle_state, 2827 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); 2828 break; 2829 2830 default: 2831 break; 2832 } 2833 } 2834 2835 /* 2836 * Found a non-idle non-timekeeping CPU, so kick the system-idle state 2837 * back to the beginning. 2838 */ 2839 static void rcu_sysidle_cancel(void) 2840 { 2841 smp_mb(); 2842 if (full_sysidle_state > RCU_SYSIDLE_SHORT) 2843 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; 2844 } 2845 2846 /* 2847 * Update the sysidle state based on the results of a force-quiescent-state 2848 * scan of the CPUs' dyntick-idle state. 2849 */ 2850 static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, 2851 unsigned long maxj, bool gpkt) 2852 { 2853 if (rsp != rcu_sysidle_state) 2854 return; /* Wrong flavor, ignore. */ 2855 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 2856 return; /* Running state machine from timekeeping CPU. */ 2857 if (isidle) 2858 rcu_sysidle(maxj); /* More idle! */ 2859 else 2860 rcu_sysidle_cancel(); /* Idle is over. */ 2861 } 2862 2863 /* 2864 * Wrapper for rcu_sysidle_report() when called from the grace-period 2865 * kthread's context. 2866 */ 2867 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2868 unsigned long maxj) 2869 { 2870 rcu_sysidle_report(rsp, isidle, maxj, true); 2871 } 2872 2873 /* Callback and function for forcing an RCU grace period. */ 2874 struct rcu_sysidle_head { 2875 struct rcu_head rh; 2876 int inuse; 2877 }; 2878 2879 static void rcu_sysidle_cb(struct rcu_head *rhp) 2880 { 2881 struct rcu_sysidle_head *rshp; 2882 2883 /* 2884 * The following memory barrier is needed to replace the 2885 * memory barriers that would normally be in the memory 2886 * allocator. 2887 */ 2888 smp_mb(); /* grace period precedes setting inuse. */ 2889 2890 rshp = container_of(rhp, struct rcu_sysidle_head, rh); 2891 ACCESS_ONCE(rshp->inuse) = 0; 2892 } 2893 2894 /* 2895 * Check to see if the system is fully idle, other than the timekeeping CPU. 2896 * The caller must have disabled interrupts. 2897 */ 2898 bool rcu_sys_is_idle(void) 2899 { 2900 static struct rcu_sysidle_head rsh; 2901 int rss = ACCESS_ONCE(full_sysidle_state); 2902 2903 if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) 2904 return false; 2905 2906 /* Handle small-system case by doing a full scan of CPUs. */ 2907 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { 2908 int oldrss = rss - 1; 2909 2910 /* 2911 * One pass to advance to each state up to _FULL. 2912 * Give up if any pass fails to advance the state. 2913 */ 2914 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { 2915 int cpu; 2916 bool isidle = true; 2917 unsigned long maxj = jiffies - ULONG_MAX / 4; 2918 struct rcu_data *rdp; 2919 2920 /* Scan all the CPUs looking for nonidle CPUs. */ 2921 for_each_possible_cpu(cpu) { 2922 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); 2923 rcu_sysidle_check_cpu(rdp, &isidle, &maxj); 2924 if (!isidle) 2925 break; 2926 } 2927 rcu_sysidle_report(rcu_sysidle_state, 2928 isidle, maxj, false); 2929 oldrss = rss; 2930 rss = ACCESS_ONCE(full_sysidle_state); 2931 } 2932 } 2933 2934 /* If this is the first observation of an idle period, record it. */ 2935 if (rss == RCU_SYSIDLE_FULL) { 2936 rss = cmpxchg(&full_sysidle_state, 2937 RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); 2938 return rss == RCU_SYSIDLE_FULL; 2939 } 2940 2941 smp_mb(); /* ensure rss load happens before later caller actions. */ 2942 2943 /* If already fully idle, tell the caller (in case of races). */ 2944 if (rss == RCU_SYSIDLE_FULL_NOTED) 2945 return true; 2946 2947 /* 2948 * If we aren't there yet, and a grace period is not in flight, 2949 * initiate a grace period. Either way, tell the caller that 2950 * we are not there yet. We use an xchg() rather than an assignment 2951 * to make up for the memory barriers that would otherwise be 2952 * provided by the memory allocator. 2953 */ 2954 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && 2955 !rcu_gp_in_progress(rcu_sysidle_state) && 2956 !rsh.inuse && xchg(&rsh.inuse, 1) == 0) 2957 call_rcu(&rsh.rh, rcu_sysidle_cb); 2958 return false; 2959 } 2960 2961 /* 2962 * Initialize dynticks sysidle state for CPUs coming online. 2963 */ 2964 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) 2965 { 2966 rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; 2967 } 2968 2969 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 2970 2971 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2972 { 2973 } 2974 2975 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2976 { 2977 } 2978 2979 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2980 unsigned long *maxj) 2981 { 2982 } 2983 2984 static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2985 { 2986 return false; 2987 } 2988 2989 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2990 unsigned long maxj) 2991 { 2992 } 2993 2994 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) 2995 { 2996 } 2997 2998 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 2999 3000 /* 3001 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the 3002 * grace-period kthread will do force_quiescent_state() processing? 3003 * The idea is to avoid waking up RCU core processing on such a 3004 * CPU unless the grace period has extended for too long. 3005 * 3006 * This code relies on the fact that all NO_HZ_FULL CPUs are also 3007 * CONFIG_RCU_NOCB_CPU CPUs. 3008 */ 3009 static bool rcu_nohz_full_cpu(struct rcu_state *rsp) 3010 { 3011 #ifdef CONFIG_NO_HZ_FULL 3012 if (tick_nohz_full_cpu(smp_processor_id()) && 3013 (!rcu_gp_in_progress(rsp) || 3014 ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ))) 3015 return 1; 3016 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 3017 return 0; 3018 } 3019 3020 /* 3021 * Bind the grace-period kthread for the sysidle flavor of RCU to the 3022 * timekeeping CPU. 3023 */ 3024 static void rcu_bind_gp_kthread(void) 3025 { 3026 int __maybe_unused cpu; 3027 3028 if (!tick_nohz_full_enabled()) 3029 return; 3030 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 3031 cpu = tick_do_timer_cpu; 3032 if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) 3033 set_cpus_allowed_ptr(current, cpumask_of(cpu)); 3034 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3035 if (!is_housekeeping_cpu(raw_smp_processor_id())) 3036 housekeeping_affine(current); 3037 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3038 } 3039