1 /* 2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 3 * Internal non-public definitions that provide either classic 4 * or preemptible semantics. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 19 * 20 * Copyright Red Hat, 2009 21 * Copyright IBM Corporation, 2009 22 * 23 * Author: Ingo Molnar <mingo@elte.hu> 24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> 25 */ 26 27 #include <linux/delay.h> 28 #include <linux/gfp.h> 29 #include <linux/oom.h> 30 #include <linux/smpboot.h> 31 #include "../time/tick-internal.h" 32 33 #define RCU_KTHREAD_PRIO 1 34 35 #ifdef CONFIG_RCU_BOOST 36 #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO 37 #else 38 #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO 39 #endif 40 41 #ifdef CONFIG_RCU_NOCB_CPU 42 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 43 static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ 44 static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 45 static char __initdata nocb_buf[NR_CPUS * 5]; 46 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 47 48 /* 49 * Check the RCU kernel configuration parameters and print informative 50 * messages about anything out of the ordinary. If you like #ifdef, you 51 * will love this function. 52 */ 53 static void __init rcu_bootup_announce_oddness(void) 54 { 55 #ifdef CONFIG_RCU_TRACE 56 pr_info("\tRCU debugfs-based tracing is enabled.\n"); 57 #endif 58 #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 59 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 60 CONFIG_RCU_FANOUT); 61 #endif 62 #ifdef CONFIG_RCU_FANOUT_EXACT 63 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 64 #endif 65 #ifdef CONFIG_RCU_FAST_NO_HZ 66 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 67 #endif 68 #ifdef CONFIG_PROVE_RCU 69 pr_info("\tRCU lockdep checking is enabled.\n"); 70 #endif 71 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 72 pr_info("\tRCU torture testing starts during boot.\n"); 73 #endif 74 #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 75 pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); 76 #endif 77 #if defined(CONFIG_RCU_CPU_STALL_INFO) 78 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 79 #endif 80 #if NUM_RCU_LVL_4 != 0 81 pr_info("\tFour-level hierarchy is enabled.\n"); 82 #endif 83 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 84 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 85 if (nr_cpu_ids != NR_CPUS) 86 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 #ifdef CONFIG_RCU_NOCB_CPU 88 #ifndef CONFIG_RCU_NOCB_CPU_NONE 89 if (!have_rcu_nocb_mask) { 90 zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); 91 have_rcu_nocb_mask = true; 92 } 93 #ifdef CONFIG_RCU_NOCB_CPU_ZERO 94 pr_info("\tOffload RCU callbacks from CPU 0\n"); 95 cpumask_set_cpu(0, rcu_nocb_mask); 96 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 97 #ifdef CONFIG_RCU_NOCB_CPU_ALL 98 pr_info("\tOffload RCU callbacks from all CPUs\n"); 99 cpumask_copy(rcu_nocb_mask, cpu_possible_mask); 100 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 101 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 102 if (have_rcu_nocb_mask) { 103 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { 104 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); 105 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 106 rcu_nocb_mask); 107 } 108 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 109 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 110 if (rcu_nocb_poll) 111 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 112 } 113 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 114 } 115 116 #ifdef CONFIG_TREE_PREEMPT_RCU 117 118 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); 119 static struct rcu_state *rcu_state = &rcu_preempt_state; 120 121 static int rcu_preempted_readers_exp(struct rcu_node *rnp); 122 123 /* 124 * Tell them what RCU they are running. 125 */ 126 static void __init rcu_bootup_announce(void) 127 { 128 pr_info("Preemptible hierarchical RCU implementation.\n"); 129 rcu_bootup_announce_oddness(); 130 } 131 132 /* 133 * Return the number of RCU-preempt batches processed thus far 134 * for debug and statistics. 135 */ 136 long rcu_batches_completed_preempt(void) 137 { 138 return rcu_preempt_state.completed; 139 } 140 EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); 141 142 /* 143 * Return the number of RCU batches processed thus far for debug & stats. 144 */ 145 long rcu_batches_completed(void) 146 { 147 return rcu_batches_completed_preempt(); 148 } 149 EXPORT_SYMBOL_GPL(rcu_batches_completed); 150 151 /* 152 * Force a quiescent state for preemptible RCU. 153 */ 154 void rcu_force_quiescent_state(void) 155 { 156 force_quiescent_state(&rcu_preempt_state); 157 } 158 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 159 160 /* 161 * Record a preemptible-RCU quiescent state for the specified CPU. Note 162 * that this just means that the task currently running on the CPU is 163 * not in a quiescent state. There might be any number of tasks blocked 164 * while in an RCU read-side critical section. 165 * 166 * Unlike the other rcu_*_qs() functions, callers to this function 167 * must disable irqs in order to protect the assignment to 168 * ->rcu_read_unlock_special. 169 */ 170 static void rcu_preempt_qs(int cpu) 171 { 172 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 173 174 if (rdp->passed_quiesce == 0) 175 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); 176 rdp->passed_quiesce = 1; 177 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 178 } 179 180 /* 181 * We have entered the scheduler, and the current task might soon be 182 * context-switched away from. If this task is in an RCU read-side 183 * critical section, we will no longer be able to rely on the CPU to 184 * record that fact, so we enqueue the task on the blkd_tasks list. 185 * The task will dequeue itself when it exits the outermost enclosing 186 * RCU read-side critical section. Therefore, the current grace period 187 * cannot be permitted to complete until the blkd_tasks list entries 188 * predating the current grace period drain, in other words, until 189 * rnp->gp_tasks becomes NULL. 190 * 191 * Caller must disable preemption. 192 */ 193 static void rcu_preempt_note_context_switch(int cpu) 194 { 195 struct task_struct *t = current; 196 unsigned long flags; 197 struct rcu_data *rdp; 198 struct rcu_node *rnp; 199 200 if (t->rcu_read_lock_nesting > 0 && 201 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 202 203 /* Possibly blocking in an RCU read-side critical section. */ 204 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 205 rnp = rdp->mynode; 206 raw_spin_lock_irqsave(&rnp->lock, flags); 207 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 208 t->rcu_blocked_node = rnp; 209 210 /* 211 * If this CPU has already checked in, then this task 212 * will hold up the next grace period rather than the 213 * current grace period. Queue the task accordingly. 214 * If the task is queued for the current grace period 215 * (i.e., this CPU has not yet passed through a quiescent 216 * state for the current grace period), then as long 217 * as that task remains queued, the current grace period 218 * cannot end. Note that there is some uncertainty as 219 * to exactly when the current grace period started. 220 * We take a conservative approach, which can result 221 * in unnecessarily waiting on tasks that started very 222 * slightly after the current grace period began. C'est 223 * la vie!!! 224 * 225 * But first, note that the current CPU must still be 226 * on line! 227 */ 228 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 229 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 230 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { 231 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); 232 rnp->gp_tasks = &t->rcu_node_entry; 233 #ifdef CONFIG_RCU_BOOST 234 if (rnp->boost_tasks != NULL) 235 rnp->boost_tasks = rnp->gp_tasks; 236 #endif /* #ifdef CONFIG_RCU_BOOST */ 237 } else { 238 list_add(&t->rcu_node_entry, &rnp->blkd_tasks); 239 if (rnp->qsmask & rdp->grpmask) 240 rnp->gp_tasks = &t->rcu_node_entry; 241 } 242 trace_rcu_preempt_task(rdp->rsp->name, 243 t->pid, 244 (rnp->qsmask & rdp->grpmask) 245 ? rnp->gpnum 246 : rnp->gpnum + 1); 247 raw_spin_unlock_irqrestore(&rnp->lock, flags); 248 } else if (t->rcu_read_lock_nesting < 0 && 249 t->rcu_read_unlock_special) { 250 251 /* 252 * Complete exit from RCU read-side critical section on 253 * behalf of preempted instance of __rcu_read_unlock(). 254 */ 255 rcu_read_unlock_special(t); 256 } 257 258 /* 259 * Either we were not in an RCU read-side critical section to 260 * begin with, or we have now recorded that critical section 261 * globally. Either way, we can now note a quiescent state 262 * for this CPU. Again, if we were in an RCU read-side critical 263 * section, and if that critical section was blocking the current 264 * grace period, then the fact that the task has been enqueued 265 * means that we continue to block the current grace period. 266 */ 267 local_irq_save(flags); 268 rcu_preempt_qs(cpu); 269 local_irq_restore(flags); 270 } 271 272 /* 273 * Check for preempted RCU readers blocking the current grace period 274 * for the specified rcu_node structure. If the caller needs a reliable 275 * answer, it must hold the rcu_node's ->lock. 276 */ 277 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 278 { 279 return rnp->gp_tasks != NULL; 280 } 281 282 /* 283 * Record a quiescent state for all tasks that were previously queued 284 * on the specified rcu_node structure and that were blocking the current 285 * RCU grace period. The caller must hold the specified rnp->lock with 286 * irqs disabled, and this lock is released upon return, but irqs remain 287 * disabled. 288 */ 289 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 290 __releases(rnp->lock) 291 { 292 unsigned long mask; 293 struct rcu_node *rnp_p; 294 295 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 296 raw_spin_unlock_irqrestore(&rnp->lock, flags); 297 return; /* Still need more quiescent states! */ 298 } 299 300 rnp_p = rnp->parent; 301 if (rnp_p == NULL) { 302 /* 303 * Either there is only one rcu_node in the tree, 304 * or tasks were kicked up to root rcu_node due to 305 * CPUs going offline. 306 */ 307 rcu_report_qs_rsp(&rcu_preempt_state, flags); 308 return; 309 } 310 311 /* Report up the rest of the hierarchy. */ 312 mask = rnp->grpmask; 313 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 314 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ 315 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 316 } 317 318 /* 319 * Advance a ->blkd_tasks-list pointer to the next entry, instead 320 * returning NULL if at the end of the list. 321 */ 322 static struct list_head *rcu_next_node_entry(struct task_struct *t, 323 struct rcu_node *rnp) 324 { 325 struct list_head *np; 326 327 np = t->rcu_node_entry.next; 328 if (np == &rnp->blkd_tasks) 329 np = NULL; 330 return np; 331 } 332 333 /* 334 * Handle special cases during rcu_read_unlock(), such as needing to 335 * notify RCU core processing or task having blocked during the RCU 336 * read-side critical section. 337 */ 338 void rcu_read_unlock_special(struct task_struct *t) 339 { 340 int empty; 341 int empty_exp; 342 int empty_exp_now; 343 unsigned long flags; 344 struct list_head *np; 345 #ifdef CONFIG_RCU_BOOST 346 struct rt_mutex *rbmp = NULL; 347 #endif /* #ifdef CONFIG_RCU_BOOST */ 348 struct rcu_node *rnp; 349 int special; 350 351 /* NMI handlers cannot block and cannot safely manipulate state. */ 352 if (in_nmi()) 353 return; 354 355 local_irq_save(flags); 356 357 /* 358 * If RCU core is waiting for this CPU to exit critical section, 359 * let it know that we have done so. 360 */ 361 special = t->rcu_read_unlock_special; 362 if (special & RCU_READ_UNLOCK_NEED_QS) { 363 rcu_preempt_qs(smp_processor_id()); 364 } 365 366 /* Hardware IRQ handlers cannot block. */ 367 if (in_irq() || in_serving_softirq()) { 368 local_irq_restore(flags); 369 return; 370 } 371 372 /* Clean up if blocked during RCU read-side critical section. */ 373 if (special & RCU_READ_UNLOCK_BLOCKED) { 374 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; 375 376 /* 377 * Remove this task from the list it blocked on. The 378 * task can migrate while we acquire the lock, but at 379 * most one time. So at most two passes through loop. 380 */ 381 for (;;) { 382 rnp = t->rcu_blocked_node; 383 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 384 if (rnp == t->rcu_blocked_node) 385 break; 386 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 387 } 388 empty = !rcu_preempt_blocked_readers_cgp(rnp); 389 empty_exp = !rcu_preempted_readers_exp(rnp); 390 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 391 np = rcu_next_node_entry(t, rnp); 392 list_del_init(&t->rcu_node_entry); 393 t->rcu_blocked_node = NULL; 394 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), 395 rnp->gpnum, t->pid); 396 if (&t->rcu_node_entry == rnp->gp_tasks) 397 rnp->gp_tasks = np; 398 if (&t->rcu_node_entry == rnp->exp_tasks) 399 rnp->exp_tasks = np; 400 #ifdef CONFIG_RCU_BOOST 401 if (&t->rcu_node_entry == rnp->boost_tasks) 402 rnp->boost_tasks = np; 403 /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ 404 if (t->rcu_boost_mutex) { 405 rbmp = t->rcu_boost_mutex; 406 t->rcu_boost_mutex = NULL; 407 } 408 #endif /* #ifdef CONFIG_RCU_BOOST */ 409 410 /* 411 * If this was the last task on the current list, and if 412 * we aren't waiting on any CPUs, report the quiescent state. 413 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 414 * so we must take a snapshot of the expedited state. 415 */ 416 empty_exp_now = !rcu_preempted_readers_exp(rnp); 417 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 418 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 419 rnp->gpnum, 420 0, rnp->qsmask, 421 rnp->level, 422 rnp->grplo, 423 rnp->grphi, 424 !!rnp->gp_tasks); 425 rcu_report_unblock_qs_rnp(rnp, flags); 426 } else { 427 raw_spin_unlock_irqrestore(&rnp->lock, flags); 428 } 429 430 #ifdef CONFIG_RCU_BOOST 431 /* Unboost if we were boosted. */ 432 if (rbmp) 433 rt_mutex_unlock(rbmp); 434 #endif /* #ifdef CONFIG_RCU_BOOST */ 435 436 /* 437 * If this was the last task on the expedited lists, 438 * then we need to report up the rcu_node hierarchy. 439 */ 440 if (!empty_exp && empty_exp_now) 441 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); 442 } else { 443 local_irq_restore(flags); 444 } 445 } 446 447 #ifdef CONFIG_RCU_CPU_STALL_VERBOSE 448 449 /* 450 * Dump detailed information for all tasks blocking the current RCU 451 * grace period on the specified rcu_node structure. 452 */ 453 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) 454 { 455 unsigned long flags; 456 struct task_struct *t; 457 458 raw_spin_lock_irqsave(&rnp->lock, flags); 459 if (!rcu_preempt_blocked_readers_cgp(rnp)) { 460 raw_spin_unlock_irqrestore(&rnp->lock, flags); 461 return; 462 } 463 t = list_entry(rnp->gp_tasks, 464 struct task_struct, rcu_node_entry); 465 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) 466 sched_show_task(t); 467 raw_spin_unlock_irqrestore(&rnp->lock, flags); 468 } 469 470 /* 471 * Dump detailed information for all tasks blocking the current RCU 472 * grace period. 473 */ 474 static void rcu_print_detail_task_stall(struct rcu_state *rsp) 475 { 476 struct rcu_node *rnp = rcu_get_root(rsp); 477 478 rcu_print_detail_task_stall_rnp(rnp); 479 rcu_for_each_leaf_node(rsp, rnp) 480 rcu_print_detail_task_stall_rnp(rnp); 481 } 482 483 #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 484 485 static void rcu_print_detail_task_stall(struct rcu_state *rsp) 486 { 487 } 488 489 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 490 491 #ifdef CONFIG_RCU_CPU_STALL_INFO 492 493 static void rcu_print_task_stall_begin(struct rcu_node *rnp) 494 { 495 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", 496 rnp->level, rnp->grplo, rnp->grphi); 497 } 498 499 static void rcu_print_task_stall_end(void) 500 { 501 pr_cont("\n"); 502 } 503 504 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 505 506 static void rcu_print_task_stall_begin(struct rcu_node *rnp) 507 { 508 } 509 510 static void rcu_print_task_stall_end(void) 511 { 512 } 513 514 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 515 516 /* 517 * Scan the current list of tasks blocked within RCU read-side critical 518 * sections, printing out the tid of each. 519 */ 520 static int rcu_print_task_stall(struct rcu_node *rnp) 521 { 522 struct task_struct *t; 523 int ndetected = 0; 524 525 if (!rcu_preempt_blocked_readers_cgp(rnp)) 526 return 0; 527 rcu_print_task_stall_begin(rnp); 528 t = list_entry(rnp->gp_tasks, 529 struct task_struct, rcu_node_entry); 530 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 531 pr_cont(" P%d", t->pid); 532 ndetected++; 533 } 534 rcu_print_task_stall_end(); 535 return ndetected; 536 } 537 538 /* 539 * Check that the list of blocked tasks for the newly completed grace 540 * period is in fact empty. It is a serious bug to complete a grace 541 * period that still has RCU readers blocked! This function must be 542 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock 543 * must be held by the caller. 544 * 545 * Also, if there are blocked tasks on the list, they automatically 546 * block the newly created grace period, so set up ->gp_tasks accordingly. 547 */ 548 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 549 { 550 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 551 if (!list_empty(&rnp->blkd_tasks)) 552 rnp->gp_tasks = rnp->blkd_tasks.next; 553 WARN_ON_ONCE(rnp->qsmask); 554 } 555 556 #ifdef CONFIG_HOTPLUG_CPU 557 558 /* 559 * Handle tasklist migration for case in which all CPUs covered by the 560 * specified rcu_node have gone offline. Move them up to the root 561 * rcu_node. The reason for not just moving them to the immediate 562 * parent is to remove the need for rcu_read_unlock_special() to 563 * make more than two attempts to acquire the target rcu_node's lock. 564 * Returns true if there were tasks blocking the current RCU grace 565 * period. 566 * 567 * Returns 1 if there was previously a task blocking the current grace 568 * period on the specified rcu_node structure. 569 * 570 * The caller must hold rnp->lock with irqs disabled. 571 */ 572 static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 573 struct rcu_node *rnp, 574 struct rcu_data *rdp) 575 { 576 struct list_head *lp; 577 struct list_head *lp_root; 578 int retval = 0; 579 struct rcu_node *rnp_root = rcu_get_root(rsp); 580 struct task_struct *t; 581 582 if (rnp == rnp_root) { 583 WARN_ONCE(1, "Last CPU thought to be offlined?"); 584 return 0; /* Shouldn't happen: at least one CPU online. */ 585 } 586 587 /* If we are on an internal node, complain bitterly. */ 588 WARN_ON_ONCE(rnp != rdp->mynode); 589 590 /* 591 * Move tasks up to root rcu_node. Don't try to get fancy for 592 * this corner-case operation -- just put this node's tasks 593 * at the head of the root node's list, and update the root node's 594 * ->gp_tasks and ->exp_tasks pointers to those of this node's, 595 * if non-NULL. This might result in waiting for more tasks than 596 * absolutely necessary, but this is a good performance/complexity 597 * tradeoff. 598 */ 599 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) 600 retval |= RCU_OFL_TASKS_NORM_GP; 601 if (rcu_preempted_readers_exp(rnp)) 602 retval |= RCU_OFL_TASKS_EXP_GP; 603 lp = &rnp->blkd_tasks; 604 lp_root = &rnp_root->blkd_tasks; 605 while (!list_empty(lp)) { 606 t = list_entry(lp->next, typeof(*t), rcu_node_entry); 607 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 608 list_del(&t->rcu_node_entry); 609 t->rcu_blocked_node = rnp_root; 610 list_add(&t->rcu_node_entry, lp_root); 611 if (&t->rcu_node_entry == rnp->gp_tasks) 612 rnp_root->gp_tasks = rnp->gp_tasks; 613 if (&t->rcu_node_entry == rnp->exp_tasks) 614 rnp_root->exp_tasks = rnp->exp_tasks; 615 #ifdef CONFIG_RCU_BOOST 616 if (&t->rcu_node_entry == rnp->boost_tasks) 617 rnp_root->boost_tasks = rnp->boost_tasks; 618 #endif /* #ifdef CONFIG_RCU_BOOST */ 619 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 620 } 621 622 rnp->gp_tasks = NULL; 623 rnp->exp_tasks = NULL; 624 #ifdef CONFIG_RCU_BOOST 625 rnp->boost_tasks = NULL; 626 /* 627 * In case root is being boosted and leaf was not. Make sure 628 * that we boost the tasks blocking the current grace period 629 * in this case. 630 */ 631 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ 632 if (rnp_root->boost_tasks != NULL && 633 rnp_root->boost_tasks != rnp_root->gp_tasks && 634 rnp_root->boost_tasks != rnp_root->exp_tasks) 635 rnp_root->boost_tasks = rnp_root->gp_tasks; 636 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ 637 #endif /* #ifdef CONFIG_RCU_BOOST */ 638 639 return retval; 640 } 641 642 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 643 644 /* 645 * Check for a quiescent state from the current CPU. When a task blocks, 646 * the task is recorded in the corresponding CPU's rcu_node structure, 647 * which is checked elsewhere. 648 * 649 * Caller must disable hard irqs. 650 */ 651 static void rcu_preempt_check_callbacks(int cpu) 652 { 653 struct task_struct *t = current; 654 655 if (t->rcu_read_lock_nesting == 0) { 656 rcu_preempt_qs(cpu); 657 return; 658 } 659 if (t->rcu_read_lock_nesting > 0 && 660 per_cpu(rcu_preempt_data, cpu).qs_pending) 661 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 662 } 663 664 #ifdef CONFIG_RCU_BOOST 665 666 static void rcu_preempt_do_callbacks(void) 667 { 668 rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); 669 } 670 671 #endif /* #ifdef CONFIG_RCU_BOOST */ 672 673 /* 674 * Queue a preemptible-RCU callback for invocation after a grace period. 675 */ 676 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 677 { 678 __call_rcu(head, func, &rcu_preempt_state, -1, 0); 679 } 680 EXPORT_SYMBOL_GPL(call_rcu); 681 682 /* 683 * Queue an RCU callback for lazy invocation after a grace period. 684 * This will likely be later named something like "call_rcu_lazy()", 685 * but this change will require some way of tagging the lazy RCU 686 * callbacks in the list of pending callbacks. Until then, this 687 * function may only be called from __kfree_rcu(). 688 */ 689 void kfree_call_rcu(struct rcu_head *head, 690 void (*func)(struct rcu_head *rcu)) 691 { 692 __call_rcu(head, func, &rcu_preempt_state, -1, 1); 693 } 694 EXPORT_SYMBOL_GPL(kfree_call_rcu); 695 696 /** 697 * synchronize_rcu - wait until a grace period has elapsed. 698 * 699 * Control will return to the caller some time after a full grace 700 * period has elapsed, in other words after all currently executing RCU 701 * read-side critical sections have completed. Note, however, that 702 * upon return from synchronize_rcu(), the caller might well be executing 703 * concurrently with new RCU read-side critical sections that began while 704 * synchronize_rcu() was waiting. RCU read-side critical sections are 705 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. 706 * 707 * See the description of synchronize_sched() for more detailed information 708 * on memory ordering guarantees. 709 */ 710 void synchronize_rcu(void) 711 { 712 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 713 !lock_is_held(&rcu_lock_map) && 714 !lock_is_held(&rcu_sched_lock_map), 715 "Illegal synchronize_rcu() in RCU read-side critical section"); 716 if (!rcu_scheduler_active) 717 return; 718 if (rcu_expedited) 719 synchronize_rcu_expedited(); 720 else 721 wait_rcu_gp(call_rcu); 722 } 723 EXPORT_SYMBOL_GPL(synchronize_rcu); 724 725 static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); 726 static unsigned long sync_rcu_preempt_exp_count; 727 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); 728 729 /* 730 * Return non-zero if there are any tasks in RCU read-side critical 731 * sections blocking the current preemptible-RCU expedited grace period. 732 * If there is no preemptible-RCU expedited grace period currently in 733 * progress, returns zero unconditionally. 734 */ 735 static int rcu_preempted_readers_exp(struct rcu_node *rnp) 736 { 737 return rnp->exp_tasks != NULL; 738 } 739 740 /* 741 * return non-zero if there is no RCU expedited grace period in progress 742 * for the specified rcu_node structure, in other words, if all CPUs and 743 * tasks covered by the specified rcu_node structure have done their bit 744 * for the current expedited grace period. Works only for preemptible 745 * RCU -- other RCU implementation use other means. 746 * 747 * Caller must hold sync_rcu_preempt_exp_mutex. 748 */ 749 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) 750 { 751 return !rcu_preempted_readers_exp(rnp) && 752 ACCESS_ONCE(rnp->expmask) == 0; 753 } 754 755 /* 756 * Report the exit from RCU read-side critical section for the last task 757 * that queued itself during or before the current expedited preemptible-RCU 758 * grace period. This event is reported either to the rcu_node structure on 759 * which the task was queued or to one of that rcu_node structure's ancestors, 760 * recursively up the tree. (Calm down, calm down, we do the recursion 761 * iteratively!) 762 * 763 * Most callers will set the "wake" flag, but the task initiating the 764 * expedited grace period need not wake itself. 765 * 766 * Caller must hold sync_rcu_preempt_exp_mutex. 767 */ 768 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 769 bool wake) 770 { 771 unsigned long flags; 772 unsigned long mask; 773 774 raw_spin_lock_irqsave(&rnp->lock, flags); 775 for (;;) { 776 if (!sync_rcu_preempt_exp_done(rnp)) { 777 raw_spin_unlock_irqrestore(&rnp->lock, flags); 778 break; 779 } 780 if (rnp->parent == NULL) { 781 raw_spin_unlock_irqrestore(&rnp->lock, flags); 782 if (wake) 783 wake_up(&sync_rcu_preempt_exp_wq); 784 break; 785 } 786 mask = rnp->grpmask; 787 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ 788 rnp = rnp->parent; 789 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 790 rnp->expmask &= ~mask; 791 } 792 } 793 794 /* 795 * Snapshot the tasks blocking the newly started preemptible-RCU expedited 796 * grace period for the specified rcu_node structure. If there are no such 797 * tasks, report it up the rcu_node hierarchy. 798 * 799 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude 800 * CPU hotplug operations. 801 */ 802 static void 803 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 804 { 805 unsigned long flags; 806 int must_wait = 0; 807 808 raw_spin_lock_irqsave(&rnp->lock, flags); 809 if (list_empty(&rnp->blkd_tasks)) { 810 raw_spin_unlock_irqrestore(&rnp->lock, flags); 811 } else { 812 rnp->exp_tasks = rnp->blkd_tasks.next; 813 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 814 must_wait = 1; 815 } 816 if (!must_wait) 817 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 818 } 819 820 /** 821 * synchronize_rcu_expedited - Brute-force RCU grace period 822 * 823 * Wait for an RCU-preempt grace period, but expedite it. The basic 824 * idea is to invoke synchronize_sched_expedited() to push all the tasks to 825 * the ->blkd_tasks lists and wait for this list to drain. This consumes 826 * significant time on all CPUs and is unfriendly to real-time workloads, 827 * so is thus not recommended for any sort of common-case code. 828 * In fact, if you are using synchronize_rcu_expedited() in a loop, 829 * please restructure your code to batch your updates, and then Use a 830 * single synchronize_rcu() instead. 831 * 832 * Note that it is illegal to call this function while holding any lock 833 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 834 * to call this function from a CPU-hotplug notifier. Failing to observe 835 * these restriction will result in deadlock. 836 */ 837 void synchronize_rcu_expedited(void) 838 { 839 unsigned long flags; 840 struct rcu_node *rnp; 841 struct rcu_state *rsp = &rcu_preempt_state; 842 unsigned long snap; 843 int trycount = 0; 844 845 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 846 snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; 847 smp_mb(); /* Above access cannot bleed into critical section. */ 848 849 /* 850 * Block CPU-hotplug operations. This means that any CPU-hotplug 851 * operation that finds an rcu_node structure with tasks in the 852 * process of being boosted will know that all tasks blocking 853 * this expedited grace period will already be in the process of 854 * being boosted. This simplifies the process of moving tasks 855 * from leaf to root rcu_node structures. 856 */ 857 get_online_cpus(); 858 859 /* 860 * Acquire lock, falling back to synchronize_rcu() if too many 861 * lock-acquisition failures. Of course, if someone does the 862 * expedited grace period for us, just leave. 863 */ 864 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { 865 if (ULONG_CMP_LT(snap, 866 ACCESS_ONCE(sync_rcu_preempt_exp_count))) { 867 put_online_cpus(); 868 goto mb_ret; /* Others did our work for us. */ 869 } 870 if (trycount++ < 10) { 871 udelay(trycount * num_online_cpus()); 872 } else { 873 put_online_cpus(); 874 wait_rcu_gp(call_rcu); 875 return; 876 } 877 } 878 if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { 879 put_online_cpus(); 880 goto unlock_mb_ret; /* Others did our work for us. */ 881 } 882 883 /* force all RCU readers onto ->blkd_tasks lists. */ 884 synchronize_sched_expedited(); 885 886 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 887 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 888 raw_spin_lock_irqsave(&rnp->lock, flags); 889 rnp->expmask = rnp->qsmaskinit; 890 raw_spin_unlock_irqrestore(&rnp->lock, flags); 891 } 892 893 /* Snapshot current state of ->blkd_tasks lists. */ 894 rcu_for_each_leaf_node(rsp, rnp) 895 sync_rcu_preempt_exp_init(rsp, rnp); 896 if (NUM_RCU_NODES > 1) 897 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 898 899 put_online_cpus(); 900 901 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 902 rnp = rcu_get_root(rsp); 903 wait_event(sync_rcu_preempt_exp_wq, 904 sync_rcu_preempt_exp_done(rnp)); 905 906 /* Clean up and exit. */ 907 smp_mb(); /* ensure expedited GP seen before counter increment. */ 908 ACCESS_ONCE(sync_rcu_preempt_exp_count)++; 909 unlock_mb_ret: 910 mutex_unlock(&sync_rcu_preempt_exp_mutex); 911 mb_ret: 912 smp_mb(); /* ensure subsequent action seen after grace period. */ 913 } 914 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 915 916 /** 917 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. 918 * 919 * Note that this primitive does not necessarily wait for an RCU grace period 920 * to complete. For example, if there are no RCU callbacks queued anywhere 921 * in the system, then rcu_barrier() is within its rights to return 922 * immediately, without waiting for anything, much less an RCU grace period. 923 */ 924 void rcu_barrier(void) 925 { 926 _rcu_barrier(&rcu_preempt_state); 927 } 928 EXPORT_SYMBOL_GPL(rcu_barrier); 929 930 /* 931 * Initialize preemptible RCU's state structures. 932 */ 933 static void __init __rcu_init_preempt(void) 934 { 935 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); 936 } 937 938 /* 939 * Check for a task exiting while in a preemptible-RCU read-side 940 * critical section, clean up if so. No need to issue warnings, 941 * as debug_check_no_locks_held() already does this if lockdep 942 * is enabled. 943 */ 944 void exit_rcu(void) 945 { 946 struct task_struct *t = current; 947 948 if (likely(list_empty(¤t->rcu_node_entry))) 949 return; 950 t->rcu_read_lock_nesting = 1; 951 barrier(); 952 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; 953 __rcu_read_unlock(); 954 } 955 956 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 957 958 static struct rcu_state *rcu_state = &rcu_sched_state; 959 960 /* 961 * Tell them what RCU they are running. 962 */ 963 static void __init rcu_bootup_announce(void) 964 { 965 pr_info("Hierarchical RCU implementation.\n"); 966 rcu_bootup_announce_oddness(); 967 } 968 969 /* 970 * Return the number of RCU batches processed thus far for debug & stats. 971 */ 972 long rcu_batches_completed(void) 973 { 974 return rcu_batches_completed_sched(); 975 } 976 EXPORT_SYMBOL_GPL(rcu_batches_completed); 977 978 /* 979 * Force a quiescent state for RCU, which, because there is no preemptible 980 * RCU, becomes the same as rcu-sched. 981 */ 982 void rcu_force_quiescent_state(void) 983 { 984 rcu_sched_force_quiescent_state(); 985 } 986 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 987 988 /* 989 * Because preemptible RCU does not exist, we never have to check for 990 * CPUs being in quiescent states. 991 */ 992 static void rcu_preempt_note_context_switch(int cpu) 993 { 994 } 995 996 /* 997 * Because preemptible RCU does not exist, there are never any preempted 998 * RCU readers. 999 */ 1000 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 1001 { 1002 return 0; 1003 } 1004 1005 #ifdef CONFIG_HOTPLUG_CPU 1006 1007 /* Because preemptible RCU does not exist, no quieting of tasks. */ 1008 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 1009 { 1010 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1011 } 1012 1013 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1014 1015 /* 1016 * Because preemptible RCU does not exist, we never have to check for 1017 * tasks blocked within RCU read-side critical sections. 1018 */ 1019 static void rcu_print_detail_task_stall(struct rcu_state *rsp) 1020 { 1021 } 1022 1023 /* 1024 * Because preemptible RCU does not exist, we never have to check for 1025 * tasks blocked within RCU read-side critical sections. 1026 */ 1027 static int rcu_print_task_stall(struct rcu_node *rnp) 1028 { 1029 return 0; 1030 } 1031 1032 /* 1033 * Because there is no preemptible RCU, there can be no readers blocked, 1034 * so there is no need to check for blocked tasks. So check only for 1035 * bogus qsmask values. 1036 */ 1037 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 1038 { 1039 WARN_ON_ONCE(rnp->qsmask); 1040 } 1041 1042 #ifdef CONFIG_HOTPLUG_CPU 1043 1044 /* 1045 * Because preemptible RCU does not exist, it never needs to migrate 1046 * tasks that were blocked within RCU read-side critical sections, and 1047 * such non-existent tasks cannot possibly have been blocking the current 1048 * grace period. 1049 */ 1050 static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 1051 struct rcu_node *rnp, 1052 struct rcu_data *rdp) 1053 { 1054 return 0; 1055 } 1056 1057 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1058 1059 /* 1060 * Because preemptible RCU does not exist, it never has any callbacks 1061 * to check. 1062 */ 1063 static void rcu_preempt_check_callbacks(int cpu) 1064 { 1065 } 1066 1067 /* 1068 * Queue an RCU callback for lazy invocation after a grace period. 1069 * This will likely be later named something like "call_rcu_lazy()", 1070 * but this change will require some way of tagging the lazy RCU 1071 * callbacks in the list of pending callbacks. Until then, this 1072 * function may only be called from __kfree_rcu(). 1073 * 1074 * Because there is no preemptible RCU, we use RCU-sched instead. 1075 */ 1076 void kfree_call_rcu(struct rcu_head *head, 1077 void (*func)(struct rcu_head *rcu)) 1078 { 1079 __call_rcu(head, func, &rcu_sched_state, -1, 1); 1080 } 1081 EXPORT_SYMBOL_GPL(kfree_call_rcu); 1082 1083 /* 1084 * Wait for an rcu-preempt grace period, but make it happen quickly. 1085 * But because preemptible RCU does not exist, map to rcu-sched. 1086 */ 1087 void synchronize_rcu_expedited(void) 1088 { 1089 synchronize_sched_expedited(); 1090 } 1091 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 1092 1093 #ifdef CONFIG_HOTPLUG_CPU 1094 1095 /* 1096 * Because preemptible RCU does not exist, there is never any need to 1097 * report on tasks preempted in RCU read-side critical sections during 1098 * expedited RCU grace periods. 1099 */ 1100 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 1101 bool wake) 1102 { 1103 } 1104 1105 #endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1106 1107 /* 1108 * Because preemptible RCU does not exist, rcu_barrier() is just 1109 * another name for rcu_barrier_sched(). 1110 */ 1111 void rcu_barrier(void) 1112 { 1113 rcu_barrier_sched(); 1114 } 1115 EXPORT_SYMBOL_GPL(rcu_barrier); 1116 1117 /* 1118 * Because preemptible RCU does not exist, it need not be initialized. 1119 */ 1120 static void __init __rcu_init_preempt(void) 1121 { 1122 } 1123 1124 /* 1125 * Because preemptible RCU does not exist, tasks cannot possibly exit 1126 * while in preemptible RCU read-side critical sections. 1127 */ 1128 void exit_rcu(void) 1129 { 1130 } 1131 1132 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1133 1134 #ifdef CONFIG_RCU_BOOST 1135 1136 #include "../locking/rtmutex_common.h" 1137 1138 #ifdef CONFIG_RCU_TRACE 1139 1140 static void rcu_initiate_boost_trace(struct rcu_node *rnp) 1141 { 1142 if (list_empty(&rnp->blkd_tasks)) 1143 rnp->n_balk_blkd_tasks++; 1144 else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) 1145 rnp->n_balk_exp_gp_tasks++; 1146 else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) 1147 rnp->n_balk_boost_tasks++; 1148 else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) 1149 rnp->n_balk_notblocked++; 1150 else if (rnp->gp_tasks != NULL && 1151 ULONG_CMP_LT(jiffies, rnp->boost_time)) 1152 rnp->n_balk_notyet++; 1153 else 1154 rnp->n_balk_nos++; 1155 } 1156 1157 #else /* #ifdef CONFIG_RCU_TRACE */ 1158 1159 static void rcu_initiate_boost_trace(struct rcu_node *rnp) 1160 { 1161 } 1162 1163 #endif /* #else #ifdef CONFIG_RCU_TRACE */ 1164 1165 static void rcu_wake_cond(struct task_struct *t, int status) 1166 { 1167 /* 1168 * If the thread is yielding, only wake it when this 1169 * is invoked from idle 1170 */ 1171 if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) 1172 wake_up_process(t); 1173 } 1174 1175 /* 1176 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1177 * or ->boost_tasks, advancing the pointer to the next task in the 1178 * ->blkd_tasks list. 1179 * 1180 * Note that irqs must be enabled: boosting the task can block. 1181 * Returns 1 if there are more tasks needing to be boosted. 1182 */ 1183 static int rcu_boost(struct rcu_node *rnp) 1184 { 1185 unsigned long flags; 1186 struct rt_mutex mtx; 1187 struct task_struct *t; 1188 struct list_head *tb; 1189 1190 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) 1191 return 0; /* Nothing left to boost. */ 1192 1193 raw_spin_lock_irqsave(&rnp->lock, flags); 1194 1195 /* 1196 * Recheck under the lock: all tasks in need of boosting 1197 * might exit their RCU read-side critical sections on their own. 1198 */ 1199 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { 1200 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1201 return 0; 1202 } 1203 1204 /* 1205 * Preferentially boost tasks blocking expedited grace periods. 1206 * This cannot starve the normal grace periods because a second 1207 * expedited grace period must boost all blocked tasks, including 1208 * those blocking the pre-existing normal grace period. 1209 */ 1210 if (rnp->exp_tasks != NULL) { 1211 tb = rnp->exp_tasks; 1212 rnp->n_exp_boosts++; 1213 } else { 1214 tb = rnp->boost_tasks; 1215 rnp->n_normal_boosts++; 1216 } 1217 rnp->n_tasks_boosted++; 1218 1219 /* 1220 * We boost task t by manufacturing an rt_mutex that appears to 1221 * be held by task t. We leave a pointer to that rt_mutex where 1222 * task t can find it, and task t will release the mutex when it 1223 * exits its outermost RCU read-side critical section. Then 1224 * simply acquiring this artificial rt_mutex will boost task 1225 * t's priority. (Thanks to tglx for suggesting this approach!) 1226 * 1227 * Note that task t must acquire rnp->lock to remove itself from 1228 * the ->blkd_tasks list, which it will do from exit() if from 1229 * nowhere else. We therefore are guaranteed that task t will 1230 * stay around at least until we drop rnp->lock. Note that 1231 * rnp->lock also resolves races between our priority boosting 1232 * and task t's exiting its outermost RCU read-side critical 1233 * section. 1234 */ 1235 t = container_of(tb, struct task_struct, rcu_node_entry); 1236 rt_mutex_init_proxy_locked(&mtx, t); 1237 t->rcu_boost_mutex = &mtx; 1238 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1239 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1240 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1241 1242 return ACCESS_ONCE(rnp->exp_tasks) != NULL || 1243 ACCESS_ONCE(rnp->boost_tasks) != NULL; 1244 } 1245 1246 /* 1247 * Priority-boosting kthread. One per leaf rcu_node and one for the 1248 * root rcu_node. 1249 */ 1250 static int rcu_boost_kthread(void *arg) 1251 { 1252 struct rcu_node *rnp = (struct rcu_node *)arg; 1253 int spincnt = 0; 1254 int more2boost; 1255 1256 trace_rcu_utilization(TPS("Start boost kthread@init")); 1257 for (;;) { 1258 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1259 trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); 1260 rcu_wait(rnp->boost_tasks || rnp->exp_tasks); 1261 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); 1262 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1263 more2boost = rcu_boost(rnp); 1264 if (more2boost) 1265 spincnt++; 1266 else 1267 spincnt = 0; 1268 if (spincnt > 10) { 1269 rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; 1270 trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); 1271 schedule_timeout_interruptible(2); 1272 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); 1273 spincnt = 0; 1274 } 1275 } 1276 /* NOTREACHED */ 1277 trace_rcu_utilization(TPS("End boost kthread@notreached")); 1278 return 0; 1279 } 1280 1281 /* 1282 * Check to see if it is time to start boosting RCU readers that are 1283 * blocking the current grace period, and, if so, tell the per-rcu_node 1284 * kthread to start boosting them. If there is an expedited grace 1285 * period in progress, it is always time to boost. 1286 * 1287 * The caller must hold rnp->lock, which this function releases. 1288 * The ->boost_kthread_task is immortal, so we don't need to worry 1289 * about it going away. 1290 */ 1291 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1292 { 1293 struct task_struct *t; 1294 1295 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1296 rnp->n_balk_exp_gp_tasks++; 1297 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1298 return; 1299 } 1300 if (rnp->exp_tasks != NULL || 1301 (rnp->gp_tasks != NULL && 1302 rnp->boost_tasks == NULL && 1303 rnp->qsmask == 0 && 1304 ULONG_CMP_GE(jiffies, rnp->boost_time))) { 1305 if (rnp->exp_tasks == NULL) 1306 rnp->boost_tasks = rnp->gp_tasks; 1307 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1308 t = rnp->boost_kthread_task; 1309 if (t) 1310 rcu_wake_cond(t, rnp->boost_kthread_status); 1311 } else { 1312 rcu_initiate_boost_trace(rnp); 1313 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1314 } 1315 } 1316 1317 /* 1318 * Wake up the per-CPU kthread to invoke RCU callbacks. 1319 */ 1320 static void invoke_rcu_callbacks_kthread(void) 1321 { 1322 unsigned long flags; 1323 1324 local_irq_save(flags); 1325 __this_cpu_write(rcu_cpu_has_work, 1); 1326 if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && 1327 current != __this_cpu_read(rcu_cpu_kthread_task)) { 1328 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), 1329 __this_cpu_read(rcu_cpu_kthread_status)); 1330 } 1331 local_irq_restore(flags); 1332 } 1333 1334 /* 1335 * Is the current CPU running the RCU-callbacks kthread? 1336 * Caller must have preemption disabled. 1337 */ 1338 static bool rcu_is_callbacks_kthread(void) 1339 { 1340 return __this_cpu_read(rcu_cpu_kthread_task) == current; 1341 } 1342 1343 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1344 1345 /* 1346 * Do priority-boost accounting for the start of a new grace period. 1347 */ 1348 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1349 { 1350 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 1351 } 1352 1353 /* 1354 * Create an RCU-boost kthread for the specified node if one does not 1355 * already exist. We only create this kthread for preemptible RCU. 1356 * Returns zero if all is well, a negated errno otherwise. 1357 */ 1358 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1359 struct rcu_node *rnp) 1360 { 1361 int rnp_index = rnp - &rsp->node[0]; 1362 unsigned long flags; 1363 struct sched_param sp; 1364 struct task_struct *t; 1365 1366 if (&rcu_preempt_state != rsp) 1367 return 0; 1368 1369 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) 1370 return 0; 1371 1372 rsp->boost = 1; 1373 if (rnp->boost_kthread_task != NULL) 1374 return 0; 1375 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1376 "rcub/%d", rnp_index); 1377 if (IS_ERR(t)) 1378 return PTR_ERR(t); 1379 raw_spin_lock_irqsave(&rnp->lock, flags); 1380 rnp->boost_kthread_task = t; 1381 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1382 sp.sched_priority = RCU_BOOST_PRIO; 1383 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1384 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1385 return 0; 1386 } 1387 1388 static void rcu_kthread_do_work(void) 1389 { 1390 rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); 1391 rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); 1392 rcu_preempt_do_callbacks(); 1393 } 1394 1395 static void rcu_cpu_kthread_setup(unsigned int cpu) 1396 { 1397 struct sched_param sp; 1398 1399 sp.sched_priority = RCU_KTHREAD_PRIO; 1400 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1401 } 1402 1403 static void rcu_cpu_kthread_park(unsigned int cpu) 1404 { 1405 per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; 1406 } 1407 1408 static int rcu_cpu_kthread_should_run(unsigned int cpu) 1409 { 1410 return __this_cpu_read(rcu_cpu_has_work); 1411 } 1412 1413 /* 1414 * Per-CPU kernel thread that invokes RCU callbacks. This replaces the 1415 * RCU softirq used in flavors and configurations of RCU that do not 1416 * support RCU priority boosting. 1417 */ 1418 static void rcu_cpu_kthread(unsigned int cpu) 1419 { 1420 unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); 1421 char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); 1422 int spincnt; 1423 1424 for (spincnt = 0; spincnt < 10; spincnt++) { 1425 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); 1426 local_bh_disable(); 1427 *statusp = RCU_KTHREAD_RUNNING; 1428 this_cpu_inc(rcu_cpu_kthread_loops); 1429 local_irq_disable(); 1430 work = *workp; 1431 *workp = 0; 1432 local_irq_enable(); 1433 if (work) 1434 rcu_kthread_do_work(); 1435 local_bh_enable(); 1436 if (*workp == 0) { 1437 trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); 1438 *statusp = RCU_KTHREAD_WAITING; 1439 return; 1440 } 1441 } 1442 *statusp = RCU_KTHREAD_YIELDING; 1443 trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); 1444 schedule_timeout_interruptible(2); 1445 trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); 1446 *statusp = RCU_KTHREAD_WAITING; 1447 } 1448 1449 /* 1450 * Set the per-rcu_node kthread's affinity to cover all CPUs that are 1451 * served by the rcu_node in question. The CPU hotplug lock is still 1452 * held, so the value of rnp->qsmaskinit will be stable. 1453 * 1454 * We don't include outgoingcpu in the affinity set, use -1 if there is 1455 * no outgoing CPU. If there are no CPUs left in the affinity set, 1456 * this function allows the kthread to execute on any CPU. 1457 */ 1458 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1459 { 1460 struct task_struct *t = rnp->boost_kthread_task; 1461 unsigned long mask = rnp->qsmaskinit; 1462 cpumask_var_t cm; 1463 int cpu; 1464 1465 if (!t) 1466 return; 1467 if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) 1468 return; 1469 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) 1470 if ((mask & 0x1) && cpu != outgoingcpu) 1471 cpumask_set_cpu(cpu, cm); 1472 if (cpumask_weight(cm) == 0) { 1473 cpumask_setall(cm); 1474 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) 1475 cpumask_clear_cpu(cpu, cm); 1476 WARN_ON_ONCE(cpumask_weight(cm) == 0); 1477 } 1478 set_cpus_allowed_ptr(t, cm); 1479 free_cpumask_var(cm); 1480 } 1481 1482 static struct smp_hotplug_thread rcu_cpu_thread_spec = { 1483 .store = &rcu_cpu_kthread_task, 1484 .thread_should_run = rcu_cpu_kthread_should_run, 1485 .thread_fn = rcu_cpu_kthread, 1486 .thread_comm = "rcuc/%u", 1487 .setup = rcu_cpu_kthread_setup, 1488 .park = rcu_cpu_kthread_park, 1489 }; 1490 1491 /* 1492 * Spawn all kthreads -- called as soon as the scheduler is running. 1493 */ 1494 static int __init rcu_spawn_kthreads(void) 1495 { 1496 struct rcu_node *rnp; 1497 int cpu; 1498 1499 rcu_scheduler_fully_active = 1; 1500 for_each_possible_cpu(cpu) 1501 per_cpu(rcu_cpu_has_work, cpu) = 0; 1502 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1503 rnp = rcu_get_root(rcu_state); 1504 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1505 if (NUM_RCU_NODES > 1) { 1506 rcu_for_each_leaf_node(rcu_state, rnp) 1507 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1508 } 1509 return 0; 1510 } 1511 early_initcall(rcu_spawn_kthreads); 1512 1513 static void rcu_prepare_kthreads(int cpu) 1514 { 1515 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1516 struct rcu_node *rnp = rdp->mynode; 1517 1518 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ 1519 if (rcu_scheduler_fully_active) 1520 (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); 1521 } 1522 1523 #else /* #ifdef CONFIG_RCU_BOOST */ 1524 1525 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1526 { 1527 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1528 } 1529 1530 static void invoke_rcu_callbacks_kthread(void) 1531 { 1532 WARN_ON_ONCE(1); 1533 } 1534 1535 static bool rcu_is_callbacks_kthread(void) 1536 { 1537 return false; 1538 } 1539 1540 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1541 { 1542 } 1543 1544 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1545 { 1546 } 1547 1548 static int __init rcu_scheduler_really_started(void) 1549 { 1550 rcu_scheduler_fully_active = 1; 1551 return 0; 1552 } 1553 early_initcall(rcu_scheduler_really_started); 1554 1555 static void rcu_prepare_kthreads(int cpu) 1556 { 1557 } 1558 1559 #endif /* #else #ifdef CONFIG_RCU_BOOST */ 1560 1561 #if !defined(CONFIG_RCU_FAST_NO_HZ) 1562 1563 /* 1564 * Check to see if any future RCU-related work will need to be done 1565 * by the current CPU, even if none need be done immediately, returning 1566 * 1 if so. This function is part of the RCU implementation; it is -not- 1567 * an exported member of the RCU API. 1568 * 1569 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs 1570 * any flavor of RCU. 1571 */ 1572 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1573 { 1574 *delta_jiffies = ULONG_MAX; 1575 return rcu_cpu_has_callbacks(cpu, NULL); 1576 } 1577 1578 /* 1579 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1580 * after it. 1581 */ 1582 static void rcu_cleanup_after_idle(int cpu) 1583 { 1584 } 1585 1586 /* 1587 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1588 * is nothing. 1589 */ 1590 static void rcu_prepare_for_idle(int cpu) 1591 { 1592 } 1593 1594 /* 1595 * Don't bother keeping a running count of the number of RCU callbacks 1596 * posted because CONFIG_RCU_FAST_NO_HZ=n. 1597 */ 1598 static void rcu_idle_count_callbacks_posted(void) 1599 { 1600 } 1601 1602 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1603 1604 /* 1605 * This code is invoked when a CPU goes idle, at which point we want 1606 * to have the CPU do everything required for RCU so that it can enter 1607 * the energy-efficient dyntick-idle mode. This is handled by a 1608 * state machine implemented by rcu_prepare_for_idle() below. 1609 * 1610 * The following three proprocessor symbols control this state machine: 1611 * 1612 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1613 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1614 * is sized to be roughly one RCU grace period. Those energy-efficiency 1615 * benchmarkers who might otherwise be tempted to set this to a large 1616 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1617 * system. And if you are -that- concerned about energy efficiency, 1618 * just power the system down and be done with it! 1619 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is 1620 * permitted to sleep in dyntick-idle mode with only lazy RCU 1621 * callbacks pending. Setting this too high can OOM your system. 1622 * 1623 * The values below work well in practice. If future workloads require 1624 * adjustment, they can be converted into kernel config parameters, though 1625 * making the state machine smarter might be a better option. 1626 */ 1627 #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1628 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1629 1630 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; 1631 module_param(rcu_idle_gp_delay, int, 0644); 1632 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; 1633 module_param(rcu_idle_lazy_gp_delay, int, 0644); 1634 1635 extern int tick_nohz_active; 1636 1637 /* 1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but 1639 * only if it has been awhile since the last time we did so. Afterwards, 1640 * if there are any callbacks ready for immediate invocation, return true. 1641 */ 1642 static bool rcu_try_advance_all_cbs(void) 1643 { 1644 bool cbs_ready = false; 1645 struct rcu_data *rdp; 1646 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 1647 struct rcu_node *rnp; 1648 struct rcu_state *rsp; 1649 1650 /* Exit early if we advanced recently. */ 1651 if (jiffies == rdtp->last_advance_all) 1652 return 0; 1653 rdtp->last_advance_all = jiffies; 1654 1655 for_each_rcu_flavor(rsp) { 1656 rdp = this_cpu_ptr(rsp->rda); 1657 rnp = rdp->mynode; 1658 1659 /* 1660 * Don't bother checking unless a grace period has 1661 * completed since we last checked and there are 1662 * callbacks not yet ready to invoke. 1663 */ 1664 if (rdp->completed != rnp->completed && 1665 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1666 note_gp_changes(rsp, rdp); 1667 1668 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1669 cbs_ready = true; 1670 } 1671 return cbs_ready; 1672 } 1673 1674 /* 1675 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready 1676 * to invoke. If the CPU has callbacks, try to advance them. Tell the 1677 * caller to set the timeout based on whether or not there are non-lazy 1678 * callbacks. 1679 * 1680 * The caller must have disabled interrupts. 1681 */ 1682 int rcu_needs_cpu(int cpu, unsigned long *dj) 1683 { 1684 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1685 1686 /* Snapshot to detect later posting of non-lazy callback. */ 1687 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1688 1689 /* If no callbacks, RCU doesn't need the CPU. */ 1690 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { 1691 *dj = ULONG_MAX; 1692 return 0; 1693 } 1694 1695 /* Attempt to advance callbacks. */ 1696 if (rcu_try_advance_all_cbs()) { 1697 /* Some ready to invoke, so initiate later invocation. */ 1698 invoke_rcu_core(); 1699 return 1; 1700 } 1701 rdtp->last_accelerate = jiffies; 1702 1703 /* Request timer delay depending on laziness, and round. */ 1704 if (!rdtp->all_lazy) { 1705 *dj = round_up(rcu_idle_gp_delay + jiffies, 1706 rcu_idle_gp_delay) - jiffies; 1707 } else { 1708 *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; 1709 } 1710 return 0; 1711 } 1712 1713 /* 1714 * Prepare a CPU for idle from an RCU perspective. The first major task 1715 * is to sense whether nohz mode has been enabled or disabled via sysfs. 1716 * The second major task is to check to see if a non-lazy callback has 1717 * arrived at a CPU that previously had only lazy callbacks. The third 1718 * major task is to accelerate (that is, assign grace-period numbers to) 1719 * any recently arrived callbacks. 1720 * 1721 * The caller must have disabled interrupts. 1722 */ 1723 static void rcu_prepare_for_idle(int cpu) 1724 { 1725 struct rcu_data *rdp; 1726 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1727 struct rcu_node *rnp; 1728 struct rcu_state *rsp; 1729 int tne; 1730 1731 /* Handle nohz enablement switches conservatively. */ 1732 tne = ACCESS_ONCE(tick_nohz_active); 1733 if (tne != rdtp->tick_nohz_enabled_snap) { 1734 if (rcu_cpu_has_callbacks(cpu, NULL)) 1735 invoke_rcu_core(); /* force nohz to see update. */ 1736 rdtp->tick_nohz_enabled_snap = tne; 1737 return; 1738 } 1739 if (!tne) 1740 return; 1741 1742 /* If this is a no-CBs CPU, no callbacks, just return. */ 1743 if (rcu_is_nocb_cpu(cpu)) 1744 return; 1745 1746 /* 1747 * If a non-lazy callback arrived at a CPU having only lazy 1748 * callbacks, invoke RCU core for the side-effect of recalculating 1749 * idle duration on re-entry to idle. 1750 */ 1751 if (rdtp->all_lazy && 1752 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { 1753 rdtp->all_lazy = false; 1754 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; 1755 invoke_rcu_core(); 1756 return; 1757 } 1758 1759 /* 1760 * If we have not yet accelerated this jiffy, accelerate all 1761 * callbacks on this CPU. 1762 */ 1763 if (rdtp->last_accelerate == jiffies) 1764 return; 1765 rdtp->last_accelerate = jiffies; 1766 for_each_rcu_flavor(rsp) { 1767 rdp = per_cpu_ptr(rsp->rda, cpu); 1768 if (!*rdp->nxttail[RCU_DONE_TAIL]) 1769 continue; 1770 rnp = rdp->mynode; 1771 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1772 rcu_accelerate_cbs(rsp, rnp, rdp); 1773 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1774 } 1775 } 1776 1777 /* 1778 * Clean up for exit from idle. Attempt to advance callbacks based on 1779 * any grace periods that elapsed while the CPU was idle, and if any 1780 * callbacks are now ready to invoke, initiate invocation. 1781 */ 1782 static void rcu_cleanup_after_idle(int cpu) 1783 { 1784 1785 if (rcu_is_nocb_cpu(cpu)) 1786 return; 1787 if (rcu_try_advance_all_cbs()) 1788 invoke_rcu_core(); 1789 } 1790 1791 /* 1792 * Keep a running count of the number of non-lazy callbacks posted 1793 * on this CPU. This running counter (which is never decremented) allows 1794 * rcu_prepare_for_idle() to detect when something out of the idle loop 1795 * posts a callback, even if an equal number of callbacks are invoked. 1796 * Of course, callbacks should only be posted from within a trace event 1797 * designed to be called from idle or from within RCU_NONIDLE(). 1798 */ 1799 static void rcu_idle_count_callbacks_posted(void) 1800 { 1801 __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); 1802 } 1803 1804 /* 1805 * Data for flushing lazy RCU callbacks at OOM time. 1806 */ 1807 static atomic_t oom_callback_count; 1808 static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); 1809 1810 /* 1811 * RCU OOM callback -- decrement the outstanding count and deliver the 1812 * wake-up if we are the last one. 1813 */ 1814 static void rcu_oom_callback(struct rcu_head *rhp) 1815 { 1816 if (atomic_dec_and_test(&oom_callback_count)) 1817 wake_up(&oom_callback_wq); 1818 } 1819 1820 /* 1821 * Post an rcu_oom_notify callback on the current CPU if it has at 1822 * least one lazy callback. This will unnecessarily post callbacks 1823 * to CPUs that already have a non-lazy callback at the end of their 1824 * callback list, but this is an infrequent operation, so accept some 1825 * extra overhead to keep things simple. 1826 */ 1827 static void rcu_oom_notify_cpu(void *unused) 1828 { 1829 struct rcu_state *rsp; 1830 struct rcu_data *rdp; 1831 1832 for_each_rcu_flavor(rsp) { 1833 rdp = __this_cpu_ptr(rsp->rda); 1834 if (rdp->qlen_lazy != 0) { 1835 atomic_inc(&oom_callback_count); 1836 rsp->call(&rdp->oom_head, rcu_oom_callback); 1837 } 1838 } 1839 } 1840 1841 /* 1842 * If low on memory, ensure that each CPU has a non-lazy callback. 1843 * This will wake up CPUs that have only lazy callbacks, in turn 1844 * ensuring that they free up the corresponding memory in a timely manner. 1845 * Because an uncertain amount of memory will be freed in some uncertain 1846 * timeframe, we do not claim to have freed anything. 1847 */ 1848 static int rcu_oom_notify(struct notifier_block *self, 1849 unsigned long notused, void *nfreed) 1850 { 1851 int cpu; 1852 1853 /* Wait for callbacks from earlier instance to complete. */ 1854 wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); 1855 1856 /* 1857 * Prevent premature wakeup: ensure that all increments happen 1858 * before there is a chance of the counter reaching zero. 1859 */ 1860 atomic_set(&oom_callback_count, 1); 1861 1862 get_online_cpus(); 1863 for_each_online_cpu(cpu) { 1864 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); 1865 cond_resched(); 1866 } 1867 put_online_cpus(); 1868 1869 /* Unconditionally decrement: no need to wake ourselves up. */ 1870 atomic_dec(&oom_callback_count); 1871 1872 return NOTIFY_OK; 1873 } 1874 1875 static struct notifier_block rcu_oom_nb = { 1876 .notifier_call = rcu_oom_notify 1877 }; 1878 1879 static int __init rcu_register_oom_notifier(void) 1880 { 1881 register_oom_notifier(&rcu_oom_nb); 1882 return 0; 1883 } 1884 early_initcall(rcu_register_oom_notifier); 1885 1886 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1887 1888 #ifdef CONFIG_RCU_CPU_STALL_INFO 1889 1890 #ifdef CONFIG_RCU_FAST_NO_HZ 1891 1892 static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1893 { 1894 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1895 unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; 1896 1897 sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", 1898 rdtp->last_accelerate & 0xffff, jiffies & 0xffff, 1899 ulong2long(nlpd), 1900 rdtp->all_lazy ? 'L' : '.', 1901 rdtp->tick_nohz_enabled_snap ? '.' : 'D'); 1902 } 1903 1904 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1905 1906 static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1907 { 1908 *cp = '\0'; 1909 } 1910 1911 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ 1912 1913 /* Initiate the stall-info list. */ 1914 static void print_cpu_stall_info_begin(void) 1915 { 1916 pr_cont("\n"); 1917 } 1918 1919 /* 1920 * Print out diagnostic information for the specified stalled CPU. 1921 * 1922 * If the specified CPU is aware of the current RCU grace period 1923 * (flavor specified by rsp), then print the number of scheduling 1924 * clock interrupts the CPU has taken during the time that it has 1925 * been aware. Otherwise, print the number of RCU grace periods 1926 * that this CPU is ignorant of, for example, "1" if the CPU was 1927 * aware of the previous grace period. 1928 * 1929 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. 1930 */ 1931 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) 1932 { 1933 char fast_no_hz[72]; 1934 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1935 struct rcu_dynticks *rdtp = rdp->dynticks; 1936 char *ticks_title; 1937 unsigned long ticks_value; 1938 1939 if (rsp->gpnum == rdp->gpnum) { 1940 ticks_title = "ticks this GP"; 1941 ticks_value = rdp->ticks_this_gp; 1942 } else { 1943 ticks_title = "GPs behind"; 1944 ticks_value = rsp->gpnum - rdp->gpnum; 1945 } 1946 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1947 pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", 1948 cpu, ticks_value, ticks_title, 1949 atomic_read(&rdtp->dynticks) & 0xfff, 1950 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1951 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), 1952 fast_no_hz); 1953 } 1954 1955 /* Terminate the stall-info list. */ 1956 static void print_cpu_stall_info_end(void) 1957 { 1958 pr_err("\t"); 1959 } 1960 1961 /* Zero ->ticks_this_gp for all flavors of RCU. */ 1962 static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1963 { 1964 rdp->ticks_this_gp = 0; 1965 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); 1966 } 1967 1968 /* Increment ->ticks_this_gp for all flavors of RCU. */ 1969 static void increment_cpu_stall_ticks(void) 1970 { 1971 struct rcu_state *rsp; 1972 1973 for_each_rcu_flavor(rsp) 1974 __this_cpu_ptr(rsp->rda)->ticks_this_gp++; 1975 } 1976 1977 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 1978 1979 static void print_cpu_stall_info_begin(void) 1980 { 1981 pr_cont(" {"); 1982 } 1983 1984 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) 1985 { 1986 pr_cont(" %d", cpu); 1987 } 1988 1989 static void print_cpu_stall_info_end(void) 1990 { 1991 pr_cont("} "); 1992 } 1993 1994 static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1995 { 1996 } 1997 1998 static void increment_cpu_stall_ticks(void) 1999 { 2000 } 2001 2002 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ 2003 2004 #ifdef CONFIG_RCU_NOCB_CPU 2005 2006 /* 2007 * Offload callback processing from the boot-time-specified set of CPUs 2008 * specified by rcu_nocb_mask. For each CPU in the set, there is a 2009 * kthread created that pulls the callbacks from the corresponding CPU, 2010 * waits for a grace period to elapse, and invokes the callbacks. 2011 * The no-CBs CPUs do a wake_up() on their kthread when they insert 2012 * a callback into any empty list, unless the rcu_nocb_poll boot parameter 2013 * has been specified, in which case each kthread actively polls its 2014 * CPU. (Which isn't so great for energy efficiency, but which does 2015 * reduce RCU's overhead on that CPU.) 2016 * 2017 * This is intended to be used in conjunction with Frederic Weisbecker's 2018 * adaptive-idle work, which would seriously reduce OS jitter on CPUs 2019 * running CPU-bound user-mode computations. 2020 * 2021 * Offloading of callback processing could also in theory be used as 2022 * an energy-efficiency measure because CPUs with no RCU callbacks 2023 * queued are more aggressive about entering dyntick-idle mode. 2024 */ 2025 2026 2027 /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ 2028 static int __init rcu_nocb_setup(char *str) 2029 { 2030 alloc_bootmem_cpumask_var(&rcu_nocb_mask); 2031 have_rcu_nocb_mask = true; 2032 cpulist_parse(str, rcu_nocb_mask); 2033 return 1; 2034 } 2035 __setup("rcu_nocbs=", rcu_nocb_setup); 2036 2037 static int __init parse_rcu_nocb_poll(char *arg) 2038 { 2039 rcu_nocb_poll = 1; 2040 return 0; 2041 } 2042 early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2043 2044 /* 2045 * Do any no-CBs CPUs need another grace period? 2046 * 2047 * Interrupts must be disabled. If the caller does not hold the root 2048 * rnp_node structure's ->lock, the results are advisory only. 2049 */ 2050 static int rcu_nocb_needs_gp(struct rcu_state *rsp) 2051 { 2052 struct rcu_node *rnp = rcu_get_root(rsp); 2053 2054 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; 2055 } 2056 2057 /* 2058 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 2059 * grace period. 2060 */ 2061 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2062 { 2063 wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); 2064 } 2065 2066 /* 2067 * Set the root rcu_node structure's ->need_future_gp field 2068 * based on the sum of those of all rcu_node structures. This does 2069 * double-count the root rcu_node structure's requests, but this 2070 * is necessary to handle the possibility of a rcu_nocb_kthread() 2071 * having awakened during the time that the rcu_node structures 2072 * were being updated for the end of the previous grace period. 2073 */ 2074 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) 2075 { 2076 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; 2077 } 2078 2079 static void rcu_init_one_nocb(struct rcu_node *rnp) 2080 { 2081 init_waitqueue_head(&rnp->nocb_gp_wq[0]); 2082 init_waitqueue_head(&rnp->nocb_gp_wq[1]); 2083 } 2084 2085 /* Is the specified CPU a no-CPUs CPU? */ 2086 bool rcu_is_nocb_cpu(int cpu) 2087 { 2088 if (have_rcu_nocb_mask) 2089 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2090 return false; 2091 } 2092 2093 /* 2094 * Enqueue the specified string of rcu_head structures onto the specified 2095 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 2096 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 2097 * counts are supplied by rhcount and rhcount_lazy. 2098 * 2099 * If warranted, also wake up the kthread servicing this CPUs queues. 2100 */ 2101 static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, 2102 struct rcu_head *rhp, 2103 struct rcu_head **rhtp, 2104 int rhcount, int rhcount_lazy) 2105 { 2106 int len; 2107 struct rcu_head **old_rhpp; 2108 struct task_struct *t; 2109 2110 /* Enqueue the callback on the nocb list and update counts. */ 2111 old_rhpp = xchg(&rdp->nocb_tail, rhtp); 2112 ACCESS_ONCE(*old_rhpp) = rhp; 2113 atomic_long_add(rhcount, &rdp->nocb_q_count); 2114 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 2115 2116 /* If we are not being polled and there is a kthread, awaken it ... */ 2117 t = ACCESS_ONCE(rdp->nocb_kthread); 2118 if (rcu_nocb_poll || !t) { 2119 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2120 TPS("WakeNotPoll")); 2121 return; 2122 } 2123 len = atomic_long_read(&rdp->nocb_q_count); 2124 if (old_rhpp == &rdp->nocb_head) { 2125 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ 2126 rdp->qlen_last_fqs_check = 0; 2127 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2129 wake_up_process(t); /* ... or if many callbacks queued. */ 2130 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2131 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); 2132 } else { 2133 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); 2134 } 2135 return; 2136 } 2137 2138 /* 2139 * This is a helper for __call_rcu(), which invokes this when the normal 2140 * callback queue is inoperable. If this is not a no-CBs CPU, this 2141 * function returns failure back to __call_rcu(), which can complain 2142 * appropriately. 2143 * 2144 * Otherwise, this function queues the callback where the corresponding 2145 * "rcuo" kthread can find it. 2146 */ 2147 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2148 bool lazy) 2149 { 2150 2151 if (!rcu_is_nocb_cpu(rdp->cpu)) 2152 return 0; 2153 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2154 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2155 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2156 (unsigned long)rhp->func, 2157 -atomic_long_read(&rdp->nocb_q_count_lazy), 2158 -atomic_long_read(&rdp->nocb_q_count)); 2159 else 2160 trace_rcu_callback(rdp->rsp->name, rhp, 2161 -atomic_long_read(&rdp->nocb_q_count_lazy), 2162 -atomic_long_read(&rdp->nocb_q_count)); 2163 return 1; 2164 } 2165 2166 /* 2167 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is 2168 * not a no-CBs CPU. 2169 */ 2170 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2171 struct rcu_data *rdp) 2172 { 2173 long ql = rsp->qlen; 2174 long qll = rsp->qlen_lazy; 2175 2176 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 2177 if (!rcu_is_nocb_cpu(smp_processor_id())) 2178 return 0; 2179 rsp->qlen = 0; 2180 rsp->qlen_lazy = 0; 2181 2182 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 2183 if (rsp->orphan_donelist != NULL) { 2184 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, 2185 rsp->orphan_donetail, ql, qll); 2186 ql = qll = 0; 2187 rsp->orphan_donelist = NULL; 2188 rsp->orphan_donetail = &rsp->orphan_donelist; 2189 } 2190 if (rsp->orphan_nxtlist != NULL) { 2191 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, 2192 rsp->orphan_nxttail, ql, qll); 2193 ql = qll = 0; 2194 rsp->orphan_nxtlist = NULL; 2195 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2196 } 2197 return 1; 2198 } 2199 2200 /* 2201 * If necessary, kick off a new grace period, and either way wait 2202 * for a subsequent grace period to complete. 2203 */ 2204 static void rcu_nocb_wait_gp(struct rcu_data *rdp) 2205 { 2206 unsigned long c; 2207 bool d; 2208 unsigned long flags; 2209 struct rcu_node *rnp = rdp->mynode; 2210 2211 raw_spin_lock_irqsave(&rnp->lock, flags); 2212 c = rcu_start_future_gp(rnp, rdp); 2213 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2214 2215 /* 2216 * Wait for the grace period. Do so interruptibly to avoid messing 2217 * up the load average. 2218 */ 2219 trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); 2220 for (;;) { 2221 wait_event_interruptible( 2222 rnp->nocb_gp_wq[c & 0x1], 2223 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); 2224 if (likely(d)) 2225 break; 2226 flush_signals(current); 2227 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); 2228 } 2229 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); 2230 smp_mb(); /* Ensure that CB invocation happens after GP end. */ 2231 } 2232 2233 /* 2234 * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes 2235 * callbacks queued by the corresponding no-CBs CPU. 2236 */ 2237 static int rcu_nocb_kthread(void *arg) 2238 { 2239 int c, cl; 2240 bool firsttime = 1; 2241 struct rcu_head *list; 2242 struct rcu_head *next; 2243 struct rcu_head **tail; 2244 struct rcu_data *rdp = arg; 2245 2246 /* Each pass through this loop invokes one batch of callbacks */ 2247 for (;;) { 2248 /* If not polling, wait for next batch of callbacks. */ 2249 if (!rcu_nocb_poll) { 2250 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2251 TPS("Sleep")); 2252 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2253 } else if (firsttime) { 2254 firsttime = 0; 2255 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2256 TPS("Poll")); 2257 } 2258 list = ACCESS_ONCE(rdp->nocb_head); 2259 if (!list) { 2260 if (!rcu_nocb_poll) 2261 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2262 TPS("WokeEmpty")); 2263 schedule_timeout_interruptible(1); 2264 flush_signals(current); 2265 continue; 2266 } 2267 firsttime = 1; 2268 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2269 TPS("WokeNonEmpty")); 2270 2271 /* 2272 * Extract queued callbacks, update counts, and wait 2273 * for a grace period to elapse. 2274 */ 2275 ACCESS_ONCE(rdp->nocb_head) = NULL; 2276 tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); 2277 c = atomic_long_xchg(&rdp->nocb_q_count, 0); 2278 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); 2279 ACCESS_ONCE(rdp->nocb_p_count) += c; 2280 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; 2281 rcu_nocb_wait_gp(rdp); 2282 2283 /* Each pass through the following loop invokes a callback. */ 2284 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2285 c = cl = 0; 2286 while (list) { 2287 next = list->next; 2288 /* Wait for enqueuing to complete, if needed. */ 2289 while (next == NULL && &list->next != tail) { 2290 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2291 TPS("WaitQueue")); 2292 schedule_timeout_interruptible(1); 2293 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2294 TPS("WokeQueue")); 2295 next = list->next; 2296 } 2297 debug_rcu_head_unqueue(list); 2298 local_bh_disable(); 2299 if (__rcu_reclaim(rdp->rsp->name, list)) 2300 cl++; 2301 c++; 2302 local_bh_enable(); 2303 list = next; 2304 } 2305 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2306 ACCESS_ONCE(rdp->nocb_p_count) -= c; 2307 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; 2308 rdp->n_nocbs_invoked += c; 2309 } 2310 return 0; 2311 } 2312 2313 /* Initialize per-rcu_data variables for no-CBs CPUs. */ 2314 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2315 { 2316 rdp->nocb_tail = &rdp->nocb_head; 2317 init_waitqueue_head(&rdp->nocb_wq); 2318 } 2319 2320 /* Create a kthread for each RCU flavor for each no-CBs CPU. */ 2321 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2322 { 2323 int cpu; 2324 struct rcu_data *rdp; 2325 struct task_struct *t; 2326 2327 if (rcu_nocb_mask == NULL) 2328 return; 2329 for_each_cpu(cpu, rcu_nocb_mask) { 2330 rdp = per_cpu_ptr(rsp->rda, cpu); 2331 t = kthread_run(rcu_nocb_kthread, rdp, 2332 "rcuo%c/%d", rsp->abbr, cpu); 2333 BUG_ON(IS_ERR(t)); 2334 ACCESS_ONCE(rdp->nocb_kthread) = t; 2335 } 2336 } 2337 2338 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2339 static bool init_nocb_callback_list(struct rcu_data *rdp) 2340 { 2341 if (rcu_nocb_mask == NULL || 2342 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) 2343 return false; 2344 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2345 return true; 2346 } 2347 2348 #else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2349 2350 static int rcu_nocb_needs_gp(struct rcu_state *rsp) 2351 { 2352 return 0; 2353 } 2354 2355 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2356 { 2357 } 2358 2359 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) 2360 { 2361 } 2362 2363 static void rcu_init_one_nocb(struct rcu_node *rnp) 2364 { 2365 } 2366 2367 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2368 bool lazy) 2369 { 2370 return 0; 2371 } 2372 2373 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2374 struct rcu_data *rdp) 2375 { 2376 return 0; 2377 } 2378 2379 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2380 { 2381 } 2382 2383 static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2384 { 2385 } 2386 2387 static bool init_nocb_callback_list(struct rcu_data *rdp) 2388 { 2389 return false; 2390 } 2391 2392 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2393 2394 /* 2395 * An adaptive-ticks CPU can potentially execute in kernel mode for an 2396 * arbitrarily long period of time with the scheduling-clock tick turned 2397 * off. RCU will be paying attention to this CPU because it is in the 2398 * kernel, but the CPU cannot be guaranteed to be executing the RCU state 2399 * machine because the scheduling-clock tick has been disabled. Therefore, 2400 * if an adaptive-ticks CPU is failing to respond to the current grace 2401 * period and has not be idle from an RCU perspective, kick it. 2402 */ 2403 static void rcu_kick_nohz_cpu(int cpu) 2404 { 2405 #ifdef CONFIG_NO_HZ_FULL 2406 if (tick_nohz_full_cpu(cpu)) 2407 smp_send_reschedule(cpu); 2408 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 2409 } 2410 2411 2412 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 2413 2414 /* 2415 * Define RCU flavor that holds sysidle state. This needs to be the 2416 * most active flavor of RCU. 2417 */ 2418 #ifdef CONFIG_PREEMPT_RCU 2419 static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; 2420 #else /* #ifdef CONFIG_PREEMPT_RCU */ 2421 static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; 2422 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 2423 2424 static int full_sysidle_state; /* Current system-idle state. */ 2425 #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ 2426 #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ 2427 #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ 2428 #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ 2429 #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ 2430 2431 /* 2432 * Invoked to note exit from irq or task transition to idle. Note that 2433 * usermode execution does -not- count as idle here! After all, we want 2434 * to detect full-system idle states, not RCU quiescent states and grace 2435 * periods. The caller must have disabled interrupts. 2436 */ 2437 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2438 { 2439 unsigned long j; 2440 2441 /* Adjust nesting, check for fully idle. */ 2442 if (irq) { 2443 rdtp->dynticks_idle_nesting--; 2444 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); 2445 if (rdtp->dynticks_idle_nesting != 0) 2446 return; /* Still not fully idle. */ 2447 } else { 2448 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == 2449 DYNTICK_TASK_NEST_VALUE) { 2450 rdtp->dynticks_idle_nesting = 0; 2451 } else { 2452 rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; 2453 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); 2454 return; /* Still not fully idle. */ 2455 } 2456 } 2457 2458 /* Record start of fully idle period. */ 2459 j = jiffies; 2460 ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; 2461 smp_mb__before_atomic_inc(); 2462 atomic_inc(&rdtp->dynticks_idle); 2463 smp_mb__after_atomic_inc(); 2464 WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); 2465 } 2466 2467 /* 2468 * Unconditionally force exit from full system-idle state. This is 2469 * invoked when a normal CPU exits idle, but must be called separately 2470 * for the timekeeping CPU (tick_do_timer_cpu). The reason for this 2471 * is that the timekeeping CPU is permitted to take scheduling-clock 2472 * interrupts while the system is in system-idle state, and of course 2473 * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock 2474 * interrupt from any other type of interrupt. 2475 */ 2476 void rcu_sysidle_force_exit(void) 2477 { 2478 int oldstate = ACCESS_ONCE(full_sysidle_state); 2479 int newoldstate; 2480 2481 /* 2482 * Each pass through the following loop attempts to exit full 2483 * system-idle state. If contention proves to be a problem, 2484 * a trylock-based contention tree could be used here. 2485 */ 2486 while (oldstate > RCU_SYSIDLE_SHORT) { 2487 newoldstate = cmpxchg(&full_sysidle_state, 2488 oldstate, RCU_SYSIDLE_NOT); 2489 if (oldstate == newoldstate && 2490 oldstate == RCU_SYSIDLE_FULL_NOTED) { 2491 rcu_kick_nohz_cpu(tick_do_timer_cpu); 2492 return; /* We cleared it, done! */ 2493 } 2494 oldstate = newoldstate; 2495 } 2496 smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ 2497 } 2498 2499 /* 2500 * Invoked to note entry to irq or task transition from idle. Note that 2501 * usermode execution does -not- count as idle here! The caller must 2502 * have disabled interrupts. 2503 */ 2504 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2505 { 2506 /* Adjust nesting, check for already non-idle. */ 2507 if (irq) { 2508 rdtp->dynticks_idle_nesting++; 2509 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); 2510 if (rdtp->dynticks_idle_nesting != 1) 2511 return; /* Already non-idle. */ 2512 } else { 2513 /* 2514 * Allow for irq misnesting. Yes, it really is possible 2515 * to enter an irq handler then never leave it, and maybe 2516 * also vice versa. Handle both possibilities. 2517 */ 2518 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { 2519 rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; 2520 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); 2521 return; /* Already non-idle. */ 2522 } else { 2523 rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; 2524 } 2525 } 2526 2527 /* Record end of idle period. */ 2528 smp_mb__before_atomic_inc(); 2529 atomic_inc(&rdtp->dynticks_idle); 2530 smp_mb__after_atomic_inc(); 2531 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); 2532 2533 /* 2534 * If we are the timekeeping CPU, we are permitted to be non-idle 2535 * during a system-idle state. This must be the case, because 2536 * the timekeeping CPU has to take scheduling-clock interrupts 2537 * during the time that the system is transitioning to full 2538 * system-idle state. This means that the timekeeping CPU must 2539 * invoke rcu_sysidle_force_exit() directly if it does anything 2540 * more than take a scheduling-clock interrupt. 2541 */ 2542 if (smp_processor_id() == tick_do_timer_cpu) 2543 return; 2544 2545 /* Update system-idle state: We are clearly no longer fully idle! */ 2546 rcu_sysidle_force_exit(); 2547 } 2548 2549 /* 2550 * Check to see if the current CPU is idle. Note that usermode execution 2551 * does not count as idle. The caller must have disabled interrupts. 2552 */ 2553 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2554 unsigned long *maxj) 2555 { 2556 int cur; 2557 unsigned long j; 2558 struct rcu_dynticks *rdtp = rdp->dynticks; 2559 2560 /* 2561 * If some other CPU has already reported non-idle, if this is 2562 * not the flavor of RCU that tracks sysidle state, or if this 2563 * is an offline or the timekeeping CPU, nothing to do. 2564 */ 2565 if (!*isidle || rdp->rsp != rcu_sysidle_state || 2566 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2567 return; 2568 if (rcu_gp_in_progress(rdp->rsp)) 2569 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); 2570 2571 /* Pick up current idle and NMI-nesting counter and check. */ 2572 cur = atomic_read(&rdtp->dynticks_idle); 2573 if (cur & 0x1) { 2574 *isidle = false; /* We are not idle! */ 2575 return; 2576 } 2577 smp_mb(); /* Read counters before timestamps. */ 2578 2579 /* Pick up timestamps. */ 2580 j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); 2581 /* If this CPU entered idle more recently, update maxj timestamp. */ 2582 if (ULONG_CMP_LT(*maxj, j)) 2583 *maxj = j; 2584 } 2585 2586 /* 2587 * Is this the flavor of RCU that is handling full-system idle? 2588 */ 2589 static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2590 { 2591 return rsp == rcu_sysidle_state; 2592 } 2593 2594 /* 2595 * Bind the grace-period kthread for the sysidle flavor of RCU to the 2596 * timekeeping CPU. 2597 */ 2598 static void rcu_bind_gp_kthread(void) 2599 { 2600 int cpu = ACCESS_ONCE(tick_do_timer_cpu); 2601 2602 if (cpu < 0 || cpu >= nr_cpu_ids) 2603 return; 2604 if (raw_smp_processor_id() != cpu) 2605 set_cpus_allowed_ptr(current, cpumask_of(cpu)); 2606 } 2607 2608 /* 2609 * Return a delay in jiffies based on the number of CPUs, rcu_node 2610 * leaf fanout, and jiffies tick rate. The idea is to allow larger 2611 * systems more time to transition to full-idle state in order to 2612 * avoid the cache thrashing that otherwise occur on the state variable. 2613 * Really small systems (less than a couple of tens of CPUs) should 2614 * instead use a single global atomically incremented counter, and later 2615 * versions of this will automatically reconfigure themselves accordingly. 2616 */ 2617 static unsigned long rcu_sysidle_delay(void) 2618 { 2619 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 2620 return 0; 2621 return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); 2622 } 2623 2624 /* 2625 * Advance the full-system-idle state. This is invoked when all of 2626 * the non-timekeeping CPUs are idle. 2627 */ 2628 static void rcu_sysidle(unsigned long j) 2629 { 2630 /* Check the current state. */ 2631 switch (ACCESS_ONCE(full_sysidle_state)) { 2632 case RCU_SYSIDLE_NOT: 2633 2634 /* First time all are idle, so note a short idle period. */ 2635 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; 2636 break; 2637 2638 case RCU_SYSIDLE_SHORT: 2639 2640 /* 2641 * Idle for a bit, time to advance to next state? 2642 * cmpxchg failure means race with non-idle, let them win. 2643 */ 2644 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) 2645 (void)cmpxchg(&full_sysidle_state, 2646 RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); 2647 break; 2648 2649 case RCU_SYSIDLE_LONG: 2650 2651 /* 2652 * Do an additional check pass before advancing to full. 2653 * cmpxchg failure means race with non-idle, let them win. 2654 */ 2655 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) 2656 (void)cmpxchg(&full_sysidle_state, 2657 RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); 2658 break; 2659 2660 default: 2661 break; 2662 } 2663 } 2664 2665 /* 2666 * Found a non-idle non-timekeeping CPU, so kick the system-idle state 2667 * back to the beginning. 2668 */ 2669 static void rcu_sysidle_cancel(void) 2670 { 2671 smp_mb(); 2672 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; 2673 } 2674 2675 /* 2676 * Update the sysidle state based on the results of a force-quiescent-state 2677 * scan of the CPUs' dyntick-idle state. 2678 */ 2679 static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, 2680 unsigned long maxj, bool gpkt) 2681 { 2682 if (rsp != rcu_sysidle_state) 2683 return; /* Wrong flavor, ignore. */ 2684 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 2685 return; /* Running state machine from timekeeping CPU. */ 2686 if (isidle) 2687 rcu_sysidle(maxj); /* More idle! */ 2688 else 2689 rcu_sysidle_cancel(); /* Idle is over. */ 2690 } 2691 2692 /* 2693 * Wrapper for rcu_sysidle_report() when called from the grace-period 2694 * kthread's context. 2695 */ 2696 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2697 unsigned long maxj) 2698 { 2699 rcu_sysidle_report(rsp, isidle, maxj, true); 2700 } 2701 2702 /* Callback and function for forcing an RCU grace period. */ 2703 struct rcu_sysidle_head { 2704 struct rcu_head rh; 2705 int inuse; 2706 }; 2707 2708 static void rcu_sysidle_cb(struct rcu_head *rhp) 2709 { 2710 struct rcu_sysidle_head *rshp; 2711 2712 /* 2713 * The following memory barrier is needed to replace the 2714 * memory barriers that would normally be in the memory 2715 * allocator. 2716 */ 2717 smp_mb(); /* grace period precedes setting inuse. */ 2718 2719 rshp = container_of(rhp, struct rcu_sysidle_head, rh); 2720 ACCESS_ONCE(rshp->inuse) = 0; 2721 } 2722 2723 /* 2724 * Check to see if the system is fully idle, other than the timekeeping CPU. 2725 * The caller must have disabled interrupts. 2726 */ 2727 bool rcu_sys_is_idle(void) 2728 { 2729 static struct rcu_sysidle_head rsh; 2730 int rss = ACCESS_ONCE(full_sysidle_state); 2731 2732 if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) 2733 return false; 2734 2735 /* Handle small-system case by doing a full scan of CPUs. */ 2736 if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { 2737 int oldrss = rss - 1; 2738 2739 /* 2740 * One pass to advance to each state up to _FULL. 2741 * Give up if any pass fails to advance the state. 2742 */ 2743 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { 2744 int cpu; 2745 bool isidle = true; 2746 unsigned long maxj = jiffies - ULONG_MAX / 4; 2747 struct rcu_data *rdp; 2748 2749 /* Scan all the CPUs looking for nonidle CPUs. */ 2750 for_each_possible_cpu(cpu) { 2751 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); 2752 rcu_sysidle_check_cpu(rdp, &isidle, &maxj); 2753 if (!isidle) 2754 break; 2755 } 2756 rcu_sysidle_report(rcu_sysidle_state, 2757 isidle, maxj, false); 2758 oldrss = rss; 2759 rss = ACCESS_ONCE(full_sysidle_state); 2760 } 2761 } 2762 2763 /* If this is the first observation of an idle period, record it. */ 2764 if (rss == RCU_SYSIDLE_FULL) { 2765 rss = cmpxchg(&full_sysidle_state, 2766 RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); 2767 return rss == RCU_SYSIDLE_FULL; 2768 } 2769 2770 smp_mb(); /* ensure rss load happens before later caller actions. */ 2771 2772 /* If already fully idle, tell the caller (in case of races). */ 2773 if (rss == RCU_SYSIDLE_FULL_NOTED) 2774 return true; 2775 2776 /* 2777 * If we aren't there yet, and a grace period is not in flight, 2778 * initiate a grace period. Either way, tell the caller that 2779 * we are not there yet. We use an xchg() rather than an assignment 2780 * to make up for the memory barriers that would otherwise be 2781 * provided by the memory allocator. 2782 */ 2783 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && 2784 !rcu_gp_in_progress(rcu_sysidle_state) && 2785 !rsh.inuse && xchg(&rsh.inuse, 1) == 0) 2786 call_rcu(&rsh.rh, rcu_sysidle_cb); 2787 return false; 2788 } 2789 2790 /* 2791 * Initialize dynticks sysidle state for CPUs coming online. 2792 */ 2793 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) 2794 { 2795 rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; 2796 } 2797 2798 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 2799 2800 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) 2801 { 2802 } 2803 2804 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2805 { 2806 } 2807 2808 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2809 unsigned long *maxj) 2810 { 2811 } 2812 2813 static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2814 { 2815 return false; 2816 } 2817 2818 static void rcu_bind_gp_kthread(void) 2819 { 2820 } 2821 2822 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 2823 unsigned long maxj) 2824 { 2825 } 2826 2827 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) 2828 { 2829 } 2830 2831 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 2832