1 /* SPDX-License-Identifier: GPL-2.0+ */ 2 /* 3 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 4 * Internal non-public definitions that provide either classic 5 * or preemptible semantics. 6 * 7 * Copyright Red Hat, 2009 8 * Copyright IBM Corporation, 2009 9 * 10 * Author: Ingo Molnar <mingo@elte.hu> 11 * Paul E. McKenney <paulmck@linux.ibm.com> 12 */ 13 14 #include "../locking/rtmutex_common.h" 15 16 #ifdef CONFIG_RCU_NOCB_CPU 17 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ 18 static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ 19 static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) 20 { 21 return lockdep_is_held(&rdp->nocb_lock); 22 } 23 24 static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) 25 { 26 /* Race on early boot between thread creation and assignment */ 27 if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread) 28 return true; 29 30 if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread) 31 if (in_task()) 32 return true; 33 return false; 34 } 35 36 #else 37 static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp) 38 { 39 return 0; 40 } 41 42 static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) 43 { 44 return false; 45 } 46 47 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 48 49 static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) 50 { 51 /* 52 * In order to read the offloaded state of an rdp is a safe 53 * and stable way and prevent from its value to be changed 54 * under us, we must either hold the barrier mutex, the cpu 55 * hotplug lock (read or write) or the nocb lock. Local 56 * non-preemptible reads are also safe. NOCB kthreads and 57 * timers have their own means of synchronization against the 58 * offloaded state updaters. 59 */ 60 RCU_LOCKDEP_WARN( 61 !(lockdep_is_held(&rcu_state.barrier_mutex) || 62 (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) || 63 rcu_lockdep_is_held_nocb(rdp) || 64 (rdp == this_cpu_ptr(&rcu_data) && 65 !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) || 66 rcu_current_is_nocb_kthread(rdp)), 67 "Unsafe read of RCU_NOCB offloaded state" 68 ); 69 70 return rcu_segcblist_is_offloaded(&rdp->cblist); 71 } 72 73 /* 74 * Check the RCU kernel configuration parameters and print informative 75 * messages about anything out of the ordinary. 76 */ 77 static void __init rcu_bootup_announce_oddness(void) 78 { 79 if (IS_ENABLED(CONFIG_RCU_TRACE)) 80 pr_info("\tRCU event tracing is enabled.\n"); 81 if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) || 82 (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32)) 83 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n", 84 RCU_FANOUT); 85 if (rcu_fanout_exact) 86 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 87 if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) 88 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 89 if (IS_ENABLED(CONFIG_PROVE_RCU)) 90 pr_info("\tRCU lockdep checking is enabled.\n"); 91 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) 92 pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); 93 if (RCU_NUM_LVLS >= 4) 94 pr_info("\tFour(or more)-level hierarchy is enabled.\n"); 95 if (RCU_FANOUT_LEAF != 16) 96 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", 97 RCU_FANOUT_LEAF); 98 if (rcu_fanout_leaf != RCU_FANOUT_LEAF) 99 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", 100 rcu_fanout_leaf); 101 if (nr_cpu_ids != NR_CPUS) 102 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids); 103 #ifdef CONFIG_RCU_BOOST 104 pr_info("\tRCU priority boosting: priority %d delay %d ms.\n", 105 kthread_prio, CONFIG_RCU_BOOST_DELAY); 106 #endif 107 if (blimit != DEFAULT_RCU_BLIMIT) 108 pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit); 109 if (qhimark != DEFAULT_RCU_QHIMARK) 110 pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark); 111 if (qlowmark != DEFAULT_RCU_QLOMARK) 112 pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark); 113 if (qovld != DEFAULT_RCU_QOVLD) 114 pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld); 115 if (jiffies_till_first_fqs != ULONG_MAX) 116 pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs); 117 if (jiffies_till_next_fqs != ULONG_MAX) 118 pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs); 119 if (jiffies_till_sched_qs != ULONG_MAX) 120 pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs); 121 if (rcu_kick_kthreads) 122 pr_info("\tKick kthreads if too-long grace period.\n"); 123 if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) 124 pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); 125 if (gp_preinit_delay) 126 pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); 127 if (gp_init_delay) 128 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); 129 if (gp_cleanup_delay) 130 pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); 131 if (!use_softirq) 132 pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); 133 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) 134 pr_info("\tRCU debug extended QS entry/exit.\n"); 135 rcupdate_announce_bootup_oddness(); 136 } 137 138 #ifdef CONFIG_PREEMPT_RCU 139 140 static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake); 141 static void rcu_read_unlock_special(struct task_struct *t); 142 143 /* 144 * Tell them what RCU they are running. 145 */ 146 static void __init rcu_bootup_announce(void) 147 { 148 pr_info("Preemptible hierarchical RCU implementation.\n"); 149 rcu_bootup_announce_oddness(); 150 } 151 152 /* Flags for rcu_preempt_ctxt_queue() decision table. */ 153 #define RCU_GP_TASKS 0x8 154 #define RCU_EXP_TASKS 0x4 155 #define RCU_GP_BLKD 0x2 156 #define RCU_EXP_BLKD 0x1 157 158 /* 159 * Queues a task preempted within an RCU-preempt read-side critical 160 * section into the appropriate location within the ->blkd_tasks list, 161 * depending on the states of any ongoing normal and expedited grace 162 * periods. The ->gp_tasks pointer indicates which element the normal 163 * grace period is waiting on (NULL if none), and the ->exp_tasks pointer 164 * indicates which element the expedited grace period is waiting on (again, 165 * NULL if none). If a grace period is waiting on a given element in the 166 * ->blkd_tasks list, it also waits on all subsequent elements. Thus, 167 * adding a task to the tail of the list blocks any grace period that is 168 * already waiting on one of the elements. In contrast, adding a task 169 * to the head of the list won't block any grace period that is already 170 * waiting on one of the elements. 171 * 172 * This queuing is imprecise, and can sometimes make an ongoing grace 173 * period wait for a task that is not strictly speaking blocking it. 174 * Given the choice, we needlessly block a normal grace period rather than 175 * blocking an expedited grace period. 176 * 177 * Note that an endless sequence of expedited grace periods still cannot 178 * indefinitely postpone a normal grace period. Eventually, all of the 179 * fixed number of preempted tasks blocking the normal grace period that are 180 * not also blocking the expedited grace period will resume and complete 181 * their RCU read-side critical sections. At that point, the ->gp_tasks 182 * pointer will equal the ->exp_tasks pointer, at which point the end of 183 * the corresponding expedited grace period will also be the end of the 184 * normal grace period. 185 */ 186 static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) 187 __releases(rnp->lock) /* But leaves rrupts disabled. */ 188 { 189 int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + 190 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + 191 (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + 192 (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); 193 struct task_struct *t = current; 194 195 raw_lockdep_assert_held_rcu_node(rnp); 196 WARN_ON_ONCE(rdp->mynode != rnp); 197 WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); 198 /* RCU better not be waiting on newly onlined CPUs! */ 199 WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask & 200 rdp->grpmask); 201 202 /* 203 * Decide where to queue the newly blocked task. In theory, 204 * this could be an if-statement. In practice, when I tried 205 * that, it was quite messy. 206 */ 207 switch (blkd_state) { 208 case 0: 209 case RCU_EXP_TASKS: 210 case RCU_EXP_TASKS + RCU_GP_BLKD: 211 case RCU_GP_TASKS: 212 case RCU_GP_TASKS + RCU_EXP_TASKS: 213 214 /* 215 * Blocking neither GP, or first task blocking the normal 216 * GP but not blocking the already-waiting expedited GP. 217 * Queue at the head of the list to avoid unnecessarily 218 * blocking the already-waiting GPs. 219 */ 220 list_add(&t->rcu_node_entry, &rnp->blkd_tasks); 221 break; 222 223 case RCU_EXP_BLKD: 224 case RCU_GP_BLKD: 225 case RCU_GP_BLKD + RCU_EXP_BLKD: 226 case RCU_GP_TASKS + RCU_EXP_BLKD: 227 case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 228 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 229 230 /* 231 * First task arriving that blocks either GP, or first task 232 * arriving that blocks the expedited GP (with the normal 233 * GP already waiting), or a task arriving that blocks 234 * both GPs with both GPs already waiting. Queue at the 235 * tail of the list to avoid any GP waiting on any of the 236 * already queued tasks that are not blocking it. 237 */ 238 list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); 239 break; 240 241 case RCU_EXP_TASKS + RCU_EXP_BLKD: 242 case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: 243 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: 244 245 /* 246 * Second or subsequent task blocking the expedited GP. 247 * The task either does not block the normal GP, or is the 248 * first task blocking the normal GP. Queue just after 249 * the first task blocking the expedited GP. 250 */ 251 list_add(&t->rcu_node_entry, rnp->exp_tasks); 252 break; 253 254 case RCU_GP_TASKS + RCU_GP_BLKD: 255 case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: 256 257 /* 258 * Second or subsequent task blocking the normal GP. 259 * The task does not block the expedited GP. Queue just 260 * after the first task blocking the normal GP. 261 */ 262 list_add(&t->rcu_node_entry, rnp->gp_tasks); 263 break; 264 265 default: 266 267 /* Yet another exercise in excessive paranoia. */ 268 WARN_ON_ONCE(1); 269 break; 270 } 271 272 /* 273 * We have now queued the task. If it was the first one to 274 * block either grace period, update the ->gp_tasks and/or 275 * ->exp_tasks pointers, respectively, to reference the newly 276 * blocked tasks. 277 */ 278 if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) { 279 WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry); 280 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); 281 } 282 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) 283 WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); 284 WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != 285 !(rnp->qsmask & rdp->grpmask)); 286 WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != 287 !(rnp->expmask & rdp->grpmask)); 288 raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ 289 290 /* 291 * Report the quiescent state for the expedited GP. This expedited 292 * GP should not be able to end until we report, so there should be 293 * no need to check for a subsequent expedited GP. (Though we are 294 * still in a quiescent state in any case.) 295 */ 296 if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) 297 rcu_report_exp_rdp(rdp); 298 else 299 WARN_ON_ONCE(rdp->exp_deferred_qs); 300 } 301 302 /* 303 * Record a preemptible-RCU quiescent state for the specified CPU. 304 * Note that this does not necessarily mean that the task currently running 305 * on the CPU is in a quiescent state: Instead, it means that the current 306 * grace period need not wait on any RCU read-side critical section that 307 * starts later on this CPU. It also means that if the current task is 308 * in an RCU read-side critical section, it has already added itself to 309 * some leaf rcu_node structure's ->blkd_tasks list. In addition to the 310 * current task, there might be any number of other tasks blocked while 311 * in an RCU read-side critical section. 312 * 313 * Callers to this function must disable preemption. 314 */ 315 static void rcu_qs(void) 316 { 317 RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); 318 if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { 319 trace_rcu_grace_period(TPS("rcu_preempt"), 320 __this_cpu_read(rcu_data.gp_seq), 321 TPS("cpuqs")); 322 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); 323 barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ 324 WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false); 325 } 326 } 327 328 /* 329 * We have entered the scheduler, and the current task might soon be 330 * context-switched away from. If this task is in an RCU read-side 331 * critical section, we will no longer be able to rely on the CPU to 332 * record that fact, so we enqueue the task on the blkd_tasks list. 333 * The task will dequeue itself when it exits the outermost enclosing 334 * RCU read-side critical section. Therefore, the current grace period 335 * cannot be permitted to complete until the blkd_tasks list entries 336 * predating the current grace period drain, in other words, until 337 * rnp->gp_tasks becomes NULL. 338 * 339 * Caller must disable interrupts. 340 */ 341 void rcu_note_context_switch(bool preempt) 342 { 343 struct task_struct *t = current; 344 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 345 struct rcu_node *rnp; 346 347 trace_rcu_utilization(TPS("Start context switch")); 348 lockdep_assert_irqs_disabled(); 349 WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0); 350 if (rcu_preempt_depth() > 0 && 351 !t->rcu_read_unlock_special.b.blocked) { 352 353 /* Possibly blocking in an RCU read-side critical section. */ 354 rnp = rdp->mynode; 355 raw_spin_lock_rcu_node(rnp); 356 t->rcu_read_unlock_special.b.blocked = true; 357 t->rcu_blocked_node = rnp; 358 359 /* 360 * Verify the CPU's sanity, trace the preemption, and 361 * then queue the task as required based on the states 362 * of any ongoing and expedited grace periods. 363 */ 364 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); 365 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 366 trace_rcu_preempt_task(rcu_state.name, 367 t->pid, 368 (rnp->qsmask & rdp->grpmask) 369 ? rnp->gp_seq 370 : rcu_seq_snap(&rnp->gp_seq)); 371 rcu_preempt_ctxt_queue(rnp, rdp); 372 } else { 373 rcu_preempt_deferred_qs(t); 374 } 375 376 /* 377 * Either we were not in an RCU read-side critical section to 378 * begin with, or we have now recorded that critical section 379 * globally. Either way, we can now note a quiescent state 380 * for this CPU. Again, if we were in an RCU read-side critical 381 * section, and if that critical section was blocking the current 382 * grace period, then the fact that the task has been enqueued 383 * means that we continue to block the current grace period. 384 */ 385 rcu_qs(); 386 if (rdp->exp_deferred_qs) 387 rcu_report_exp_rdp(rdp); 388 rcu_tasks_qs(current, preempt); 389 trace_rcu_utilization(TPS("End context switch")); 390 } 391 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 392 393 /* 394 * Check for preempted RCU readers blocking the current grace period 395 * for the specified rcu_node structure. If the caller needs a reliable 396 * answer, it must hold the rcu_node's ->lock. 397 */ 398 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 399 { 400 return READ_ONCE(rnp->gp_tasks) != NULL; 401 } 402 403 /* limit value for ->rcu_read_lock_nesting. */ 404 #define RCU_NEST_PMAX (INT_MAX / 2) 405 406 static void rcu_preempt_read_enter(void) 407 { 408 current->rcu_read_lock_nesting++; 409 } 410 411 static int rcu_preempt_read_exit(void) 412 { 413 return --current->rcu_read_lock_nesting; 414 } 415 416 static void rcu_preempt_depth_set(int val) 417 { 418 current->rcu_read_lock_nesting = val; 419 } 420 421 /* 422 * Preemptible RCU implementation for rcu_read_lock(). 423 * Just increment ->rcu_read_lock_nesting, shared state will be updated 424 * if we block. 425 */ 426 void __rcu_read_lock(void) 427 { 428 rcu_preempt_read_enter(); 429 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 430 WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX); 431 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread) 432 WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); 433 barrier(); /* critical section after entry code. */ 434 } 435 EXPORT_SYMBOL_GPL(__rcu_read_lock); 436 437 /* 438 * Preemptible RCU implementation for rcu_read_unlock(). 439 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost 440 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then 441 * invoke rcu_read_unlock_special() to clean up after a context switch 442 * in an RCU read-side critical section and other special cases. 443 */ 444 void __rcu_read_unlock(void) 445 { 446 struct task_struct *t = current; 447 448 barrier(); // critical section before exit code. 449 if (rcu_preempt_read_exit() == 0) { 450 barrier(); // critical-section exit before .s check. 451 if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) 452 rcu_read_unlock_special(t); 453 } 454 if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { 455 int rrln = rcu_preempt_depth(); 456 457 WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX); 458 } 459 } 460 EXPORT_SYMBOL_GPL(__rcu_read_unlock); 461 462 /* 463 * Advance a ->blkd_tasks-list pointer to the next entry, instead 464 * returning NULL if at the end of the list. 465 */ 466 static struct list_head *rcu_next_node_entry(struct task_struct *t, 467 struct rcu_node *rnp) 468 { 469 struct list_head *np; 470 471 np = t->rcu_node_entry.next; 472 if (np == &rnp->blkd_tasks) 473 np = NULL; 474 return np; 475 } 476 477 /* 478 * Return true if the specified rcu_node structure has tasks that were 479 * preempted within an RCU read-side critical section. 480 */ 481 static bool rcu_preempt_has_tasks(struct rcu_node *rnp) 482 { 483 return !list_empty(&rnp->blkd_tasks); 484 } 485 486 /* 487 * Report deferred quiescent states. The deferral time can 488 * be quite short, for example, in the case of the call from 489 * rcu_read_unlock_special(). 490 */ 491 static void 492 rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) 493 { 494 bool empty_exp; 495 bool empty_norm; 496 bool empty_exp_now; 497 struct list_head *np; 498 bool drop_boost_mutex = false; 499 struct rcu_data *rdp; 500 struct rcu_node *rnp; 501 union rcu_special special; 502 503 /* 504 * If RCU core is waiting for this CPU to exit its critical section, 505 * report the fact that it has exited. Because irqs are disabled, 506 * t->rcu_read_unlock_special cannot change. 507 */ 508 special = t->rcu_read_unlock_special; 509 rdp = this_cpu_ptr(&rcu_data); 510 if (!special.s && !rdp->exp_deferred_qs) { 511 local_irq_restore(flags); 512 return; 513 } 514 t->rcu_read_unlock_special.s = 0; 515 if (special.b.need_qs) { 516 if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) { 517 rcu_report_qs_rdp(rdp); 518 udelay(rcu_unlock_delay); 519 } else { 520 rcu_qs(); 521 } 522 } 523 524 /* 525 * Respond to a request by an expedited grace period for a 526 * quiescent state from this CPU. Note that requests from 527 * tasks are handled when removing the task from the 528 * blocked-tasks list below. 529 */ 530 if (rdp->exp_deferred_qs) 531 rcu_report_exp_rdp(rdp); 532 533 /* Clean up if blocked during RCU read-side critical section. */ 534 if (special.b.blocked) { 535 536 /* 537 * Remove this task from the list it blocked on. The task 538 * now remains queued on the rcu_node corresponding to the 539 * CPU it first blocked on, so there is no longer any need 540 * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. 541 */ 542 rnp = t->rcu_blocked_node; 543 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 544 WARN_ON_ONCE(rnp != t->rcu_blocked_node); 545 WARN_ON_ONCE(!rcu_is_leaf_node(rnp)); 546 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 547 WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && 548 (!empty_norm || rnp->qsmask)); 549 empty_exp = sync_rcu_exp_done(rnp); 550 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 551 np = rcu_next_node_entry(t, rnp); 552 list_del_init(&t->rcu_node_entry); 553 t->rcu_blocked_node = NULL; 554 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), 555 rnp->gp_seq, t->pid); 556 if (&t->rcu_node_entry == rnp->gp_tasks) 557 WRITE_ONCE(rnp->gp_tasks, np); 558 if (&t->rcu_node_entry == rnp->exp_tasks) 559 WRITE_ONCE(rnp->exp_tasks, np); 560 if (IS_ENABLED(CONFIG_RCU_BOOST)) { 561 /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ 562 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; 563 if (&t->rcu_node_entry == rnp->boost_tasks) 564 WRITE_ONCE(rnp->boost_tasks, np); 565 } 566 567 /* 568 * If this was the last task on the current list, and if 569 * we aren't waiting on any CPUs, report the quiescent state. 570 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 571 * so we must take a snapshot of the expedited state. 572 */ 573 empty_exp_now = sync_rcu_exp_done(rnp); 574 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { 575 trace_rcu_quiescent_state_report(TPS("preempt_rcu"), 576 rnp->gp_seq, 577 0, rnp->qsmask, 578 rnp->level, 579 rnp->grplo, 580 rnp->grphi, 581 !!rnp->gp_tasks); 582 rcu_report_unblock_qs_rnp(rnp, flags); 583 } else { 584 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 585 } 586 587 /* Unboost if we were boosted. */ 588 if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) 589 rt_mutex_futex_unlock(&rnp->boost_mtx); 590 591 /* 592 * If this was the last task on the expedited lists, 593 * then we need to report up the rcu_node hierarchy. 594 */ 595 if (!empty_exp && empty_exp_now) 596 rcu_report_exp_rnp(rnp, true); 597 } else { 598 local_irq_restore(flags); 599 } 600 } 601 602 /* 603 * Is a deferred quiescent-state pending, and are we also not in 604 * an RCU read-side critical section? It is the caller's responsibility 605 * to ensure it is otherwise safe to report any deferred quiescent 606 * states. The reason for this is that it is safe to report a 607 * quiescent state during context switch even though preemption 608 * is disabled. This function cannot be expected to understand these 609 * nuances, so the caller must handle them. 610 */ 611 static bool rcu_preempt_need_deferred_qs(struct task_struct *t) 612 { 613 return (__this_cpu_read(rcu_data.exp_deferred_qs) || 614 READ_ONCE(t->rcu_read_unlock_special.s)) && 615 rcu_preempt_depth() == 0; 616 } 617 618 /* 619 * Report a deferred quiescent state if needed and safe to do so. 620 * As with rcu_preempt_need_deferred_qs(), "safe" involves only 621 * not being in an RCU read-side critical section. The caller must 622 * evaluate safety in terms of interrupt, softirq, and preemption 623 * disabling. 624 */ 625 static void rcu_preempt_deferred_qs(struct task_struct *t) 626 { 627 unsigned long flags; 628 629 if (!rcu_preempt_need_deferred_qs(t)) 630 return; 631 local_irq_save(flags); 632 rcu_preempt_deferred_qs_irqrestore(t, flags); 633 } 634 635 /* 636 * Minimal handler to give the scheduler a chance to re-evaluate. 637 */ 638 static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) 639 { 640 struct rcu_data *rdp; 641 642 rdp = container_of(iwp, struct rcu_data, defer_qs_iw); 643 rdp->defer_qs_iw_pending = false; 644 } 645 646 /* 647 * Handle special cases during rcu_read_unlock(), such as needing to 648 * notify RCU core processing or task having blocked during the RCU 649 * read-side critical section. 650 */ 651 static void rcu_read_unlock_special(struct task_struct *t) 652 { 653 unsigned long flags; 654 bool irqs_were_disabled; 655 bool preempt_bh_were_disabled = 656 !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); 657 658 /* NMI handlers cannot block and cannot safely manipulate state. */ 659 if (in_nmi()) 660 return; 661 662 local_irq_save(flags); 663 irqs_were_disabled = irqs_disabled_flags(flags); 664 if (preempt_bh_were_disabled || irqs_were_disabled) { 665 bool expboost; // Expedited GP in flight or possible boosting. 666 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 667 struct rcu_node *rnp = rdp->mynode; 668 669 expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || 670 (rdp->grpmask & READ_ONCE(rnp->expmask)) || 671 IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || 672 (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && 673 t->rcu_blocked_node); 674 // Need to defer quiescent state until everything is enabled. 675 if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { 676 // Using softirq, safe to awaken, and either the 677 // wakeup is free or there is either an expedited 678 // GP in flight or a potential need to deboost. 679 raise_softirq_irqoff(RCU_SOFTIRQ); 680 } else { 681 // Enabling BH or preempt does reschedule, so... 682 // Also if no expediting and no possible deboosting, 683 // slow is OK. Plus nohz_full CPUs eventually get 684 // tick enabled. 685 set_tsk_need_resched(current); 686 set_preempt_need_resched(); 687 if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && 688 expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { 689 // Get scheduler to re-evaluate and call hooks. 690 // If !IRQ_WORK, FQS scan will eventually IPI. 691 init_irq_work(&rdp->defer_qs_iw, rcu_preempt_deferred_qs_handler); 692 rdp->defer_qs_iw_pending = true; 693 irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); 694 } 695 } 696 local_irq_restore(flags); 697 return; 698 } 699 rcu_preempt_deferred_qs_irqrestore(t, flags); 700 } 701 702 /* 703 * Check that the list of blocked tasks for the newly completed grace 704 * period is in fact empty. It is a serious bug to complete a grace 705 * period that still has RCU readers blocked! This function must be 706 * invoked -before- updating this rnp's ->gp_seq. 707 * 708 * Also, if there are blocked tasks on the list, they automatically 709 * block the newly created grace period, so set up ->gp_tasks accordingly. 710 */ 711 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 712 { 713 struct task_struct *t; 714 715 RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); 716 raw_lockdep_assert_held_rcu_node(rnp); 717 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp))) 718 dump_blkd_tasks(rnp, 10); 719 if (rcu_preempt_has_tasks(rnp) && 720 (rnp->qsmaskinit || rnp->wait_blkd_tasks)) { 721 WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next); 722 t = container_of(rnp->gp_tasks, struct task_struct, 723 rcu_node_entry); 724 trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), 725 rnp->gp_seq, t->pid); 726 } 727 WARN_ON_ONCE(rnp->qsmask); 728 } 729 730 /* 731 * Check for a quiescent state from the current CPU, including voluntary 732 * context switches for Tasks RCU. When a task blocks, the task is 733 * recorded in the corresponding CPU's rcu_node structure, which is checked 734 * elsewhere, hence this function need only check for quiescent states 735 * related to the current CPU, not to those related to tasks. 736 */ 737 static void rcu_flavor_sched_clock_irq(int user) 738 { 739 struct task_struct *t = current; 740 741 lockdep_assert_irqs_disabled(); 742 if (user || rcu_is_cpu_rrupt_from_idle()) { 743 rcu_note_voluntary_context_switch(current); 744 } 745 if (rcu_preempt_depth() > 0 || 746 (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { 747 /* No QS, force context switch if deferred. */ 748 if (rcu_preempt_need_deferred_qs(t)) { 749 set_tsk_need_resched(t); 750 set_preempt_need_resched(); 751 } 752 } else if (rcu_preempt_need_deferred_qs(t)) { 753 rcu_preempt_deferred_qs(t); /* Report deferred QS. */ 754 return; 755 } else if (!WARN_ON_ONCE(rcu_preempt_depth())) { 756 rcu_qs(); /* Report immediate QS. */ 757 return; 758 } 759 760 /* If GP is oldish, ask for help from rcu_read_unlock_special(). */ 761 if (rcu_preempt_depth() > 0 && 762 __this_cpu_read(rcu_data.core_needs_qs) && 763 __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && 764 !t->rcu_read_unlock_special.b.need_qs && 765 time_after(jiffies, rcu_state.gp_start + HZ)) 766 t->rcu_read_unlock_special.b.need_qs = true; 767 } 768 769 /* 770 * Check for a task exiting while in a preemptible-RCU read-side 771 * critical section, clean up if so. No need to issue warnings, as 772 * debug_check_no_locks_held() already does this if lockdep is enabled. 773 * Besides, if this function does anything other than just immediately 774 * return, there was a bug of some sort. Spewing warnings from this 775 * function is like as not to simply obscure important prior warnings. 776 */ 777 void exit_rcu(void) 778 { 779 struct task_struct *t = current; 780 781 if (unlikely(!list_empty(¤t->rcu_node_entry))) { 782 rcu_preempt_depth_set(1); 783 barrier(); 784 WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); 785 } else if (unlikely(rcu_preempt_depth())) { 786 rcu_preempt_depth_set(1); 787 } else { 788 return; 789 } 790 __rcu_read_unlock(); 791 rcu_preempt_deferred_qs(current); 792 } 793 794 /* 795 * Dump the blocked-tasks state, but limit the list dump to the 796 * specified number of elements. 797 */ 798 static void 799 dump_blkd_tasks(struct rcu_node *rnp, int ncheck) 800 { 801 int cpu; 802 int i; 803 struct list_head *lhp; 804 bool onl; 805 struct rcu_data *rdp; 806 struct rcu_node *rnp1; 807 808 raw_lockdep_assert_held_rcu_node(rnp); 809 pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n", 810 __func__, rnp->grplo, rnp->grphi, rnp->level, 811 (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs); 812 for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent) 813 pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", 814 __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); 815 pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", 816 __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), 817 READ_ONCE(rnp->exp_tasks)); 818 pr_info("%s: ->blkd_tasks", __func__); 819 i = 0; 820 list_for_each(lhp, &rnp->blkd_tasks) { 821 pr_cont(" %p", lhp); 822 if (++i >= ncheck) 823 break; 824 } 825 pr_cont("\n"); 826 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { 827 rdp = per_cpu_ptr(&rcu_data, cpu); 828 onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp)); 829 pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", 830 cpu, ".o"[onl], 831 (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, 832 (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); 833 } 834 } 835 836 #else /* #ifdef CONFIG_PREEMPT_RCU */ 837 838 /* 839 * If strict grace periods are enabled, and if the calling 840 * __rcu_read_unlock() marks the beginning of a quiescent state, immediately 841 * report that quiescent state and, if requested, spin for a bit. 842 */ 843 void rcu_read_unlock_strict(void) 844 { 845 struct rcu_data *rdp; 846 847 if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || 848 irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) 849 return; 850 rdp = this_cpu_ptr(&rcu_data); 851 rcu_report_qs_rdp(rdp); 852 udelay(rcu_unlock_delay); 853 } 854 EXPORT_SYMBOL_GPL(rcu_read_unlock_strict); 855 856 /* 857 * Tell them what RCU they are running. 858 */ 859 static void __init rcu_bootup_announce(void) 860 { 861 pr_info("Hierarchical RCU implementation.\n"); 862 rcu_bootup_announce_oddness(); 863 } 864 865 /* 866 * Note a quiescent state for PREEMPTION=n. Because we do not need to know 867 * how many quiescent states passed, just if there was at least one since 868 * the start of the grace period, this just sets a flag. The caller must 869 * have disabled preemption. 870 */ 871 static void rcu_qs(void) 872 { 873 RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!"); 874 if (!__this_cpu_read(rcu_data.cpu_no_qs.s)) 875 return; 876 trace_rcu_grace_period(TPS("rcu_sched"), 877 __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); 878 __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); 879 if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) 880 return; 881 __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); 882 rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); 883 } 884 885 /* 886 * Register an urgently needed quiescent state. If there is an 887 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 888 * dyntick-idle quiescent state visible to other CPUs, which will in 889 * some cases serve for expedited as well as normal grace periods. 890 * Either way, register a lightweight quiescent state. 891 */ 892 void rcu_all_qs(void) 893 { 894 unsigned long flags; 895 896 if (!raw_cpu_read(rcu_data.rcu_urgent_qs)) 897 return; 898 preempt_disable(); 899 /* Load rcu_urgent_qs before other flags. */ 900 if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { 901 preempt_enable(); 902 return; 903 } 904 this_cpu_write(rcu_data.rcu_urgent_qs, false); 905 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) { 906 local_irq_save(flags); 907 rcu_momentary_dyntick_idle(); 908 local_irq_restore(flags); 909 } 910 rcu_qs(); 911 preempt_enable(); 912 } 913 EXPORT_SYMBOL_GPL(rcu_all_qs); 914 915 /* 916 * Note a PREEMPTION=n context switch. The caller must have disabled interrupts. 917 */ 918 void rcu_note_context_switch(bool preempt) 919 { 920 trace_rcu_utilization(TPS("Start context switch")); 921 rcu_qs(); 922 /* Load rcu_urgent_qs before other flags. */ 923 if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) 924 goto out; 925 this_cpu_write(rcu_data.rcu_urgent_qs, false); 926 if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) 927 rcu_momentary_dyntick_idle(); 928 rcu_tasks_qs(current, preempt); 929 out: 930 trace_rcu_utilization(TPS("End context switch")); 931 } 932 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 933 934 /* 935 * Because preemptible RCU does not exist, there are never any preempted 936 * RCU readers. 937 */ 938 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) 939 { 940 return 0; 941 } 942 943 /* 944 * Because there is no preemptible RCU, there can be no readers blocked. 945 */ 946 static bool rcu_preempt_has_tasks(struct rcu_node *rnp) 947 { 948 return false; 949 } 950 951 /* 952 * Because there is no preemptible RCU, there can be no deferred quiescent 953 * states. 954 */ 955 static bool rcu_preempt_need_deferred_qs(struct task_struct *t) 956 { 957 return false; 958 } 959 static void rcu_preempt_deferred_qs(struct task_struct *t) { } 960 961 /* 962 * Because there is no preemptible RCU, there can be no readers blocked, 963 * so there is no need to check for blocked tasks. So check only for 964 * bogus qsmask values. 965 */ 966 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 967 { 968 WARN_ON_ONCE(rnp->qsmask); 969 } 970 971 /* 972 * Check to see if this CPU is in a non-context-switch quiescent state, 973 * namely user mode and idle loop. 974 */ 975 static void rcu_flavor_sched_clock_irq(int user) 976 { 977 if (user || rcu_is_cpu_rrupt_from_idle()) { 978 979 /* 980 * Get here if this CPU took its interrupt from user 981 * mode or from the idle loop, and if this is not a 982 * nested interrupt. In this case, the CPU is in 983 * a quiescent state, so note it. 984 * 985 * No memory barrier is required here because rcu_qs() 986 * references only CPU-local variables that other CPUs 987 * neither access nor modify, at least not while the 988 * corresponding CPU is online. 989 */ 990 991 rcu_qs(); 992 } 993 } 994 995 /* 996 * Because preemptible RCU does not exist, tasks cannot possibly exit 997 * while in preemptible RCU read-side critical sections. 998 */ 999 void exit_rcu(void) 1000 { 1001 } 1002 1003 /* 1004 * Dump the guaranteed-empty blocked-tasks state. Trust but verify. 1005 */ 1006 static void 1007 dump_blkd_tasks(struct rcu_node *rnp, int ncheck) 1008 { 1009 WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); 1010 } 1011 1012 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 1013 1014 /* 1015 * If boosting, set rcuc kthreads to realtime priority. 1016 */ 1017 static void rcu_cpu_kthread_setup(unsigned int cpu) 1018 { 1019 #ifdef CONFIG_RCU_BOOST 1020 struct sched_param sp; 1021 1022 sp.sched_priority = kthread_prio; 1023 sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); 1024 #endif /* #ifdef CONFIG_RCU_BOOST */ 1025 } 1026 1027 #ifdef CONFIG_RCU_BOOST 1028 1029 /* 1030 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1031 * or ->boost_tasks, advancing the pointer to the next task in the 1032 * ->blkd_tasks list. 1033 * 1034 * Note that irqs must be enabled: boosting the task can block. 1035 * Returns 1 if there are more tasks needing to be boosted. 1036 */ 1037 static int rcu_boost(struct rcu_node *rnp) 1038 { 1039 unsigned long flags; 1040 struct task_struct *t; 1041 struct list_head *tb; 1042 1043 if (READ_ONCE(rnp->exp_tasks) == NULL && 1044 READ_ONCE(rnp->boost_tasks) == NULL) 1045 return 0; /* Nothing left to boost. */ 1046 1047 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1048 1049 /* 1050 * Recheck under the lock: all tasks in need of boosting 1051 * might exit their RCU read-side critical sections on their own. 1052 */ 1053 if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { 1054 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1055 return 0; 1056 } 1057 1058 /* 1059 * Preferentially boost tasks blocking expedited grace periods. 1060 * This cannot starve the normal grace periods because a second 1061 * expedited grace period must boost all blocked tasks, including 1062 * those blocking the pre-existing normal grace period. 1063 */ 1064 if (rnp->exp_tasks != NULL) 1065 tb = rnp->exp_tasks; 1066 else 1067 tb = rnp->boost_tasks; 1068 1069 /* 1070 * We boost task t by manufacturing an rt_mutex that appears to 1071 * be held by task t. We leave a pointer to that rt_mutex where 1072 * task t can find it, and task t will release the mutex when it 1073 * exits its outermost RCU read-side critical section. Then 1074 * simply acquiring this artificial rt_mutex will boost task 1075 * t's priority. (Thanks to tglx for suggesting this approach!) 1076 * 1077 * Note that task t must acquire rnp->lock to remove itself from 1078 * the ->blkd_tasks list, which it will do from exit() if from 1079 * nowhere else. We therefore are guaranteed that task t will 1080 * stay around at least until we drop rnp->lock. Note that 1081 * rnp->lock also resolves races between our priority boosting 1082 * and task t's exiting its outermost RCU read-side critical 1083 * section. 1084 */ 1085 t = container_of(tb, struct task_struct, rcu_node_entry); 1086 rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); 1087 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1088 /* Lock only for side effect: boosts task t's priority. */ 1089 rt_mutex_lock(&rnp->boost_mtx); 1090 rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ 1091 rnp->n_boosts++; 1092 1093 return READ_ONCE(rnp->exp_tasks) != NULL || 1094 READ_ONCE(rnp->boost_tasks) != NULL; 1095 } 1096 1097 /* 1098 * Priority-boosting kthread, one per leaf rcu_node. 1099 */ 1100 static int rcu_boost_kthread(void *arg) 1101 { 1102 struct rcu_node *rnp = (struct rcu_node *)arg; 1103 int spincnt = 0; 1104 int more2boost; 1105 1106 trace_rcu_utilization(TPS("Start boost kthread@init")); 1107 for (;;) { 1108 WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); 1109 trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); 1110 rcu_wait(READ_ONCE(rnp->boost_tasks) || 1111 READ_ONCE(rnp->exp_tasks)); 1112 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); 1113 WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); 1114 more2boost = rcu_boost(rnp); 1115 if (more2boost) 1116 spincnt++; 1117 else 1118 spincnt = 0; 1119 if (spincnt > 10) { 1120 WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING); 1121 trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); 1122 schedule_timeout_idle(2); 1123 trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); 1124 spincnt = 0; 1125 } 1126 } 1127 /* NOTREACHED */ 1128 trace_rcu_utilization(TPS("End boost kthread@notreached")); 1129 return 0; 1130 } 1131 1132 /* 1133 * Check to see if it is time to start boosting RCU readers that are 1134 * blocking the current grace period, and, if so, tell the per-rcu_node 1135 * kthread to start boosting them. If there is an expedited grace 1136 * period in progress, it is always time to boost. 1137 * 1138 * The caller must hold rnp->lock, which this function releases. 1139 * The ->boost_kthread_task is immortal, so we don't need to worry 1140 * about it going away. 1141 */ 1142 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1143 __releases(rnp->lock) 1144 { 1145 raw_lockdep_assert_held_rcu_node(rnp); 1146 if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { 1147 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1148 return; 1149 } 1150 if (rnp->exp_tasks != NULL || 1151 (rnp->gp_tasks != NULL && 1152 rnp->boost_tasks == NULL && 1153 rnp->qsmask == 0 && 1154 (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { 1155 if (rnp->exp_tasks == NULL) 1156 WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); 1157 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1158 rcu_wake_cond(rnp->boost_kthread_task, 1159 READ_ONCE(rnp->boost_kthread_status)); 1160 } else { 1161 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1162 } 1163 } 1164 1165 /* 1166 * Is the current CPU running the RCU-callbacks kthread? 1167 * Caller must have preemption disabled. 1168 */ 1169 static bool rcu_is_callbacks_kthread(void) 1170 { 1171 return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; 1172 } 1173 1174 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1175 1176 /* 1177 * Do priority-boost accounting for the start of a new grace period. 1178 */ 1179 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1180 { 1181 rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; 1182 } 1183 1184 /* 1185 * Create an RCU-boost kthread for the specified node if one does not 1186 * already exist. We only create this kthread for preemptible RCU. 1187 * Returns zero if all is well, a negated errno otherwise. 1188 */ 1189 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) 1190 { 1191 unsigned long flags; 1192 int rnp_index = rnp - rcu_get_root(); 1193 struct sched_param sp; 1194 struct task_struct *t; 1195 1196 if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) 1197 return; 1198 1199 rcu_state.boost = 1; 1200 1201 t = kthread_create(rcu_boost_kthread, (void *)rnp, 1202 "rcub/%d", rnp_index); 1203 if (WARN_ON_ONCE(IS_ERR(t))) 1204 return; 1205 1206 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1207 rnp->boost_kthread_task = t; 1208 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1209 sp.sched_priority = kthread_prio; 1210 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1211 wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ 1212 } 1213 1214 /* 1215 * Set the per-rcu_node kthread's affinity to cover all CPUs that are 1216 * served by the rcu_node in question. The CPU hotplug lock is still 1217 * held, so the value of rnp->qsmaskinit will be stable. 1218 * 1219 * We don't include outgoingcpu in the affinity set, use -1 if there is 1220 * no outgoing CPU. If there are no CPUs left in the affinity set, 1221 * this function allows the kthread to execute on any CPU. 1222 */ 1223 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1224 { 1225 struct task_struct *t = rnp->boost_kthread_task; 1226 unsigned long mask = rcu_rnp_online_cpus(rnp); 1227 cpumask_var_t cm; 1228 int cpu; 1229 1230 if (!t) 1231 return; 1232 if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) 1233 return; 1234 for_each_leaf_node_possible_cpu(rnp, cpu) 1235 if ((mask & leaf_node_cpu_bit(rnp, cpu)) && 1236 cpu != outgoingcpu) 1237 cpumask_set_cpu(cpu, cm); 1238 if (cpumask_weight(cm) == 0) 1239 cpumask_setall(cm); 1240 set_cpus_allowed_ptr(t, cm); 1241 free_cpumask_var(cm); 1242 } 1243 1244 /* 1245 * Spawn boost kthreads -- called as soon as the scheduler is running. 1246 */ 1247 static void __init rcu_spawn_boost_kthreads(void) 1248 { 1249 struct rcu_node *rnp; 1250 1251 rcu_for_each_leaf_node(rnp) 1252 if (rcu_rnp_online_cpus(rnp)) 1253 rcu_spawn_one_boost_kthread(rnp); 1254 } 1255 1256 #else /* #ifdef CONFIG_RCU_BOOST */ 1257 1258 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1259 __releases(rnp->lock) 1260 { 1261 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1262 } 1263 1264 static bool rcu_is_callbacks_kthread(void) 1265 { 1266 return false; 1267 } 1268 1269 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1270 { 1271 } 1272 1273 static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) 1274 { 1275 } 1276 1277 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1278 { 1279 } 1280 1281 static void __init rcu_spawn_boost_kthreads(void) 1282 { 1283 } 1284 1285 #endif /* #else #ifdef CONFIG_RCU_BOOST */ 1286 1287 #if !defined(CONFIG_RCU_FAST_NO_HZ) 1288 1289 /* 1290 * Check to see if any future non-offloaded RCU-related work will need 1291 * to be done by the current CPU, even if none need be done immediately, 1292 * returning 1 if so. This function is part of the RCU implementation; 1293 * it is -not- an exported member of the RCU API. 1294 * 1295 * Because we not have RCU_FAST_NO_HZ, just check whether or not this 1296 * CPU has RCU callbacks queued. 1297 */ 1298 int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1299 { 1300 *nextevt = KTIME_MAX; 1301 return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && 1302 !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); 1303 } 1304 1305 /* 1306 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up 1307 * after it. 1308 */ 1309 static void rcu_cleanup_after_idle(void) 1310 { 1311 } 1312 1313 /* 1314 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, 1315 * is nothing. 1316 */ 1317 static void rcu_prepare_for_idle(void) 1318 { 1319 } 1320 1321 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1322 1323 /* 1324 * This code is invoked when a CPU goes idle, at which point we want 1325 * to have the CPU do everything required for RCU so that it can enter 1326 * the energy-efficient dyntick-idle mode. 1327 * 1328 * The following preprocessor symbol controls this: 1329 * 1330 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1331 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1332 * is sized to be roughly one RCU grace period. Those energy-efficiency 1333 * benchmarkers who might otherwise be tempted to set this to a large 1334 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1335 * system. And if you are -that- concerned about energy efficiency, 1336 * just power the system down and be done with it! 1337 * 1338 * The value below works well in practice. If future workloads require 1339 * adjustment, they can be converted into kernel config parameters, though 1340 * making the state machine smarter might be a better option. 1341 */ 1342 #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1343 1344 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; 1345 module_param(rcu_idle_gp_delay, int, 0644); 1346 1347 /* 1348 * Try to advance callbacks on the current CPU, but only if it has been 1349 * awhile since the last time we did so. Afterwards, if there are any 1350 * callbacks ready for immediate invocation, return true. 1351 */ 1352 static bool __maybe_unused rcu_try_advance_all_cbs(void) 1353 { 1354 bool cbs_ready = false; 1355 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1356 struct rcu_node *rnp; 1357 1358 /* Exit early if we advanced recently. */ 1359 if (jiffies == rdp->last_advance_all) 1360 return false; 1361 rdp->last_advance_all = jiffies; 1362 1363 rnp = rdp->mynode; 1364 1365 /* 1366 * Don't bother checking unless a grace period has 1367 * completed since we last checked and there are 1368 * callbacks not yet ready to invoke. 1369 */ 1370 if ((rcu_seq_completed_gp(rdp->gp_seq, 1371 rcu_seq_current(&rnp->gp_seq)) || 1372 unlikely(READ_ONCE(rdp->gpwrap))) && 1373 rcu_segcblist_pend_cbs(&rdp->cblist)) 1374 note_gp_changes(rdp); 1375 1376 if (rcu_segcblist_ready_cbs(&rdp->cblist)) 1377 cbs_ready = true; 1378 return cbs_ready; 1379 } 1380 1381 /* 1382 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready 1383 * to invoke. If the CPU has callbacks, try to advance them. Tell the 1384 * caller about what to set the timeout. 1385 * 1386 * The caller must have disabled interrupts. 1387 */ 1388 int rcu_needs_cpu(u64 basemono, u64 *nextevt) 1389 { 1390 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1391 unsigned long dj; 1392 1393 lockdep_assert_irqs_disabled(); 1394 1395 /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ 1396 if (rcu_segcblist_empty(&rdp->cblist) || 1397 rcu_rdp_is_offloaded(rdp)) { 1398 *nextevt = KTIME_MAX; 1399 return 0; 1400 } 1401 1402 /* Attempt to advance callbacks. */ 1403 if (rcu_try_advance_all_cbs()) { 1404 /* Some ready to invoke, so initiate later invocation. */ 1405 invoke_rcu_core(); 1406 return 1; 1407 } 1408 rdp->last_accelerate = jiffies; 1409 1410 /* Request timer and round. */ 1411 dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; 1412 1413 *nextevt = basemono + dj * TICK_NSEC; 1414 return 0; 1415 } 1416 1417 /* 1418 * Prepare a CPU for idle from an RCU perspective. The first major task is to 1419 * sense whether nohz mode has been enabled or disabled via sysfs. The second 1420 * major task is to accelerate (that is, assign grace-period numbers to) any 1421 * recently arrived callbacks. 1422 * 1423 * The caller must have disabled interrupts. 1424 */ 1425 static void rcu_prepare_for_idle(void) 1426 { 1427 bool needwake; 1428 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1429 struct rcu_node *rnp; 1430 int tne; 1431 1432 lockdep_assert_irqs_disabled(); 1433 if (rcu_rdp_is_offloaded(rdp)) 1434 return; 1435 1436 /* Handle nohz enablement switches conservatively. */ 1437 tne = READ_ONCE(tick_nohz_active); 1438 if (tne != rdp->tick_nohz_enabled_snap) { 1439 if (!rcu_segcblist_empty(&rdp->cblist)) 1440 invoke_rcu_core(); /* force nohz to see update. */ 1441 rdp->tick_nohz_enabled_snap = tne; 1442 return; 1443 } 1444 if (!tne) 1445 return; 1446 1447 /* 1448 * If we have not yet accelerated this jiffy, accelerate all 1449 * callbacks on this CPU. 1450 */ 1451 if (rdp->last_accelerate == jiffies) 1452 return; 1453 rdp->last_accelerate = jiffies; 1454 if (rcu_segcblist_pend_cbs(&rdp->cblist)) { 1455 rnp = rdp->mynode; 1456 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 1457 needwake = rcu_accelerate_cbs(rnp, rdp); 1458 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 1459 if (needwake) 1460 rcu_gp_kthread_wake(); 1461 } 1462 } 1463 1464 /* 1465 * Clean up for exit from idle. Attempt to advance callbacks based on 1466 * any grace periods that elapsed while the CPU was idle, and if any 1467 * callbacks are now ready to invoke, initiate invocation. 1468 */ 1469 static void rcu_cleanup_after_idle(void) 1470 { 1471 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); 1472 1473 lockdep_assert_irqs_disabled(); 1474 if (rcu_rdp_is_offloaded(rdp)) 1475 return; 1476 if (rcu_try_advance_all_cbs()) 1477 invoke_rcu_core(); 1478 } 1479 1480 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1481 1482 #ifdef CONFIG_RCU_NOCB_CPU 1483 1484 /* 1485 * Offload callback processing from the boot-time-specified set of CPUs 1486 * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads 1487 * created that pull the callbacks from the corresponding CPU, wait for 1488 * a grace period to elapse, and invoke the callbacks. These kthreads 1489 * are organized into GP kthreads, which manage incoming callbacks, wait for 1490 * grace periods, and awaken CB kthreads, and the CB kthreads, which only 1491 * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs 1492 * do a wake_up() on their GP kthread when they insert a callback into any 1493 * empty list, unless the rcu_nocb_poll boot parameter has been specified, 1494 * in which case each kthread actively polls its CPU. (Which isn't so great 1495 * for energy efficiency, but which does reduce RCU's overhead on that CPU.) 1496 * 1497 * This is intended to be used in conjunction with Frederic Weisbecker's 1498 * adaptive-idle work, which would seriously reduce OS jitter on CPUs 1499 * running CPU-bound user-mode computations. 1500 * 1501 * Offloading of callbacks can also be used as an energy-efficiency 1502 * measure because CPUs with no RCU callbacks queued are more aggressive 1503 * about entering dyntick-idle mode. 1504 */ 1505 1506 1507 /* 1508 * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. 1509 * If the list is invalid, a warning is emitted and all CPUs are offloaded. 1510 */ 1511 static int __init rcu_nocb_setup(char *str) 1512 { 1513 alloc_bootmem_cpumask_var(&rcu_nocb_mask); 1514 if (cpulist_parse(str, rcu_nocb_mask)) { 1515 pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); 1516 cpumask_setall(rcu_nocb_mask); 1517 } 1518 return 1; 1519 } 1520 __setup("rcu_nocbs=", rcu_nocb_setup); 1521 1522 static int __init parse_rcu_nocb_poll(char *arg) 1523 { 1524 rcu_nocb_poll = true; 1525 return 0; 1526 } 1527 early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 1528 1529 /* 1530 * Don't bother bypassing ->cblist if the call_rcu() rate is low. 1531 * After all, the main point of bypassing is to avoid lock contention 1532 * on ->nocb_lock, which only can happen at high call_rcu() rates. 1533 */ 1534 static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ; 1535 module_param(nocb_nobypass_lim_per_jiffy, int, 0); 1536 1537 /* 1538 * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the 1539 * lock isn't immediately available, increment ->nocb_lock_contended to 1540 * flag the contention. 1541 */ 1542 static void rcu_nocb_bypass_lock(struct rcu_data *rdp) 1543 __acquires(&rdp->nocb_bypass_lock) 1544 { 1545 lockdep_assert_irqs_disabled(); 1546 if (raw_spin_trylock(&rdp->nocb_bypass_lock)) 1547 return; 1548 atomic_inc(&rdp->nocb_lock_contended); 1549 WARN_ON_ONCE(smp_processor_id() != rdp->cpu); 1550 smp_mb__after_atomic(); /* atomic_inc() before lock. */ 1551 raw_spin_lock(&rdp->nocb_bypass_lock); 1552 smp_mb__before_atomic(); /* atomic_dec() after lock. */ 1553 atomic_dec(&rdp->nocb_lock_contended); 1554 } 1555 1556 /* 1557 * Spinwait until the specified rcu_data structure's ->nocb_lock is 1558 * not contended. Please note that this is extremely special-purpose, 1559 * relying on the fact that at most two kthreads and one CPU contend for 1560 * this lock, and also that the two kthreads are guaranteed to have frequent 1561 * grace-period-duration time intervals between successive acquisitions 1562 * of the lock. This allows us to use an extremely simple throttling 1563 * mechanism, and further to apply it only to the CPU doing floods of 1564 * call_rcu() invocations. Don't try this at home! 1565 */ 1566 static void rcu_nocb_wait_contended(struct rcu_data *rdp) 1567 { 1568 WARN_ON_ONCE(smp_processor_id() != rdp->cpu); 1569 while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended))) 1570 cpu_relax(); 1571 } 1572 1573 /* 1574 * Conditionally acquire the specified rcu_data structure's 1575 * ->nocb_bypass_lock. 1576 */ 1577 static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp) 1578 { 1579 lockdep_assert_irqs_disabled(); 1580 return raw_spin_trylock(&rdp->nocb_bypass_lock); 1581 } 1582 1583 /* 1584 * Release the specified rcu_data structure's ->nocb_bypass_lock. 1585 */ 1586 static void rcu_nocb_bypass_unlock(struct rcu_data *rdp) 1587 __releases(&rdp->nocb_bypass_lock) 1588 { 1589 lockdep_assert_irqs_disabled(); 1590 raw_spin_unlock(&rdp->nocb_bypass_lock); 1591 } 1592 1593 /* 1594 * Acquire the specified rcu_data structure's ->nocb_lock, but only 1595 * if it corresponds to a no-CBs CPU. 1596 */ 1597 static void rcu_nocb_lock(struct rcu_data *rdp) 1598 { 1599 lockdep_assert_irqs_disabled(); 1600 if (!rcu_rdp_is_offloaded(rdp)) 1601 return; 1602 raw_spin_lock(&rdp->nocb_lock); 1603 } 1604 1605 /* 1606 * Release the specified rcu_data structure's ->nocb_lock, but only 1607 * if it corresponds to a no-CBs CPU. 1608 */ 1609 static void rcu_nocb_unlock(struct rcu_data *rdp) 1610 { 1611 if (rcu_rdp_is_offloaded(rdp)) { 1612 lockdep_assert_irqs_disabled(); 1613 raw_spin_unlock(&rdp->nocb_lock); 1614 } 1615 } 1616 1617 /* 1618 * Release the specified rcu_data structure's ->nocb_lock and restore 1619 * interrupts, but only if it corresponds to a no-CBs CPU. 1620 */ 1621 static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, 1622 unsigned long flags) 1623 { 1624 if (rcu_rdp_is_offloaded(rdp)) { 1625 lockdep_assert_irqs_disabled(); 1626 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 1627 } else { 1628 local_irq_restore(flags); 1629 } 1630 } 1631 1632 /* Lockdep check that ->cblist may be safely accessed. */ 1633 static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) 1634 { 1635 lockdep_assert_irqs_disabled(); 1636 if (rcu_rdp_is_offloaded(rdp)) 1637 lockdep_assert_held(&rdp->nocb_lock); 1638 } 1639 1640 /* 1641 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended 1642 * grace period. 1643 */ 1644 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 1645 { 1646 swake_up_all(sq); 1647 } 1648 1649 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 1650 { 1651 return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1]; 1652 } 1653 1654 static void rcu_init_one_nocb(struct rcu_node *rnp) 1655 { 1656 init_swait_queue_head(&rnp->nocb_gp_wq[0]); 1657 init_swait_queue_head(&rnp->nocb_gp_wq[1]); 1658 } 1659 1660 /* Is the specified CPU a no-CBs CPU? */ 1661 bool rcu_is_nocb_cpu(int cpu) 1662 { 1663 if (cpumask_available(rcu_nocb_mask)) 1664 return cpumask_test_cpu(cpu, rcu_nocb_mask); 1665 return false; 1666 } 1667 1668 static bool __wake_nocb_gp(struct rcu_data *rdp_gp, 1669 struct rcu_data *rdp, 1670 bool force, unsigned long flags) 1671 __releases(rdp_gp->nocb_gp_lock) 1672 { 1673 bool needwake = false; 1674 1675 if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) { 1676 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); 1677 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1678 TPS("AlreadyAwake")); 1679 return false; 1680 } 1681 1682 if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { 1683 WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 1684 del_timer(&rdp_gp->nocb_timer); 1685 } 1686 1687 if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { 1688 WRITE_ONCE(rdp_gp->nocb_gp_sleep, false); 1689 needwake = true; 1690 } 1691 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); 1692 if (needwake) { 1693 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake")); 1694 wake_up_process(rdp_gp->nocb_gp_kthread); 1695 } 1696 1697 return needwake; 1698 } 1699 1700 /* 1701 * Kick the GP kthread for this NOCB group. 1702 */ 1703 static bool wake_nocb_gp(struct rcu_data *rdp, bool force) 1704 { 1705 unsigned long flags; 1706 struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; 1707 1708 raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); 1709 return __wake_nocb_gp(rdp_gp, rdp, force, flags); 1710 } 1711 1712 /* 1713 * Arrange to wake the GP kthread for this NOCB group at some future 1714 * time when it is safe to do so. 1715 */ 1716 static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, 1717 const char *reason) 1718 { 1719 unsigned long flags; 1720 struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; 1721 1722 raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); 1723 1724 /* 1725 * Bypass wakeup overrides previous deferments. In case 1726 * of callback storm, no need to wake up too early. 1727 */ 1728 if (waketype == RCU_NOCB_WAKE_BYPASS) { 1729 mod_timer(&rdp_gp->nocb_timer, jiffies + 2); 1730 WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); 1731 } else { 1732 if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE) 1733 mod_timer(&rdp_gp->nocb_timer, jiffies + 1); 1734 if (rdp_gp->nocb_defer_wakeup < waketype) 1735 WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); 1736 } 1737 1738 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); 1739 1740 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason); 1741 } 1742 1743 /* 1744 * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. 1745 * However, if there is a callback to be enqueued and if ->nocb_bypass 1746 * proves to be initially empty, just return false because the no-CB GP 1747 * kthread may need to be awakened in this case. 1748 * 1749 * Note that this function always returns true if rhp is NULL. 1750 */ 1751 static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, 1752 unsigned long j) 1753 { 1754 struct rcu_cblist rcl; 1755 1756 WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp)); 1757 rcu_lockdep_assert_cblist_protected(rdp); 1758 lockdep_assert_held(&rdp->nocb_bypass_lock); 1759 if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) { 1760 raw_spin_unlock(&rdp->nocb_bypass_lock); 1761 return false; 1762 } 1763 /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */ 1764 if (rhp) 1765 rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ 1766 rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp); 1767 rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl); 1768 WRITE_ONCE(rdp->nocb_bypass_first, j); 1769 rcu_nocb_bypass_unlock(rdp); 1770 return true; 1771 } 1772 1773 /* 1774 * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL. 1775 * However, if there is a callback to be enqueued and if ->nocb_bypass 1776 * proves to be initially empty, just return false because the no-CB GP 1777 * kthread may need to be awakened in this case. 1778 * 1779 * Note that this function always returns true if rhp is NULL. 1780 */ 1781 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, 1782 unsigned long j) 1783 { 1784 if (!rcu_rdp_is_offloaded(rdp)) 1785 return true; 1786 rcu_lockdep_assert_cblist_protected(rdp); 1787 rcu_nocb_bypass_lock(rdp); 1788 return rcu_nocb_do_flush_bypass(rdp, rhp, j); 1789 } 1790 1791 /* 1792 * If the ->nocb_bypass_lock is immediately available, flush the 1793 * ->nocb_bypass queue into ->cblist. 1794 */ 1795 static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j) 1796 { 1797 rcu_lockdep_assert_cblist_protected(rdp); 1798 if (!rcu_rdp_is_offloaded(rdp) || 1799 !rcu_nocb_bypass_trylock(rdp)) 1800 return; 1801 WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j)); 1802 } 1803 1804 /* 1805 * See whether it is appropriate to use the ->nocb_bypass list in order 1806 * to control contention on ->nocb_lock. A limited number of direct 1807 * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass 1808 * is non-empty, further callbacks must be placed into ->nocb_bypass, 1809 * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch 1810 * back to direct use of ->cblist. However, ->nocb_bypass should not be 1811 * used if ->cblist is empty, because otherwise callbacks can be stranded 1812 * on ->nocb_bypass because we cannot count on the current CPU ever again 1813 * invoking call_rcu(). The general rule is that if ->nocb_bypass is 1814 * non-empty, the corresponding no-CBs grace-period kthread must not be 1815 * in an indefinite sleep state. 1816 * 1817 * Finally, it is not permitted to use the bypass during early boot, 1818 * as doing so would confuse the auto-initialization code. Besides 1819 * which, there is no point in worrying about lock contention while 1820 * there is only one CPU in operation. 1821 */ 1822 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, 1823 bool *was_alldone, unsigned long flags) 1824 { 1825 unsigned long c; 1826 unsigned long cur_gp_seq; 1827 unsigned long j = jiffies; 1828 long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); 1829 1830 lockdep_assert_irqs_disabled(); 1831 1832 // Pure softirq/rcuc based processing: no bypassing, no 1833 // locking. 1834 if (!rcu_rdp_is_offloaded(rdp)) { 1835 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); 1836 return false; 1837 } 1838 1839 // In the process of (de-)offloading: no bypassing, but 1840 // locking. 1841 if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { 1842 rcu_nocb_lock(rdp); 1843 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); 1844 return false; /* Not offloaded, no bypassing. */ 1845 } 1846 1847 // Don't use ->nocb_bypass during early boot. 1848 if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) { 1849 rcu_nocb_lock(rdp); 1850 WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); 1851 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); 1852 return false; 1853 } 1854 1855 // If we have advanced to a new jiffy, reset counts to allow 1856 // moving back from ->nocb_bypass to ->cblist. 1857 if (j == rdp->nocb_nobypass_last) { 1858 c = rdp->nocb_nobypass_count + 1; 1859 } else { 1860 WRITE_ONCE(rdp->nocb_nobypass_last, j); 1861 c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy; 1862 if (ULONG_CMP_LT(rdp->nocb_nobypass_count, 1863 nocb_nobypass_lim_per_jiffy)) 1864 c = 0; 1865 else if (c > nocb_nobypass_lim_per_jiffy) 1866 c = nocb_nobypass_lim_per_jiffy; 1867 } 1868 WRITE_ONCE(rdp->nocb_nobypass_count, c); 1869 1870 // If there hasn't yet been all that many ->cblist enqueues 1871 // this jiffy, tell the caller to enqueue onto ->cblist. But flush 1872 // ->nocb_bypass first. 1873 if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) { 1874 rcu_nocb_lock(rdp); 1875 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); 1876 if (*was_alldone) 1877 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1878 TPS("FirstQ")); 1879 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j)); 1880 WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); 1881 return false; // Caller must enqueue the callback. 1882 } 1883 1884 // If ->nocb_bypass has been used too long or is too full, 1885 // flush ->nocb_bypass to ->cblist. 1886 if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) || 1887 ncbs >= qhimark) { 1888 rcu_nocb_lock(rdp); 1889 if (!rcu_nocb_flush_bypass(rdp, rhp, j)) { 1890 *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); 1891 if (*was_alldone) 1892 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1893 TPS("FirstQ")); 1894 WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); 1895 return false; // Caller must enqueue the callback. 1896 } 1897 if (j != rdp->nocb_gp_adv_time && 1898 rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && 1899 rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { 1900 rcu_advance_cbs_nowake(rdp->mynode, rdp); 1901 rdp->nocb_gp_adv_time = j; 1902 } 1903 rcu_nocb_unlock_irqrestore(rdp, flags); 1904 return true; // Callback already enqueued. 1905 } 1906 1907 // We need to use the bypass. 1908 rcu_nocb_wait_contended(rdp); 1909 rcu_nocb_bypass_lock(rdp); 1910 ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); 1911 rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */ 1912 rcu_cblist_enqueue(&rdp->nocb_bypass, rhp); 1913 if (!ncbs) { 1914 WRITE_ONCE(rdp->nocb_bypass_first, j); 1915 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ")); 1916 } 1917 rcu_nocb_bypass_unlock(rdp); 1918 smp_mb(); /* Order enqueue before wake. */ 1919 if (ncbs) { 1920 local_irq_restore(flags); 1921 } else { 1922 // No-CBs GP kthread might be indefinitely asleep, if so, wake. 1923 rcu_nocb_lock(rdp); // Rare during call_rcu() flood. 1924 if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { 1925 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1926 TPS("FirstBQwake")); 1927 __call_rcu_nocb_wake(rdp, true, flags); 1928 } else { 1929 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1930 TPS("FirstBQnoWake")); 1931 rcu_nocb_unlock_irqrestore(rdp, flags); 1932 } 1933 } 1934 return true; // Callback already enqueued. 1935 } 1936 1937 /* 1938 * Awaken the no-CBs grace-period kthread if needed, either due to it 1939 * legitimately being asleep or due to overload conditions. 1940 * 1941 * If warranted, also wake up the kthread servicing this CPUs queues. 1942 */ 1943 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, 1944 unsigned long flags) 1945 __releases(rdp->nocb_lock) 1946 { 1947 unsigned long cur_gp_seq; 1948 unsigned long j; 1949 long len; 1950 struct task_struct *t; 1951 1952 // If we are being polled or there is no kthread, just leave. 1953 t = READ_ONCE(rdp->nocb_gp_kthread); 1954 if (rcu_nocb_poll || !t) { 1955 rcu_nocb_unlock_irqrestore(rdp, flags); 1956 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1957 TPS("WakeNotPoll")); 1958 return; 1959 } 1960 // Need to actually to a wakeup. 1961 len = rcu_segcblist_n_cbs(&rdp->cblist); 1962 if (was_alldone) { 1963 rdp->qlen_last_fqs_check = len; 1964 if (!irqs_disabled_flags(flags)) { 1965 /* ... if queue was empty ... */ 1966 rcu_nocb_unlock_irqrestore(rdp, flags); 1967 wake_nocb_gp(rdp, false); 1968 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 1969 TPS("WakeEmpty")); 1970 } else { 1971 rcu_nocb_unlock_irqrestore(rdp, flags); 1972 wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, 1973 TPS("WakeEmptyIsDeferred")); 1974 } 1975 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 1976 /* ... or if many callbacks queued. */ 1977 rdp->qlen_last_fqs_check = len; 1978 j = jiffies; 1979 if (j != rdp->nocb_gp_adv_time && 1980 rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && 1981 rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) { 1982 rcu_advance_cbs_nowake(rdp->mynode, rdp); 1983 rdp->nocb_gp_adv_time = j; 1984 } 1985 smp_mb(); /* Enqueue before timer_pending(). */ 1986 if ((rdp->nocb_cb_sleep || 1987 !rcu_segcblist_ready_cbs(&rdp->cblist)) && 1988 !timer_pending(&rdp->nocb_timer)) { 1989 rcu_nocb_unlock_irqrestore(rdp, flags); 1990 wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, 1991 TPS("WakeOvfIsDeferred")); 1992 } else { 1993 rcu_nocb_unlock_irqrestore(rdp, flags); 1994 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); 1995 } 1996 } else { 1997 rcu_nocb_unlock_irqrestore(rdp, flags); 1998 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); 1999 } 2000 return; 2001 } 2002 2003 /* 2004 * Check if we ignore this rdp. 2005 * 2006 * We check that without holding the nocb lock but 2007 * we make sure not to miss a freshly offloaded rdp 2008 * with the current ordering: 2009 * 2010 * rdp_offload_toggle() nocb_gp_enabled_cb() 2011 * ------------------------- ---------------------------- 2012 * WRITE flags LOCK nocb_gp_lock 2013 * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep 2014 * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock 2015 * UNLOCK nocb_gp_lock READ flags 2016 */ 2017 static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) 2018 { 2019 u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; 2020 2021 return rcu_segcblist_test_flags(&rdp->cblist, flags); 2022 } 2023 2024 static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp, 2025 bool *needwake_state) 2026 { 2027 struct rcu_segcblist *cblist = &rdp->cblist; 2028 2029 if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { 2030 if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { 2031 rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); 2032 if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) 2033 *needwake_state = true; 2034 } 2035 return false; 2036 } 2037 2038 /* 2039 * De-offloading. Clear our flag and notify the de-offload worker. 2040 * We will ignore this rdp until it ever gets re-offloaded. 2041 */ 2042 WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); 2043 rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); 2044 if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) 2045 *needwake_state = true; 2046 return true; 2047 } 2048 2049 2050 /* 2051 * No-CBs GP kthreads come here to wait for additional callbacks to show up 2052 * or for grace periods to end. 2053 */ 2054 static void nocb_gp_wait(struct rcu_data *my_rdp) 2055 { 2056 bool bypass = false; 2057 long bypass_ncbs; 2058 int __maybe_unused cpu = my_rdp->cpu; 2059 unsigned long cur_gp_seq; 2060 unsigned long flags; 2061 bool gotcbs = false; 2062 unsigned long j = jiffies; 2063 bool needwait_gp = false; // This prevents actual uninitialized use. 2064 bool needwake; 2065 bool needwake_gp; 2066 struct rcu_data *rdp; 2067 struct rcu_node *rnp; 2068 unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning. 2069 bool wasempty = false; 2070 2071 /* 2072 * Each pass through the following loop checks for CBs and for the 2073 * nearest grace period (if any) to wait for next. The CB kthreads 2074 * and the global grace-period kthread are awakened if needed. 2075 */ 2076 WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); 2077 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { 2078 bool needwake_state = false; 2079 2080 if (!nocb_gp_enabled_cb(rdp)) 2081 continue; 2082 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); 2083 rcu_nocb_lock_irqsave(rdp, flags); 2084 if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) { 2085 rcu_nocb_unlock_irqrestore(rdp, flags); 2086 if (needwake_state) 2087 swake_up_one(&rdp->nocb_state_wq); 2088 continue; 2089 } 2090 bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); 2091 if (bypass_ncbs && 2092 (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || 2093 bypass_ncbs > 2 * qhimark)) { 2094 // Bypass full or old, so flush it. 2095 (void)rcu_nocb_try_flush_bypass(rdp, j); 2096 bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); 2097 } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { 2098 rcu_nocb_unlock_irqrestore(rdp, flags); 2099 if (needwake_state) 2100 swake_up_one(&rdp->nocb_state_wq); 2101 continue; /* No callbacks here, try next. */ 2102 } 2103 if (bypass_ncbs) { 2104 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 2105 TPS("Bypass")); 2106 bypass = true; 2107 } 2108 rnp = rdp->mynode; 2109 2110 // Advance callbacks if helpful and low contention. 2111 needwake_gp = false; 2112 if (!rcu_segcblist_restempty(&rdp->cblist, 2113 RCU_NEXT_READY_TAIL) || 2114 (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && 2115 rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) { 2116 raw_spin_lock_rcu_node(rnp); /* irqs disabled. */ 2117 needwake_gp = rcu_advance_cbs(rnp, rdp); 2118 wasempty = rcu_segcblist_restempty(&rdp->cblist, 2119 RCU_NEXT_READY_TAIL); 2120 raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */ 2121 } 2122 // Need to wait on some grace period? 2123 WARN_ON_ONCE(wasempty && 2124 !rcu_segcblist_restempty(&rdp->cblist, 2125 RCU_NEXT_READY_TAIL)); 2126 if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) { 2127 if (!needwait_gp || 2128 ULONG_CMP_LT(cur_gp_seq, wait_gp_seq)) 2129 wait_gp_seq = cur_gp_seq; 2130 needwait_gp = true; 2131 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, 2132 TPS("NeedWaitGP")); 2133 } 2134 if (rcu_segcblist_ready_cbs(&rdp->cblist)) { 2135 needwake = rdp->nocb_cb_sleep; 2136 WRITE_ONCE(rdp->nocb_cb_sleep, false); 2137 smp_mb(); /* CB invocation -after- GP end. */ 2138 } else { 2139 needwake = false; 2140 } 2141 rcu_nocb_unlock_irqrestore(rdp, flags); 2142 if (needwake) { 2143 swake_up_one(&rdp->nocb_cb_wq); 2144 gotcbs = true; 2145 } 2146 if (needwake_gp) 2147 rcu_gp_kthread_wake(); 2148 if (needwake_state) 2149 swake_up_one(&rdp->nocb_state_wq); 2150 } 2151 2152 my_rdp->nocb_gp_bypass = bypass; 2153 my_rdp->nocb_gp_gp = needwait_gp; 2154 my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0; 2155 2156 if (bypass && !rcu_nocb_poll) { 2157 // At least one child with non-empty ->nocb_bypass, so set 2158 // timer in order to avoid stranding its callbacks. 2159 wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS, 2160 TPS("WakeBypassIsDeferred")); 2161 } 2162 if (rcu_nocb_poll) { 2163 /* Polling, so trace if first poll in the series. */ 2164 if (gotcbs) 2165 trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll")); 2166 schedule_timeout_idle(1); 2167 } else if (!needwait_gp) { 2168 /* Wait for callbacks to appear. */ 2169 trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep")); 2170 swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq, 2171 !READ_ONCE(my_rdp->nocb_gp_sleep)); 2172 trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep")); 2173 } else { 2174 rnp = my_rdp->mynode; 2175 trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait")); 2176 swait_event_interruptible_exclusive( 2177 rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1], 2178 rcu_seq_done(&rnp->gp_seq, wait_gp_seq) || 2179 !READ_ONCE(my_rdp->nocb_gp_sleep)); 2180 trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait")); 2181 } 2182 if (!rcu_nocb_poll) { 2183 raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags); 2184 if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) { 2185 WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2186 del_timer(&my_rdp->nocb_timer); 2187 } 2188 WRITE_ONCE(my_rdp->nocb_gp_sleep, true); 2189 raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags); 2190 } 2191 my_rdp->nocb_gp_seq = -1; 2192 WARN_ON(signal_pending(current)); 2193 } 2194 2195 /* 2196 * No-CBs grace-period-wait kthread. There is one of these per group 2197 * of CPUs, but only once at least one CPU in that group has come online 2198 * at least once since boot. This kthread checks for newly posted 2199 * callbacks from any of the CPUs it is responsible for, waits for a 2200 * grace period, then awakens all of the rcu_nocb_cb_kthread() instances 2201 * that then have callback-invocation work to do. 2202 */ 2203 static int rcu_nocb_gp_kthread(void *arg) 2204 { 2205 struct rcu_data *rdp = arg; 2206 2207 for (;;) { 2208 WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1); 2209 nocb_gp_wait(rdp); 2210 cond_resched_tasks_rcu_qs(); 2211 } 2212 return 0; 2213 } 2214 2215 static inline bool nocb_cb_can_run(struct rcu_data *rdp) 2216 { 2217 u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; 2218 return rcu_segcblist_test_flags(&rdp->cblist, flags); 2219 } 2220 2221 static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) 2222 { 2223 return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); 2224 } 2225 2226 /* 2227 * Invoke any ready callbacks from the corresponding no-CBs CPU, 2228 * then, if there are no more, wait for more to appear. 2229 */ 2230 static void nocb_cb_wait(struct rcu_data *rdp) 2231 { 2232 struct rcu_segcblist *cblist = &rdp->cblist; 2233 unsigned long cur_gp_seq; 2234 unsigned long flags; 2235 bool needwake_state = false; 2236 bool needwake_gp = false; 2237 bool can_sleep = true; 2238 struct rcu_node *rnp = rdp->mynode; 2239 2240 local_irq_save(flags); 2241 rcu_momentary_dyntick_idle(); 2242 local_irq_restore(flags); 2243 /* 2244 * Disable BH to provide the expected environment. Also, when 2245 * transitioning to/from NOCB mode, a self-requeuing callback might 2246 * be invoked from softirq. A short grace period could cause both 2247 * instances of this callback would execute concurrently. 2248 */ 2249 local_bh_disable(); 2250 rcu_do_batch(rdp); 2251 local_bh_enable(); 2252 lockdep_assert_irqs_enabled(); 2253 rcu_nocb_lock_irqsave(rdp, flags); 2254 if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && 2255 rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && 2256 raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ 2257 needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); 2258 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 2259 } 2260 2261 if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { 2262 if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { 2263 rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); 2264 if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) 2265 needwake_state = true; 2266 } 2267 if (rcu_segcblist_ready_cbs(cblist)) 2268 can_sleep = false; 2269 } else { 2270 /* 2271 * De-offloading. Clear our flag and notify the de-offload worker. 2272 * We won't touch the callbacks and keep sleeping until we ever 2273 * get re-offloaded. 2274 */ 2275 WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); 2276 rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); 2277 if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) 2278 needwake_state = true; 2279 } 2280 2281 WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep); 2282 2283 if (rdp->nocb_cb_sleep) 2284 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); 2285 2286 rcu_nocb_unlock_irqrestore(rdp, flags); 2287 if (needwake_gp) 2288 rcu_gp_kthread_wake(); 2289 2290 if (needwake_state) 2291 swake_up_one(&rdp->nocb_state_wq); 2292 2293 do { 2294 swait_event_interruptible_exclusive(rdp->nocb_cb_wq, 2295 nocb_cb_wait_cond(rdp)); 2296 2297 // VVV Ensure CB invocation follows _sleep test. 2298 if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ 2299 WARN_ON(signal_pending(current)); 2300 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); 2301 } 2302 } while (!nocb_cb_can_run(rdp)); 2303 } 2304 2305 /* 2306 * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke 2307 * nocb_cb_wait() to do the dirty work. 2308 */ 2309 static int rcu_nocb_cb_kthread(void *arg) 2310 { 2311 struct rcu_data *rdp = arg; 2312 2313 // Each pass through this loop does one callback batch, and, 2314 // if there are no more ready callbacks, waits for them. 2315 for (;;) { 2316 nocb_cb_wait(rdp); 2317 cond_resched_tasks_rcu_qs(); 2318 } 2319 return 0; 2320 } 2321 2322 /* Is a deferred wakeup of rcu_nocb_kthread() required? */ 2323 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) 2324 { 2325 return READ_ONCE(rdp->nocb_defer_wakeup) >= level; 2326 } 2327 2328 /* Do a deferred wakeup of rcu_nocb_kthread(). */ 2329 static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp, 2330 struct rcu_data *rdp, int level, 2331 unsigned long flags) 2332 __releases(rdp_gp->nocb_gp_lock) 2333 { 2334 int ndw; 2335 int ret; 2336 2337 if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) { 2338 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); 2339 return false; 2340 } 2341 2342 ndw = rdp_gp->nocb_defer_wakeup; 2343 ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); 2344 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); 2345 2346 return ret; 2347 } 2348 2349 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ 2350 static void do_nocb_deferred_wakeup_timer(struct timer_list *t) 2351 { 2352 unsigned long flags; 2353 struct rcu_data *rdp = from_timer(rdp, t, nocb_timer); 2354 2355 WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp); 2356 trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer")); 2357 2358 raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags); 2359 smp_mb__after_spinlock(); /* Timer expire before wakeup. */ 2360 do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags); 2361 } 2362 2363 /* 2364 * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. 2365 * This means we do an inexact common-case check. Note that if 2366 * we miss, ->nocb_timer will eventually clean things up. 2367 */ 2368 static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) 2369 { 2370 unsigned long flags; 2371 struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; 2372 2373 if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE)) 2374 return false; 2375 2376 raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); 2377 return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags); 2378 } 2379 2380 void rcu_nocb_flush_deferred_wakeup(void) 2381 { 2382 do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); 2383 } 2384 EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); 2385 2386 static int rdp_offload_toggle(struct rcu_data *rdp, 2387 bool offload, unsigned long flags) 2388 __releases(rdp->nocb_lock) 2389 { 2390 struct rcu_segcblist *cblist = &rdp->cblist; 2391 struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; 2392 bool wake_gp = false; 2393 2394 rcu_segcblist_offload(cblist, offload); 2395 2396 if (rdp->nocb_cb_sleep) 2397 rdp->nocb_cb_sleep = false; 2398 rcu_nocb_unlock_irqrestore(rdp, flags); 2399 2400 /* 2401 * Ignore former value of nocb_cb_sleep and force wake up as it could 2402 * have been spuriously set to false already. 2403 */ 2404 swake_up_one(&rdp->nocb_cb_wq); 2405 2406 raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); 2407 if (rdp_gp->nocb_gp_sleep) { 2408 rdp_gp->nocb_gp_sleep = false; 2409 wake_gp = true; 2410 } 2411 raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); 2412 2413 if (wake_gp) 2414 wake_up_process(rdp_gp->nocb_gp_kthread); 2415 2416 return 0; 2417 } 2418 2419 static long rcu_nocb_rdp_deoffload(void *arg) 2420 { 2421 struct rcu_data *rdp = arg; 2422 struct rcu_segcblist *cblist = &rdp->cblist; 2423 unsigned long flags; 2424 int ret; 2425 2426 WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); 2427 2428 pr_info("De-offloading %d\n", rdp->cpu); 2429 2430 rcu_nocb_lock_irqsave(rdp, flags); 2431 /* 2432 * Flush once and for all now. This suffices because we are 2433 * running on the target CPU holding ->nocb_lock (thus having 2434 * interrupts disabled), and because rdp_offload_toggle() 2435 * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED. 2436 * Thus future calls to rcu_segcblist_completely_offloaded() will 2437 * return false, which means that future calls to rcu_nocb_try_bypass() 2438 * will refuse to put anything into the bypass. 2439 */ 2440 WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); 2441 ret = rdp_offload_toggle(rdp, false, flags); 2442 swait_event_exclusive(rdp->nocb_state_wq, 2443 !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | 2444 SEGCBLIST_KTHREAD_GP)); 2445 /* 2446 * Lock one last time to acquire latest callback updates from kthreads 2447 * so we can later handle callbacks locally without locking. 2448 */ 2449 rcu_nocb_lock_irqsave(rdp, flags); 2450 /* 2451 * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb 2452 * lock is released but how about being paranoid for once? 2453 */ 2454 rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); 2455 /* 2456 * With SEGCBLIST_SOFTIRQ_ONLY, we can't use 2457 * rcu_nocb_unlock_irqrestore() anymore. 2458 */ 2459 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); 2460 2461 /* Sanity check */ 2462 WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass)); 2463 2464 2465 return ret; 2466 } 2467 2468 int rcu_nocb_cpu_deoffload(int cpu) 2469 { 2470 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 2471 int ret = 0; 2472 2473 mutex_lock(&rcu_state.barrier_mutex); 2474 cpus_read_lock(); 2475 if (rcu_rdp_is_offloaded(rdp)) { 2476 if (cpu_online(cpu)) { 2477 ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); 2478 if (!ret) 2479 cpumask_clear_cpu(cpu, rcu_nocb_mask); 2480 } else { 2481 pr_info("NOCB: Can't CB-deoffload an offline CPU\n"); 2482 ret = -EINVAL; 2483 } 2484 } 2485 cpus_read_unlock(); 2486 mutex_unlock(&rcu_state.barrier_mutex); 2487 2488 return ret; 2489 } 2490 EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); 2491 2492 static long rcu_nocb_rdp_offload(void *arg) 2493 { 2494 struct rcu_data *rdp = arg; 2495 struct rcu_segcblist *cblist = &rdp->cblist; 2496 unsigned long flags; 2497 int ret; 2498 2499 WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); 2500 /* 2501 * For now we only support re-offload, ie: the rdp must have been 2502 * offloaded on boot first. 2503 */ 2504 if (!rdp->nocb_gp_rdp) 2505 return -EINVAL; 2506 2507 pr_info("Offloading %d\n", rdp->cpu); 2508 /* 2509 * Can't use rcu_nocb_lock_irqsave() while we are in 2510 * SEGCBLIST_SOFTIRQ_ONLY mode. 2511 */ 2512 raw_spin_lock_irqsave(&rdp->nocb_lock, flags); 2513 2514 /* 2515 * We didn't take the nocb lock while working on the 2516 * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. 2517 * Every modifications that have been done previously on 2518 * rdp->cblist must be visible remotely by the nocb kthreads 2519 * upon wake up after reading the cblist flags. 2520 * 2521 * The layout against nocb_lock enforces that ordering: 2522 * 2523 * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() 2524 * ------------------------- ---------------------------- 2525 * WRITE callbacks rcu_nocb_lock() 2526 * rcu_nocb_lock() READ flags 2527 * WRITE flags READ callbacks 2528 * rcu_nocb_unlock() rcu_nocb_unlock() 2529 */ 2530 ret = rdp_offload_toggle(rdp, true, flags); 2531 swait_event_exclusive(rdp->nocb_state_wq, 2532 rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && 2533 rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); 2534 2535 return ret; 2536 } 2537 2538 int rcu_nocb_cpu_offload(int cpu) 2539 { 2540 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 2541 int ret = 0; 2542 2543 mutex_lock(&rcu_state.barrier_mutex); 2544 cpus_read_lock(); 2545 if (!rcu_rdp_is_offloaded(rdp)) { 2546 if (cpu_online(cpu)) { 2547 ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); 2548 if (!ret) 2549 cpumask_set_cpu(cpu, rcu_nocb_mask); 2550 } else { 2551 pr_info("NOCB: Can't CB-offload an offline CPU\n"); 2552 ret = -EINVAL; 2553 } 2554 } 2555 cpus_read_unlock(); 2556 mutex_unlock(&rcu_state.barrier_mutex); 2557 2558 return ret; 2559 } 2560 EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); 2561 2562 void __init rcu_init_nohz(void) 2563 { 2564 int cpu; 2565 bool need_rcu_nocb_mask = false; 2566 struct rcu_data *rdp; 2567 2568 #if defined(CONFIG_NO_HZ_FULL) 2569 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask)) 2570 need_rcu_nocb_mask = true; 2571 #endif /* #if defined(CONFIG_NO_HZ_FULL) */ 2572 2573 if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { 2574 if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { 2575 pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); 2576 return; 2577 } 2578 } 2579 if (!cpumask_available(rcu_nocb_mask)) 2580 return; 2581 2582 #if defined(CONFIG_NO_HZ_FULL) 2583 if (tick_nohz_full_running) 2584 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); 2585 #endif /* #if defined(CONFIG_NO_HZ_FULL) */ 2586 2587 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { 2588 pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n"); 2589 cpumask_and(rcu_nocb_mask, cpu_possible_mask, 2590 rcu_nocb_mask); 2591 } 2592 if (cpumask_empty(rcu_nocb_mask)) 2593 pr_info("\tOffload RCU callbacks from CPUs: (none).\n"); 2594 else 2595 pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n", 2596 cpumask_pr_args(rcu_nocb_mask)); 2597 if (rcu_nocb_poll) 2598 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2599 2600 for_each_cpu(cpu, rcu_nocb_mask) { 2601 rdp = per_cpu_ptr(&rcu_data, cpu); 2602 if (rcu_segcblist_empty(&rdp->cblist)) 2603 rcu_segcblist_init(&rdp->cblist); 2604 rcu_segcblist_offload(&rdp->cblist, true); 2605 rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); 2606 rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); 2607 } 2608 rcu_organize_nocb_kthreads(); 2609 } 2610 2611 /* Initialize per-rcu_data variables for no-CBs CPUs. */ 2612 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2613 { 2614 init_swait_queue_head(&rdp->nocb_cb_wq); 2615 init_swait_queue_head(&rdp->nocb_gp_wq); 2616 init_swait_queue_head(&rdp->nocb_state_wq); 2617 raw_spin_lock_init(&rdp->nocb_lock); 2618 raw_spin_lock_init(&rdp->nocb_bypass_lock); 2619 raw_spin_lock_init(&rdp->nocb_gp_lock); 2620 timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0); 2621 rcu_cblist_init(&rdp->nocb_bypass); 2622 } 2623 2624 /* 2625 * If the specified CPU is a no-CBs CPU that does not already have its 2626 * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread 2627 * for this CPU's group has not yet been created, spawn it as well. 2628 */ 2629 static void rcu_spawn_one_nocb_kthread(int cpu) 2630 { 2631 struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); 2632 struct rcu_data *rdp_gp; 2633 struct task_struct *t; 2634 2635 /* 2636 * If this isn't a no-CBs CPU or if it already has an rcuo kthread, 2637 * then nothing to do. 2638 */ 2639 if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) 2640 return; 2641 2642 /* If we didn't spawn the GP kthread first, reorganize! */ 2643 rdp_gp = rdp->nocb_gp_rdp; 2644 if (!rdp_gp->nocb_gp_kthread) { 2645 t = kthread_run(rcu_nocb_gp_kthread, rdp_gp, 2646 "rcuog/%d", rdp_gp->cpu); 2647 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) 2648 return; 2649 WRITE_ONCE(rdp_gp->nocb_gp_kthread, t); 2650 } 2651 2652 /* Spawn the kthread for this CPU. */ 2653 t = kthread_run(rcu_nocb_cb_kthread, rdp, 2654 "rcuo%c/%d", rcu_state.abbr, cpu); 2655 if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__)) 2656 return; 2657 WRITE_ONCE(rdp->nocb_cb_kthread, t); 2658 WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); 2659 } 2660 2661 /* 2662 * If the specified CPU is a no-CBs CPU that does not already have its 2663 * rcuo kthread, spawn it. 2664 */ 2665 static void rcu_spawn_cpu_nocb_kthread(int cpu) 2666 { 2667 if (rcu_scheduler_fully_active) 2668 rcu_spawn_one_nocb_kthread(cpu); 2669 } 2670 2671 /* 2672 * Once the scheduler is running, spawn rcuo kthreads for all online 2673 * no-CBs CPUs. This assumes that the early_initcall()s happen before 2674 * non-boot CPUs come online -- if this changes, we will need to add 2675 * some mutual exclusion. 2676 */ 2677 static void __init rcu_spawn_nocb_kthreads(void) 2678 { 2679 int cpu; 2680 2681 for_each_online_cpu(cpu) 2682 rcu_spawn_cpu_nocb_kthread(cpu); 2683 } 2684 2685 /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ 2686 static int rcu_nocb_gp_stride = -1; 2687 module_param(rcu_nocb_gp_stride, int, 0444); 2688 2689 /* 2690 * Initialize GP-CB relationships for all no-CBs CPU. 2691 */ 2692 static void __init rcu_organize_nocb_kthreads(void) 2693 { 2694 int cpu; 2695 bool firsttime = true; 2696 bool gotnocbs = false; 2697 bool gotnocbscbs = true; 2698 int ls = rcu_nocb_gp_stride; 2699 int nl = 0; /* Next GP kthread. */ 2700 struct rcu_data *rdp; 2701 struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ 2702 struct rcu_data *rdp_prev = NULL; 2703 2704 if (!cpumask_available(rcu_nocb_mask)) 2705 return; 2706 if (ls == -1) { 2707 ls = nr_cpu_ids / int_sqrt(nr_cpu_ids); 2708 rcu_nocb_gp_stride = ls; 2709 } 2710 2711 /* 2712 * Each pass through this loop sets up one rcu_data structure. 2713 * Should the corresponding CPU come online in the future, then 2714 * we will spawn the needed set of rcu_nocb_kthread() kthreads. 2715 */ 2716 for_each_cpu(cpu, rcu_nocb_mask) { 2717 rdp = per_cpu_ptr(&rcu_data, cpu); 2718 if (rdp->cpu >= nl) { 2719 /* New GP kthread, set up for CBs & next GP. */ 2720 gotnocbs = true; 2721 nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; 2722 rdp->nocb_gp_rdp = rdp; 2723 rdp_gp = rdp; 2724 if (dump_tree) { 2725 if (!firsttime) 2726 pr_cont("%s\n", gotnocbscbs 2727 ? "" : " (self only)"); 2728 gotnocbscbs = false; 2729 firsttime = false; 2730 pr_alert("%s: No-CB GP kthread CPU %d:", 2731 __func__, cpu); 2732 } 2733 } else { 2734 /* Another CB kthread, link to previous GP kthread. */ 2735 gotnocbscbs = true; 2736 rdp->nocb_gp_rdp = rdp_gp; 2737 rdp_prev->nocb_next_cb_rdp = rdp; 2738 if (dump_tree) 2739 pr_cont(" %d", cpu); 2740 } 2741 rdp_prev = rdp; 2742 } 2743 if (gotnocbs && dump_tree) 2744 pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); 2745 } 2746 2747 /* 2748 * Bind the current task to the offloaded CPUs. If there are no offloaded 2749 * CPUs, leave the task unbound. Splat if the bind attempt fails. 2750 */ 2751 void rcu_bind_current_to_nocb(void) 2752 { 2753 if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask)) 2754 WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask)); 2755 } 2756 EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); 2757 2758 // The ->on_cpu field is available only in CONFIG_SMP=y, so... 2759 #ifdef CONFIG_SMP 2760 static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) 2761 { 2762 return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : ""; 2763 } 2764 #else // #ifdef CONFIG_SMP 2765 static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) 2766 { 2767 return ""; 2768 } 2769 #endif // #else #ifdef CONFIG_SMP 2770 2771 /* 2772 * Dump out nocb grace-period kthread state for the specified rcu_data 2773 * structure. 2774 */ 2775 static void show_rcu_nocb_gp_state(struct rcu_data *rdp) 2776 { 2777 struct rcu_node *rnp = rdp->mynode; 2778 2779 pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", 2780 rdp->cpu, 2781 "kK"[!!rdp->nocb_gp_kthread], 2782 "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], 2783 "dD"[!!rdp->nocb_defer_wakeup], 2784 "tT"[timer_pending(&rdp->nocb_timer)], 2785 "sS"[!!rdp->nocb_gp_sleep], 2786 ".W"[swait_active(&rdp->nocb_gp_wq)], 2787 ".W"[swait_active(&rnp->nocb_gp_wq[0])], 2788 ".W"[swait_active(&rnp->nocb_gp_wq[1])], 2789 ".B"[!!rdp->nocb_gp_bypass], 2790 ".G"[!!rdp->nocb_gp_gp], 2791 (long)rdp->nocb_gp_seq, 2792 rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), 2793 rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', 2794 rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, 2795 show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); 2796 } 2797 2798 /* Dump out nocb kthread state for the specified rcu_data structure. */ 2799 static void show_rcu_nocb_state(struct rcu_data *rdp) 2800 { 2801 char bufw[20]; 2802 char bufr[20]; 2803 struct rcu_segcblist *rsclp = &rdp->cblist; 2804 bool waslocked; 2805 bool wassleep; 2806 2807 if (rdp->nocb_gp_rdp == rdp) 2808 show_rcu_nocb_gp_state(rdp); 2809 2810 sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); 2811 sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); 2812 pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", 2813 rdp->cpu, rdp->nocb_gp_rdp->cpu, 2814 rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, 2815 "kK"[!!rdp->nocb_cb_kthread], 2816 "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], 2817 "cC"[!!atomic_read(&rdp->nocb_lock_contended)], 2818 "lL"[raw_spin_is_locked(&rdp->nocb_lock)], 2819 "sS"[!!rdp->nocb_cb_sleep], 2820 ".W"[swait_active(&rdp->nocb_cb_wq)], 2821 jiffies - rdp->nocb_bypass_first, 2822 jiffies - rdp->nocb_nobypass_last, 2823 rdp->nocb_nobypass_count, 2824 ".D"[rcu_segcblist_ready_cbs(rsclp)], 2825 ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], 2826 rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, 2827 ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], 2828 rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, 2829 ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], 2830 ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], 2831 rcu_segcblist_n_cbs(&rdp->cblist), 2832 rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', 2833 rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, 2834 show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); 2835 2836 /* It is OK for GP kthreads to have GP state. */ 2837 if (rdp->nocb_gp_rdp == rdp) 2838 return; 2839 2840 waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock); 2841 wassleep = swait_active(&rdp->nocb_gp_wq); 2842 if (!rdp->nocb_gp_sleep && !waslocked && !wassleep) 2843 return; /* Nothing untoward. */ 2844 2845 pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n", 2846 "lL"[waslocked], 2847 "dD"[!!rdp->nocb_defer_wakeup], 2848 "sS"[!!rdp->nocb_gp_sleep], 2849 ".W"[wassleep]); 2850 } 2851 2852 #else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2853 2854 /* No ->nocb_lock to acquire. */ 2855 static void rcu_nocb_lock(struct rcu_data *rdp) 2856 { 2857 } 2858 2859 /* No ->nocb_lock to release. */ 2860 static void rcu_nocb_unlock(struct rcu_data *rdp) 2861 { 2862 } 2863 2864 /* No ->nocb_lock to release. */ 2865 static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, 2866 unsigned long flags) 2867 { 2868 local_irq_restore(flags); 2869 } 2870 2871 /* Lockdep check that ->cblist may be safely accessed. */ 2872 static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp) 2873 { 2874 lockdep_assert_irqs_disabled(); 2875 } 2876 2877 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) 2878 { 2879 } 2880 2881 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) 2882 { 2883 return NULL; 2884 } 2885 2886 static void rcu_init_one_nocb(struct rcu_node *rnp) 2887 { 2888 } 2889 2890 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, 2891 unsigned long j) 2892 { 2893 return true; 2894 } 2895 2896 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, 2897 bool *was_alldone, unsigned long flags) 2898 { 2899 return false; 2900 } 2901 2902 static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, 2903 unsigned long flags) 2904 { 2905 WARN_ON_ONCE(1); /* Should be dead code! */ 2906 } 2907 2908 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2909 { 2910 } 2911 2912 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level) 2913 { 2914 return false; 2915 } 2916 2917 static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) 2918 { 2919 return false; 2920 } 2921 2922 static void rcu_spawn_cpu_nocb_kthread(int cpu) 2923 { 2924 } 2925 2926 static void __init rcu_spawn_nocb_kthreads(void) 2927 { 2928 } 2929 2930 static void show_rcu_nocb_state(struct rcu_data *rdp) 2931 { 2932 } 2933 2934 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2935 2936 /* 2937 * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the 2938 * grace-period kthread will do force_quiescent_state() processing? 2939 * The idea is to avoid waking up RCU core processing on such a 2940 * CPU unless the grace period has extended for too long. 2941 * 2942 * This code relies on the fact that all NO_HZ_FULL CPUs are also 2943 * CONFIG_RCU_NOCB_CPU CPUs. 2944 */ 2945 static bool rcu_nohz_full_cpu(void) 2946 { 2947 #ifdef CONFIG_NO_HZ_FULL 2948 if (tick_nohz_full_cpu(smp_processor_id()) && 2949 (!rcu_gp_in_progress() || 2950 time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) 2951 return true; 2952 #endif /* #ifdef CONFIG_NO_HZ_FULL */ 2953 return false; 2954 } 2955 2956 /* 2957 * Bind the RCU grace-period kthreads to the housekeeping CPU. 2958 */ 2959 static void rcu_bind_gp_kthread(void) 2960 { 2961 if (!tick_nohz_full_enabled()) 2962 return; 2963 housekeeping_affine(current, HK_FLAG_RCU); 2964 } 2965 2966 /* Record the current task on dyntick-idle entry. */ 2967 static void noinstr rcu_dynticks_task_enter(void) 2968 { 2969 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) 2970 WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); 2971 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ 2972 } 2973 2974 /* Record no current task on dyntick-idle exit. */ 2975 static void noinstr rcu_dynticks_task_exit(void) 2976 { 2977 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) 2978 WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); 2979 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ 2980 } 2981 2982 /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ 2983 static void rcu_dynticks_task_trace_enter(void) 2984 { 2985 #ifdef CONFIG_TASKS_RCU_TRACE 2986 if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) 2987 current->trc_reader_special.b.need_mb = true; 2988 #endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ 2989 } 2990 2991 /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ 2992 static void rcu_dynticks_task_trace_exit(void) 2993 { 2994 #ifdef CONFIG_TASKS_RCU_TRACE 2995 if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) 2996 current->trc_reader_special.b.need_mb = false; 2997 #endif /* #ifdef CONFIG_TASKS_RCU_TRACE */ 2998 } 2999