1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Sleepable Read-Copy Update mechanism for mutual exclusion. 4 * 5 * Copyright (C) IBM Corporation, 2006 6 * Copyright (C) Fujitsu, 2012 7 * 8 * Authors: Paul McKenney <paulmck@linux.ibm.com> 9 * Lai Jiangshan <laijs@cn.fujitsu.com> 10 * 11 * For detailed explanation of Read-Copy Update mechanism see - 12 * Documentation/RCU/ *.txt 13 * 14 */ 15 16 #define pr_fmt(fmt) "rcu: " fmt 17 18 #include <linux/export.h> 19 #include <linux/mutex.h> 20 #include <linux/percpu.h> 21 #include <linux/preempt.h> 22 #include <linux/rcupdate_wait.h> 23 #include <linux/sched.h> 24 #include <linux/smp.h> 25 #include <linux/delay.h> 26 #include <linux/module.h> 27 #include <linux/slab.h> 28 #include <linux/srcu.h> 29 30 #include "rcu.h" 31 #include "rcu_segcblist.h" 32 33 /* Holdoff in nanoseconds for auto-expediting. */ 34 #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) 35 static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; 36 module_param(exp_holdoff, ulong, 0444); 37 38 /* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ 39 static ulong counter_wrap_check = (ULONG_MAX >> 2); 40 module_param(counter_wrap_check, ulong, 0444); 41 42 /* 43 * Control conversion to SRCU_SIZE_BIG: 44 * 0: Don't convert at all. 45 * 1: Convert at init_srcu_struct() time. 46 * 2: Convert when rcutorture invokes srcu_torture_stats_print(). 47 * 3: Decide at boot time based on system shape (default). 48 * 0x1x: Convert when excessive contention encountered. 49 */ 50 #define SRCU_SIZING_NONE 0 51 #define SRCU_SIZING_INIT 1 52 #define SRCU_SIZING_TORTURE 2 53 #define SRCU_SIZING_AUTO 3 54 #define SRCU_SIZING_CONTEND 0x10 55 #define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) 56 #define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) 57 #define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) 58 #define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) 59 #define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) 60 static int convert_to_big = SRCU_SIZING_AUTO; 61 module_param(convert_to_big, int, 0444); 62 63 /* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ 64 static int big_cpu_lim __read_mostly = 128; 65 module_param(big_cpu_lim, int, 0444); 66 67 /* Contention events per jiffy to initiate transition to big. */ 68 static int small_contention_lim __read_mostly = 100; 69 module_param(small_contention_lim, int, 0444); 70 71 /* Early-boot callback-management, so early that no lock is required! */ 72 static LIST_HEAD(srcu_boot_list); 73 static bool __read_mostly srcu_init_done; 74 75 static void srcu_invoke_callbacks(struct work_struct *work); 76 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); 77 static void process_srcu(struct work_struct *work); 78 static void srcu_delay_timer(struct timer_list *t); 79 80 /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ 81 #define spin_lock_rcu_node(p) \ 82 do { \ 83 spin_lock(&ACCESS_PRIVATE(p, lock)); \ 84 smp_mb__after_unlock_lock(); \ 85 } while (0) 86 87 #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) 88 89 #define spin_lock_irq_rcu_node(p) \ 90 do { \ 91 spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ 92 smp_mb__after_unlock_lock(); \ 93 } while (0) 94 95 #define spin_unlock_irq_rcu_node(p) \ 96 spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) 97 98 #define spin_lock_irqsave_rcu_node(p, flags) \ 99 do { \ 100 spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ 101 smp_mb__after_unlock_lock(); \ 102 } while (0) 103 104 #define spin_trylock_irqsave_rcu_node(p, flags) \ 105 ({ \ 106 bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ 107 \ 108 if (___locked) \ 109 smp_mb__after_unlock_lock(); \ 110 ___locked; \ 111 }) 112 113 #define spin_unlock_irqrestore_rcu_node(p, flags) \ 114 spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ 115 116 /* 117 * Initialize SRCU per-CPU data. Note that statically allocated 118 * srcu_struct structures might already have srcu_read_lock() and 119 * srcu_read_unlock() running against them. So if the is_static parameter 120 * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. 121 */ 122 static void init_srcu_struct_data(struct srcu_struct *ssp) 123 { 124 int cpu; 125 struct srcu_data *sdp; 126 127 /* 128 * Initialize the per-CPU srcu_data array, which feeds into the 129 * leaves of the srcu_node tree. 130 */ 131 WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != 132 ARRAY_SIZE(sdp->srcu_unlock_count)); 133 for_each_possible_cpu(cpu) { 134 sdp = per_cpu_ptr(ssp->sda, cpu); 135 spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); 136 rcu_segcblist_init(&sdp->srcu_cblist); 137 sdp->srcu_cblist_invoking = false; 138 sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; 139 sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; 140 sdp->mynode = NULL; 141 sdp->cpu = cpu; 142 INIT_WORK(&sdp->work, srcu_invoke_callbacks); 143 timer_setup(&sdp->delay_work, srcu_delay_timer, 0); 144 sdp->ssp = ssp; 145 } 146 } 147 148 /* Invalid seq state, used during snp node initialization */ 149 #define SRCU_SNP_INIT_SEQ 0x2 150 151 /* 152 * Check whether sequence number corresponding to snp node, 153 * is invalid. 154 */ 155 static inline bool srcu_invl_snp_seq(unsigned long s) 156 { 157 return rcu_seq_state(s) == SRCU_SNP_INIT_SEQ; 158 } 159 160 /* 161 * Allocated and initialize SRCU combining tree. Returns @true if 162 * allocation succeeded and @false otherwise. 163 */ 164 static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) 165 { 166 int cpu; 167 int i; 168 int level = 0; 169 int levelspread[RCU_NUM_LVLS]; 170 struct srcu_data *sdp; 171 struct srcu_node *snp; 172 struct srcu_node *snp_first; 173 174 /* Initialize geometry if it has not already been initialized. */ 175 rcu_init_geometry(); 176 ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags); 177 if (!ssp->node) 178 return false; 179 180 /* Work out the overall tree geometry. */ 181 ssp->level[0] = &ssp->node[0]; 182 for (i = 1; i < rcu_num_lvls; i++) 183 ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1]; 184 rcu_init_levelspread(levelspread, num_rcu_lvl); 185 186 /* Each pass through this loop initializes one srcu_node structure. */ 187 srcu_for_each_node_breadth_first(ssp, snp) { 188 spin_lock_init(&ACCESS_PRIVATE(snp, lock)); 189 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != 190 ARRAY_SIZE(snp->srcu_data_have_cbs)); 191 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { 192 snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; 193 snp->srcu_data_have_cbs[i] = 0; 194 } 195 snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; 196 snp->grplo = -1; 197 snp->grphi = -1; 198 if (snp == &ssp->node[0]) { 199 /* Root node, special case. */ 200 snp->srcu_parent = NULL; 201 continue; 202 } 203 204 /* Non-root node. */ 205 if (snp == ssp->level[level + 1]) 206 level++; 207 snp->srcu_parent = ssp->level[level - 1] + 208 (snp - ssp->level[level]) / 209 levelspread[level - 1]; 210 } 211 212 /* 213 * Initialize the per-CPU srcu_data array, which feeds into the 214 * leaves of the srcu_node tree. 215 */ 216 level = rcu_num_lvls - 1; 217 snp_first = ssp->level[level]; 218 for_each_possible_cpu(cpu) { 219 sdp = per_cpu_ptr(ssp->sda, cpu); 220 sdp->mynode = &snp_first[cpu / levelspread[level]]; 221 for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { 222 if (snp->grplo < 0) 223 snp->grplo = cpu; 224 snp->grphi = cpu; 225 } 226 sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); 227 } 228 smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); 229 return true; 230 } 231 232 /* 233 * Initialize non-compile-time initialized fields, including the 234 * associated srcu_node and srcu_data structures. The is_static parameter 235 * tells us that ->sda has already been wired up to srcu_data. 236 */ 237 static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) 238 { 239 ssp->srcu_size_state = SRCU_SIZE_SMALL; 240 ssp->node = NULL; 241 mutex_init(&ssp->srcu_cb_mutex); 242 mutex_init(&ssp->srcu_gp_mutex); 243 ssp->srcu_idx = 0; 244 ssp->srcu_gp_seq = 0; 245 ssp->srcu_barrier_seq = 0; 246 mutex_init(&ssp->srcu_barrier_mutex); 247 atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); 248 INIT_DELAYED_WORK(&ssp->work, process_srcu); 249 ssp->sda_is_static = is_static; 250 if (!is_static) 251 ssp->sda = alloc_percpu(struct srcu_data); 252 if (!ssp->sda) 253 return -ENOMEM; 254 init_srcu_struct_data(ssp); 255 ssp->srcu_gp_seq_needed_exp = 0; 256 ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); 257 if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { 258 if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { 259 if (!ssp->sda_is_static) { 260 free_percpu(ssp->sda); 261 ssp->sda = NULL; 262 return -ENOMEM; 263 } 264 } else { 265 WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG); 266 } 267 } 268 smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ 269 return 0; 270 } 271 272 #ifdef CONFIG_DEBUG_LOCK_ALLOC 273 274 int __init_srcu_struct(struct srcu_struct *ssp, const char *name, 275 struct lock_class_key *key) 276 { 277 /* Don't re-initialize a lock while it is held. */ 278 debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); 279 lockdep_init_map(&ssp->dep_map, name, key, 0); 280 spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); 281 return init_srcu_struct_fields(ssp, false); 282 } 283 EXPORT_SYMBOL_GPL(__init_srcu_struct); 284 285 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 286 287 /** 288 * init_srcu_struct - initialize a sleep-RCU structure 289 * @ssp: structure to initialize. 290 * 291 * Must invoke this on a given srcu_struct before passing that srcu_struct 292 * to any other function. Each srcu_struct represents a separate domain 293 * of SRCU protection. 294 */ 295 int init_srcu_struct(struct srcu_struct *ssp) 296 { 297 spin_lock_init(&ACCESS_PRIVATE(ssp, lock)); 298 return init_srcu_struct_fields(ssp, false); 299 } 300 EXPORT_SYMBOL_GPL(init_srcu_struct); 301 302 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 303 304 /* 305 * Initiate a transition to SRCU_SIZE_BIG with lock held. 306 */ 307 static void __srcu_transition_to_big(struct srcu_struct *ssp) 308 { 309 lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); 310 smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC); 311 } 312 313 /* 314 * Initiate an idempotent transition to SRCU_SIZE_BIG. 315 */ 316 static void srcu_transition_to_big(struct srcu_struct *ssp) 317 { 318 unsigned long flags; 319 320 /* Double-checked locking on ->srcu_size-state. */ 321 if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) 322 return; 323 spin_lock_irqsave_rcu_node(ssp, flags); 324 if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) { 325 spin_unlock_irqrestore_rcu_node(ssp, flags); 326 return; 327 } 328 __srcu_transition_to_big(ssp); 329 spin_unlock_irqrestore_rcu_node(ssp, flags); 330 } 331 332 /* 333 * Check to see if the just-encountered contention event justifies 334 * a transition to SRCU_SIZE_BIG. 335 */ 336 static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) 337 { 338 unsigned long j; 339 340 if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state) 341 return; 342 j = jiffies; 343 if (ssp->srcu_size_jiffies != j) { 344 ssp->srcu_size_jiffies = j; 345 ssp->srcu_n_lock_retries = 0; 346 } 347 if (++ssp->srcu_n_lock_retries <= small_contention_lim) 348 return; 349 __srcu_transition_to_big(ssp); 350 } 351 352 /* 353 * Acquire the specified srcu_data structure's ->lock, but check for 354 * excessive contention, which results in initiation of a transition 355 * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module 356 * parameter permits this. 357 */ 358 static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) 359 { 360 struct srcu_struct *ssp = sdp->ssp; 361 362 if (spin_trylock_irqsave_rcu_node(sdp, *flags)) 363 return; 364 spin_lock_irqsave_rcu_node(ssp, *flags); 365 spin_lock_irqsave_check_contention(ssp); 366 spin_unlock_irqrestore_rcu_node(ssp, *flags); 367 spin_lock_irqsave_rcu_node(sdp, *flags); 368 } 369 370 /* 371 * Acquire the specified srcu_struct structure's ->lock, but check for 372 * excessive contention, which results in initiation of a transition 373 * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module 374 * parameter permits this. 375 */ 376 static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) 377 { 378 if (spin_trylock_irqsave_rcu_node(ssp, *flags)) 379 return; 380 spin_lock_irqsave_rcu_node(ssp, *flags); 381 spin_lock_irqsave_check_contention(ssp); 382 } 383 384 /* 385 * First-use initialization of statically allocated srcu_struct 386 * structure. Wiring up the combining tree is more than can be 387 * done with compile-time initialization, so this check is added 388 * to each update-side SRCU primitive. Use ssp->lock, which -is- 389 * compile-time initialized, to resolve races involving multiple 390 * CPUs trying to garner first-use privileges. 391 */ 392 static void check_init_srcu_struct(struct srcu_struct *ssp) 393 { 394 unsigned long flags; 395 396 /* The smp_load_acquire() pairs with the smp_store_release(). */ 397 if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ 398 return; /* Already initialized. */ 399 spin_lock_irqsave_rcu_node(ssp, flags); 400 if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { 401 spin_unlock_irqrestore_rcu_node(ssp, flags); 402 return; 403 } 404 init_srcu_struct_fields(ssp, true); 405 spin_unlock_irqrestore_rcu_node(ssp, flags); 406 } 407 408 /* 409 * Returns approximate total of the readers' ->srcu_lock_count[] values 410 * for the rank of per-CPU counters specified by idx. 411 */ 412 static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) 413 { 414 int cpu; 415 unsigned long sum = 0; 416 417 for_each_possible_cpu(cpu) { 418 struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); 419 420 sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); 421 } 422 return sum; 423 } 424 425 /* 426 * Returns approximate total of the readers' ->srcu_unlock_count[] values 427 * for the rank of per-CPU counters specified by idx. 428 */ 429 static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) 430 { 431 int cpu; 432 unsigned long mask = 0; 433 unsigned long sum = 0; 434 435 for_each_possible_cpu(cpu) { 436 struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); 437 438 sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); 439 if (IS_ENABLED(CONFIG_PROVE_RCU)) 440 mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); 441 } 442 WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), 443 "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); 444 return sum; 445 } 446 447 /* 448 * Return true if the number of pre-existing readers is determined to 449 * be zero. 450 */ 451 static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) 452 { 453 unsigned long unlocks; 454 455 unlocks = srcu_readers_unlock_idx(ssp, idx); 456 457 /* 458 * Make sure that a lock is always counted if the corresponding 459 * unlock is counted. Needs to be a smp_mb() as the read side may 460 * contain a read from a variable that is written to before the 461 * synchronize_srcu() in the write side. In this case smp_mb()s 462 * A and B act like the store buffering pattern. 463 * 464 * This smp_mb() also pairs with smp_mb() C to prevent accesses 465 * after the synchronize_srcu() from being executed before the 466 * grace period ends. 467 */ 468 smp_mb(); /* A */ 469 470 /* 471 * If the locks are the same as the unlocks, then there must have 472 * been no readers on this index at some time in between. This does 473 * not mean that there are no more readers, as one could have read 474 * the current index but not have incremented the lock counter yet. 475 * 476 * So suppose that the updater is preempted here for so long 477 * that more than ULONG_MAX non-nested readers come and go in 478 * the meantime. It turns out that this cannot result in overflow 479 * because if a reader modifies its unlock count after we read it 480 * above, then that reader's next load of ->srcu_idx is guaranteed 481 * to get the new value, which will cause it to operate on the 482 * other bank of counters, where it cannot contribute to the 483 * overflow of these counters. This means that there is a maximum 484 * of 2*NR_CPUS increments, which cannot overflow given current 485 * systems, especially not on 64-bit systems. 486 * 487 * OK, how about nesting? This does impose a limit on nesting 488 * of floor(ULONG_MAX/NR_CPUS/2), which should be sufficient, 489 * especially on 64-bit systems. 490 */ 491 return srcu_readers_lock_idx(ssp, idx) == unlocks; 492 } 493 494 /** 495 * srcu_readers_active - returns true if there are readers. and false 496 * otherwise 497 * @ssp: which srcu_struct to count active readers (holding srcu_read_lock). 498 * 499 * Note that this is not an atomic primitive, and can therefore suffer 500 * severe errors when invoked on an active srcu_struct. That said, it 501 * can be useful as an error check at cleanup time. 502 */ 503 static bool srcu_readers_active(struct srcu_struct *ssp) 504 { 505 int cpu; 506 unsigned long sum = 0; 507 508 for_each_possible_cpu(cpu) { 509 struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); 510 511 sum += atomic_long_read(&cpuc->srcu_lock_count[0]); 512 sum += atomic_long_read(&cpuc->srcu_lock_count[1]); 513 sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); 514 sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); 515 } 516 return sum; 517 } 518 519 /* 520 * We use an adaptive strategy for synchronize_srcu() and especially for 521 * synchronize_srcu_expedited(). We spin for a fixed time period 522 * (defined below, boot time configurable) to allow SRCU readers to exit 523 * their read-side critical sections. If there are still some readers 524 * after one jiffy, we repeatedly block for one jiffy time periods. 525 * The blocking time is increased as the grace-period age increases, 526 * with max blocking time capped at 10 jiffies. 527 */ 528 #define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 529 530 static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; 531 module_param(srcu_retry_check_delay, ulong, 0444); 532 533 #define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. 534 #define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. 535 536 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase 537 // no-delay instances. 538 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase 539 // no-delay instances. 540 541 #define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) 542 #define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) 543 #define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) 544 // per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto 545 // one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() 546 // called from process_srcu(). 547 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ 548 (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) 549 550 // Maximum per-GP-phase consecutive no-delay instances. 551 #define SRCU_DEFAULT_MAX_NODELAY_PHASE \ 552 SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ 553 SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ 554 SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) 555 556 static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; 557 module_param(srcu_max_nodelay_phase, ulong, 0444); 558 559 // Maximum consecutive no-delay instances. 560 #define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ 561 SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) 562 563 static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; 564 module_param(srcu_max_nodelay, ulong, 0444); 565 566 /* 567 * Return grace-period delay, zero if there are expedited grace 568 * periods pending, SRCU_INTERVAL otherwise. 569 */ 570 static unsigned long srcu_get_delay(struct srcu_struct *ssp) 571 { 572 unsigned long gpstart; 573 unsigned long j; 574 unsigned long jbase = SRCU_INTERVAL; 575 576 if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) 577 jbase = 0; 578 if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { 579 j = jiffies - 1; 580 gpstart = READ_ONCE(ssp->srcu_gp_start); 581 if (time_after(j, gpstart)) 582 jbase += j - gpstart; 583 if (!jbase) { 584 WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); 585 if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) 586 jbase = 1; 587 } 588 } 589 return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; 590 } 591 592 /** 593 * cleanup_srcu_struct - deconstruct a sleep-RCU structure 594 * @ssp: structure to clean up. 595 * 596 * Must invoke this after you are finished using a given srcu_struct that 597 * was initialized via init_srcu_struct(), else you leak memory. 598 */ 599 void cleanup_srcu_struct(struct srcu_struct *ssp) 600 { 601 int cpu; 602 603 if (WARN_ON(!srcu_get_delay(ssp))) 604 return; /* Just leak it! */ 605 if (WARN_ON(srcu_readers_active(ssp))) 606 return; /* Just leak it! */ 607 flush_delayed_work(&ssp->work); 608 for_each_possible_cpu(cpu) { 609 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); 610 611 del_timer_sync(&sdp->delay_work); 612 flush_work(&sdp->work); 613 if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) 614 return; /* Forgot srcu_barrier(), so just leak it! */ 615 } 616 if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || 617 WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || 618 WARN_ON(srcu_readers_active(ssp))) { 619 pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", 620 __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), 621 rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); 622 return; /* Caller forgot to stop doing call_srcu()? */ 623 } 624 if (!ssp->sda_is_static) { 625 free_percpu(ssp->sda); 626 ssp->sda = NULL; 627 } 628 kfree(ssp->node); 629 ssp->node = NULL; 630 ssp->srcu_size_state = SRCU_SIZE_SMALL; 631 } 632 EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 633 634 #ifdef CONFIG_PROVE_RCU 635 /* 636 * Check for consistent NMI safety. 637 */ 638 void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) 639 { 640 int nmi_safe_mask = 1 << nmi_safe; 641 int old_nmi_safe_mask; 642 struct srcu_data *sdp; 643 644 /* NMI-unsafe use in NMI is a bad sign */ 645 WARN_ON_ONCE(!nmi_safe && in_nmi()); 646 sdp = raw_cpu_ptr(ssp->sda); 647 old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); 648 if (!old_nmi_safe_mask) { 649 WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); 650 return; 651 } 652 WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); 653 } 654 EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); 655 #endif /* CONFIG_PROVE_RCU */ 656 657 /* 658 * Counts the new reader in the appropriate per-CPU element of the 659 * srcu_struct. 660 * Returns an index that must be passed to the matching srcu_read_unlock(). 661 */ 662 int __srcu_read_lock(struct srcu_struct *ssp) 663 { 664 int idx; 665 666 idx = READ_ONCE(ssp->srcu_idx) & 0x1; 667 this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); 668 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 669 return idx; 670 } 671 EXPORT_SYMBOL_GPL(__srcu_read_lock); 672 673 /* 674 * Removes the count for the old reader from the appropriate per-CPU 675 * element of the srcu_struct. Note that this may well be a different 676 * CPU than that which was incremented by the corresponding srcu_read_lock(). 677 */ 678 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) 679 { 680 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 681 this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); 682 } 683 EXPORT_SYMBOL_GPL(__srcu_read_unlock); 684 685 #ifdef CONFIG_NEED_SRCU_NMI_SAFE 686 687 /* 688 * Counts the new reader in the appropriate per-CPU element of the 689 * srcu_struct, but in an NMI-safe manner using RMW atomics. 690 * Returns an index that must be passed to the matching srcu_read_unlock(). 691 */ 692 int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) 693 { 694 int idx; 695 struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); 696 697 idx = READ_ONCE(ssp->srcu_idx) & 0x1; 698 atomic_long_inc(&sdp->srcu_lock_count[idx]); 699 smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ 700 return idx; 701 } 702 EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); 703 704 /* 705 * Removes the count for the old reader from the appropriate per-CPU 706 * element of the srcu_struct. Note that this may well be a different 707 * CPU than that which was incremented by the corresponding srcu_read_lock(). 708 */ 709 void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) 710 { 711 struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); 712 713 smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ 714 atomic_long_inc(&sdp->srcu_unlock_count[idx]); 715 } 716 EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); 717 718 #endif // CONFIG_NEED_SRCU_NMI_SAFE 719 720 /* 721 * Start an SRCU grace period. 722 */ 723 static void srcu_gp_start(struct srcu_struct *ssp) 724 { 725 struct srcu_data *sdp; 726 int state; 727 728 if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 729 sdp = per_cpu_ptr(ssp->sda, 0); 730 else 731 sdp = this_cpu_ptr(ssp->sda); 732 lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock)); 733 WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); 734 spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ 735 rcu_segcblist_advance(&sdp->srcu_cblist, 736 rcu_seq_current(&ssp->srcu_gp_seq)); 737 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, 738 rcu_seq_snap(&ssp->srcu_gp_seq)); 739 spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ 740 WRITE_ONCE(ssp->srcu_gp_start, jiffies); 741 WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); 742 smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ 743 rcu_seq_start(&ssp->srcu_gp_seq); 744 state = rcu_seq_state(ssp->srcu_gp_seq); 745 WARN_ON_ONCE(state != SRCU_STATE_SCAN1); 746 } 747 748 749 static void srcu_delay_timer(struct timer_list *t) 750 { 751 struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); 752 753 queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); 754 } 755 756 static void srcu_queue_delayed_work_on(struct srcu_data *sdp, 757 unsigned long delay) 758 { 759 if (!delay) { 760 queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); 761 return; 762 } 763 764 timer_reduce(&sdp->delay_work, jiffies + delay); 765 } 766 767 /* 768 * Schedule callback invocation for the specified srcu_data structure, 769 * if possible, on the corresponding CPU. 770 */ 771 static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) 772 { 773 srcu_queue_delayed_work_on(sdp, delay); 774 } 775 776 /* 777 * Schedule callback invocation for all srcu_data structures associated 778 * with the specified srcu_node structure that have callbacks for the 779 * just-completed grace period, the one corresponding to idx. If possible, 780 * schedule this invocation on the corresponding CPUs. 781 */ 782 static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp, 783 unsigned long mask, unsigned long delay) 784 { 785 int cpu; 786 787 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { 788 if (!(mask & (1 << (cpu - snp->grplo)))) 789 continue; 790 srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); 791 } 792 } 793 794 /* 795 * Note the end of an SRCU grace period. Initiates callback invocation 796 * and starts a new grace period if needed. 797 * 798 * The ->srcu_cb_mutex acquisition does not protect any data, but 799 * instead prevents more than one grace period from starting while we 800 * are initiating callback invocation. This allows the ->srcu_have_cbs[] 801 * array to have a finite number of elements. 802 */ 803 static void srcu_gp_end(struct srcu_struct *ssp) 804 { 805 unsigned long cbdelay = 1; 806 bool cbs; 807 bool last_lvl; 808 int cpu; 809 unsigned long flags; 810 unsigned long gpseq; 811 int idx; 812 unsigned long mask; 813 struct srcu_data *sdp; 814 unsigned long sgsne; 815 struct srcu_node *snp; 816 int ss_state; 817 818 /* Prevent more than one additional grace period. */ 819 mutex_lock(&ssp->srcu_cb_mutex); 820 821 /* End the current grace period. */ 822 spin_lock_irq_rcu_node(ssp); 823 idx = rcu_seq_state(ssp->srcu_gp_seq); 824 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); 825 if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) 826 cbdelay = 0; 827 828 WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); 829 rcu_seq_end(&ssp->srcu_gp_seq); 830 gpseq = rcu_seq_current(&ssp->srcu_gp_seq); 831 if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) 832 WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); 833 spin_unlock_irq_rcu_node(ssp); 834 mutex_unlock(&ssp->srcu_gp_mutex); 835 /* A new grace period can start at this point. But only one. */ 836 837 /* Initiate callback invocation as needed. */ 838 ss_state = smp_load_acquire(&ssp->srcu_size_state); 839 if (ss_state < SRCU_SIZE_WAIT_BARRIER) { 840 srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, 0), cbdelay); 841 } else { 842 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); 843 srcu_for_each_node_breadth_first(ssp, snp) { 844 spin_lock_irq_rcu_node(snp); 845 cbs = false; 846 last_lvl = snp >= ssp->level[rcu_num_lvls - 1]; 847 if (last_lvl) 848 cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; 849 snp->srcu_have_cbs[idx] = gpseq; 850 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); 851 sgsne = snp->srcu_gp_seq_needed_exp; 852 if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) 853 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); 854 if (ss_state < SRCU_SIZE_BIG) 855 mask = ~0; 856 else 857 mask = snp->srcu_data_have_cbs[idx]; 858 snp->srcu_data_have_cbs[idx] = 0; 859 spin_unlock_irq_rcu_node(snp); 860 if (cbs) 861 srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); 862 } 863 } 864 865 /* Occasionally prevent srcu_data counter wrap. */ 866 if (!(gpseq & counter_wrap_check)) 867 for_each_possible_cpu(cpu) { 868 sdp = per_cpu_ptr(ssp->sda, cpu); 869 spin_lock_irqsave_rcu_node(sdp, flags); 870 if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) 871 sdp->srcu_gp_seq_needed = gpseq; 872 if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) 873 sdp->srcu_gp_seq_needed_exp = gpseq; 874 spin_unlock_irqrestore_rcu_node(sdp, flags); 875 } 876 877 /* Callback initiation done, allow grace periods after next. */ 878 mutex_unlock(&ssp->srcu_cb_mutex); 879 880 /* Start a new grace period if needed. */ 881 spin_lock_irq_rcu_node(ssp); 882 gpseq = rcu_seq_current(&ssp->srcu_gp_seq); 883 if (!rcu_seq_state(gpseq) && 884 ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { 885 srcu_gp_start(ssp); 886 spin_unlock_irq_rcu_node(ssp); 887 srcu_reschedule(ssp, 0); 888 } else { 889 spin_unlock_irq_rcu_node(ssp); 890 } 891 892 /* Transition to big if needed. */ 893 if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { 894 if (ss_state == SRCU_SIZE_ALLOC) 895 init_srcu_struct_nodes(ssp, GFP_KERNEL); 896 else 897 smp_store_release(&ssp->srcu_size_state, ss_state + 1); 898 } 899 } 900 901 /* 902 * Funnel-locking scheme to scalably mediate many concurrent expedited 903 * grace-period requests. This function is invoked for the first known 904 * expedited request for a grace period that has already been requested, 905 * but without expediting. To start a completely new grace period, 906 * whether expedited or not, use srcu_funnel_gp_start() instead. 907 */ 908 static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp, 909 unsigned long s) 910 { 911 unsigned long flags; 912 unsigned long sgsne; 913 914 if (snp) 915 for (; snp != NULL; snp = snp->srcu_parent) { 916 sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); 917 if (rcu_seq_done(&ssp->srcu_gp_seq, s) || 918 (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) 919 return; 920 spin_lock_irqsave_rcu_node(snp, flags); 921 sgsne = snp->srcu_gp_seq_needed_exp; 922 if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { 923 spin_unlock_irqrestore_rcu_node(snp, flags); 924 return; 925 } 926 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); 927 spin_unlock_irqrestore_rcu_node(snp, flags); 928 } 929 spin_lock_irqsave_ssp_contention(ssp, &flags); 930 if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) 931 WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); 932 spin_unlock_irqrestore_rcu_node(ssp, flags); 933 } 934 935 /* 936 * Funnel-locking scheme to scalably mediate many concurrent grace-period 937 * requests. The winner has to do the work of actually starting grace 938 * period s. Losers must either ensure that their desired grace-period 939 * number is recorded on at least their leaf srcu_node structure, or they 940 * must take steps to invoke their own callbacks. 941 * 942 * Note that this function also does the work of srcu_funnel_exp_start(), 943 * in some cases by directly invoking it. 944 */ 945 static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, 946 unsigned long s, bool do_norm) 947 { 948 unsigned long flags; 949 int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); 950 unsigned long sgsne; 951 struct srcu_node *snp; 952 struct srcu_node *snp_leaf; 953 unsigned long snp_seq; 954 955 /* Ensure that snp node tree is fully initialized before traversing it */ 956 if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 957 snp_leaf = NULL; 958 else 959 snp_leaf = sdp->mynode; 960 961 if (snp_leaf) 962 /* Each pass through the loop does one level of the srcu_node tree. */ 963 for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { 964 if (rcu_seq_done(&ssp->srcu_gp_seq, s) && snp != snp_leaf) 965 return; /* GP already done and CBs recorded. */ 966 spin_lock_irqsave_rcu_node(snp, flags); 967 snp_seq = snp->srcu_have_cbs[idx]; 968 if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { 969 if (snp == snp_leaf && snp_seq == s) 970 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 971 spin_unlock_irqrestore_rcu_node(snp, flags); 972 if (snp == snp_leaf && snp_seq != s) { 973 srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); 974 return; 975 } 976 if (!do_norm) 977 srcu_funnel_exp_start(ssp, snp, s); 978 return; 979 } 980 snp->srcu_have_cbs[idx] = s; 981 if (snp == snp_leaf) 982 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 983 sgsne = snp->srcu_gp_seq_needed_exp; 984 if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) 985 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); 986 spin_unlock_irqrestore_rcu_node(snp, flags); 987 } 988 989 /* Top of tree, must ensure the grace period will be started. */ 990 spin_lock_irqsave_ssp_contention(ssp, &flags); 991 if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { 992 /* 993 * Record need for grace period s. Pair with load 994 * acquire setting up for initialization. 995 */ 996 smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ 997 } 998 if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) 999 WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); 1000 1001 /* If grace period not already done and none in progress, start it. */ 1002 if (!rcu_seq_done(&ssp->srcu_gp_seq, s) && 1003 rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { 1004 WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); 1005 srcu_gp_start(ssp); 1006 1007 // And how can that list_add() in the "else" clause 1008 // possibly be safe for concurrent execution? Well, 1009 // it isn't. And it does not have to be. After all, it 1010 // can only be executed during early boot when there is only 1011 // the one boot CPU running with interrupts still disabled. 1012 if (likely(srcu_init_done)) 1013 queue_delayed_work(rcu_gp_wq, &ssp->work, 1014 !!srcu_get_delay(ssp)); 1015 else if (list_empty(&ssp->work.work.entry)) 1016 list_add(&ssp->work.work.entry, &srcu_boot_list); 1017 } 1018 spin_unlock_irqrestore_rcu_node(ssp, flags); 1019 } 1020 1021 /* 1022 * Wait until all readers counted by array index idx complete, but 1023 * loop an additional time if there is an expedited grace period pending. 1024 * The caller must ensure that ->srcu_idx is not changed while checking. 1025 */ 1026 static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) 1027 { 1028 unsigned long curdelay; 1029 1030 curdelay = !srcu_get_delay(ssp); 1031 1032 for (;;) { 1033 if (srcu_readers_active_idx_check(ssp, idx)) 1034 return true; 1035 if ((--trycount + curdelay) <= 0) 1036 return false; 1037 udelay(srcu_retry_check_delay); 1038 } 1039 } 1040 1041 /* 1042 * Increment the ->srcu_idx counter so that future SRCU readers will 1043 * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows 1044 * us to wait for pre-existing readers in a starvation-free manner. 1045 */ 1046 static void srcu_flip(struct srcu_struct *ssp) 1047 { 1048 /* 1049 * Ensure that if this updater saw a given reader's increment 1050 * from __srcu_read_lock(), that reader was using an old value 1051 * of ->srcu_idx. Also ensure that if a given reader sees the 1052 * new value of ->srcu_idx, this updater's earlier scans cannot 1053 * have seen that reader's increments (which is OK, because this 1054 * grace period need not wait on that reader). 1055 */ 1056 smp_mb(); /* E */ /* Pairs with B and C. */ 1057 1058 WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); 1059 1060 /* 1061 * Ensure that if the updater misses an __srcu_read_unlock() 1062 * increment, that task's next __srcu_read_lock() will see the 1063 * above counter update. Note that both this memory barrier 1064 * and the one in srcu_readers_active_idx_check() provide the 1065 * guarantee for __srcu_read_lock(). 1066 */ 1067 smp_mb(); /* D */ /* Pairs with C. */ 1068 } 1069 1070 /* 1071 * If SRCU is likely idle, return true, otherwise return false. 1072 * 1073 * Note that it is OK for several current from-idle requests for a new 1074 * grace period from idle to specify expediting because they will all end 1075 * up requesting the same grace period anyhow. So no loss. 1076 * 1077 * Note also that if any CPU (including the current one) is still invoking 1078 * callbacks, this function will nevertheless say "idle". This is not 1079 * ideal, but the overhead of checking all CPUs' callback lists is even 1080 * less ideal, especially on large systems. Furthermore, the wakeup 1081 * can happen before the callback is fully removed, so we have no choice 1082 * but to accept this type of error. 1083 * 1084 * This function is also subject to counter-wrap errors, but let's face 1085 * it, if this function was preempted for enough time for the counters 1086 * to wrap, it really doesn't matter whether or not we expedite the grace 1087 * period. The extra overhead of a needlessly expedited grace period is 1088 * negligible when amortized over that time period, and the extra latency 1089 * of a needlessly non-expedited grace period is similarly negligible. 1090 */ 1091 static bool srcu_might_be_idle(struct srcu_struct *ssp) 1092 { 1093 unsigned long curseq; 1094 unsigned long flags; 1095 struct srcu_data *sdp; 1096 unsigned long t; 1097 unsigned long tlast; 1098 1099 check_init_srcu_struct(ssp); 1100 /* If the local srcu_data structure has callbacks, not idle. */ 1101 sdp = raw_cpu_ptr(ssp->sda); 1102 spin_lock_irqsave_rcu_node(sdp, flags); 1103 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { 1104 spin_unlock_irqrestore_rcu_node(sdp, flags); 1105 return false; /* Callbacks already present, so not idle. */ 1106 } 1107 spin_unlock_irqrestore_rcu_node(sdp, flags); 1108 1109 /* 1110 * No local callbacks, so probabilistically probe global state. 1111 * Exact information would require acquiring locks, which would 1112 * kill scalability, hence the probabilistic nature of the probe. 1113 */ 1114 1115 /* First, see if enough time has passed since the last GP. */ 1116 t = ktime_get_mono_fast_ns(); 1117 tlast = READ_ONCE(ssp->srcu_last_gp_end); 1118 if (exp_holdoff == 0 || 1119 time_in_range_open(t, tlast, tlast + exp_holdoff)) 1120 return false; /* Too soon after last GP. */ 1121 1122 /* Next, check for probable idleness. */ 1123 curseq = rcu_seq_current(&ssp->srcu_gp_seq); 1124 smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ 1125 if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) 1126 return false; /* Grace period in progress, so not idle. */ 1127 smp_mb(); /* Order ->srcu_gp_seq with prior access. */ 1128 if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) 1129 return false; /* GP # changed, so not idle. */ 1130 return true; /* With reasonable probability, idle! */ 1131 } 1132 1133 /* 1134 * SRCU callback function to leak a callback. 1135 */ 1136 static void srcu_leak_callback(struct rcu_head *rhp) 1137 { 1138 } 1139 1140 /* 1141 * Start an SRCU grace period, and also queue the callback if non-NULL. 1142 */ 1143 static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, 1144 struct rcu_head *rhp, bool do_norm) 1145 { 1146 unsigned long flags; 1147 int idx; 1148 bool needexp = false; 1149 bool needgp = false; 1150 unsigned long s; 1151 struct srcu_data *sdp; 1152 struct srcu_node *sdp_mynode; 1153 int ss_state; 1154 1155 check_init_srcu_struct(ssp); 1156 /* 1157 * While starting a new grace period, make sure we are in an 1158 * SRCU read-side critical section so that the grace-period 1159 * sequence number cannot wrap around in the meantime. 1160 */ 1161 idx = __srcu_read_lock_nmisafe(ssp); 1162 ss_state = smp_load_acquire(&ssp->srcu_size_state); 1163 if (ss_state < SRCU_SIZE_WAIT_CALL) 1164 sdp = per_cpu_ptr(ssp->sda, 0); 1165 else 1166 sdp = raw_cpu_ptr(ssp->sda); 1167 spin_lock_irqsave_sdp_contention(sdp, &flags); 1168 if (rhp) 1169 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); 1170 rcu_segcblist_advance(&sdp->srcu_cblist, 1171 rcu_seq_current(&ssp->srcu_gp_seq)); 1172 s = rcu_seq_snap(&ssp->srcu_gp_seq); 1173 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); 1174 if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { 1175 sdp->srcu_gp_seq_needed = s; 1176 needgp = true; 1177 } 1178 if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { 1179 sdp->srcu_gp_seq_needed_exp = s; 1180 needexp = true; 1181 } 1182 spin_unlock_irqrestore_rcu_node(sdp, flags); 1183 1184 /* Ensure that snp node tree is fully initialized before traversing it */ 1185 if (ss_state < SRCU_SIZE_WAIT_BARRIER) 1186 sdp_mynode = NULL; 1187 else 1188 sdp_mynode = sdp->mynode; 1189 1190 if (needgp) 1191 srcu_funnel_gp_start(ssp, sdp, s, do_norm); 1192 else if (needexp) 1193 srcu_funnel_exp_start(ssp, sdp_mynode, s); 1194 __srcu_read_unlock_nmisafe(ssp, idx); 1195 return s; 1196 } 1197 1198 /* 1199 * Enqueue an SRCU callback on the srcu_data structure associated with 1200 * the current CPU and the specified srcu_struct structure, initiating 1201 * grace-period processing if it is not already running. 1202 * 1203 * Note that all CPUs must agree that the grace period extended beyond 1204 * all pre-existing SRCU read-side critical section. On systems with 1205 * more than one CPU, this means that when "func()" is invoked, each CPU 1206 * is guaranteed to have executed a full memory barrier since the end of 1207 * its last corresponding SRCU read-side critical section whose beginning 1208 * preceded the call to call_srcu(). It also means that each CPU executing 1209 * an SRCU read-side critical section that continues beyond the start of 1210 * "func()" must have executed a memory barrier after the call_srcu() 1211 * but before the beginning of that SRCU read-side critical section. 1212 * Note that these guarantees include CPUs that are offline, idle, or 1213 * executing in user mode, as well as CPUs that are executing in the kernel. 1214 * 1215 * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the 1216 * resulting SRCU callback function "func()", then both CPU A and CPU 1217 * B are guaranteed to execute a full memory barrier during the time 1218 * interval between the call to call_srcu() and the invocation of "func()". 1219 * This guarantee applies even if CPU A and CPU B are the same CPU (but 1220 * again only if the system has more than one CPU). 1221 * 1222 * Of course, these guarantees apply only for invocations of call_srcu(), 1223 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same 1224 * srcu_struct structure. 1225 */ 1226 static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, 1227 rcu_callback_t func, bool do_norm) 1228 { 1229 if (debug_rcu_head_queue(rhp)) { 1230 /* Probable double call_srcu(), so leak the callback. */ 1231 WRITE_ONCE(rhp->func, srcu_leak_callback); 1232 WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); 1233 return; 1234 } 1235 rhp->func = func; 1236 (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); 1237 } 1238 1239 /** 1240 * call_srcu() - Queue a callback for invocation after an SRCU grace period 1241 * @ssp: srcu_struct in queue the callback 1242 * @rhp: structure to be used for queueing the SRCU callback. 1243 * @func: function to be invoked after the SRCU grace period 1244 * 1245 * The callback function will be invoked some time after a full SRCU 1246 * grace period elapses, in other words after all pre-existing SRCU 1247 * read-side critical sections have completed. However, the callback 1248 * function might well execute concurrently with other SRCU read-side 1249 * critical sections that started after call_srcu() was invoked. SRCU 1250 * read-side critical sections are delimited by srcu_read_lock() and 1251 * srcu_read_unlock(), and may be nested. 1252 * 1253 * The callback will be invoked from process context, but must nevertheless 1254 * be fast and must not block. 1255 */ 1256 void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, 1257 rcu_callback_t func) 1258 { 1259 __call_srcu(ssp, rhp, func, true); 1260 } 1261 EXPORT_SYMBOL_GPL(call_srcu); 1262 1263 /* 1264 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 1265 */ 1266 static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) 1267 { 1268 struct rcu_synchronize rcu; 1269 1270 RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || 1271 lock_is_held(&rcu_bh_lock_map) || 1272 lock_is_held(&rcu_lock_map) || 1273 lock_is_held(&rcu_sched_lock_map), 1274 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); 1275 1276 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) 1277 return; 1278 might_sleep(); 1279 check_init_srcu_struct(ssp); 1280 init_completion(&rcu.completion); 1281 init_rcu_head_on_stack(&rcu.head); 1282 __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); 1283 wait_for_completion(&rcu.completion); 1284 destroy_rcu_head_on_stack(&rcu.head); 1285 1286 /* 1287 * Make sure that later code is ordered after the SRCU grace 1288 * period. This pairs with the spin_lock_irq_rcu_node() 1289 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed 1290 * because the current CPU might have been totally uninvolved with 1291 * (and thus unordered against) that grace period. 1292 */ 1293 smp_mb(); 1294 } 1295 1296 /** 1297 * synchronize_srcu_expedited - Brute-force SRCU grace period 1298 * @ssp: srcu_struct with which to synchronize. 1299 * 1300 * Wait for an SRCU grace period to elapse, but be more aggressive about 1301 * spinning rather than blocking when waiting. 1302 * 1303 * Note that synchronize_srcu_expedited() has the same deadlock and 1304 * memory-ordering properties as does synchronize_srcu(). 1305 */ 1306 void synchronize_srcu_expedited(struct srcu_struct *ssp) 1307 { 1308 __synchronize_srcu(ssp, rcu_gp_is_normal()); 1309 } 1310 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 1311 1312 /** 1313 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 1314 * @ssp: srcu_struct with which to synchronize. 1315 * 1316 * Wait for the count to drain to zero of both indexes. To avoid the 1317 * possible starvation of synchronize_srcu(), it waits for the count of 1318 * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, 1319 * and then flip the srcu_idx and wait for the count of the other index. 1320 * 1321 * Can block; must be called from process context. 1322 * 1323 * Note that it is illegal to call synchronize_srcu() from the corresponding 1324 * SRCU read-side critical section; doing so will result in deadlock. 1325 * However, it is perfectly legal to call synchronize_srcu() on one 1326 * srcu_struct from some other srcu_struct's read-side critical section, 1327 * as long as the resulting graph of srcu_structs is acyclic. 1328 * 1329 * There are memory-ordering constraints implied by synchronize_srcu(). 1330 * On systems with more than one CPU, when synchronize_srcu() returns, 1331 * each CPU is guaranteed to have executed a full memory barrier since 1332 * the end of its last corresponding SRCU read-side critical section 1333 * whose beginning preceded the call to synchronize_srcu(). In addition, 1334 * each CPU having an SRCU read-side critical section that extends beyond 1335 * the return from synchronize_srcu() is guaranteed to have executed a 1336 * full memory barrier after the beginning of synchronize_srcu() and before 1337 * the beginning of that SRCU read-side critical section. Note that these 1338 * guarantees include CPUs that are offline, idle, or executing in user mode, 1339 * as well as CPUs that are executing in the kernel. 1340 * 1341 * Furthermore, if CPU A invoked synchronize_srcu(), which returned 1342 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 1343 * to have executed a full memory barrier during the execution of 1344 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B 1345 * are the same CPU, but again only if the system has more than one CPU. 1346 * 1347 * Of course, these memory-ordering guarantees apply only when 1348 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are 1349 * passed the same srcu_struct structure. 1350 * 1351 * Implementation of these memory-ordering guarantees is similar to 1352 * that of synchronize_rcu(). 1353 * 1354 * If SRCU is likely idle, expedite the first request. This semantic 1355 * was provided by Classic SRCU, and is relied upon by its users, so TREE 1356 * SRCU must also provide it. Note that detecting idleness is heuristic 1357 * and subject to both false positives and negatives. 1358 */ 1359 void synchronize_srcu(struct srcu_struct *ssp) 1360 { 1361 if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) 1362 synchronize_srcu_expedited(ssp); 1363 else 1364 __synchronize_srcu(ssp, true); 1365 } 1366 EXPORT_SYMBOL_GPL(synchronize_srcu); 1367 1368 /** 1369 * get_state_synchronize_srcu - Provide an end-of-grace-period cookie 1370 * @ssp: srcu_struct to provide cookie for. 1371 * 1372 * This function returns a cookie that can be passed to 1373 * poll_state_synchronize_srcu(), which will return true if a full grace 1374 * period has elapsed in the meantime. It is the caller's responsibility 1375 * to make sure that grace period happens, for example, by invoking 1376 * call_srcu() after return from get_state_synchronize_srcu(). 1377 */ 1378 unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) 1379 { 1380 // Any prior manipulation of SRCU-protected data must happen 1381 // before the load from ->srcu_gp_seq. 1382 smp_mb(); 1383 return rcu_seq_snap(&ssp->srcu_gp_seq); 1384 } 1385 EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); 1386 1387 /** 1388 * start_poll_synchronize_srcu - Provide cookie and start grace period 1389 * @ssp: srcu_struct to provide cookie for. 1390 * 1391 * This function returns a cookie that can be passed to 1392 * poll_state_synchronize_srcu(), which will return true if a full grace 1393 * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), 1394 * this function also ensures that any needed SRCU grace period will be 1395 * started. This convenience does come at a cost in terms of CPU overhead. 1396 */ 1397 unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) 1398 { 1399 return srcu_gp_start_if_needed(ssp, NULL, true); 1400 } 1401 EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); 1402 1403 /** 1404 * poll_state_synchronize_srcu - Has cookie's grace period ended? 1405 * @ssp: srcu_struct to provide cookie for. 1406 * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). 1407 * 1408 * This function takes the cookie that was returned from either 1409 * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and 1410 * returns @true if an SRCU grace period elapsed since the time that the 1411 * cookie was created. 1412 * 1413 * Because cookies are finite in size, wrapping/overflow is possible. 1414 * This is more pronounced on 32-bit systems where cookies are 32 bits, 1415 * where in theory wrapping could happen in about 14 hours assuming 1416 * 25-microsecond expedited SRCU grace periods. However, a more likely 1417 * overflow lower bound is on the order of 24 days in the case of 1418 * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit 1419 * system requires geologic timespans, as in more than seven million years 1420 * even for expedited SRCU grace periods. 1421 * 1422 * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems 1423 * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses 1424 * a 16-bit cookie, which rcutorture routinely wraps in a matter of a 1425 * few minutes. If this proves to be a problem, this counter will be 1426 * expanded to the same size as for Tree SRCU. 1427 */ 1428 bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) 1429 { 1430 if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) 1431 return false; 1432 // Ensure that the end of the SRCU grace period happens before 1433 // any subsequent code that the caller might execute. 1434 smp_mb(); // ^^^ 1435 return true; 1436 } 1437 EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); 1438 1439 /* 1440 * Callback function for srcu_barrier() use. 1441 */ 1442 static void srcu_barrier_cb(struct rcu_head *rhp) 1443 { 1444 struct srcu_data *sdp; 1445 struct srcu_struct *ssp; 1446 1447 sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); 1448 ssp = sdp->ssp; 1449 if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) 1450 complete(&ssp->srcu_barrier_completion); 1451 } 1452 1453 /* 1454 * Enqueue an srcu_barrier() callback on the specified srcu_data 1455 * structure's ->cblist. but only if that ->cblist already has at least one 1456 * callback enqueued. Note that if a CPU already has callbacks enqueue, 1457 * it must have already registered the need for a future grace period, 1458 * so all we need do is enqueue a callback that will use the same grace 1459 * period as the last callback already in the queue. 1460 */ 1461 static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) 1462 { 1463 spin_lock_irq_rcu_node(sdp); 1464 atomic_inc(&ssp->srcu_barrier_cpu_cnt); 1465 sdp->srcu_barrier_head.func = srcu_barrier_cb; 1466 debug_rcu_head_queue(&sdp->srcu_barrier_head); 1467 if (!rcu_segcblist_entrain(&sdp->srcu_cblist, 1468 &sdp->srcu_barrier_head)) { 1469 debug_rcu_head_unqueue(&sdp->srcu_barrier_head); 1470 atomic_dec(&ssp->srcu_barrier_cpu_cnt); 1471 } 1472 spin_unlock_irq_rcu_node(sdp); 1473 } 1474 1475 /** 1476 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. 1477 * @ssp: srcu_struct on which to wait for in-flight callbacks. 1478 */ 1479 void srcu_barrier(struct srcu_struct *ssp) 1480 { 1481 int cpu; 1482 int idx; 1483 unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); 1484 1485 check_init_srcu_struct(ssp); 1486 mutex_lock(&ssp->srcu_barrier_mutex); 1487 if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { 1488 smp_mb(); /* Force ordering following return. */ 1489 mutex_unlock(&ssp->srcu_barrier_mutex); 1490 return; /* Someone else did our work for us. */ 1491 } 1492 rcu_seq_start(&ssp->srcu_barrier_seq); 1493 init_completion(&ssp->srcu_barrier_completion); 1494 1495 /* Initial count prevents reaching zero until all CBs are posted. */ 1496 atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); 1497 1498 idx = __srcu_read_lock_nmisafe(ssp); 1499 if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 1500 srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, 0)); 1501 else 1502 for_each_possible_cpu(cpu) 1503 srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); 1504 __srcu_read_unlock_nmisafe(ssp, idx); 1505 1506 /* Remove the initial count, at which point reaching zero can happen. */ 1507 if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) 1508 complete(&ssp->srcu_barrier_completion); 1509 wait_for_completion(&ssp->srcu_barrier_completion); 1510 1511 rcu_seq_end(&ssp->srcu_barrier_seq); 1512 mutex_unlock(&ssp->srcu_barrier_mutex); 1513 } 1514 EXPORT_SYMBOL_GPL(srcu_barrier); 1515 1516 /** 1517 * srcu_batches_completed - return batches completed. 1518 * @ssp: srcu_struct on which to report batch completion. 1519 * 1520 * Report the number of batches, correlated with, but not necessarily 1521 * precisely the same as, the number of grace periods that have elapsed. 1522 */ 1523 unsigned long srcu_batches_completed(struct srcu_struct *ssp) 1524 { 1525 return READ_ONCE(ssp->srcu_idx); 1526 } 1527 EXPORT_SYMBOL_GPL(srcu_batches_completed); 1528 1529 /* 1530 * Core SRCU state machine. Push state bits of ->srcu_gp_seq 1531 * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has 1532 * completed in that state. 1533 */ 1534 static void srcu_advance_state(struct srcu_struct *ssp) 1535 { 1536 int idx; 1537 1538 mutex_lock(&ssp->srcu_gp_mutex); 1539 1540 /* 1541 * Because readers might be delayed for an extended period after 1542 * fetching ->srcu_idx for their index, at any point in time there 1543 * might well be readers using both idx=0 and idx=1. We therefore 1544 * need to wait for readers to clear from both index values before 1545 * invoking a callback. 1546 * 1547 * The load-acquire ensures that we see the accesses performed 1548 * by the prior grace period. 1549 */ 1550 idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ 1551 if (idx == SRCU_STATE_IDLE) { 1552 spin_lock_irq_rcu_node(ssp); 1553 if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { 1554 WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); 1555 spin_unlock_irq_rcu_node(ssp); 1556 mutex_unlock(&ssp->srcu_gp_mutex); 1557 return; 1558 } 1559 idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); 1560 if (idx == SRCU_STATE_IDLE) 1561 srcu_gp_start(ssp); 1562 spin_unlock_irq_rcu_node(ssp); 1563 if (idx != SRCU_STATE_IDLE) { 1564 mutex_unlock(&ssp->srcu_gp_mutex); 1565 return; /* Someone else started the grace period. */ 1566 } 1567 } 1568 1569 if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { 1570 idx = 1 ^ (ssp->srcu_idx & 1); 1571 if (!try_check_zero(ssp, idx, 1)) { 1572 mutex_unlock(&ssp->srcu_gp_mutex); 1573 return; /* readers present, retry later. */ 1574 } 1575 srcu_flip(ssp); 1576 spin_lock_irq_rcu_node(ssp); 1577 rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); 1578 ssp->srcu_n_exp_nodelay = 0; 1579 spin_unlock_irq_rcu_node(ssp); 1580 } 1581 1582 if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { 1583 1584 /* 1585 * SRCU read-side critical sections are normally short, 1586 * so check at least twice in quick succession after a flip. 1587 */ 1588 idx = 1 ^ (ssp->srcu_idx & 1); 1589 if (!try_check_zero(ssp, idx, 2)) { 1590 mutex_unlock(&ssp->srcu_gp_mutex); 1591 return; /* readers present, retry later. */ 1592 } 1593 ssp->srcu_n_exp_nodelay = 0; 1594 srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ 1595 } 1596 } 1597 1598 /* 1599 * Invoke a limited number of SRCU callbacks that have passed through 1600 * their grace period. If there are more to do, SRCU will reschedule 1601 * the workqueue. Note that needed memory barriers have been executed 1602 * in this task's context by srcu_readers_active_idx_check(). 1603 */ 1604 static void srcu_invoke_callbacks(struct work_struct *work) 1605 { 1606 long len; 1607 bool more; 1608 struct rcu_cblist ready_cbs; 1609 struct rcu_head *rhp; 1610 struct srcu_data *sdp; 1611 struct srcu_struct *ssp; 1612 1613 sdp = container_of(work, struct srcu_data, work); 1614 1615 ssp = sdp->ssp; 1616 rcu_cblist_init(&ready_cbs); 1617 spin_lock_irq_rcu_node(sdp); 1618 rcu_segcblist_advance(&sdp->srcu_cblist, 1619 rcu_seq_current(&ssp->srcu_gp_seq)); 1620 if (sdp->srcu_cblist_invoking || 1621 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { 1622 spin_unlock_irq_rcu_node(sdp); 1623 return; /* Someone else on the job or nothing to do. */ 1624 } 1625 1626 /* We are on the job! Extract and invoke ready callbacks. */ 1627 sdp->srcu_cblist_invoking = true; 1628 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); 1629 len = ready_cbs.len; 1630 spin_unlock_irq_rcu_node(sdp); 1631 rhp = rcu_cblist_dequeue(&ready_cbs); 1632 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { 1633 debug_rcu_head_unqueue(rhp); 1634 local_bh_disable(); 1635 rhp->func(rhp); 1636 local_bh_enable(); 1637 } 1638 WARN_ON_ONCE(ready_cbs.len); 1639 1640 /* 1641 * Update counts, accelerate new callbacks, and if needed, 1642 * schedule another round of callback invocation. 1643 */ 1644 spin_lock_irq_rcu_node(sdp); 1645 rcu_segcblist_add_len(&sdp->srcu_cblist, -len); 1646 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, 1647 rcu_seq_snap(&ssp->srcu_gp_seq)); 1648 sdp->srcu_cblist_invoking = false; 1649 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); 1650 spin_unlock_irq_rcu_node(sdp); 1651 if (more) 1652 srcu_schedule_cbs_sdp(sdp, 0); 1653 } 1654 1655 /* 1656 * Finished one round of SRCU grace period. Start another if there are 1657 * more SRCU callbacks queued, otherwise put SRCU into not-running state. 1658 */ 1659 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) 1660 { 1661 bool pushgp = true; 1662 1663 spin_lock_irq_rcu_node(ssp); 1664 if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { 1665 if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { 1666 /* All requests fulfilled, time to go idle. */ 1667 pushgp = false; 1668 } 1669 } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { 1670 /* Outstanding request and no GP. Start one. */ 1671 srcu_gp_start(ssp); 1672 } 1673 spin_unlock_irq_rcu_node(ssp); 1674 1675 if (pushgp) 1676 queue_delayed_work(rcu_gp_wq, &ssp->work, delay); 1677 } 1678 1679 /* 1680 * This is the work-queue function that handles SRCU grace periods. 1681 */ 1682 static void process_srcu(struct work_struct *work) 1683 { 1684 unsigned long curdelay; 1685 unsigned long j; 1686 struct srcu_struct *ssp; 1687 1688 ssp = container_of(work, struct srcu_struct, work.work); 1689 1690 srcu_advance_state(ssp); 1691 curdelay = srcu_get_delay(ssp); 1692 if (curdelay) { 1693 WRITE_ONCE(ssp->reschedule_count, 0); 1694 } else { 1695 j = jiffies; 1696 if (READ_ONCE(ssp->reschedule_jiffies) == j) { 1697 WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); 1698 if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) 1699 curdelay = 1; 1700 } else { 1701 WRITE_ONCE(ssp->reschedule_count, 1); 1702 WRITE_ONCE(ssp->reschedule_jiffies, j); 1703 } 1704 } 1705 srcu_reschedule(ssp, curdelay); 1706 } 1707 1708 void srcutorture_get_gp_data(enum rcutorture_type test_type, 1709 struct srcu_struct *ssp, int *flags, 1710 unsigned long *gp_seq) 1711 { 1712 if (test_type != SRCU_FLAVOR) 1713 return; 1714 *flags = 0; 1715 *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); 1716 } 1717 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); 1718 1719 static const char * const srcu_size_state_name[] = { 1720 "SRCU_SIZE_SMALL", 1721 "SRCU_SIZE_ALLOC", 1722 "SRCU_SIZE_WAIT_BARRIER", 1723 "SRCU_SIZE_WAIT_CALL", 1724 "SRCU_SIZE_WAIT_CBS1", 1725 "SRCU_SIZE_WAIT_CBS2", 1726 "SRCU_SIZE_WAIT_CBS3", 1727 "SRCU_SIZE_WAIT_CBS4", 1728 "SRCU_SIZE_BIG", 1729 "SRCU_SIZE_???", 1730 }; 1731 1732 void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) 1733 { 1734 int cpu; 1735 int idx; 1736 unsigned long s0 = 0, s1 = 0; 1737 int ss_state = READ_ONCE(ssp->srcu_size_state); 1738 int ss_state_idx = ss_state; 1739 1740 idx = ssp->srcu_idx & 0x1; 1741 if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) 1742 ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; 1743 pr_alert("%s%s Tree SRCU g%ld state %d (%s)", 1744 tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, 1745 srcu_size_state_name[ss_state_idx]); 1746 if (!ssp->sda) { 1747 // Called after cleanup_srcu_struct(), perhaps. 1748 pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); 1749 } else { 1750 pr_cont(" per-CPU(idx=%d):", idx); 1751 for_each_possible_cpu(cpu) { 1752 unsigned long l0, l1; 1753 unsigned long u0, u1; 1754 long c0, c1; 1755 struct srcu_data *sdp; 1756 1757 sdp = per_cpu_ptr(ssp->sda, cpu); 1758 u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); 1759 u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); 1760 1761 /* 1762 * Make sure that a lock is always counted if the corresponding 1763 * unlock is counted. 1764 */ 1765 smp_rmb(); 1766 1767 l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); 1768 l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); 1769 1770 c0 = l0 - u0; 1771 c1 = l1 - u1; 1772 pr_cont(" %d(%ld,%ld %c)", 1773 cpu, c0, c1, 1774 "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); 1775 s0 += c0; 1776 s1 += c1; 1777 } 1778 pr_cont(" T(%ld,%ld)\n", s0, s1); 1779 } 1780 if (SRCU_SIZING_IS_TORTURE()) 1781 srcu_transition_to_big(ssp); 1782 } 1783 EXPORT_SYMBOL_GPL(srcu_torture_stats_print); 1784 1785 static int __init srcu_bootup_announce(void) 1786 { 1787 pr_info("Hierarchical SRCU implementation.\n"); 1788 if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) 1789 pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); 1790 if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) 1791 pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); 1792 if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) 1793 pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); 1794 pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); 1795 return 0; 1796 } 1797 early_initcall(srcu_bootup_announce); 1798 1799 void __init srcu_init(void) 1800 { 1801 struct srcu_struct *ssp; 1802 1803 /* Decide on srcu_struct-size strategy. */ 1804 if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { 1805 if (nr_cpu_ids >= big_cpu_lim) { 1806 convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. 1807 pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); 1808 } else { 1809 convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; 1810 pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); 1811 } 1812 } 1813 1814 /* 1815 * Once that is set, call_srcu() can follow the normal path and 1816 * queue delayed work. This must follow RCU workqueues creation 1817 * and timers initialization. 1818 */ 1819 srcu_init_done = true; 1820 while (!list_empty(&srcu_boot_list)) { 1821 ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, 1822 work.work.entry); 1823 list_del_init(&ssp->work.work.entry); 1824 if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL) 1825 ssp->srcu_size_state = SRCU_SIZE_ALLOC; 1826 queue_work(rcu_gp_wq, &ssp->work.work); 1827 } 1828 } 1829 1830 #ifdef CONFIG_MODULES 1831 1832 /* Initialize any global-scope srcu_struct structures used by this module. */ 1833 static int srcu_module_coming(struct module *mod) 1834 { 1835 int i; 1836 struct srcu_struct **sspp = mod->srcu_struct_ptrs; 1837 int ret; 1838 1839 for (i = 0; i < mod->num_srcu_structs; i++) { 1840 ret = init_srcu_struct(*(sspp++)); 1841 if (WARN_ON_ONCE(ret)) 1842 return ret; 1843 } 1844 return 0; 1845 } 1846 1847 /* Clean up any global-scope srcu_struct structures used by this module. */ 1848 static void srcu_module_going(struct module *mod) 1849 { 1850 int i; 1851 struct srcu_struct **sspp = mod->srcu_struct_ptrs; 1852 1853 for (i = 0; i < mod->num_srcu_structs; i++) 1854 cleanup_srcu_struct(*(sspp++)); 1855 } 1856 1857 /* Handle one module, either coming or going. */ 1858 static int srcu_module_notify(struct notifier_block *self, 1859 unsigned long val, void *data) 1860 { 1861 struct module *mod = data; 1862 int ret = 0; 1863 1864 switch (val) { 1865 case MODULE_STATE_COMING: 1866 ret = srcu_module_coming(mod); 1867 break; 1868 case MODULE_STATE_GOING: 1869 srcu_module_going(mod); 1870 break; 1871 default: 1872 break; 1873 } 1874 return ret; 1875 } 1876 1877 static struct notifier_block srcu_module_nb = { 1878 .notifier_call = srcu_module_notify, 1879 .priority = 0, 1880 }; 1881 1882 static __init int init_srcu_module_notifier(void) 1883 { 1884 int ret; 1885 1886 ret = register_module_notifier(&srcu_module_nb); 1887 if (ret) 1888 pr_warn("Failed to register srcu module notifier\n"); 1889 return ret; 1890 } 1891 late_initcall(init_srcu_module_notifier); 1892 1893 #endif /* #ifdef CONFIG_MODULES */ 1894