1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Sleepable Read-Copy Update mechanism for mutual exclusion. 4 * 5 * Copyright (C) IBM Corporation, 2006 6 * Copyright (C) Fujitsu, 2012 7 * 8 * Authors: Paul McKenney <paulmck@linux.ibm.com> 9 * Lai Jiangshan <laijs@cn.fujitsu.com> 10 * 11 * For detailed explanation of Read-Copy Update mechanism see - 12 * Documentation/RCU/ *.txt 13 * 14 */ 15 16 #define pr_fmt(fmt) "rcu: " fmt 17 18 #include <linux/export.h> 19 #include <linux/mutex.h> 20 #include <linux/percpu.h> 21 #include <linux/preempt.h> 22 #include <linux/rcupdate_wait.h> 23 #include <linux/sched.h> 24 #include <linux/smp.h> 25 #include <linux/delay.h> 26 #include <linux/module.h> 27 #include <linux/slab.h> 28 #include <linux/srcu.h> 29 30 #include "rcu.h" 31 #include "rcu_segcblist.h" 32 33 /* Holdoff in nanoseconds for auto-expediting. */ 34 #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) 35 static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; 36 module_param(exp_holdoff, ulong, 0444); 37 38 /* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ 39 static ulong counter_wrap_check = (ULONG_MAX >> 2); 40 module_param(counter_wrap_check, ulong, 0444); 41 42 /* 43 * Control conversion to SRCU_SIZE_BIG: 44 * 0: Don't convert at all. 45 * 1: Convert at init_srcu_struct() time. 46 * 2: Convert when rcutorture invokes srcu_torture_stats_print(). 47 * 3: Decide at boot time based on system shape (default). 48 * 0x1x: Convert when excessive contention encountered. 49 */ 50 #define SRCU_SIZING_NONE 0 51 #define SRCU_SIZING_INIT 1 52 #define SRCU_SIZING_TORTURE 2 53 #define SRCU_SIZING_AUTO 3 54 #define SRCU_SIZING_CONTEND 0x10 55 #define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) 56 #define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) 57 #define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) 58 #define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) 59 #define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) 60 static int convert_to_big = SRCU_SIZING_AUTO; 61 module_param(convert_to_big, int, 0444); 62 63 /* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ 64 static int big_cpu_lim __read_mostly = 128; 65 module_param(big_cpu_lim, int, 0444); 66 67 /* Contention events per jiffy to initiate transition to big. */ 68 static int small_contention_lim __read_mostly = 100; 69 module_param(small_contention_lim, int, 0444); 70 71 /* Early-boot callback-management, so early that no lock is required! */ 72 static LIST_HEAD(srcu_boot_list); 73 static bool __read_mostly srcu_init_done; 74 75 static void srcu_invoke_callbacks(struct work_struct *work); 76 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); 77 static void process_srcu(struct work_struct *work); 78 static void srcu_delay_timer(struct timer_list *t); 79 80 /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ 81 #define spin_lock_rcu_node(p) \ 82 do { \ 83 spin_lock(&ACCESS_PRIVATE(p, lock)); \ 84 smp_mb__after_unlock_lock(); \ 85 } while (0) 86 87 #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) 88 89 #define spin_lock_irq_rcu_node(p) \ 90 do { \ 91 spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ 92 smp_mb__after_unlock_lock(); \ 93 } while (0) 94 95 #define spin_unlock_irq_rcu_node(p) \ 96 spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) 97 98 #define spin_lock_irqsave_rcu_node(p, flags) \ 99 do { \ 100 spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ 101 smp_mb__after_unlock_lock(); \ 102 } while (0) 103 104 #define spin_trylock_irqsave_rcu_node(p, flags) \ 105 ({ \ 106 bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ 107 \ 108 if (___locked) \ 109 smp_mb__after_unlock_lock(); \ 110 ___locked; \ 111 }) 112 113 #define spin_unlock_irqrestore_rcu_node(p, flags) \ 114 spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ 115 116 /* 117 * Initialize SRCU per-CPU data. Note that statically allocated 118 * srcu_struct structures might already have srcu_read_lock() and 119 * srcu_read_unlock() running against them. So if the is_static parameter 120 * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. 121 */ 122 static void init_srcu_struct_data(struct srcu_struct *ssp) 123 { 124 int cpu; 125 struct srcu_data *sdp; 126 127 /* 128 * Initialize the per-CPU srcu_data array, which feeds into the 129 * leaves of the srcu_node tree. 130 */ 131 WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != 132 ARRAY_SIZE(sdp->srcu_unlock_count)); 133 for_each_possible_cpu(cpu) { 134 sdp = per_cpu_ptr(ssp->sda, cpu); 135 spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); 136 rcu_segcblist_init(&sdp->srcu_cblist); 137 sdp->srcu_cblist_invoking = false; 138 sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq; 139 sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq; 140 sdp->mynode = NULL; 141 sdp->cpu = cpu; 142 INIT_WORK(&sdp->work, srcu_invoke_callbacks); 143 timer_setup(&sdp->delay_work, srcu_delay_timer, 0); 144 sdp->ssp = ssp; 145 } 146 } 147 148 /* Invalid seq state, used during snp node initialization */ 149 #define SRCU_SNP_INIT_SEQ 0x2 150 151 /* 152 * Check whether sequence number corresponding to snp node, 153 * is invalid. 154 */ 155 static inline bool srcu_invl_snp_seq(unsigned long s) 156 { 157 return s == SRCU_SNP_INIT_SEQ; 158 } 159 160 /* 161 * Allocated and initialize SRCU combining tree. Returns @true if 162 * allocation succeeded and @false otherwise. 163 */ 164 static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) 165 { 166 int cpu; 167 int i; 168 int level = 0; 169 int levelspread[RCU_NUM_LVLS]; 170 struct srcu_data *sdp; 171 struct srcu_node *snp; 172 struct srcu_node *snp_first; 173 174 /* Initialize geometry if it has not already been initialized. */ 175 rcu_init_geometry(); 176 ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags); 177 if (!ssp->srcu_sup->node) 178 return false; 179 180 /* Work out the overall tree geometry. */ 181 ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0]; 182 for (i = 1; i < rcu_num_lvls; i++) 183 ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1]; 184 rcu_init_levelspread(levelspread, num_rcu_lvl); 185 186 /* Each pass through this loop initializes one srcu_node structure. */ 187 srcu_for_each_node_breadth_first(ssp, snp) { 188 spin_lock_init(&ACCESS_PRIVATE(snp, lock)); 189 WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != 190 ARRAY_SIZE(snp->srcu_data_have_cbs)); 191 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { 192 snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; 193 snp->srcu_data_have_cbs[i] = 0; 194 } 195 snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; 196 snp->grplo = -1; 197 snp->grphi = -1; 198 if (snp == &ssp->srcu_sup->node[0]) { 199 /* Root node, special case. */ 200 snp->srcu_parent = NULL; 201 continue; 202 } 203 204 /* Non-root node. */ 205 if (snp == ssp->srcu_sup->level[level + 1]) 206 level++; 207 snp->srcu_parent = ssp->srcu_sup->level[level - 1] + 208 (snp - ssp->srcu_sup->level[level]) / 209 levelspread[level - 1]; 210 } 211 212 /* 213 * Initialize the per-CPU srcu_data array, which feeds into the 214 * leaves of the srcu_node tree. 215 */ 216 level = rcu_num_lvls - 1; 217 snp_first = ssp->srcu_sup->level[level]; 218 for_each_possible_cpu(cpu) { 219 sdp = per_cpu_ptr(ssp->sda, cpu); 220 sdp->mynode = &snp_first[cpu / levelspread[level]]; 221 for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { 222 if (snp->grplo < 0) 223 snp->grplo = cpu; 224 snp->grphi = cpu; 225 } 226 sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); 227 } 228 smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); 229 return true; 230 } 231 232 /* 233 * Initialize non-compile-time initialized fields, including the 234 * associated srcu_node and srcu_data structures. The is_static parameter 235 * tells us that ->sda has already been wired up to srcu_data. 236 */ 237 static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) 238 { 239 if (!is_static) 240 ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); 241 if (!ssp->srcu_sup) 242 return -ENOMEM; 243 if (!is_static) 244 spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); 245 ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; 246 ssp->srcu_sup->node = NULL; 247 mutex_init(&ssp->srcu_sup->srcu_cb_mutex); 248 mutex_init(&ssp->srcu_sup->srcu_gp_mutex); 249 ssp->srcu_idx = 0; 250 ssp->srcu_gp_seq = 0; 251 ssp->srcu_barrier_seq = 0; 252 mutex_init(&ssp->srcu_barrier_mutex); 253 atomic_set(&ssp->srcu_barrier_cpu_cnt, 0); 254 INIT_DELAYED_WORK(&ssp->work, process_srcu); 255 ssp->sda_is_static = is_static; 256 if (!is_static) 257 ssp->sda = alloc_percpu(struct srcu_data); 258 if (!ssp->sda) { 259 if (!is_static) 260 kfree(ssp->srcu_sup); 261 return -ENOMEM; 262 } 263 init_srcu_struct_data(ssp); 264 ssp->srcu_gp_seq_needed_exp = 0; 265 ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); 266 if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { 267 if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { 268 if (!ssp->sda_is_static) { 269 free_percpu(ssp->sda); 270 ssp->sda = NULL; 271 kfree(ssp->srcu_sup); 272 return -ENOMEM; 273 } 274 } else { 275 WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); 276 } 277 } 278 smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ 279 return 0; 280 } 281 282 #ifdef CONFIG_DEBUG_LOCK_ALLOC 283 284 int __init_srcu_struct(struct srcu_struct *ssp, const char *name, 285 struct lock_class_key *key) 286 { 287 /* Don't re-initialize a lock while it is held. */ 288 debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); 289 lockdep_init_map(&ssp->dep_map, name, key, 0); 290 return init_srcu_struct_fields(ssp, false); 291 } 292 EXPORT_SYMBOL_GPL(__init_srcu_struct); 293 294 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 295 296 /** 297 * init_srcu_struct - initialize a sleep-RCU structure 298 * @ssp: structure to initialize. 299 * 300 * Must invoke this on a given srcu_struct before passing that srcu_struct 301 * to any other function. Each srcu_struct represents a separate domain 302 * of SRCU protection. 303 */ 304 int init_srcu_struct(struct srcu_struct *ssp) 305 { 306 return init_srcu_struct_fields(ssp, false); 307 } 308 EXPORT_SYMBOL_GPL(init_srcu_struct); 309 310 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 311 312 /* 313 * Initiate a transition to SRCU_SIZE_BIG with lock held. 314 */ 315 static void __srcu_transition_to_big(struct srcu_struct *ssp) 316 { 317 lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); 318 smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); 319 } 320 321 /* 322 * Initiate an idempotent transition to SRCU_SIZE_BIG. 323 */ 324 static void srcu_transition_to_big(struct srcu_struct *ssp) 325 { 326 unsigned long flags; 327 328 /* Double-checked locking on ->srcu_size-state. */ 329 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) 330 return; 331 spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); 332 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { 333 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 334 return; 335 } 336 __srcu_transition_to_big(ssp); 337 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 338 } 339 340 /* 341 * Check to see if the just-encountered contention event justifies 342 * a transition to SRCU_SIZE_BIG. 343 */ 344 static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) 345 { 346 unsigned long j; 347 348 if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) 349 return; 350 j = jiffies; 351 if (ssp->srcu_size_jiffies != j) { 352 ssp->srcu_size_jiffies = j; 353 ssp->srcu_n_lock_retries = 0; 354 } 355 if (++ssp->srcu_n_lock_retries <= small_contention_lim) 356 return; 357 __srcu_transition_to_big(ssp); 358 } 359 360 /* 361 * Acquire the specified srcu_data structure's ->lock, but check for 362 * excessive contention, which results in initiation of a transition 363 * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module 364 * parameter permits this. 365 */ 366 static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) 367 { 368 struct srcu_struct *ssp = sdp->ssp; 369 370 if (spin_trylock_irqsave_rcu_node(sdp, *flags)) 371 return; 372 spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); 373 spin_lock_irqsave_check_contention(ssp); 374 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); 375 spin_lock_irqsave_rcu_node(sdp, *flags); 376 } 377 378 /* 379 * Acquire the specified srcu_struct structure's ->lock, but check for 380 * excessive contention, which results in initiation of a transition 381 * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module 382 * parameter permits this. 383 */ 384 static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) 385 { 386 if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) 387 return; 388 spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); 389 spin_lock_irqsave_check_contention(ssp); 390 } 391 392 /* 393 * First-use initialization of statically allocated srcu_struct 394 * structure. Wiring up the combining tree is more than can be 395 * done with compile-time initialization, so this check is added 396 * to each update-side SRCU primitive. Use ssp->lock, which -is- 397 * compile-time initialized, to resolve races involving multiple 398 * CPUs trying to garner first-use privileges. 399 */ 400 static void check_init_srcu_struct(struct srcu_struct *ssp) 401 { 402 unsigned long flags; 403 404 /* The smp_load_acquire() pairs with the smp_store_release(). */ 405 if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/ 406 return; /* Already initialized. */ 407 spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); 408 if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) { 409 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 410 return; 411 } 412 init_srcu_struct_fields(ssp, true); 413 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 414 } 415 416 /* 417 * Returns approximate total of the readers' ->srcu_lock_count[] values 418 * for the rank of per-CPU counters specified by idx. 419 */ 420 static unsigned long srcu_readers_lock_idx(struct srcu_struct *ssp, int idx) 421 { 422 int cpu; 423 unsigned long sum = 0; 424 425 for_each_possible_cpu(cpu) { 426 struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); 427 428 sum += atomic_long_read(&cpuc->srcu_lock_count[idx]); 429 } 430 return sum; 431 } 432 433 /* 434 * Returns approximate total of the readers' ->srcu_unlock_count[] values 435 * for the rank of per-CPU counters specified by idx. 436 */ 437 static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx) 438 { 439 int cpu; 440 unsigned long mask = 0; 441 unsigned long sum = 0; 442 443 for_each_possible_cpu(cpu) { 444 struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); 445 446 sum += atomic_long_read(&cpuc->srcu_unlock_count[idx]); 447 if (IS_ENABLED(CONFIG_PROVE_RCU)) 448 mask = mask | READ_ONCE(cpuc->srcu_nmi_safety); 449 } 450 WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask >> 1)), 451 "Mixed NMI-safe readers for srcu_struct at %ps.\n", ssp); 452 return sum; 453 } 454 455 /* 456 * Return true if the number of pre-existing readers is determined to 457 * be zero. 458 */ 459 static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) 460 { 461 unsigned long unlocks; 462 463 unlocks = srcu_readers_unlock_idx(ssp, idx); 464 465 /* 466 * Make sure that a lock is always counted if the corresponding 467 * unlock is counted. Needs to be a smp_mb() as the read side may 468 * contain a read from a variable that is written to before the 469 * synchronize_srcu() in the write side. In this case smp_mb()s 470 * A and B act like the store buffering pattern. 471 * 472 * This smp_mb() also pairs with smp_mb() C to prevent accesses 473 * after the synchronize_srcu() from being executed before the 474 * grace period ends. 475 */ 476 smp_mb(); /* A */ 477 478 /* 479 * If the locks are the same as the unlocks, then there must have 480 * been no readers on this index at some point in this function. 481 * But there might be more readers, as a task might have read 482 * the current ->srcu_idx but not yet have incremented its CPU's 483 * ->srcu_lock_count[idx] counter. In fact, it is possible 484 * that most of the tasks have been preempted between fetching 485 * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there 486 * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks 487 * in a system whose address space was fully populated with memory. 488 * Call this quantity Nt. 489 * 490 * So suppose that the updater is preempted at this point in the 491 * code for a long time. That now-preempted updater has already 492 * flipped ->srcu_idx (possibly during the preceding grace period), 493 * done an smp_mb() (again, possibly during the preceding grace 494 * period), and summed up the ->srcu_unlock_count[idx] counters. 495 * How many times can a given one of the aforementioned Nt tasks 496 * increment the old ->srcu_idx value's ->srcu_lock_count[idx] 497 * counter, in the absence of nesting? 498 * 499 * It can clearly do so once, given that it has already fetched 500 * the old value of ->srcu_idx and is just about to use that value 501 * to index its increment of ->srcu_lock_count[idx]. But as soon as 502 * it leaves that SRCU read-side critical section, it will increment 503 * ->srcu_unlock_count[idx], which must follow the updater's above 504 * read from that same value. Thus, as soon the reading task does 505 * an smp_mb() and a later fetch from ->srcu_idx, that task will be 506 * guaranteed to get the new index. Except that the increment of 507 * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the 508 * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() 509 * is before the smp_mb(). Thus, that task might not see the new 510 * value of ->srcu_idx until the -second- __srcu_read_lock(), 511 * which in turn means that this task might well increment 512 * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, 513 * not just once. 514 * 515 * However, it is important to note that a given smp_mb() takes 516 * effect not just for the task executing it, but also for any 517 * later task running on that same CPU. 518 * 519 * That is, there can be almost Nt + Nc further increments of 520 * ->srcu_lock_count[idx] for the old index, where Nc is the number 521 * of CPUs. But this is OK because the size of the task_struct 522 * structure limits the value of Nt and current systems limit Nc 523 * to a few thousand. 524 * 525 * OK, but what about nesting? This does impose a limit on 526 * nesting of half of the size of the task_struct structure 527 * (measured in bytes), which should be sufficient. A late 2022 528 * TREE01 rcutorture run reported this size to be no less than 529 * 9408 bytes, allowing up to 4704 levels of nesting, which is 530 * comfortably beyond excessive. Especially on 64-bit systems, 531 * which are unlikely to be configured with an address space fully 532 * populated with memory, at least not anytime soon. 533 */ 534 return srcu_readers_lock_idx(ssp, idx) == unlocks; 535 } 536 537 /** 538 * srcu_readers_active - returns true if there are readers. and false 539 * otherwise 540 * @ssp: which srcu_struct to count active readers (holding srcu_read_lock). 541 * 542 * Note that this is not an atomic primitive, and can therefore suffer 543 * severe errors when invoked on an active srcu_struct. That said, it 544 * can be useful as an error check at cleanup time. 545 */ 546 static bool srcu_readers_active(struct srcu_struct *ssp) 547 { 548 int cpu; 549 unsigned long sum = 0; 550 551 for_each_possible_cpu(cpu) { 552 struct srcu_data *cpuc = per_cpu_ptr(ssp->sda, cpu); 553 554 sum += atomic_long_read(&cpuc->srcu_lock_count[0]); 555 sum += atomic_long_read(&cpuc->srcu_lock_count[1]); 556 sum -= atomic_long_read(&cpuc->srcu_unlock_count[0]); 557 sum -= atomic_long_read(&cpuc->srcu_unlock_count[1]); 558 } 559 return sum; 560 } 561 562 /* 563 * We use an adaptive strategy for synchronize_srcu() and especially for 564 * synchronize_srcu_expedited(). We spin for a fixed time period 565 * (defined below, boot time configurable) to allow SRCU readers to exit 566 * their read-side critical sections. If there are still some readers 567 * after one jiffy, we repeatedly block for one jiffy time periods. 568 * The blocking time is increased as the grace-period age increases, 569 * with max blocking time capped at 10 jiffies. 570 */ 571 #define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 572 573 static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; 574 module_param(srcu_retry_check_delay, ulong, 0444); 575 576 #define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. 577 #define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. 578 579 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase 580 // no-delay instances. 581 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase 582 // no-delay instances. 583 584 #define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) 585 #define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) 586 #define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) 587 // per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto 588 // one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() 589 // called from process_srcu(). 590 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ 591 (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) 592 593 // Maximum per-GP-phase consecutive no-delay instances. 594 #define SRCU_DEFAULT_MAX_NODELAY_PHASE \ 595 SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ 596 SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ 597 SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) 598 599 static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; 600 module_param(srcu_max_nodelay_phase, ulong, 0444); 601 602 // Maximum consecutive no-delay instances. 603 #define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ 604 SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) 605 606 static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; 607 module_param(srcu_max_nodelay, ulong, 0444); 608 609 /* 610 * Return grace-period delay, zero if there are expedited grace 611 * periods pending, SRCU_INTERVAL otherwise. 612 */ 613 static unsigned long srcu_get_delay(struct srcu_struct *ssp) 614 { 615 unsigned long gpstart; 616 unsigned long j; 617 unsigned long jbase = SRCU_INTERVAL; 618 619 if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) 620 jbase = 0; 621 if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) { 622 j = jiffies - 1; 623 gpstart = READ_ONCE(ssp->srcu_gp_start); 624 if (time_after(j, gpstart)) 625 jbase += j - gpstart; 626 if (!jbase) { 627 WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1); 628 if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) 629 jbase = 1; 630 } 631 } 632 return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; 633 } 634 635 /** 636 * cleanup_srcu_struct - deconstruct a sleep-RCU structure 637 * @ssp: structure to clean up. 638 * 639 * Must invoke this after you are finished using a given srcu_struct that 640 * was initialized via init_srcu_struct(), else you leak memory. 641 */ 642 void cleanup_srcu_struct(struct srcu_struct *ssp) 643 { 644 int cpu; 645 646 if (WARN_ON(!srcu_get_delay(ssp))) 647 return; /* Just leak it! */ 648 if (WARN_ON(srcu_readers_active(ssp))) 649 return; /* Just leak it! */ 650 flush_delayed_work(&ssp->work); 651 for_each_possible_cpu(cpu) { 652 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); 653 654 del_timer_sync(&sdp->delay_work); 655 flush_work(&sdp->work); 656 if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) 657 return; /* Forgot srcu_barrier(), so just leak it! */ 658 } 659 if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || 660 WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) || 661 WARN_ON(srcu_readers_active(ssp))) { 662 pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", 663 __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)), 664 rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed); 665 return; /* Caller forgot to stop doing call_srcu()? */ 666 } 667 kfree(ssp->srcu_sup->node); 668 ssp->srcu_sup->node = NULL; 669 ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; 670 if (!ssp->sda_is_static) { 671 free_percpu(ssp->sda); 672 ssp->sda = NULL; 673 kfree(ssp->srcu_sup); 674 ssp->srcu_sup = NULL; 675 } 676 } 677 EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 678 679 #ifdef CONFIG_PROVE_RCU 680 /* 681 * Check for consistent NMI safety. 682 */ 683 void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) 684 { 685 int nmi_safe_mask = 1 << nmi_safe; 686 int old_nmi_safe_mask; 687 struct srcu_data *sdp; 688 689 /* NMI-unsafe use in NMI is a bad sign */ 690 WARN_ON_ONCE(!nmi_safe && in_nmi()); 691 sdp = raw_cpu_ptr(ssp->sda); 692 old_nmi_safe_mask = READ_ONCE(sdp->srcu_nmi_safety); 693 if (!old_nmi_safe_mask) { 694 WRITE_ONCE(sdp->srcu_nmi_safety, nmi_safe_mask); 695 return; 696 } 697 WARN_ONCE(old_nmi_safe_mask != nmi_safe_mask, "CPU %d old state %d new state %d\n", sdp->cpu, old_nmi_safe_mask, nmi_safe_mask); 698 } 699 EXPORT_SYMBOL_GPL(srcu_check_nmi_safety); 700 #endif /* CONFIG_PROVE_RCU */ 701 702 /* 703 * Counts the new reader in the appropriate per-CPU element of the 704 * srcu_struct. 705 * Returns an index that must be passed to the matching srcu_read_unlock(). 706 */ 707 int __srcu_read_lock(struct srcu_struct *ssp) 708 { 709 int idx; 710 711 idx = READ_ONCE(ssp->srcu_idx) & 0x1; 712 this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); 713 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 714 return idx; 715 } 716 EXPORT_SYMBOL_GPL(__srcu_read_lock); 717 718 /* 719 * Removes the count for the old reader from the appropriate per-CPU 720 * element of the srcu_struct. Note that this may well be a different 721 * CPU than that which was incremented by the corresponding srcu_read_lock(). 722 */ 723 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) 724 { 725 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 726 this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); 727 } 728 EXPORT_SYMBOL_GPL(__srcu_read_unlock); 729 730 #ifdef CONFIG_NEED_SRCU_NMI_SAFE 731 732 /* 733 * Counts the new reader in the appropriate per-CPU element of the 734 * srcu_struct, but in an NMI-safe manner using RMW atomics. 735 * Returns an index that must be passed to the matching srcu_read_unlock(). 736 */ 737 int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) 738 { 739 int idx; 740 struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); 741 742 idx = READ_ONCE(ssp->srcu_idx) & 0x1; 743 atomic_long_inc(&sdp->srcu_lock_count[idx]); 744 smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ 745 return idx; 746 } 747 EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); 748 749 /* 750 * Removes the count for the old reader from the appropriate per-CPU 751 * element of the srcu_struct. Note that this may well be a different 752 * CPU than that which was incremented by the corresponding srcu_read_lock(). 753 */ 754 void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) 755 { 756 struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); 757 758 smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ 759 atomic_long_inc(&sdp->srcu_unlock_count[idx]); 760 } 761 EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); 762 763 #endif // CONFIG_NEED_SRCU_NMI_SAFE 764 765 /* 766 * Start an SRCU grace period. 767 */ 768 static void srcu_gp_start(struct srcu_struct *ssp) 769 { 770 struct srcu_data *sdp; 771 int state; 772 773 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 774 sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); 775 else 776 sdp = this_cpu_ptr(ssp->sda); 777 lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); 778 WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); 779 spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ 780 rcu_segcblist_advance(&sdp->srcu_cblist, 781 rcu_seq_current(&ssp->srcu_gp_seq)); 782 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, 783 rcu_seq_snap(&ssp->srcu_gp_seq)); 784 spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ 785 WRITE_ONCE(ssp->srcu_gp_start, jiffies); 786 WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0); 787 smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ 788 rcu_seq_start(&ssp->srcu_gp_seq); 789 state = rcu_seq_state(ssp->srcu_gp_seq); 790 WARN_ON_ONCE(state != SRCU_STATE_SCAN1); 791 } 792 793 794 static void srcu_delay_timer(struct timer_list *t) 795 { 796 struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); 797 798 queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); 799 } 800 801 static void srcu_queue_delayed_work_on(struct srcu_data *sdp, 802 unsigned long delay) 803 { 804 if (!delay) { 805 queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); 806 return; 807 } 808 809 timer_reduce(&sdp->delay_work, jiffies + delay); 810 } 811 812 /* 813 * Schedule callback invocation for the specified srcu_data structure, 814 * if possible, on the corresponding CPU. 815 */ 816 static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) 817 { 818 srcu_queue_delayed_work_on(sdp, delay); 819 } 820 821 /* 822 * Schedule callback invocation for all srcu_data structures associated 823 * with the specified srcu_node structure that have callbacks for the 824 * just-completed grace period, the one corresponding to idx. If possible, 825 * schedule this invocation on the corresponding CPUs. 826 */ 827 static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp, 828 unsigned long mask, unsigned long delay) 829 { 830 int cpu; 831 832 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { 833 if (!(mask & (1 << (cpu - snp->grplo)))) 834 continue; 835 srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); 836 } 837 } 838 839 /* 840 * Note the end of an SRCU grace period. Initiates callback invocation 841 * and starts a new grace period if needed. 842 * 843 * The ->srcu_cb_mutex acquisition does not protect any data, but 844 * instead prevents more than one grace period from starting while we 845 * are initiating callback invocation. This allows the ->srcu_have_cbs[] 846 * array to have a finite number of elements. 847 */ 848 static void srcu_gp_end(struct srcu_struct *ssp) 849 { 850 unsigned long cbdelay = 1; 851 bool cbs; 852 bool last_lvl; 853 int cpu; 854 unsigned long flags; 855 unsigned long gpseq; 856 int idx; 857 unsigned long mask; 858 struct srcu_data *sdp; 859 unsigned long sgsne; 860 struct srcu_node *snp; 861 int ss_state; 862 863 /* Prevent more than one additional grace period. */ 864 mutex_lock(&ssp->srcu_sup->srcu_cb_mutex); 865 866 /* End the current grace period. */ 867 spin_lock_irq_rcu_node(ssp->srcu_sup); 868 idx = rcu_seq_state(ssp->srcu_gp_seq); 869 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); 870 if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp))) 871 cbdelay = 0; 872 873 WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns()); 874 rcu_seq_end(&ssp->srcu_gp_seq); 875 gpseq = rcu_seq_current(&ssp->srcu_gp_seq); 876 if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq)) 877 WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq); 878 spin_unlock_irq_rcu_node(ssp->srcu_sup); 879 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 880 /* A new grace period can start at this point. But only one. */ 881 882 /* Initiate callback invocation as needed. */ 883 ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); 884 if (ss_state < SRCU_SIZE_WAIT_BARRIER) { 885 srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), 886 cbdelay); 887 } else { 888 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); 889 srcu_for_each_node_breadth_first(ssp, snp) { 890 spin_lock_irq_rcu_node(snp); 891 cbs = false; 892 last_lvl = snp >= ssp->srcu_sup->level[rcu_num_lvls - 1]; 893 if (last_lvl) 894 cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; 895 snp->srcu_have_cbs[idx] = gpseq; 896 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); 897 sgsne = snp->srcu_gp_seq_needed_exp; 898 if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) 899 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); 900 if (ss_state < SRCU_SIZE_BIG) 901 mask = ~0; 902 else 903 mask = snp->srcu_data_have_cbs[idx]; 904 snp->srcu_data_have_cbs[idx] = 0; 905 spin_unlock_irq_rcu_node(snp); 906 if (cbs) 907 srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); 908 } 909 } 910 911 /* Occasionally prevent srcu_data counter wrap. */ 912 if (!(gpseq & counter_wrap_check)) 913 for_each_possible_cpu(cpu) { 914 sdp = per_cpu_ptr(ssp->sda, cpu); 915 spin_lock_irqsave_rcu_node(sdp, flags); 916 if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) 917 sdp->srcu_gp_seq_needed = gpseq; 918 if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) 919 sdp->srcu_gp_seq_needed_exp = gpseq; 920 spin_unlock_irqrestore_rcu_node(sdp, flags); 921 } 922 923 /* Callback initiation done, allow grace periods after next. */ 924 mutex_unlock(&ssp->srcu_sup->srcu_cb_mutex); 925 926 /* Start a new grace period if needed. */ 927 spin_lock_irq_rcu_node(ssp->srcu_sup); 928 gpseq = rcu_seq_current(&ssp->srcu_gp_seq); 929 if (!rcu_seq_state(gpseq) && 930 ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) { 931 srcu_gp_start(ssp); 932 spin_unlock_irq_rcu_node(ssp->srcu_sup); 933 srcu_reschedule(ssp, 0); 934 } else { 935 spin_unlock_irq_rcu_node(ssp->srcu_sup); 936 } 937 938 /* Transition to big if needed. */ 939 if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { 940 if (ss_state == SRCU_SIZE_ALLOC) 941 init_srcu_struct_nodes(ssp, GFP_KERNEL); 942 else 943 smp_store_release(&ssp->srcu_sup->srcu_size_state, ss_state + 1); 944 } 945 } 946 947 /* 948 * Funnel-locking scheme to scalably mediate many concurrent expedited 949 * grace-period requests. This function is invoked for the first known 950 * expedited request for a grace period that has already been requested, 951 * but without expediting. To start a completely new grace period, 952 * whether expedited or not, use srcu_funnel_gp_start() instead. 953 */ 954 static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp, 955 unsigned long s) 956 { 957 unsigned long flags; 958 unsigned long sgsne; 959 960 if (snp) 961 for (; snp != NULL; snp = snp->srcu_parent) { 962 sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); 963 if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) || 964 (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) 965 return; 966 spin_lock_irqsave_rcu_node(snp, flags); 967 sgsne = snp->srcu_gp_seq_needed_exp; 968 if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { 969 spin_unlock_irqrestore_rcu_node(snp, flags); 970 return; 971 } 972 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); 973 spin_unlock_irqrestore_rcu_node(snp, flags); 974 } 975 spin_lock_irqsave_ssp_contention(ssp, &flags); 976 if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) 977 WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); 978 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 979 } 980 981 /* 982 * Funnel-locking scheme to scalably mediate many concurrent grace-period 983 * requests. The winner has to do the work of actually starting grace 984 * period s. Losers must either ensure that their desired grace-period 985 * number is recorded on at least their leaf srcu_node structure, or they 986 * must take steps to invoke their own callbacks. 987 * 988 * Note that this function also does the work of srcu_funnel_exp_start(), 989 * in some cases by directly invoking it. 990 * 991 * The srcu read lock should be hold around this function. And s is a seq snap 992 * after holding that lock. 993 */ 994 static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, 995 unsigned long s, bool do_norm) 996 { 997 unsigned long flags; 998 int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); 999 unsigned long sgsne; 1000 struct srcu_node *snp; 1001 struct srcu_node *snp_leaf; 1002 unsigned long snp_seq; 1003 1004 /* Ensure that snp node tree is fully initialized before traversing it */ 1005 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 1006 snp_leaf = NULL; 1007 else 1008 snp_leaf = sdp->mynode; 1009 1010 if (snp_leaf) 1011 /* Each pass through the loop does one level of the srcu_node tree. */ 1012 for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { 1013 if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf) 1014 return; /* GP already done and CBs recorded. */ 1015 spin_lock_irqsave_rcu_node(snp, flags); 1016 snp_seq = snp->srcu_have_cbs[idx]; 1017 if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { 1018 if (snp == snp_leaf && snp_seq == s) 1019 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 1020 spin_unlock_irqrestore_rcu_node(snp, flags); 1021 if (snp == snp_leaf && snp_seq != s) { 1022 srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); 1023 return; 1024 } 1025 if (!do_norm) 1026 srcu_funnel_exp_start(ssp, snp, s); 1027 return; 1028 } 1029 snp->srcu_have_cbs[idx] = s; 1030 if (snp == snp_leaf) 1031 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 1032 sgsne = snp->srcu_gp_seq_needed_exp; 1033 if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) 1034 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); 1035 spin_unlock_irqrestore_rcu_node(snp, flags); 1036 } 1037 1038 /* Top of tree, must ensure the grace period will be started. */ 1039 spin_lock_irqsave_ssp_contention(ssp, &flags); 1040 if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) { 1041 /* 1042 * Record need for grace period s. Pair with load 1043 * acquire setting up for initialization. 1044 */ 1045 smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/ 1046 } 1047 if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s)) 1048 WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s); 1049 1050 /* If grace period not already in progress, start it. */ 1051 if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && 1052 rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) { 1053 WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)); 1054 srcu_gp_start(ssp); 1055 1056 // And how can that list_add() in the "else" clause 1057 // possibly be safe for concurrent execution? Well, 1058 // it isn't. And it does not have to be. After all, it 1059 // can only be executed during early boot when there is only 1060 // the one boot CPU running with interrupts still disabled. 1061 if (likely(srcu_init_done)) 1062 queue_delayed_work(rcu_gp_wq, &ssp->work, 1063 !!srcu_get_delay(ssp)); 1064 else if (list_empty(&ssp->work.work.entry)) 1065 list_add(&ssp->work.work.entry, &srcu_boot_list); 1066 } 1067 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 1068 } 1069 1070 /* 1071 * Wait until all readers counted by array index idx complete, but 1072 * loop an additional time if there is an expedited grace period pending. 1073 * The caller must ensure that ->srcu_idx is not changed while checking. 1074 */ 1075 static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) 1076 { 1077 unsigned long curdelay; 1078 1079 curdelay = !srcu_get_delay(ssp); 1080 1081 for (;;) { 1082 if (srcu_readers_active_idx_check(ssp, idx)) 1083 return true; 1084 if ((--trycount + curdelay) <= 0) 1085 return false; 1086 udelay(srcu_retry_check_delay); 1087 } 1088 } 1089 1090 /* 1091 * Increment the ->srcu_idx counter so that future SRCU readers will 1092 * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows 1093 * us to wait for pre-existing readers in a starvation-free manner. 1094 */ 1095 static void srcu_flip(struct srcu_struct *ssp) 1096 { 1097 /* 1098 * Ensure that if this updater saw a given reader's increment 1099 * from __srcu_read_lock(), that reader was using an old value 1100 * of ->srcu_idx. Also ensure that if a given reader sees the 1101 * new value of ->srcu_idx, this updater's earlier scans cannot 1102 * have seen that reader's increments (which is OK, because this 1103 * grace period need not wait on that reader). 1104 */ 1105 smp_mb(); /* E */ /* Pairs with B and C. */ 1106 1107 WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); 1108 1109 /* 1110 * Ensure that if the updater misses an __srcu_read_unlock() 1111 * increment, that task's __srcu_read_lock() following its next 1112 * __srcu_read_lock() or __srcu_read_unlock() will see the above 1113 * counter update. Note that both this memory barrier and the 1114 * one in srcu_readers_active_idx_check() provide the guarantee 1115 * for __srcu_read_lock(). 1116 */ 1117 smp_mb(); /* D */ /* Pairs with C. */ 1118 } 1119 1120 /* 1121 * If SRCU is likely idle, return true, otherwise return false. 1122 * 1123 * Note that it is OK for several current from-idle requests for a new 1124 * grace period from idle to specify expediting because they will all end 1125 * up requesting the same grace period anyhow. So no loss. 1126 * 1127 * Note also that if any CPU (including the current one) is still invoking 1128 * callbacks, this function will nevertheless say "idle". This is not 1129 * ideal, but the overhead of checking all CPUs' callback lists is even 1130 * less ideal, especially on large systems. Furthermore, the wakeup 1131 * can happen before the callback is fully removed, so we have no choice 1132 * but to accept this type of error. 1133 * 1134 * This function is also subject to counter-wrap errors, but let's face 1135 * it, if this function was preempted for enough time for the counters 1136 * to wrap, it really doesn't matter whether or not we expedite the grace 1137 * period. The extra overhead of a needlessly expedited grace period is 1138 * negligible when amortized over that time period, and the extra latency 1139 * of a needlessly non-expedited grace period is similarly negligible. 1140 */ 1141 static bool srcu_might_be_idle(struct srcu_struct *ssp) 1142 { 1143 unsigned long curseq; 1144 unsigned long flags; 1145 struct srcu_data *sdp; 1146 unsigned long t; 1147 unsigned long tlast; 1148 1149 check_init_srcu_struct(ssp); 1150 /* If the local srcu_data structure has callbacks, not idle. */ 1151 sdp = raw_cpu_ptr(ssp->sda); 1152 spin_lock_irqsave_rcu_node(sdp, flags); 1153 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { 1154 spin_unlock_irqrestore_rcu_node(sdp, flags); 1155 return false; /* Callbacks already present, so not idle. */ 1156 } 1157 spin_unlock_irqrestore_rcu_node(sdp, flags); 1158 1159 /* 1160 * No local callbacks, so probabilistically probe global state. 1161 * Exact information would require acquiring locks, which would 1162 * kill scalability, hence the probabilistic nature of the probe. 1163 */ 1164 1165 /* First, see if enough time has passed since the last GP. */ 1166 t = ktime_get_mono_fast_ns(); 1167 tlast = READ_ONCE(ssp->srcu_last_gp_end); 1168 if (exp_holdoff == 0 || 1169 time_in_range_open(t, tlast, tlast + exp_holdoff)) 1170 return false; /* Too soon after last GP. */ 1171 1172 /* Next, check for probable idleness. */ 1173 curseq = rcu_seq_current(&ssp->srcu_gp_seq); 1174 smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ 1175 if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed))) 1176 return false; /* Grace period in progress, so not idle. */ 1177 smp_mb(); /* Order ->srcu_gp_seq with prior access. */ 1178 if (curseq != rcu_seq_current(&ssp->srcu_gp_seq)) 1179 return false; /* GP # changed, so not idle. */ 1180 return true; /* With reasonable probability, idle! */ 1181 } 1182 1183 /* 1184 * SRCU callback function to leak a callback. 1185 */ 1186 static void srcu_leak_callback(struct rcu_head *rhp) 1187 { 1188 } 1189 1190 /* 1191 * Start an SRCU grace period, and also queue the callback if non-NULL. 1192 */ 1193 static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, 1194 struct rcu_head *rhp, bool do_norm) 1195 { 1196 unsigned long flags; 1197 int idx; 1198 bool needexp = false; 1199 bool needgp = false; 1200 unsigned long s; 1201 struct srcu_data *sdp; 1202 struct srcu_node *sdp_mynode; 1203 int ss_state; 1204 1205 check_init_srcu_struct(ssp); 1206 /* 1207 * While starting a new grace period, make sure we are in an 1208 * SRCU read-side critical section so that the grace-period 1209 * sequence number cannot wrap around in the meantime. 1210 */ 1211 idx = __srcu_read_lock_nmisafe(ssp); 1212 ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); 1213 if (ss_state < SRCU_SIZE_WAIT_CALL) 1214 sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); 1215 else 1216 sdp = raw_cpu_ptr(ssp->sda); 1217 spin_lock_irqsave_sdp_contention(sdp, &flags); 1218 if (rhp) 1219 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); 1220 rcu_segcblist_advance(&sdp->srcu_cblist, 1221 rcu_seq_current(&ssp->srcu_gp_seq)); 1222 s = rcu_seq_snap(&ssp->srcu_gp_seq); 1223 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); 1224 if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { 1225 sdp->srcu_gp_seq_needed = s; 1226 needgp = true; 1227 } 1228 if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { 1229 sdp->srcu_gp_seq_needed_exp = s; 1230 needexp = true; 1231 } 1232 spin_unlock_irqrestore_rcu_node(sdp, flags); 1233 1234 /* Ensure that snp node tree is fully initialized before traversing it */ 1235 if (ss_state < SRCU_SIZE_WAIT_BARRIER) 1236 sdp_mynode = NULL; 1237 else 1238 sdp_mynode = sdp->mynode; 1239 1240 if (needgp) 1241 srcu_funnel_gp_start(ssp, sdp, s, do_norm); 1242 else if (needexp) 1243 srcu_funnel_exp_start(ssp, sdp_mynode, s); 1244 __srcu_read_unlock_nmisafe(ssp, idx); 1245 return s; 1246 } 1247 1248 /* 1249 * Enqueue an SRCU callback on the srcu_data structure associated with 1250 * the current CPU and the specified srcu_struct structure, initiating 1251 * grace-period processing if it is not already running. 1252 * 1253 * Note that all CPUs must agree that the grace period extended beyond 1254 * all pre-existing SRCU read-side critical section. On systems with 1255 * more than one CPU, this means that when "func()" is invoked, each CPU 1256 * is guaranteed to have executed a full memory barrier since the end of 1257 * its last corresponding SRCU read-side critical section whose beginning 1258 * preceded the call to call_srcu(). It also means that each CPU executing 1259 * an SRCU read-side critical section that continues beyond the start of 1260 * "func()" must have executed a memory barrier after the call_srcu() 1261 * but before the beginning of that SRCU read-side critical section. 1262 * Note that these guarantees include CPUs that are offline, idle, or 1263 * executing in user mode, as well as CPUs that are executing in the kernel. 1264 * 1265 * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the 1266 * resulting SRCU callback function "func()", then both CPU A and CPU 1267 * B are guaranteed to execute a full memory barrier during the time 1268 * interval between the call to call_srcu() and the invocation of "func()". 1269 * This guarantee applies even if CPU A and CPU B are the same CPU (but 1270 * again only if the system has more than one CPU). 1271 * 1272 * Of course, these guarantees apply only for invocations of call_srcu(), 1273 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same 1274 * srcu_struct structure. 1275 */ 1276 static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, 1277 rcu_callback_t func, bool do_norm) 1278 { 1279 if (debug_rcu_head_queue(rhp)) { 1280 /* Probable double call_srcu(), so leak the callback. */ 1281 WRITE_ONCE(rhp->func, srcu_leak_callback); 1282 WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); 1283 return; 1284 } 1285 rhp->func = func; 1286 (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); 1287 } 1288 1289 /** 1290 * call_srcu() - Queue a callback for invocation after an SRCU grace period 1291 * @ssp: srcu_struct in queue the callback 1292 * @rhp: structure to be used for queueing the SRCU callback. 1293 * @func: function to be invoked after the SRCU grace period 1294 * 1295 * The callback function will be invoked some time after a full SRCU 1296 * grace period elapses, in other words after all pre-existing SRCU 1297 * read-side critical sections have completed. However, the callback 1298 * function might well execute concurrently with other SRCU read-side 1299 * critical sections that started after call_srcu() was invoked. SRCU 1300 * read-side critical sections are delimited by srcu_read_lock() and 1301 * srcu_read_unlock(), and may be nested. 1302 * 1303 * The callback will be invoked from process context, but must nevertheless 1304 * be fast and must not block. 1305 */ 1306 void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, 1307 rcu_callback_t func) 1308 { 1309 __call_srcu(ssp, rhp, func, true); 1310 } 1311 EXPORT_SYMBOL_GPL(call_srcu); 1312 1313 /* 1314 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 1315 */ 1316 static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) 1317 { 1318 struct rcu_synchronize rcu; 1319 1320 RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || 1321 lock_is_held(&rcu_bh_lock_map) || 1322 lock_is_held(&rcu_lock_map) || 1323 lock_is_held(&rcu_sched_lock_map), 1324 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); 1325 1326 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) 1327 return; 1328 might_sleep(); 1329 check_init_srcu_struct(ssp); 1330 init_completion(&rcu.completion); 1331 init_rcu_head_on_stack(&rcu.head); 1332 __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); 1333 wait_for_completion(&rcu.completion); 1334 destroy_rcu_head_on_stack(&rcu.head); 1335 1336 /* 1337 * Make sure that later code is ordered after the SRCU grace 1338 * period. This pairs with the spin_lock_irq_rcu_node() 1339 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed 1340 * because the current CPU might have been totally uninvolved with 1341 * (and thus unordered against) that grace period. 1342 */ 1343 smp_mb(); 1344 } 1345 1346 /** 1347 * synchronize_srcu_expedited - Brute-force SRCU grace period 1348 * @ssp: srcu_struct with which to synchronize. 1349 * 1350 * Wait for an SRCU grace period to elapse, but be more aggressive about 1351 * spinning rather than blocking when waiting. 1352 * 1353 * Note that synchronize_srcu_expedited() has the same deadlock and 1354 * memory-ordering properties as does synchronize_srcu(). 1355 */ 1356 void synchronize_srcu_expedited(struct srcu_struct *ssp) 1357 { 1358 __synchronize_srcu(ssp, rcu_gp_is_normal()); 1359 } 1360 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 1361 1362 /** 1363 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 1364 * @ssp: srcu_struct with which to synchronize. 1365 * 1366 * Wait for the count to drain to zero of both indexes. To avoid the 1367 * possible starvation of synchronize_srcu(), it waits for the count of 1368 * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, 1369 * and then flip the srcu_idx and wait for the count of the other index. 1370 * 1371 * Can block; must be called from process context. 1372 * 1373 * Note that it is illegal to call synchronize_srcu() from the corresponding 1374 * SRCU read-side critical section; doing so will result in deadlock. 1375 * However, it is perfectly legal to call synchronize_srcu() on one 1376 * srcu_struct from some other srcu_struct's read-side critical section, 1377 * as long as the resulting graph of srcu_structs is acyclic. 1378 * 1379 * There are memory-ordering constraints implied by synchronize_srcu(). 1380 * On systems with more than one CPU, when synchronize_srcu() returns, 1381 * each CPU is guaranteed to have executed a full memory barrier since 1382 * the end of its last corresponding SRCU read-side critical section 1383 * whose beginning preceded the call to synchronize_srcu(). In addition, 1384 * each CPU having an SRCU read-side critical section that extends beyond 1385 * the return from synchronize_srcu() is guaranteed to have executed a 1386 * full memory barrier after the beginning of synchronize_srcu() and before 1387 * the beginning of that SRCU read-side critical section. Note that these 1388 * guarantees include CPUs that are offline, idle, or executing in user mode, 1389 * as well as CPUs that are executing in the kernel. 1390 * 1391 * Furthermore, if CPU A invoked synchronize_srcu(), which returned 1392 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 1393 * to have executed a full memory barrier during the execution of 1394 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B 1395 * are the same CPU, but again only if the system has more than one CPU. 1396 * 1397 * Of course, these memory-ordering guarantees apply only when 1398 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are 1399 * passed the same srcu_struct structure. 1400 * 1401 * Implementation of these memory-ordering guarantees is similar to 1402 * that of synchronize_rcu(). 1403 * 1404 * If SRCU is likely idle, expedite the first request. This semantic 1405 * was provided by Classic SRCU, and is relied upon by its users, so TREE 1406 * SRCU must also provide it. Note that detecting idleness is heuristic 1407 * and subject to both false positives and negatives. 1408 */ 1409 void synchronize_srcu(struct srcu_struct *ssp) 1410 { 1411 if (srcu_might_be_idle(ssp) || rcu_gp_is_expedited()) 1412 synchronize_srcu_expedited(ssp); 1413 else 1414 __synchronize_srcu(ssp, true); 1415 } 1416 EXPORT_SYMBOL_GPL(synchronize_srcu); 1417 1418 /** 1419 * get_state_synchronize_srcu - Provide an end-of-grace-period cookie 1420 * @ssp: srcu_struct to provide cookie for. 1421 * 1422 * This function returns a cookie that can be passed to 1423 * poll_state_synchronize_srcu(), which will return true if a full grace 1424 * period has elapsed in the meantime. It is the caller's responsibility 1425 * to make sure that grace period happens, for example, by invoking 1426 * call_srcu() after return from get_state_synchronize_srcu(). 1427 */ 1428 unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) 1429 { 1430 // Any prior manipulation of SRCU-protected data must happen 1431 // before the load from ->srcu_gp_seq. 1432 smp_mb(); 1433 return rcu_seq_snap(&ssp->srcu_gp_seq); 1434 } 1435 EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); 1436 1437 /** 1438 * start_poll_synchronize_srcu - Provide cookie and start grace period 1439 * @ssp: srcu_struct to provide cookie for. 1440 * 1441 * This function returns a cookie that can be passed to 1442 * poll_state_synchronize_srcu(), which will return true if a full grace 1443 * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), 1444 * this function also ensures that any needed SRCU grace period will be 1445 * started. This convenience does come at a cost in terms of CPU overhead. 1446 */ 1447 unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) 1448 { 1449 return srcu_gp_start_if_needed(ssp, NULL, true); 1450 } 1451 EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); 1452 1453 /** 1454 * poll_state_synchronize_srcu - Has cookie's grace period ended? 1455 * @ssp: srcu_struct to provide cookie for. 1456 * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). 1457 * 1458 * This function takes the cookie that was returned from either 1459 * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and 1460 * returns @true if an SRCU grace period elapsed since the time that the 1461 * cookie was created. 1462 * 1463 * Because cookies are finite in size, wrapping/overflow is possible. 1464 * This is more pronounced on 32-bit systems where cookies are 32 bits, 1465 * where in theory wrapping could happen in about 14 hours assuming 1466 * 25-microsecond expedited SRCU grace periods. However, a more likely 1467 * overflow lower bound is on the order of 24 days in the case of 1468 * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit 1469 * system requires geologic timespans, as in more than seven million years 1470 * even for expedited SRCU grace periods. 1471 * 1472 * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems 1473 * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses 1474 * a 16-bit cookie, which rcutorture routinely wraps in a matter of a 1475 * few minutes. If this proves to be a problem, this counter will be 1476 * expanded to the same size as for Tree SRCU. 1477 */ 1478 bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) 1479 { 1480 if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) 1481 return false; 1482 // Ensure that the end of the SRCU grace period happens before 1483 // any subsequent code that the caller might execute. 1484 smp_mb(); // ^^^ 1485 return true; 1486 } 1487 EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); 1488 1489 /* 1490 * Callback function for srcu_barrier() use. 1491 */ 1492 static void srcu_barrier_cb(struct rcu_head *rhp) 1493 { 1494 struct srcu_data *sdp; 1495 struct srcu_struct *ssp; 1496 1497 sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); 1498 ssp = sdp->ssp; 1499 if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) 1500 complete(&ssp->srcu_barrier_completion); 1501 } 1502 1503 /* 1504 * Enqueue an srcu_barrier() callback on the specified srcu_data 1505 * structure's ->cblist. but only if that ->cblist already has at least one 1506 * callback enqueued. Note that if a CPU already has callbacks enqueue, 1507 * it must have already registered the need for a future grace period, 1508 * so all we need do is enqueue a callback that will use the same grace 1509 * period as the last callback already in the queue. 1510 */ 1511 static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) 1512 { 1513 spin_lock_irq_rcu_node(sdp); 1514 atomic_inc(&ssp->srcu_barrier_cpu_cnt); 1515 sdp->srcu_barrier_head.func = srcu_barrier_cb; 1516 debug_rcu_head_queue(&sdp->srcu_barrier_head); 1517 if (!rcu_segcblist_entrain(&sdp->srcu_cblist, 1518 &sdp->srcu_barrier_head)) { 1519 debug_rcu_head_unqueue(&sdp->srcu_barrier_head); 1520 atomic_dec(&ssp->srcu_barrier_cpu_cnt); 1521 } 1522 spin_unlock_irq_rcu_node(sdp); 1523 } 1524 1525 /** 1526 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. 1527 * @ssp: srcu_struct on which to wait for in-flight callbacks. 1528 */ 1529 void srcu_barrier(struct srcu_struct *ssp) 1530 { 1531 int cpu; 1532 int idx; 1533 unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq); 1534 1535 check_init_srcu_struct(ssp); 1536 mutex_lock(&ssp->srcu_barrier_mutex); 1537 if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) { 1538 smp_mb(); /* Force ordering following return. */ 1539 mutex_unlock(&ssp->srcu_barrier_mutex); 1540 return; /* Someone else did our work for us. */ 1541 } 1542 rcu_seq_start(&ssp->srcu_barrier_seq); 1543 init_completion(&ssp->srcu_barrier_completion); 1544 1545 /* Initial count prevents reaching zero until all CBs are posted. */ 1546 atomic_set(&ssp->srcu_barrier_cpu_cnt, 1); 1547 1548 idx = __srcu_read_lock_nmisafe(ssp); 1549 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 1550 srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); 1551 else 1552 for_each_possible_cpu(cpu) 1553 srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); 1554 __srcu_read_unlock_nmisafe(ssp, idx); 1555 1556 /* Remove the initial count, at which point reaching zero can happen. */ 1557 if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt)) 1558 complete(&ssp->srcu_barrier_completion); 1559 wait_for_completion(&ssp->srcu_barrier_completion); 1560 1561 rcu_seq_end(&ssp->srcu_barrier_seq); 1562 mutex_unlock(&ssp->srcu_barrier_mutex); 1563 } 1564 EXPORT_SYMBOL_GPL(srcu_barrier); 1565 1566 /** 1567 * srcu_batches_completed - return batches completed. 1568 * @ssp: srcu_struct on which to report batch completion. 1569 * 1570 * Report the number of batches, correlated with, but not necessarily 1571 * precisely the same as, the number of grace periods that have elapsed. 1572 */ 1573 unsigned long srcu_batches_completed(struct srcu_struct *ssp) 1574 { 1575 return READ_ONCE(ssp->srcu_idx); 1576 } 1577 EXPORT_SYMBOL_GPL(srcu_batches_completed); 1578 1579 /* 1580 * Core SRCU state machine. Push state bits of ->srcu_gp_seq 1581 * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has 1582 * completed in that state. 1583 */ 1584 static void srcu_advance_state(struct srcu_struct *ssp) 1585 { 1586 int idx; 1587 1588 mutex_lock(&ssp->srcu_sup->srcu_gp_mutex); 1589 1590 /* 1591 * Because readers might be delayed for an extended period after 1592 * fetching ->srcu_idx for their index, at any point in time there 1593 * might well be readers using both idx=0 and idx=1. We therefore 1594 * need to wait for readers to clear from both index values before 1595 * invoking a callback. 1596 * 1597 * The load-acquire ensures that we see the accesses performed 1598 * by the prior grace period. 1599 */ 1600 idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */ 1601 if (idx == SRCU_STATE_IDLE) { 1602 spin_lock_irq_rcu_node(ssp->srcu_sup); 1603 if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { 1604 WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq)); 1605 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1606 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1607 return; 1608 } 1609 idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)); 1610 if (idx == SRCU_STATE_IDLE) 1611 srcu_gp_start(ssp); 1612 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1613 if (idx != SRCU_STATE_IDLE) { 1614 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1615 return; /* Someone else started the grace period. */ 1616 } 1617 } 1618 1619 if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { 1620 idx = 1 ^ (ssp->srcu_idx & 1); 1621 if (!try_check_zero(ssp, idx, 1)) { 1622 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1623 return; /* readers present, retry later. */ 1624 } 1625 srcu_flip(ssp); 1626 spin_lock_irq_rcu_node(ssp->srcu_sup); 1627 rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2); 1628 ssp->srcu_n_exp_nodelay = 0; 1629 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1630 } 1631 1632 if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { 1633 1634 /* 1635 * SRCU read-side critical sections are normally short, 1636 * so check at least twice in quick succession after a flip. 1637 */ 1638 idx = 1 ^ (ssp->srcu_idx & 1); 1639 if (!try_check_zero(ssp, idx, 2)) { 1640 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1641 return; /* readers present, retry later. */ 1642 } 1643 ssp->srcu_n_exp_nodelay = 0; 1644 srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ 1645 } 1646 } 1647 1648 /* 1649 * Invoke a limited number of SRCU callbacks that have passed through 1650 * their grace period. If there are more to do, SRCU will reschedule 1651 * the workqueue. Note that needed memory barriers have been executed 1652 * in this task's context by srcu_readers_active_idx_check(). 1653 */ 1654 static void srcu_invoke_callbacks(struct work_struct *work) 1655 { 1656 long len; 1657 bool more; 1658 struct rcu_cblist ready_cbs; 1659 struct rcu_head *rhp; 1660 struct srcu_data *sdp; 1661 struct srcu_struct *ssp; 1662 1663 sdp = container_of(work, struct srcu_data, work); 1664 1665 ssp = sdp->ssp; 1666 rcu_cblist_init(&ready_cbs); 1667 spin_lock_irq_rcu_node(sdp); 1668 rcu_segcblist_advance(&sdp->srcu_cblist, 1669 rcu_seq_current(&ssp->srcu_gp_seq)); 1670 if (sdp->srcu_cblist_invoking || 1671 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { 1672 spin_unlock_irq_rcu_node(sdp); 1673 return; /* Someone else on the job or nothing to do. */ 1674 } 1675 1676 /* We are on the job! Extract and invoke ready callbacks. */ 1677 sdp->srcu_cblist_invoking = true; 1678 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); 1679 len = ready_cbs.len; 1680 spin_unlock_irq_rcu_node(sdp); 1681 rhp = rcu_cblist_dequeue(&ready_cbs); 1682 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { 1683 debug_rcu_head_unqueue(rhp); 1684 local_bh_disable(); 1685 rhp->func(rhp); 1686 local_bh_enable(); 1687 } 1688 WARN_ON_ONCE(ready_cbs.len); 1689 1690 /* 1691 * Update counts, accelerate new callbacks, and if needed, 1692 * schedule another round of callback invocation. 1693 */ 1694 spin_lock_irq_rcu_node(sdp); 1695 rcu_segcblist_add_len(&sdp->srcu_cblist, -len); 1696 (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, 1697 rcu_seq_snap(&ssp->srcu_gp_seq)); 1698 sdp->srcu_cblist_invoking = false; 1699 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); 1700 spin_unlock_irq_rcu_node(sdp); 1701 if (more) 1702 srcu_schedule_cbs_sdp(sdp, 0); 1703 } 1704 1705 /* 1706 * Finished one round of SRCU grace period. Start another if there are 1707 * more SRCU callbacks queued, otherwise put SRCU into not-running state. 1708 */ 1709 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) 1710 { 1711 bool pushgp = true; 1712 1713 spin_lock_irq_rcu_node(ssp->srcu_sup); 1714 if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) { 1715 if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) { 1716 /* All requests fulfilled, time to go idle. */ 1717 pushgp = false; 1718 } 1719 } else if (!rcu_seq_state(ssp->srcu_gp_seq)) { 1720 /* Outstanding request and no GP. Start one. */ 1721 srcu_gp_start(ssp); 1722 } 1723 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1724 1725 if (pushgp) 1726 queue_delayed_work(rcu_gp_wq, &ssp->work, delay); 1727 } 1728 1729 /* 1730 * This is the work-queue function that handles SRCU grace periods. 1731 */ 1732 static void process_srcu(struct work_struct *work) 1733 { 1734 unsigned long curdelay; 1735 unsigned long j; 1736 struct srcu_struct *ssp; 1737 1738 ssp = container_of(work, struct srcu_struct, work.work); 1739 1740 srcu_advance_state(ssp); 1741 curdelay = srcu_get_delay(ssp); 1742 if (curdelay) { 1743 WRITE_ONCE(ssp->reschedule_count, 0); 1744 } else { 1745 j = jiffies; 1746 if (READ_ONCE(ssp->reschedule_jiffies) == j) { 1747 WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1); 1748 if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay) 1749 curdelay = 1; 1750 } else { 1751 WRITE_ONCE(ssp->reschedule_count, 1); 1752 WRITE_ONCE(ssp->reschedule_jiffies, j); 1753 } 1754 } 1755 srcu_reschedule(ssp, curdelay); 1756 } 1757 1758 void srcutorture_get_gp_data(enum rcutorture_type test_type, 1759 struct srcu_struct *ssp, int *flags, 1760 unsigned long *gp_seq) 1761 { 1762 if (test_type != SRCU_FLAVOR) 1763 return; 1764 *flags = 0; 1765 *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq); 1766 } 1767 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); 1768 1769 static const char * const srcu_size_state_name[] = { 1770 "SRCU_SIZE_SMALL", 1771 "SRCU_SIZE_ALLOC", 1772 "SRCU_SIZE_WAIT_BARRIER", 1773 "SRCU_SIZE_WAIT_CALL", 1774 "SRCU_SIZE_WAIT_CBS1", 1775 "SRCU_SIZE_WAIT_CBS2", 1776 "SRCU_SIZE_WAIT_CBS3", 1777 "SRCU_SIZE_WAIT_CBS4", 1778 "SRCU_SIZE_BIG", 1779 "SRCU_SIZE_???", 1780 }; 1781 1782 void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) 1783 { 1784 int cpu; 1785 int idx; 1786 unsigned long s0 = 0, s1 = 0; 1787 int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); 1788 int ss_state_idx = ss_state; 1789 1790 idx = ssp->srcu_idx & 0x1; 1791 if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) 1792 ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; 1793 pr_alert("%s%s Tree SRCU g%ld state %d (%s)", 1794 tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state, 1795 srcu_size_state_name[ss_state_idx]); 1796 if (!ssp->sda) { 1797 // Called after cleanup_srcu_struct(), perhaps. 1798 pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); 1799 } else { 1800 pr_cont(" per-CPU(idx=%d):", idx); 1801 for_each_possible_cpu(cpu) { 1802 unsigned long l0, l1; 1803 unsigned long u0, u1; 1804 long c0, c1; 1805 struct srcu_data *sdp; 1806 1807 sdp = per_cpu_ptr(ssp->sda, cpu); 1808 u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); 1809 u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); 1810 1811 /* 1812 * Make sure that a lock is always counted if the corresponding 1813 * unlock is counted. 1814 */ 1815 smp_rmb(); 1816 1817 l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); 1818 l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); 1819 1820 c0 = l0 - u0; 1821 c1 = l1 - u1; 1822 pr_cont(" %d(%ld,%ld %c)", 1823 cpu, c0, c1, 1824 "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); 1825 s0 += c0; 1826 s1 += c1; 1827 } 1828 pr_cont(" T(%ld,%ld)\n", s0, s1); 1829 } 1830 if (SRCU_SIZING_IS_TORTURE()) 1831 srcu_transition_to_big(ssp); 1832 } 1833 EXPORT_SYMBOL_GPL(srcu_torture_stats_print); 1834 1835 static int __init srcu_bootup_announce(void) 1836 { 1837 pr_info("Hierarchical SRCU implementation.\n"); 1838 if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) 1839 pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); 1840 if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) 1841 pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); 1842 if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) 1843 pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); 1844 pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); 1845 return 0; 1846 } 1847 early_initcall(srcu_bootup_announce); 1848 1849 void __init srcu_init(void) 1850 { 1851 struct srcu_struct *ssp; 1852 1853 /* Decide on srcu_struct-size strategy. */ 1854 if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { 1855 if (nr_cpu_ids >= big_cpu_lim) { 1856 convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. 1857 pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); 1858 } else { 1859 convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; 1860 pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); 1861 } 1862 } 1863 1864 /* 1865 * Once that is set, call_srcu() can follow the normal path and 1866 * queue delayed work. This must follow RCU workqueues creation 1867 * and timers initialization. 1868 */ 1869 srcu_init_done = true; 1870 while (!list_empty(&srcu_boot_list)) { 1871 ssp = list_first_entry(&srcu_boot_list, struct srcu_struct, 1872 work.work.entry); 1873 list_del_init(&ssp->work.work.entry); 1874 if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && 1875 ssp->srcu_sup->srcu_size_state == SRCU_SIZE_SMALL) 1876 ssp->srcu_sup->srcu_size_state = SRCU_SIZE_ALLOC; 1877 queue_work(rcu_gp_wq, &ssp->work.work); 1878 } 1879 } 1880 1881 #ifdef CONFIG_MODULES 1882 1883 /* Initialize any global-scope srcu_struct structures used by this module. */ 1884 static int srcu_module_coming(struct module *mod) 1885 { 1886 int i; 1887 struct srcu_struct *ssp; 1888 struct srcu_struct **sspp = mod->srcu_struct_ptrs; 1889 1890 for (i = 0; i < mod->num_srcu_structs; i++) { 1891 ssp = *(sspp++); 1892 ssp->sda = alloc_percpu(struct srcu_data); 1893 if (WARN_ON_ONCE(!ssp->sda)) 1894 return -ENOMEM; 1895 } 1896 return 0; 1897 } 1898 1899 /* Clean up any global-scope srcu_struct structures used by this module. */ 1900 static void srcu_module_going(struct module *mod) 1901 { 1902 int i; 1903 struct srcu_struct *ssp; 1904 struct srcu_struct **sspp = mod->srcu_struct_ptrs; 1905 1906 for (i = 0; i < mod->num_srcu_structs; i++) { 1907 ssp = *(sspp++); 1908 if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed)) && 1909 !WARN_ON_ONCE(!ssp->sda_is_static)) 1910 cleanup_srcu_struct(ssp); 1911 free_percpu(ssp->sda); 1912 } 1913 } 1914 1915 /* Handle one module, either coming or going. */ 1916 static int srcu_module_notify(struct notifier_block *self, 1917 unsigned long val, void *data) 1918 { 1919 struct module *mod = data; 1920 int ret = 0; 1921 1922 switch (val) { 1923 case MODULE_STATE_COMING: 1924 ret = srcu_module_coming(mod); 1925 break; 1926 case MODULE_STATE_GOING: 1927 srcu_module_going(mod); 1928 break; 1929 default: 1930 break; 1931 } 1932 return ret; 1933 } 1934 1935 static struct notifier_block srcu_module_nb = { 1936 .notifier_call = srcu_module_notify, 1937 .priority = 0, 1938 }; 1939 1940 static __init int init_srcu_module_notifier(void) 1941 { 1942 int ret; 1943 1944 ret = register_module_notifier(&srcu_module_nb); 1945 if (ret) 1946 pr_warn("Failed to register srcu module notifier\n"); 1947 return ret; 1948 } 1949 late_initcall(init_srcu_module_notifier); 1950 1951 #endif /* #ifdef CONFIG_MODULES */ 1952