1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2016 Facebook 4 * Copyright (C) 2013-2014 Jens Axboe 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/random.h> 9 #include <linux/sbitmap.h> 10 #include <linux/seq_file.h> 11 12 static int init_alloc_hint(struct sbitmap_queue *sbq, gfp_t flags) 13 { 14 unsigned depth = sbq->sb.depth; 15 16 sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags); 17 if (!sbq->alloc_hint) 18 return -ENOMEM; 19 20 if (depth && !sbq->sb.round_robin) { 21 int i; 22 23 for_each_possible_cpu(i) 24 *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; 25 } 26 27 return 0; 28 } 29 30 static inline unsigned update_alloc_hint_before_get(struct sbitmap_queue *sbq, 31 unsigned int depth) 32 { 33 unsigned hint; 34 35 hint = this_cpu_read(*sbq->alloc_hint); 36 if (unlikely(hint >= depth)) { 37 hint = depth ? prandom_u32() % depth : 0; 38 this_cpu_write(*sbq->alloc_hint, hint); 39 } 40 41 return hint; 42 } 43 44 static inline void update_alloc_hint_after_get(struct sbitmap_queue *sbq, 45 unsigned int depth, 46 unsigned int hint, 47 unsigned int nr) 48 { 49 if (nr == -1) { 50 /* If the map is full, a hint won't do us much good. */ 51 this_cpu_write(*sbq->alloc_hint, 0); 52 } else if (nr == hint || unlikely(sbq->sb.round_robin)) { 53 /* Only update the hint if we used it. */ 54 hint = nr + 1; 55 if (hint >= depth - 1) 56 hint = 0; 57 this_cpu_write(*sbq->alloc_hint, hint); 58 } 59 } 60 61 /* 62 * See if we have deferred clears that we can batch move 63 */ 64 static inline bool sbitmap_deferred_clear(struct sbitmap_word *map) 65 { 66 unsigned long mask; 67 68 if (!READ_ONCE(map->cleared)) 69 return false; 70 71 /* 72 * First get a stable cleared mask, setting the old mask to 0. 73 */ 74 mask = xchg(&map->cleared, 0); 75 76 /* 77 * Now clear the masked bits in our free word 78 */ 79 atomic_long_andnot(mask, (atomic_long_t *)&map->word); 80 BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(map->word)); 81 return true; 82 } 83 84 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, 85 gfp_t flags, int node, bool round_robin) 86 { 87 unsigned int bits_per_word; 88 unsigned int i; 89 90 if (shift < 0) { 91 shift = ilog2(BITS_PER_LONG); 92 /* 93 * If the bitmap is small, shrink the number of bits per word so 94 * we spread over a few cachelines, at least. If less than 4 95 * bits, just forget about it, it's not going to work optimally 96 * anyway. 97 */ 98 if (depth >= 4) { 99 while ((4U << shift) > depth) 100 shift--; 101 } 102 } 103 bits_per_word = 1U << shift; 104 if (bits_per_word > BITS_PER_LONG) 105 return -EINVAL; 106 107 sb->shift = shift; 108 sb->depth = depth; 109 sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); 110 sb->round_robin = round_robin; 111 112 if (depth == 0) { 113 sb->map = NULL; 114 return 0; 115 } 116 117 sb->map = kcalloc_node(sb->map_nr, sizeof(*sb->map), flags, node); 118 if (!sb->map) 119 return -ENOMEM; 120 121 for (i = 0; i < sb->map_nr; i++) { 122 sb->map[i].depth = min(depth, bits_per_word); 123 depth -= sb->map[i].depth; 124 } 125 return 0; 126 } 127 EXPORT_SYMBOL_GPL(sbitmap_init_node); 128 129 void sbitmap_resize(struct sbitmap *sb, unsigned int depth) 130 { 131 unsigned int bits_per_word = 1U << sb->shift; 132 unsigned int i; 133 134 for (i = 0; i < sb->map_nr; i++) 135 sbitmap_deferred_clear(&sb->map[i]); 136 137 sb->depth = depth; 138 sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); 139 140 for (i = 0; i < sb->map_nr; i++) { 141 sb->map[i].depth = min(depth, bits_per_word); 142 depth -= sb->map[i].depth; 143 } 144 } 145 EXPORT_SYMBOL_GPL(sbitmap_resize); 146 147 static int __sbitmap_get_word(unsigned long *word, unsigned long depth, 148 unsigned int hint, bool wrap) 149 { 150 int nr; 151 152 /* don't wrap if starting from 0 */ 153 wrap = wrap && hint; 154 155 while (1) { 156 nr = find_next_zero_bit(word, depth, hint); 157 if (unlikely(nr >= depth)) { 158 /* 159 * We started with an offset, and we didn't reset the 160 * offset to 0 in a failure case, so start from 0 to 161 * exhaust the map. 162 */ 163 if (hint && wrap) { 164 hint = 0; 165 continue; 166 } 167 return -1; 168 } 169 170 if (!test_and_set_bit_lock(nr, word)) 171 break; 172 173 hint = nr + 1; 174 if (hint >= depth - 1) 175 hint = 0; 176 } 177 178 return nr; 179 } 180 181 static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index, 182 unsigned int alloc_hint) 183 { 184 struct sbitmap_word *map = &sb->map[index]; 185 int nr; 186 187 do { 188 nr = __sbitmap_get_word(&map->word, map->depth, alloc_hint, 189 !sb->round_robin); 190 if (nr != -1) 191 break; 192 if (!sbitmap_deferred_clear(map)) 193 break; 194 } while (1); 195 196 return nr; 197 } 198 199 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint) 200 { 201 unsigned int i, index; 202 int nr = -1; 203 204 index = SB_NR_TO_INDEX(sb, alloc_hint); 205 206 /* 207 * Unless we're doing round robin tag allocation, just use the 208 * alloc_hint to find the right word index. No point in looping 209 * twice in find_next_zero_bit() for that case. 210 */ 211 if (sb->round_robin) 212 alloc_hint = SB_NR_TO_BIT(sb, alloc_hint); 213 else 214 alloc_hint = 0; 215 216 for (i = 0; i < sb->map_nr; i++) { 217 nr = sbitmap_find_bit_in_index(sb, index, alloc_hint); 218 if (nr != -1) { 219 nr += index << sb->shift; 220 break; 221 } 222 223 /* Jump to next index. */ 224 alloc_hint = 0; 225 if (++index >= sb->map_nr) 226 index = 0; 227 } 228 229 return nr; 230 } 231 EXPORT_SYMBOL_GPL(sbitmap_get); 232 233 int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, 234 unsigned long shallow_depth) 235 { 236 unsigned int i, index; 237 int nr = -1; 238 239 index = SB_NR_TO_INDEX(sb, alloc_hint); 240 241 for (i = 0; i < sb->map_nr; i++) { 242 again: 243 nr = __sbitmap_get_word(&sb->map[index].word, 244 min(sb->map[index].depth, shallow_depth), 245 SB_NR_TO_BIT(sb, alloc_hint), true); 246 if (nr != -1) { 247 nr += index << sb->shift; 248 break; 249 } 250 251 if (sbitmap_deferred_clear(&sb->map[index])) 252 goto again; 253 254 /* Jump to next index. */ 255 index++; 256 alloc_hint = index << sb->shift; 257 258 if (index >= sb->map_nr) { 259 index = 0; 260 alloc_hint = 0; 261 } 262 } 263 264 return nr; 265 } 266 EXPORT_SYMBOL_GPL(sbitmap_get_shallow); 267 268 bool sbitmap_any_bit_set(const struct sbitmap *sb) 269 { 270 unsigned int i; 271 272 for (i = 0; i < sb->map_nr; i++) { 273 if (sb->map[i].word & ~sb->map[i].cleared) 274 return true; 275 } 276 return false; 277 } 278 EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); 279 280 static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set) 281 { 282 unsigned int i, weight = 0; 283 284 for (i = 0; i < sb->map_nr; i++) { 285 const struct sbitmap_word *word = &sb->map[i]; 286 287 if (set) 288 weight += bitmap_weight(&word->word, word->depth); 289 else 290 weight += bitmap_weight(&word->cleared, word->depth); 291 } 292 return weight; 293 } 294 295 static unsigned int sbitmap_weight(const struct sbitmap *sb) 296 { 297 return __sbitmap_weight(sb, true); 298 } 299 300 static unsigned int sbitmap_cleared(const struct sbitmap *sb) 301 { 302 return __sbitmap_weight(sb, false); 303 } 304 305 void sbitmap_show(struct sbitmap *sb, struct seq_file *m) 306 { 307 seq_printf(m, "depth=%u\n", sb->depth); 308 seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb)); 309 seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb)); 310 seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift); 311 seq_printf(m, "map_nr=%u\n", sb->map_nr); 312 } 313 EXPORT_SYMBOL_GPL(sbitmap_show); 314 315 static inline void emit_byte(struct seq_file *m, unsigned int offset, u8 byte) 316 { 317 if ((offset & 0xf) == 0) { 318 if (offset != 0) 319 seq_putc(m, '\n'); 320 seq_printf(m, "%08x:", offset); 321 } 322 if ((offset & 0x1) == 0) 323 seq_putc(m, ' '); 324 seq_printf(m, "%02x", byte); 325 } 326 327 void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m) 328 { 329 u8 byte = 0; 330 unsigned int byte_bits = 0; 331 unsigned int offset = 0; 332 int i; 333 334 for (i = 0; i < sb->map_nr; i++) { 335 unsigned long word = READ_ONCE(sb->map[i].word); 336 unsigned long cleared = READ_ONCE(sb->map[i].cleared); 337 unsigned int word_bits = READ_ONCE(sb->map[i].depth); 338 339 word &= ~cleared; 340 341 while (word_bits > 0) { 342 unsigned int bits = min(8 - byte_bits, word_bits); 343 344 byte |= (word & (BIT(bits) - 1)) << byte_bits; 345 byte_bits += bits; 346 if (byte_bits == 8) { 347 emit_byte(m, offset, byte); 348 byte = 0; 349 byte_bits = 0; 350 offset++; 351 } 352 word >>= bits; 353 word_bits -= bits; 354 } 355 } 356 if (byte_bits) { 357 emit_byte(m, offset, byte); 358 offset++; 359 } 360 if (offset) 361 seq_putc(m, '\n'); 362 } 363 EXPORT_SYMBOL_GPL(sbitmap_bitmap_show); 364 365 static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, 366 unsigned int depth) 367 { 368 unsigned int wake_batch; 369 unsigned int shallow_depth; 370 371 /* 372 * For each batch, we wake up one queue. We need to make sure that our 373 * batch size is small enough that the full depth of the bitmap, 374 * potentially limited by a shallow depth, is enough to wake up all of 375 * the queues. 376 * 377 * Each full word of the bitmap has bits_per_word bits, and there might 378 * be a partial word. There are depth / bits_per_word full words and 379 * depth % bits_per_word bits left over. In bitwise arithmetic: 380 * 381 * bits_per_word = 1 << shift 382 * depth / bits_per_word = depth >> shift 383 * depth % bits_per_word = depth & ((1 << shift) - 1) 384 * 385 * Each word can be limited to sbq->min_shallow_depth bits. 386 */ 387 shallow_depth = min(1U << sbq->sb.shift, sbq->min_shallow_depth); 388 depth = ((depth >> sbq->sb.shift) * shallow_depth + 389 min(depth & ((1U << sbq->sb.shift) - 1), shallow_depth)); 390 wake_batch = clamp_t(unsigned int, depth / SBQ_WAIT_QUEUES, 1, 391 SBQ_WAKE_BATCH); 392 393 return wake_batch; 394 } 395 396 int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, 397 int shift, bool round_robin, gfp_t flags, int node) 398 { 399 int ret; 400 int i; 401 402 ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node, 403 round_robin); 404 if (ret) 405 return ret; 406 407 if (init_alloc_hint(sbq, flags) != 0) { 408 sbitmap_free(&sbq->sb); 409 return -ENOMEM; 410 } 411 412 sbq->min_shallow_depth = UINT_MAX; 413 sbq->wake_batch = sbq_calc_wake_batch(sbq, depth); 414 atomic_set(&sbq->wake_index, 0); 415 atomic_set(&sbq->ws_active, 0); 416 417 sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); 418 if (!sbq->ws) { 419 free_percpu(sbq->alloc_hint); 420 sbitmap_free(&sbq->sb); 421 return -ENOMEM; 422 } 423 424 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 425 init_waitqueue_head(&sbq->ws[i].wait); 426 atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); 427 } 428 429 return 0; 430 } 431 EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); 432 433 static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, 434 unsigned int depth) 435 { 436 unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth); 437 int i; 438 439 if (sbq->wake_batch != wake_batch) { 440 WRITE_ONCE(sbq->wake_batch, wake_batch); 441 /* 442 * Pairs with the memory barrier in sbitmap_queue_wake_up() 443 * to ensure that the batch size is updated before the wait 444 * counts. 445 */ 446 smp_mb(); 447 for (i = 0; i < SBQ_WAIT_QUEUES; i++) 448 atomic_set(&sbq->ws[i].wait_cnt, 1); 449 } 450 } 451 452 void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) 453 { 454 sbitmap_queue_update_wake_batch(sbq, depth); 455 sbitmap_resize(&sbq->sb, depth); 456 } 457 EXPORT_SYMBOL_GPL(sbitmap_queue_resize); 458 459 int __sbitmap_queue_get(struct sbitmap_queue *sbq) 460 { 461 unsigned int hint, depth; 462 int nr; 463 464 depth = READ_ONCE(sbq->sb.depth); 465 hint = update_alloc_hint_before_get(sbq, depth); 466 nr = sbitmap_get(&sbq->sb, hint); 467 update_alloc_hint_after_get(sbq, depth, hint, nr); 468 469 return nr; 470 } 471 EXPORT_SYMBOL_GPL(__sbitmap_queue_get); 472 473 int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, 474 unsigned int shallow_depth) 475 { 476 unsigned int hint, depth; 477 int nr; 478 479 WARN_ON_ONCE(shallow_depth < sbq->min_shallow_depth); 480 481 depth = READ_ONCE(sbq->sb.depth); 482 hint = update_alloc_hint_before_get(sbq, depth); 483 nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth); 484 update_alloc_hint_after_get(sbq, depth, hint, nr); 485 486 return nr; 487 } 488 EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow); 489 490 void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, 491 unsigned int min_shallow_depth) 492 { 493 sbq->min_shallow_depth = min_shallow_depth; 494 sbitmap_queue_update_wake_batch(sbq, sbq->sb.depth); 495 } 496 EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth); 497 498 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) 499 { 500 int i, wake_index; 501 502 if (!atomic_read(&sbq->ws_active)) 503 return NULL; 504 505 wake_index = atomic_read(&sbq->wake_index); 506 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 507 struct sbq_wait_state *ws = &sbq->ws[wake_index]; 508 509 if (waitqueue_active(&ws->wait)) { 510 if (wake_index != atomic_read(&sbq->wake_index)) 511 atomic_set(&sbq->wake_index, wake_index); 512 return ws; 513 } 514 515 wake_index = sbq_index_inc(wake_index); 516 } 517 518 return NULL; 519 } 520 521 static bool __sbq_wake_up(struct sbitmap_queue *sbq) 522 { 523 struct sbq_wait_state *ws; 524 unsigned int wake_batch; 525 int wait_cnt; 526 527 ws = sbq_wake_ptr(sbq); 528 if (!ws) 529 return false; 530 531 wait_cnt = atomic_dec_return(&ws->wait_cnt); 532 if (wait_cnt <= 0) { 533 int ret; 534 535 wake_batch = READ_ONCE(sbq->wake_batch); 536 537 /* 538 * Pairs with the memory barrier in sbitmap_queue_resize() to 539 * ensure that we see the batch size update before the wait 540 * count is reset. 541 */ 542 smp_mb__before_atomic(); 543 544 /* 545 * For concurrent callers of this, the one that failed the 546 * atomic_cmpxhcg() race should call this function again 547 * to wakeup a new batch on a different 'ws'. 548 */ 549 ret = atomic_cmpxchg(&ws->wait_cnt, wait_cnt, wake_batch); 550 if (ret == wait_cnt) { 551 sbq_index_atomic_inc(&sbq->wake_index); 552 wake_up_nr(&ws->wait, wake_batch); 553 return false; 554 } 555 556 return true; 557 } 558 559 return false; 560 } 561 562 void sbitmap_queue_wake_up(struct sbitmap_queue *sbq) 563 { 564 while (__sbq_wake_up(sbq)) 565 ; 566 } 567 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up); 568 569 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, 570 unsigned int cpu) 571 { 572 /* 573 * Once the clear bit is set, the bit may be allocated out. 574 * 575 * Orders READ/WRITE on the asssociated instance(such as request 576 * of blk_mq) by this bit for avoiding race with re-allocation, 577 * and its pair is the memory barrier implied in __sbitmap_get_word. 578 * 579 * One invariant is that the clear bit has to be zero when the bit 580 * is in use. 581 */ 582 smp_mb__before_atomic(); 583 sbitmap_deferred_clear_bit(&sbq->sb, nr); 584 585 /* 586 * Pairs with the memory barrier in set_current_state() to ensure the 587 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker 588 * and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the 589 * waiter. See the comment on waitqueue_active(). 590 */ 591 smp_mb__after_atomic(); 592 sbitmap_queue_wake_up(sbq); 593 594 if (likely(!sbq->sb.round_robin && nr < sbq->sb.depth)) 595 *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; 596 } 597 EXPORT_SYMBOL_GPL(sbitmap_queue_clear); 598 599 void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) 600 { 601 int i, wake_index; 602 603 /* 604 * Pairs with the memory barrier in set_current_state() like in 605 * sbitmap_queue_wake_up(). 606 */ 607 smp_mb(); 608 wake_index = atomic_read(&sbq->wake_index); 609 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 610 struct sbq_wait_state *ws = &sbq->ws[wake_index]; 611 612 if (waitqueue_active(&ws->wait)) 613 wake_up(&ws->wait); 614 615 wake_index = sbq_index_inc(wake_index); 616 } 617 } 618 EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); 619 620 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m) 621 { 622 bool first; 623 int i; 624 625 sbitmap_show(&sbq->sb, m); 626 627 seq_puts(m, "alloc_hint={"); 628 first = true; 629 for_each_possible_cpu(i) { 630 if (!first) 631 seq_puts(m, ", "); 632 first = false; 633 seq_printf(m, "%u", *per_cpu_ptr(sbq->alloc_hint, i)); 634 } 635 seq_puts(m, "}\n"); 636 637 seq_printf(m, "wake_batch=%u\n", sbq->wake_batch); 638 seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index)); 639 seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active)); 640 641 seq_puts(m, "ws={\n"); 642 for (i = 0; i < SBQ_WAIT_QUEUES; i++) { 643 struct sbq_wait_state *ws = &sbq->ws[i]; 644 645 seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n", 646 atomic_read(&ws->wait_cnt), 647 waitqueue_active(&ws->wait) ? "active" : "inactive"); 648 } 649 seq_puts(m, "}\n"); 650 651 seq_printf(m, "round_robin=%d\n", sbq->sb.round_robin); 652 seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth); 653 } 654 EXPORT_SYMBOL_GPL(sbitmap_queue_show); 655 656 void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, 657 struct sbq_wait_state *ws, 658 struct sbq_wait *sbq_wait) 659 { 660 if (!sbq_wait->sbq) { 661 sbq_wait->sbq = sbq; 662 atomic_inc(&sbq->ws_active); 663 add_wait_queue(&ws->wait, &sbq_wait->wait); 664 } 665 } 666 EXPORT_SYMBOL_GPL(sbitmap_add_wait_queue); 667 668 void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait) 669 { 670 list_del_init(&sbq_wait->wait.entry); 671 if (sbq_wait->sbq) { 672 atomic_dec(&sbq_wait->sbq->ws_active); 673 sbq_wait->sbq = NULL; 674 } 675 } 676 EXPORT_SYMBOL_GPL(sbitmap_del_wait_queue); 677 678 void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, 679 struct sbq_wait_state *ws, 680 struct sbq_wait *sbq_wait, int state) 681 { 682 if (!sbq_wait->sbq) { 683 atomic_inc(&sbq->ws_active); 684 sbq_wait->sbq = sbq; 685 } 686 prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state); 687 } 688 EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait); 689 690 void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, 691 struct sbq_wait *sbq_wait) 692 { 693 finish_wait(&ws->wait, &sbq_wait->wait); 694 if (sbq_wait->sbq) { 695 atomic_dec(&sbq->ws_active); 696 sbq_wait->sbq = NULL; 697 } 698 } 699 EXPORT_SYMBOL_GPL(sbitmap_finish_wait); 700