1 /* 2 * Primary bucket allocation code 3 * 4 * Copyright 2012 Google, Inc. 5 * 6 * Allocation in bcache is done in terms of buckets: 7 * 8 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in 9 * btree pointers - they must match for the pointer to be considered valid. 10 * 11 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a 12 * bucket simply by incrementing its gen. 13 * 14 * The gens (along with the priorities; it's really the gens are important but 15 * the code is named as if it's the priorities) are written in an arbitrary list 16 * of buckets on disk, with a pointer to them in the journal header. 17 * 18 * When we invalidate a bucket, we have to write its new gen to disk and wait 19 * for that write to complete before we use it - otherwise after a crash we 20 * could have pointers that appeared to be good but pointed to data that had 21 * been overwritten. 22 * 23 * Since the gens and priorities are all stored contiguously on disk, we can 24 * batch this up: We fill up the free_inc list with freshly invalidated buckets, 25 * call prio_write(), and when prio_write() finishes we pull buckets off the 26 * free_inc list and optionally discard them. 27 * 28 * free_inc isn't the only freelist - if it was, we'd often to sleep while 29 * priorities and gens were being written before we could allocate. c->free is a 30 * smaller freelist, and buckets on that list are always ready to be used. 31 * 32 * If we've got discards enabled, that happens when a bucket moves from the 33 * free_inc list to the free list. 34 * 35 * There is another freelist, because sometimes we have buckets that we know 36 * have nothing pointing into them - these we can reuse without waiting for 37 * priorities to be rewritten. These come from freed btree nodes and buckets 38 * that garbage collection discovered no longer had valid keys pointing into 39 * them (because they were overwritten). That's the unused list - buckets on the 40 * unused list move to the free list, optionally being discarded in the process. 41 * 42 * It's also important to ensure that gens don't wrap around - with respect to 43 * either the oldest gen in the btree or the gen on disk. This is quite 44 * difficult to do in practice, but we explicitly guard against it anyways - if 45 * a bucket is in danger of wrapping around we simply skip invalidating it that 46 * time around, and we garbage collect or rewrite the priorities sooner than we 47 * would have otherwise. 48 * 49 * bch_bucket_alloc() allocates a single bucket from a specific cache. 50 * 51 * bch_bucket_alloc_set() allocates one or more buckets from different caches 52 * out of a cache set. 53 * 54 * free_some_buckets() drives all the processes described above. It's called 55 * from bch_bucket_alloc() and a few other places that need to make sure free 56 * buckets are ready. 57 * 58 * invalidate_buckets_(lru|fifo)() find buckets that are available to be 59 * invalidated, and then invalidate them and stick them on the free_inc list - 60 * in either lru or fifo order. 61 */ 62 63 #include "bcache.h" 64 #include "btree.h" 65 66 #include <linux/blkdev.h> 67 #include <linux/kthread.h> 68 #include <linux/random.h> 69 #include <trace/events/bcache.h> 70 71 #define MAX_OPEN_BUCKETS 128 72 73 /* Bucket heap / gen */ 74 75 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) 76 { 77 uint8_t ret = ++b->gen; 78 79 ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); 80 WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); 81 82 return ret; 83 } 84 85 void bch_rescale_priorities(struct cache_set *c, int sectors) 86 { 87 struct cache *ca; 88 struct bucket *b; 89 unsigned next = c->nbuckets * c->sb.bucket_size / 1024; 90 unsigned i; 91 int r; 92 93 atomic_sub(sectors, &c->rescale); 94 95 do { 96 r = atomic_read(&c->rescale); 97 98 if (r >= 0) 99 return; 100 } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); 101 102 mutex_lock(&c->bucket_lock); 103 104 c->min_prio = USHRT_MAX; 105 106 for_each_cache(ca, c, i) 107 for_each_bucket(b, ca) 108 if (b->prio && 109 b->prio != BTREE_PRIO && 110 !atomic_read(&b->pin)) { 111 b->prio--; 112 c->min_prio = min(c->min_prio, b->prio); 113 } 114 115 mutex_unlock(&c->bucket_lock); 116 } 117 118 /* 119 * Background allocation thread: scans for buckets to be invalidated, 120 * invalidates them, rewrites prios/gens (marking them as invalidated on disk), 121 * then optionally issues discard commands to the newly free buckets, then puts 122 * them on the various freelists. 123 */ 124 125 static inline bool can_inc_bucket_gen(struct bucket *b) 126 { 127 return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; 128 } 129 130 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) 131 { 132 BUG_ON(!ca->set->gc_mark_valid); 133 134 return (!GC_MARK(b) || 135 GC_MARK(b) == GC_MARK_RECLAIMABLE) && 136 !atomic_read(&b->pin) && 137 can_inc_bucket_gen(b); 138 } 139 140 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) 141 { 142 lockdep_assert_held(&ca->set->bucket_lock); 143 BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); 144 145 if (GC_SECTORS_USED(b)) 146 trace_bcache_invalidate(ca, b - ca->buckets); 147 148 bch_inc_gen(ca, b); 149 b->prio = INITIAL_PRIO; 150 atomic_inc(&b->pin); 151 } 152 153 static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) 154 { 155 __bch_invalidate_one_bucket(ca, b); 156 157 fifo_push(&ca->free_inc, b - ca->buckets); 158 } 159 160 /* 161 * Determines what order we're going to reuse buckets, smallest bucket_prio() 162 * first: we also take into account the number of sectors of live data in that 163 * bucket, and in order for that multiply to make sense we have to scale bucket 164 * 165 * Thus, we scale the bucket priorities so that the bucket with the smallest 166 * prio is worth 1/8th of what INITIAL_PRIO is worth. 167 */ 168 169 #define bucket_prio(b) \ 170 ({ \ 171 unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ 172 \ 173 (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ 174 }) 175 176 #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) 177 #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) 178 179 static void invalidate_buckets_lru(struct cache *ca) 180 { 181 struct bucket *b; 182 ssize_t i; 183 184 ca->heap.used = 0; 185 186 for_each_bucket(b, ca) { 187 if (!bch_can_invalidate_bucket(ca, b)) 188 continue; 189 190 if (!heap_full(&ca->heap)) 191 heap_add(&ca->heap, b, bucket_max_cmp); 192 else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { 193 ca->heap.data[0] = b; 194 heap_sift(&ca->heap, 0, bucket_max_cmp); 195 } 196 } 197 198 for (i = ca->heap.used / 2 - 1; i >= 0; --i) 199 heap_sift(&ca->heap, i, bucket_min_cmp); 200 201 while (!fifo_full(&ca->free_inc)) { 202 if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { 203 /* 204 * We don't want to be calling invalidate_buckets() 205 * multiple times when it can't do anything 206 */ 207 ca->invalidate_needs_gc = 1; 208 wake_up_gc(ca->set); 209 return; 210 } 211 212 bch_invalidate_one_bucket(ca, b); 213 } 214 } 215 216 static void invalidate_buckets_fifo(struct cache *ca) 217 { 218 struct bucket *b; 219 size_t checked = 0; 220 221 while (!fifo_full(&ca->free_inc)) { 222 if (ca->fifo_last_bucket < ca->sb.first_bucket || 223 ca->fifo_last_bucket >= ca->sb.nbuckets) 224 ca->fifo_last_bucket = ca->sb.first_bucket; 225 226 b = ca->buckets + ca->fifo_last_bucket++; 227 228 if (bch_can_invalidate_bucket(ca, b)) 229 bch_invalidate_one_bucket(ca, b); 230 231 if (++checked >= ca->sb.nbuckets) { 232 ca->invalidate_needs_gc = 1; 233 wake_up_gc(ca->set); 234 return; 235 } 236 } 237 } 238 239 static void invalidate_buckets_random(struct cache *ca) 240 { 241 struct bucket *b; 242 size_t checked = 0; 243 244 while (!fifo_full(&ca->free_inc)) { 245 size_t n; 246 get_random_bytes(&n, sizeof(n)); 247 248 n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); 249 n += ca->sb.first_bucket; 250 251 b = ca->buckets + n; 252 253 if (bch_can_invalidate_bucket(ca, b)) 254 bch_invalidate_one_bucket(ca, b); 255 256 if (++checked >= ca->sb.nbuckets / 2) { 257 ca->invalidate_needs_gc = 1; 258 wake_up_gc(ca->set); 259 return; 260 } 261 } 262 } 263 264 static void invalidate_buckets(struct cache *ca) 265 { 266 BUG_ON(ca->invalidate_needs_gc); 267 268 switch (CACHE_REPLACEMENT(&ca->sb)) { 269 case CACHE_REPLACEMENT_LRU: 270 invalidate_buckets_lru(ca); 271 break; 272 case CACHE_REPLACEMENT_FIFO: 273 invalidate_buckets_fifo(ca); 274 break; 275 case CACHE_REPLACEMENT_RANDOM: 276 invalidate_buckets_random(ca); 277 break; 278 } 279 } 280 281 #define allocator_wait(ca, cond) \ 282 do { \ 283 while (1) { \ 284 set_current_state(TASK_INTERRUPTIBLE); \ 285 if (cond) \ 286 break; \ 287 \ 288 mutex_unlock(&(ca)->set->bucket_lock); \ 289 if (kthread_should_stop()) \ 290 return 0; \ 291 \ 292 schedule(); \ 293 mutex_lock(&(ca)->set->bucket_lock); \ 294 } \ 295 __set_current_state(TASK_RUNNING); \ 296 } while (0) 297 298 static int bch_allocator_push(struct cache *ca, long bucket) 299 { 300 unsigned i; 301 302 /* Prios/gens are actually the most important reserve */ 303 if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) 304 return true; 305 306 for (i = 0; i < RESERVE_NR; i++) 307 if (fifo_push(&ca->free[i], bucket)) 308 return true; 309 310 return false; 311 } 312 313 static int bch_allocator_thread(void *arg) 314 { 315 struct cache *ca = arg; 316 317 mutex_lock(&ca->set->bucket_lock); 318 319 while (1) { 320 /* 321 * First, we pull buckets off of the unused and free_inc lists, 322 * possibly issue discards to them, then we add the bucket to 323 * the free list: 324 */ 325 while (!fifo_empty(&ca->free_inc)) { 326 long bucket; 327 328 fifo_pop(&ca->free_inc, bucket); 329 330 if (ca->discard) { 331 mutex_unlock(&ca->set->bucket_lock); 332 blkdev_issue_discard(ca->bdev, 333 bucket_to_sector(ca->set, bucket), 334 ca->sb.bucket_size, GFP_KERNEL, 0); 335 mutex_lock(&ca->set->bucket_lock); 336 } 337 338 allocator_wait(ca, bch_allocator_push(ca, bucket)); 339 wake_up(&ca->set->btree_cache_wait); 340 wake_up(&ca->set->bucket_wait); 341 } 342 343 /* 344 * We've run out of free buckets, we need to find some buckets 345 * we can invalidate. First, invalidate them in memory and add 346 * them to the free_inc list: 347 */ 348 349 retry_invalidate: 350 allocator_wait(ca, ca->set->gc_mark_valid && 351 !ca->invalidate_needs_gc); 352 invalidate_buckets(ca); 353 354 /* 355 * Now, we write their new gens to disk so we can start writing 356 * new stuff to them: 357 */ 358 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); 359 if (CACHE_SYNC(&ca->set->sb)) { 360 /* 361 * This could deadlock if an allocation with a btree 362 * node locked ever blocked - having the btree node 363 * locked would block garbage collection, but here we're 364 * waiting on garbage collection before we invalidate 365 * and free anything. 366 * 367 * But this should be safe since the btree code always 368 * uses btree_check_reserve() before allocating now, and 369 * if it fails it blocks without btree nodes locked. 370 */ 371 if (!fifo_full(&ca->free_inc)) 372 goto retry_invalidate; 373 374 bch_prio_write(ca); 375 } 376 } 377 } 378 379 /* Allocation */ 380 381 long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) 382 { 383 DEFINE_WAIT(w); 384 struct bucket *b; 385 long r; 386 387 /* fastpath */ 388 if (fifo_pop(&ca->free[RESERVE_NONE], r) || 389 fifo_pop(&ca->free[reserve], r)) 390 goto out; 391 392 if (!wait) { 393 trace_bcache_alloc_fail(ca, reserve); 394 return -1; 395 } 396 397 do { 398 prepare_to_wait(&ca->set->bucket_wait, &w, 399 TASK_UNINTERRUPTIBLE); 400 401 mutex_unlock(&ca->set->bucket_lock); 402 schedule(); 403 mutex_lock(&ca->set->bucket_lock); 404 } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && 405 !fifo_pop(&ca->free[reserve], r)); 406 407 finish_wait(&ca->set->bucket_wait, &w); 408 out: 409 wake_up_process(ca->alloc_thread); 410 411 trace_bcache_alloc(ca, reserve); 412 413 if (expensive_debug_checks(ca->set)) { 414 size_t iter; 415 long i; 416 unsigned j; 417 418 for (iter = 0; iter < prio_buckets(ca) * 2; iter++) 419 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); 420 421 for (j = 0; j < RESERVE_NR; j++) 422 fifo_for_each(i, &ca->free[j], iter) 423 BUG_ON(i == r); 424 fifo_for_each(i, &ca->free_inc, iter) 425 BUG_ON(i == r); 426 } 427 428 b = ca->buckets + r; 429 430 BUG_ON(atomic_read(&b->pin) != 1); 431 432 SET_GC_SECTORS_USED(b, ca->sb.bucket_size); 433 434 if (reserve <= RESERVE_PRIO) { 435 SET_GC_MARK(b, GC_MARK_METADATA); 436 SET_GC_MOVE(b, 0); 437 b->prio = BTREE_PRIO; 438 } else { 439 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 440 SET_GC_MOVE(b, 0); 441 b->prio = INITIAL_PRIO; 442 } 443 444 return r; 445 } 446 447 void __bch_bucket_free(struct cache *ca, struct bucket *b) 448 { 449 SET_GC_MARK(b, 0); 450 SET_GC_SECTORS_USED(b, 0); 451 } 452 453 void bch_bucket_free(struct cache_set *c, struct bkey *k) 454 { 455 unsigned i; 456 457 for (i = 0; i < KEY_PTRS(k); i++) 458 __bch_bucket_free(PTR_CACHE(c, k, i), 459 PTR_BUCKET(c, k, i)); 460 } 461 462 int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, 463 struct bkey *k, int n, bool wait) 464 { 465 int i; 466 467 lockdep_assert_held(&c->bucket_lock); 468 BUG_ON(!n || n > c->caches_loaded || n > 8); 469 470 bkey_init(k); 471 472 /* sort by free space/prio of oldest data in caches */ 473 474 for (i = 0; i < n; i++) { 475 struct cache *ca = c->cache_by_alloc[i]; 476 long b = bch_bucket_alloc(ca, reserve, wait); 477 478 if (b == -1) 479 goto err; 480 481 k->ptr[i] = PTR(ca->buckets[b].gen, 482 bucket_to_sector(c, b), 483 ca->sb.nr_this_dev); 484 485 SET_KEY_PTRS(k, i + 1); 486 } 487 488 return 0; 489 err: 490 bch_bucket_free(c, k); 491 bkey_put(c, k); 492 return -1; 493 } 494 495 int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, 496 struct bkey *k, int n, bool wait) 497 { 498 int ret; 499 mutex_lock(&c->bucket_lock); 500 ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); 501 mutex_unlock(&c->bucket_lock); 502 return ret; 503 } 504 505 /* Sector allocator */ 506 507 struct open_bucket { 508 struct list_head list; 509 unsigned last_write_point; 510 unsigned sectors_free; 511 BKEY_PADDED(key); 512 }; 513 514 /* 515 * We keep multiple buckets open for writes, and try to segregate different 516 * write streams for better cache utilization: first we look for a bucket where 517 * the last write to it was sequential with the current write, and failing that 518 * we look for a bucket that was last used by the same task. 519 * 520 * The ideas is if you've got multiple tasks pulling data into the cache at the 521 * same time, you'll get better cache utilization if you try to segregate their 522 * data and preserve locality. 523 * 524 * For example, say you've starting Firefox at the same time you're copying a 525 * bunch of files. Firefox will likely end up being fairly hot and stay in the 526 * cache awhile, but the data you copied might not be; if you wrote all that 527 * data to the same buckets it'd get invalidated at the same time. 528 * 529 * Both of those tasks will be doing fairly random IO so we can't rely on 530 * detecting sequential IO to segregate their data, but going off of the task 531 * should be a sane heuristic. 532 */ 533 static struct open_bucket *pick_data_bucket(struct cache_set *c, 534 const struct bkey *search, 535 unsigned write_point, 536 struct bkey *alloc) 537 { 538 struct open_bucket *ret, *ret_task = NULL; 539 540 list_for_each_entry_reverse(ret, &c->data_buckets, list) 541 if (!bkey_cmp(&ret->key, search)) 542 goto found; 543 else if (ret->last_write_point == write_point) 544 ret_task = ret; 545 546 ret = ret_task ?: list_first_entry(&c->data_buckets, 547 struct open_bucket, list); 548 found: 549 if (!ret->sectors_free && KEY_PTRS(alloc)) { 550 ret->sectors_free = c->sb.bucket_size; 551 bkey_copy(&ret->key, alloc); 552 bkey_init(alloc); 553 } 554 555 if (!ret->sectors_free) 556 ret = NULL; 557 558 return ret; 559 } 560 561 /* 562 * Allocates some space in the cache to write to, and k to point to the newly 563 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the 564 * end of the newly allocated space). 565 * 566 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many 567 * sectors were actually allocated. 568 * 569 * If s->writeback is true, will not fail. 570 */ 571 bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, 572 unsigned write_point, unsigned write_prio, bool wait) 573 { 574 struct open_bucket *b; 575 BKEY_PADDED(key) alloc; 576 unsigned i; 577 578 /* 579 * We might have to allocate a new bucket, which we can't do with a 580 * spinlock held. So if we have to allocate, we drop the lock, allocate 581 * and then retry. KEY_PTRS() indicates whether alloc points to 582 * allocated bucket(s). 583 */ 584 585 bkey_init(&alloc.key); 586 spin_lock(&c->data_bucket_lock); 587 588 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { 589 unsigned watermark = write_prio 590 ? RESERVE_MOVINGGC 591 : RESERVE_NONE; 592 593 spin_unlock(&c->data_bucket_lock); 594 595 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) 596 return false; 597 598 spin_lock(&c->data_bucket_lock); 599 } 600 601 /* 602 * If we had to allocate, we might race and not need to allocate the 603 * second time we call find_data_bucket(). If we allocated a bucket but 604 * didn't use it, drop the refcount bch_bucket_alloc_set() took: 605 */ 606 if (KEY_PTRS(&alloc.key)) 607 bkey_put(c, &alloc.key); 608 609 for (i = 0; i < KEY_PTRS(&b->key); i++) 610 EBUG_ON(ptr_stale(c, &b->key, i)); 611 612 /* Set up the pointer to the space we're allocating: */ 613 614 for (i = 0; i < KEY_PTRS(&b->key); i++) 615 k->ptr[i] = b->key.ptr[i]; 616 617 sectors = min(sectors, b->sectors_free); 618 619 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); 620 SET_KEY_SIZE(k, sectors); 621 SET_KEY_PTRS(k, KEY_PTRS(&b->key)); 622 623 /* 624 * Move b to the end of the lru, and keep track of what this bucket was 625 * last used for: 626 */ 627 list_move_tail(&b->list, &c->data_buckets); 628 bkey_copy_key(&b->key, k); 629 b->last_write_point = write_point; 630 631 b->sectors_free -= sectors; 632 633 for (i = 0; i < KEY_PTRS(&b->key); i++) { 634 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); 635 636 atomic_long_add(sectors, 637 &PTR_CACHE(c, &b->key, i)->sectors_written); 638 } 639 640 if (b->sectors_free < c->sb.block_size) 641 b->sectors_free = 0; 642 643 /* 644 * k takes refcounts on the buckets it points to until it's inserted 645 * into the btree, but if we're done with this bucket we just transfer 646 * get_data_bucket()'s refcount. 647 */ 648 if (b->sectors_free) 649 for (i = 0; i < KEY_PTRS(&b->key); i++) 650 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); 651 652 spin_unlock(&c->data_bucket_lock); 653 return true; 654 } 655 656 /* Init */ 657 658 void bch_open_buckets_free(struct cache_set *c) 659 { 660 struct open_bucket *b; 661 662 while (!list_empty(&c->data_buckets)) { 663 b = list_first_entry(&c->data_buckets, 664 struct open_bucket, list); 665 list_del(&b->list); 666 kfree(b); 667 } 668 } 669 670 int bch_open_buckets_alloc(struct cache_set *c) 671 { 672 int i; 673 674 spin_lock_init(&c->data_bucket_lock); 675 676 for (i = 0; i < MAX_OPEN_BUCKETS; i++) { 677 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); 678 if (!b) 679 return -ENOMEM; 680 681 list_add(&b->list, &c->data_buckets); 682 } 683 684 return 0; 685 } 686 687 int bch_cache_allocator_start(struct cache *ca) 688 { 689 struct task_struct *k = kthread_run(bch_allocator_thread, 690 ca, "bcache_allocator"); 691 if (IS_ERR(k)) 692 return PTR_ERR(k); 693 694 ca->alloc_thread = k; 695 return 0; 696 } 697