1 /* 2 * Primary bucket allocation code 3 * 4 * Copyright 2012 Google, Inc. 5 * 6 * Allocation in bcache is done in terms of buckets: 7 * 8 * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in 9 * btree pointers - they must match for the pointer to be considered valid. 10 * 11 * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a 12 * bucket simply by incrementing its gen. 13 * 14 * The gens (along with the priorities; it's really the gens are important but 15 * the code is named as if it's the priorities) are written in an arbitrary list 16 * of buckets on disk, with a pointer to them in the journal header. 17 * 18 * When we invalidate a bucket, we have to write its new gen to disk and wait 19 * for that write to complete before we use it - otherwise after a crash we 20 * could have pointers that appeared to be good but pointed to data that had 21 * been overwritten. 22 * 23 * Since the gens and priorities are all stored contiguously on disk, we can 24 * batch this up: We fill up the free_inc list with freshly invalidated buckets, 25 * call prio_write(), and when prio_write() finishes we pull buckets off the 26 * free_inc list and optionally discard them. 27 * 28 * free_inc isn't the only freelist - if it was, we'd often to sleep while 29 * priorities and gens were being written before we could allocate. c->free is a 30 * smaller freelist, and buckets on that list are always ready to be used. 31 * 32 * If we've got discards enabled, that happens when a bucket moves from the 33 * free_inc list to the free list. 34 * 35 * There is another freelist, because sometimes we have buckets that we know 36 * have nothing pointing into them - these we can reuse without waiting for 37 * priorities to be rewritten. These come from freed btree nodes and buckets 38 * that garbage collection discovered no longer had valid keys pointing into 39 * them (because they were overwritten). That's the unused list - buckets on the 40 * unused list move to the free list, optionally being discarded in the process. 41 * 42 * It's also important to ensure that gens don't wrap around - with respect to 43 * either the oldest gen in the btree or the gen on disk. This is quite 44 * difficult to do in practice, but we explicitly guard against it anyways - if 45 * a bucket is in danger of wrapping around we simply skip invalidating it that 46 * time around, and we garbage collect or rewrite the priorities sooner than we 47 * would have otherwise. 48 * 49 * bch_bucket_alloc() allocates a single bucket from a specific cache. 50 * 51 * bch_bucket_alloc_set() allocates one or more buckets from different caches 52 * out of a cache set. 53 * 54 * free_some_buckets() drives all the processes described above. It's called 55 * from bch_bucket_alloc() and a few other places that need to make sure free 56 * buckets are ready. 57 * 58 * invalidate_buckets_(lru|fifo)() find buckets that are available to be 59 * invalidated, and then invalidate them and stick them on the free_inc list - 60 * in either lru or fifo order. 61 */ 62 63 #include "bcache.h" 64 #include "btree.h" 65 66 #include <linux/blkdev.h> 67 #include <linux/freezer.h> 68 #include <linux/kthread.h> 69 #include <linux/random.h> 70 #include <trace/events/bcache.h> 71 72 /* Bucket heap / gen */ 73 74 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b) 75 { 76 uint8_t ret = ++b->gen; 77 78 ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b)); 79 WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX); 80 81 return ret; 82 } 83 84 void bch_rescale_priorities(struct cache_set *c, int sectors) 85 { 86 struct cache *ca; 87 struct bucket *b; 88 unsigned next = c->nbuckets * c->sb.bucket_size / 1024; 89 unsigned i; 90 int r; 91 92 atomic_sub(sectors, &c->rescale); 93 94 do { 95 r = atomic_read(&c->rescale); 96 97 if (r >= 0) 98 return; 99 } while (atomic_cmpxchg(&c->rescale, r, r + next) != r); 100 101 mutex_lock(&c->bucket_lock); 102 103 c->min_prio = USHRT_MAX; 104 105 for_each_cache(ca, c, i) 106 for_each_bucket(b, ca) 107 if (b->prio && 108 b->prio != BTREE_PRIO && 109 !atomic_read(&b->pin)) { 110 b->prio--; 111 c->min_prio = min(c->min_prio, b->prio); 112 } 113 114 mutex_unlock(&c->bucket_lock); 115 } 116 117 /* 118 * Background allocation thread: scans for buckets to be invalidated, 119 * invalidates them, rewrites prios/gens (marking them as invalidated on disk), 120 * then optionally issues discard commands to the newly free buckets, then puts 121 * them on the various freelists. 122 */ 123 124 static inline bool can_inc_bucket_gen(struct bucket *b) 125 { 126 return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX; 127 } 128 129 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) 130 { 131 BUG_ON(!ca->set->gc_mark_valid); 132 133 return (!GC_MARK(b) || 134 GC_MARK(b) == GC_MARK_RECLAIMABLE) && 135 !atomic_read(&b->pin) && 136 can_inc_bucket_gen(b); 137 } 138 139 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) 140 { 141 lockdep_assert_held(&ca->set->bucket_lock); 142 BUG_ON(GC_MARK(b) && GC_MARK(b) != GC_MARK_RECLAIMABLE); 143 144 if (GC_SECTORS_USED(b)) 145 trace_bcache_invalidate(ca, b - ca->buckets); 146 147 bch_inc_gen(ca, b); 148 b->prio = INITIAL_PRIO; 149 atomic_inc(&b->pin); 150 } 151 152 static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) 153 { 154 __bch_invalidate_one_bucket(ca, b); 155 156 fifo_push(&ca->free_inc, b - ca->buckets); 157 } 158 159 /* 160 * Determines what order we're going to reuse buckets, smallest bucket_prio() 161 * first: we also take into account the number of sectors of live data in that 162 * bucket, and in order for that multiply to make sense we have to scale bucket 163 * 164 * Thus, we scale the bucket priorities so that the bucket with the smallest 165 * prio is worth 1/8th of what INITIAL_PRIO is worth. 166 */ 167 168 #define bucket_prio(b) \ 169 ({ \ 170 unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ 171 \ 172 (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ 173 }) 174 175 #define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) 176 #define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) 177 178 static void invalidate_buckets_lru(struct cache *ca) 179 { 180 struct bucket *b; 181 ssize_t i; 182 183 ca->heap.used = 0; 184 185 for_each_bucket(b, ca) { 186 if (!bch_can_invalidate_bucket(ca, b)) 187 continue; 188 189 if (!heap_full(&ca->heap)) 190 heap_add(&ca->heap, b, bucket_max_cmp); 191 else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { 192 ca->heap.data[0] = b; 193 heap_sift(&ca->heap, 0, bucket_max_cmp); 194 } 195 } 196 197 for (i = ca->heap.used / 2 - 1; i >= 0; --i) 198 heap_sift(&ca->heap, i, bucket_min_cmp); 199 200 while (!fifo_full(&ca->free_inc)) { 201 if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { 202 /* 203 * We don't want to be calling invalidate_buckets() 204 * multiple times when it can't do anything 205 */ 206 ca->invalidate_needs_gc = 1; 207 wake_up_gc(ca->set); 208 return; 209 } 210 211 bch_invalidate_one_bucket(ca, b); 212 } 213 } 214 215 static void invalidate_buckets_fifo(struct cache *ca) 216 { 217 struct bucket *b; 218 size_t checked = 0; 219 220 while (!fifo_full(&ca->free_inc)) { 221 if (ca->fifo_last_bucket < ca->sb.first_bucket || 222 ca->fifo_last_bucket >= ca->sb.nbuckets) 223 ca->fifo_last_bucket = ca->sb.first_bucket; 224 225 b = ca->buckets + ca->fifo_last_bucket++; 226 227 if (bch_can_invalidate_bucket(ca, b)) 228 bch_invalidate_one_bucket(ca, b); 229 230 if (++checked >= ca->sb.nbuckets) { 231 ca->invalidate_needs_gc = 1; 232 wake_up_gc(ca->set); 233 return; 234 } 235 } 236 } 237 238 static void invalidate_buckets_random(struct cache *ca) 239 { 240 struct bucket *b; 241 size_t checked = 0; 242 243 while (!fifo_full(&ca->free_inc)) { 244 size_t n; 245 get_random_bytes(&n, sizeof(n)); 246 247 n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); 248 n += ca->sb.first_bucket; 249 250 b = ca->buckets + n; 251 252 if (bch_can_invalidate_bucket(ca, b)) 253 bch_invalidate_one_bucket(ca, b); 254 255 if (++checked >= ca->sb.nbuckets / 2) { 256 ca->invalidate_needs_gc = 1; 257 wake_up_gc(ca->set); 258 return; 259 } 260 } 261 } 262 263 static void invalidate_buckets(struct cache *ca) 264 { 265 BUG_ON(ca->invalidate_needs_gc); 266 267 switch (CACHE_REPLACEMENT(&ca->sb)) { 268 case CACHE_REPLACEMENT_LRU: 269 invalidate_buckets_lru(ca); 270 break; 271 case CACHE_REPLACEMENT_FIFO: 272 invalidate_buckets_fifo(ca); 273 break; 274 case CACHE_REPLACEMENT_RANDOM: 275 invalidate_buckets_random(ca); 276 break; 277 } 278 } 279 280 #define allocator_wait(ca, cond) \ 281 do { \ 282 while (1) { \ 283 set_current_state(TASK_INTERRUPTIBLE); \ 284 if (cond) \ 285 break; \ 286 \ 287 mutex_unlock(&(ca)->set->bucket_lock); \ 288 if (kthread_should_stop()) \ 289 return 0; \ 290 \ 291 try_to_freeze(); \ 292 schedule(); \ 293 mutex_lock(&(ca)->set->bucket_lock); \ 294 } \ 295 __set_current_state(TASK_RUNNING); \ 296 } while (0) 297 298 static int bch_allocator_push(struct cache *ca, long bucket) 299 { 300 unsigned i; 301 302 /* Prios/gens are actually the most important reserve */ 303 if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) 304 return true; 305 306 for (i = 0; i < RESERVE_NR; i++) 307 if (fifo_push(&ca->free[i], bucket)) 308 return true; 309 310 return false; 311 } 312 313 static int bch_allocator_thread(void *arg) 314 { 315 struct cache *ca = arg; 316 317 mutex_lock(&ca->set->bucket_lock); 318 319 while (1) { 320 /* 321 * First, we pull buckets off of the unused and free_inc lists, 322 * possibly issue discards to them, then we add the bucket to 323 * the free list: 324 */ 325 while (!fifo_empty(&ca->free_inc)) { 326 long bucket; 327 328 fifo_pop(&ca->free_inc, bucket); 329 330 if (ca->discard) { 331 mutex_unlock(&ca->set->bucket_lock); 332 blkdev_issue_discard(ca->bdev, 333 bucket_to_sector(ca->set, bucket), 334 ca->sb.block_size, GFP_KERNEL, 0); 335 mutex_lock(&ca->set->bucket_lock); 336 } 337 338 allocator_wait(ca, bch_allocator_push(ca, bucket)); 339 wake_up(&ca->set->btree_cache_wait); 340 wake_up(&ca->set->bucket_wait); 341 } 342 343 /* 344 * We've run out of free buckets, we need to find some buckets 345 * we can invalidate. First, invalidate them in memory and add 346 * them to the free_inc list: 347 */ 348 349 retry_invalidate: 350 allocator_wait(ca, ca->set->gc_mark_valid && 351 !ca->invalidate_needs_gc); 352 invalidate_buckets(ca); 353 354 /* 355 * Now, we write their new gens to disk so we can start writing 356 * new stuff to them: 357 */ 358 allocator_wait(ca, !atomic_read(&ca->set->prio_blocked)); 359 if (CACHE_SYNC(&ca->set->sb)) { 360 /* 361 * This could deadlock if an allocation with a btree 362 * node locked ever blocked - having the btree node 363 * locked would block garbage collection, but here we're 364 * waiting on garbage collection before we invalidate 365 * and free anything. 366 * 367 * But this should be safe since the btree code always 368 * uses btree_check_reserve() before allocating now, and 369 * if it fails it blocks without btree nodes locked. 370 */ 371 if (!fifo_full(&ca->free_inc)) 372 goto retry_invalidate; 373 374 bch_prio_write(ca); 375 } 376 } 377 } 378 379 /* Allocation */ 380 381 long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) 382 { 383 DEFINE_WAIT(w); 384 struct bucket *b; 385 long r; 386 387 /* fastpath */ 388 if (fifo_pop(&ca->free[RESERVE_NONE], r) || 389 fifo_pop(&ca->free[reserve], r)) 390 goto out; 391 392 if (!wait) { 393 trace_bcache_alloc_fail(ca, reserve); 394 return -1; 395 } 396 397 do { 398 prepare_to_wait(&ca->set->bucket_wait, &w, 399 TASK_UNINTERRUPTIBLE); 400 401 mutex_unlock(&ca->set->bucket_lock); 402 schedule(); 403 mutex_lock(&ca->set->bucket_lock); 404 } while (!fifo_pop(&ca->free[RESERVE_NONE], r) && 405 !fifo_pop(&ca->free[reserve], r)); 406 407 finish_wait(&ca->set->bucket_wait, &w); 408 out: 409 wake_up_process(ca->alloc_thread); 410 411 trace_bcache_alloc(ca, reserve); 412 413 if (expensive_debug_checks(ca->set)) { 414 size_t iter; 415 long i; 416 unsigned j; 417 418 for (iter = 0; iter < prio_buckets(ca) * 2; iter++) 419 BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); 420 421 for (j = 0; j < RESERVE_NR; j++) 422 fifo_for_each(i, &ca->free[j], iter) 423 BUG_ON(i == r); 424 fifo_for_each(i, &ca->free_inc, iter) 425 BUG_ON(i == r); 426 } 427 428 b = ca->buckets + r; 429 430 BUG_ON(atomic_read(&b->pin) != 1); 431 432 SET_GC_SECTORS_USED(b, ca->sb.bucket_size); 433 434 if (reserve <= RESERVE_PRIO) { 435 SET_GC_MARK(b, GC_MARK_METADATA); 436 SET_GC_MOVE(b, 0); 437 b->prio = BTREE_PRIO; 438 } else { 439 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 440 SET_GC_MOVE(b, 0); 441 b->prio = INITIAL_PRIO; 442 } 443 444 return r; 445 } 446 447 void __bch_bucket_free(struct cache *ca, struct bucket *b) 448 { 449 SET_GC_MARK(b, 0); 450 SET_GC_SECTORS_USED(b, 0); 451 } 452 453 void bch_bucket_free(struct cache_set *c, struct bkey *k) 454 { 455 unsigned i; 456 457 for (i = 0; i < KEY_PTRS(k); i++) 458 __bch_bucket_free(PTR_CACHE(c, k, i), 459 PTR_BUCKET(c, k, i)); 460 } 461 462 int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, 463 struct bkey *k, int n, bool wait) 464 { 465 int i; 466 467 lockdep_assert_held(&c->bucket_lock); 468 BUG_ON(!n || n > c->caches_loaded || n > 8); 469 470 bkey_init(k); 471 472 /* sort by free space/prio of oldest data in caches */ 473 474 for (i = 0; i < n; i++) { 475 struct cache *ca = c->cache_by_alloc[i]; 476 long b = bch_bucket_alloc(ca, reserve, wait); 477 478 if (b == -1) 479 goto err; 480 481 k->ptr[i] = PTR(ca->buckets[b].gen, 482 bucket_to_sector(c, b), 483 ca->sb.nr_this_dev); 484 485 SET_KEY_PTRS(k, i + 1); 486 } 487 488 return 0; 489 err: 490 bch_bucket_free(c, k); 491 bkey_put(c, k); 492 return -1; 493 } 494 495 int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, 496 struct bkey *k, int n, bool wait) 497 { 498 int ret; 499 mutex_lock(&c->bucket_lock); 500 ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); 501 mutex_unlock(&c->bucket_lock); 502 return ret; 503 } 504 505 /* Sector allocator */ 506 507 struct open_bucket { 508 struct list_head list; 509 unsigned last_write_point; 510 unsigned sectors_free; 511 BKEY_PADDED(key); 512 }; 513 514 /* 515 * We keep multiple buckets open for writes, and try to segregate different 516 * write streams for better cache utilization: first we look for a bucket where 517 * the last write to it was sequential with the current write, and failing that 518 * we look for a bucket that was last used by the same task. 519 * 520 * The ideas is if you've got multiple tasks pulling data into the cache at the 521 * same time, you'll get better cache utilization if you try to segregate their 522 * data and preserve locality. 523 * 524 * For example, say you've starting Firefox at the same time you're copying a 525 * bunch of files. Firefox will likely end up being fairly hot and stay in the 526 * cache awhile, but the data you copied might not be; if you wrote all that 527 * data to the same buckets it'd get invalidated at the same time. 528 * 529 * Both of those tasks will be doing fairly random IO so we can't rely on 530 * detecting sequential IO to segregate their data, but going off of the task 531 * should be a sane heuristic. 532 */ 533 static struct open_bucket *pick_data_bucket(struct cache_set *c, 534 const struct bkey *search, 535 unsigned write_point, 536 struct bkey *alloc) 537 { 538 struct open_bucket *ret, *ret_task = NULL; 539 540 list_for_each_entry_reverse(ret, &c->data_buckets, list) 541 if (!bkey_cmp(&ret->key, search)) 542 goto found; 543 else if (ret->last_write_point == write_point) 544 ret_task = ret; 545 546 ret = ret_task ?: list_first_entry(&c->data_buckets, 547 struct open_bucket, list); 548 found: 549 if (!ret->sectors_free && KEY_PTRS(alloc)) { 550 ret->sectors_free = c->sb.bucket_size; 551 bkey_copy(&ret->key, alloc); 552 bkey_init(alloc); 553 } 554 555 if (!ret->sectors_free) 556 ret = NULL; 557 558 return ret; 559 } 560 561 /* 562 * Allocates some space in the cache to write to, and k to point to the newly 563 * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the 564 * end of the newly allocated space). 565 * 566 * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many 567 * sectors were actually allocated. 568 * 569 * If s->writeback is true, will not fail. 570 */ 571 bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, 572 unsigned write_point, unsigned write_prio, bool wait) 573 { 574 struct open_bucket *b; 575 BKEY_PADDED(key) alloc; 576 unsigned i; 577 578 /* 579 * We might have to allocate a new bucket, which we can't do with a 580 * spinlock held. So if we have to allocate, we drop the lock, allocate 581 * and then retry. KEY_PTRS() indicates whether alloc points to 582 * allocated bucket(s). 583 */ 584 585 bkey_init(&alloc.key); 586 spin_lock(&c->data_bucket_lock); 587 588 while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { 589 unsigned watermark = write_prio 590 ? RESERVE_MOVINGGC 591 : RESERVE_NONE; 592 593 spin_unlock(&c->data_bucket_lock); 594 595 if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) 596 return false; 597 598 spin_lock(&c->data_bucket_lock); 599 } 600 601 /* 602 * If we had to allocate, we might race and not need to allocate the 603 * second time we call find_data_bucket(). If we allocated a bucket but 604 * didn't use it, drop the refcount bch_bucket_alloc_set() took: 605 */ 606 if (KEY_PTRS(&alloc.key)) 607 bkey_put(c, &alloc.key); 608 609 for (i = 0; i < KEY_PTRS(&b->key); i++) 610 EBUG_ON(ptr_stale(c, &b->key, i)); 611 612 /* Set up the pointer to the space we're allocating: */ 613 614 for (i = 0; i < KEY_PTRS(&b->key); i++) 615 k->ptr[i] = b->key.ptr[i]; 616 617 sectors = min(sectors, b->sectors_free); 618 619 SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); 620 SET_KEY_SIZE(k, sectors); 621 SET_KEY_PTRS(k, KEY_PTRS(&b->key)); 622 623 /* 624 * Move b to the end of the lru, and keep track of what this bucket was 625 * last used for: 626 */ 627 list_move_tail(&b->list, &c->data_buckets); 628 bkey_copy_key(&b->key, k); 629 b->last_write_point = write_point; 630 631 b->sectors_free -= sectors; 632 633 for (i = 0; i < KEY_PTRS(&b->key); i++) { 634 SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors); 635 636 atomic_long_add(sectors, 637 &PTR_CACHE(c, &b->key, i)->sectors_written); 638 } 639 640 if (b->sectors_free < c->sb.block_size) 641 b->sectors_free = 0; 642 643 /* 644 * k takes refcounts on the buckets it points to until it's inserted 645 * into the btree, but if we're done with this bucket we just transfer 646 * get_data_bucket()'s refcount. 647 */ 648 if (b->sectors_free) 649 for (i = 0; i < KEY_PTRS(&b->key); i++) 650 atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin); 651 652 spin_unlock(&c->data_bucket_lock); 653 return true; 654 } 655 656 /* Init */ 657 658 void bch_open_buckets_free(struct cache_set *c) 659 { 660 struct open_bucket *b; 661 662 while (!list_empty(&c->data_buckets)) { 663 b = list_first_entry(&c->data_buckets, 664 struct open_bucket, list); 665 list_del(&b->list); 666 kfree(b); 667 } 668 } 669 670 int bch_open_buckets_alloc(struct cache_set *c) 671 { 672 int i; 673 674 spin_lock_init(&c->data_bucket_lock); 675 676 for (i = 0; i < 6; i++) { 677 struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); 678 if (!b) 679 return -ENOMEM; 680 681 list_add(&b->list, &c->data_buckets); 682 } 683 684 return 0; 685 } 686 687 int bch_cache_allocator_start(struct cache *ca) 688 { 689 struct task_struct *k = kthread_run(bch_allocator_thread, 690 ca, "bcache_allocator"); 691 if (IS_ERR(k)) 692 return PTR_ERR(k); 693 694 ca->alloc_thread = k; 695 return 0; 696 } 697