1 /* 2 * Main bcache entry point - handle a read or a write request and decide what to 3 * do with it; the make_request functions are called by the block layer. 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcache.h" 10 #include "btree.h" 11 #include "debug.h" 12 #include "request.h" 13 #include "writeback.h" 14 15 #include <linux/cgroup.h> 16 #include <linux/module.h> 17 #include <linux/hash.h> 18 #include <linux/random.h> 19 #include "blk-cgroup.h" 20 21 #include <trace/events/bcache.h> 22 23 #define CUTOFF_CACHE_ADD 95 24 #define CUTOFF_CACHE_READA 90 25 26 struct kmem_cache *bch_search_cache; 27 28 static void bch_data_insert_start(struct closure *); 29 30 /* Cgroup interface */ 31 32 #ifdef CONFIG_CGROUP_BCACHE 33 static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 }; 34 35 static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup) 36 { 37 struct cgroup_subsys_state *css; 38 return cgroup && 39 (css = cgroup_subsys_state(cgroup, bcache_subsys_id)) 40 ? container_of(css, struct bch_cgroup, css) 41 : &bcache_default_cgroup; 42 } 43 44 struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio) 45 { 46 struct cgroup_subsys_state *css = bio->bi_css 47 ? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id) 48 : task_subsys_state(current, bcache_subsys_id); 49 50 return css 51 ? container_of(css, struct bch_cgroup, css) 52 : &bcache_default_cgroup; 53 } 54 55 static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft, 56 struct file *file, 57 char __user *buf, size_t nbytes, loff_t *ppos) 58 { 59 char tmp[1024]; 60 int len = bch_snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes, 61 cgroup_to_bcache(cgrp)->cache_mode + 1); 62 63 if (len < 0) 64 return len; 65 66 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 67 } 68 69 static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft, 70 const char *buf) 71 { 72 int v = bch_read_string_list(buf, bch_cache_modes); 73 if (v < 0) 74 return v; 75 76 cgroup_to_bcache(cgrp)->cache_mode = v - 1; 77 return 0; 78 } 79 80 static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft) 81 { 82 return cgroup_to_bcache(cgrp)->verify; 83 } 84 85 static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val) 86 { 87 cgroup_to_bcache(cgrp)->verify = val; 88 return 0; 89 } 90 91 static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft) 92 { 93 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 94 return atomic_read(&bcachecg->stats.cache_hits); 95 } 96 97 static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft) 98 { 99 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 100 return atomic_read(&bcachecg->stats.cache_misses); 101 } 102 103 static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp, 104 struct cftype *cft) 105 { 106 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 107 return atomic_read(&bcachecg->stats.cache_bypass_hits); 108 } 109 110 static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp, 111 struct cftype *cft) 112 { 113 struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp); 114 return atomic_read(&bcachecg->stats.cache_bypass_misses); 115 } 116 117 static struct cftype bch_files[] = { 118 { 119 .name = "cache_mode", 120 .read = cache_mode_read, 121 .write_string = cache_mode_write, 122 }, 123 { 124 .name = "verify", 125 .read_u64 = bch_verify_read, 126 .write_u64 = bch_verify_write, 127 }, 128 { 129 .name = "cache_hits", 130 .read_u64 = bch_cache_hits_read, 131 }, 132 { 133 .name = "cache_misses", 134 .read_u64 = bch_cache_misses_read, 135 }, 136 { 137 .name = "cache_bypass_hits", 138 .read_u64 = bch_cache_bypass_hits_read, 139 }, 140 { 141 .name = "cache_bypass_misses", 142 .read_u64 = bch_cache_bypass_misses_read, 143 }, 144 { } /* terminate */ 145 }; 146 147 static void init_bch_cgroup(struct bch_cgroup *cg) 148 { 149 cg->cache_mode = -1; 150 } 151 152 static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) 153 { 154 struct bch_cgroup *cg; 155 156 cg = kzalloc(sizeof(*cg), GFP_KERNEL); 157 if (!cg) 158 return ERR_PTR(-ENOMEM); 159 init_bch_cgroup(cg); 160 return &cg->css; 161 } 162 163 static void bcachecg_destroy(struct cgroup *cgroup) 164 { 165 struct bch_cgroup *cg = cgroup_to_bcache(cgroup); 166 kfree(cg); 167 } 168 169 struct cgroup_subsys bcache_subsys = { 170 .create = bcachecg_create, 171 .destroy = bcachecg_destroy, 172 .subsys_id = bcache_subsys_id, 173 .name = "bcache", 174 .module = THIS_MODULE, 175 }; 176 EXPORT_SYMBOL_GPL(bcache_subsys); 177 #endif 178 179 static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) 180 { 181 #ifdef CONFIG_CGROUP_BCACHE 182 int r = bch_bio_to_cgroup(bio)->cache_mode; 183 if (r >= 0) 184 return r; 185 #endif 186 return BDEV_CACHE_MODE(&dc->sb); 187 } 188 189 static bool verify(struct cached_dev *dc, struct bio *bio) 190 { 191 #ifdef CONFIG_CGROUP_BCACHE 192 if (bch_bio_to_cgroup(bio)->verify) 193 return true; 194 #endif 195 return dc->verify; 196 } 197 198 static void bio_csum(struct bio *bio, struct bkey *k) 199 { 200 struct bio_vec bv; 201 struct bvec_iter iter; 202 uint64_t csum = 0; 203 204 bio_for_each_segment(bv, bio, iter) { 205 void *d = kmap(bv.bv_page) + bv.bv_offset; 206 csum = bch_crc64_update(csum, d, bv.bv_len); 207 kunmap(bv.bv_page); 208 } 209 210 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); 211 } 212 213 /* Insert data into cache */ 214 215 static void bch_data_insert_keys(struct closure *cl) 216 { 217 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 218 atomic_t *journal_ref = NULL; 219 struct bkey *replace_key = op->replace ? &op->replace_key : NULL; 220 int ret; 221 222 /* 223 * If we're looping, might already be waiting on 224 * another journal write - can't wait on more than one journal write at 225 * a time 226 * 227 * XXX: this looks wrong 228 */ 229 #if 0 230 while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) 231 closure_sync(&s->cl); 232 #endif 233 234 if (!op->replace) 235 journal_ref = bch_journal(op->c, &op->insert_keys, 236 op->flush_journal ? cl : NULL); 237 238 ret = bch_btree_insert(op->c, &op->insert_keys, 239 journal_ref, replace_key); 240 if (ret == -ESRCH) { 241 op->replace_collision = true; 242 } else if (ret) { 243 op->error = -ENOMEM; 244 op->insert_data_done = true; 245 } 246 247 if (journal_ref) 248 atomic_dec_bug(journal_ref); 249 250 if (!op->insert_data_done) 251 continue_at(cl, bch_data_insert_start, bcache_wq); 252 253 bch_keylist_free(&op->insert_keys); 254 closure_return(cl); 255 } 256 257 static int bch_keylist_realloc(struct keylist *l, unsigned u64s, 258 struct cache_set *c) 259 { 260 size_t oldsize = bch_keylist_nkeys(l); 261 size_t newsize = oldsize + u64s; 262 263 /* 264 * The journalling code doesn't handle the case where the keys to insert 265 * is bigger than an empty write: If we just return -ENOMEM here, 266 * bio_insert() and bio_invalidate() will insert the keys created so far 267 * and finish the rest when the keylist is empty. 268 */ 269 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) 270 return -ENOMEM; 271 272 return __bch_keylist_realloc(l, u64s); 273 } 274 275 static void bch_data_invalidate(struct closure *cl) 276 { 277 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 278 struct bio *bio = op->bio; 279 280 pr_debug("invalidating %i sectors from %llu", 281 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); 282 283 while (bio_sectors(bio)) { 284 unsigned sectors = min(bio_sectors(bio), 285 1U << (KEY_SIZE_BITS - 1)); 286 287 if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) 288 goto out; 289 290 bio->bi_iter.bi_sector += sectors; 291 bio->bi_iter.bi_size -= sectors << 9; 292 293 bch_keylist_add(&op->insert_keys, 294 &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); 295 } 296 297 op->insert_data_done = true; 298 bio_put(bio); 299 out: 300 continue_at(cl, bch_data_insert_keys, bcache_wq); 301 } 302 303 static void bch_data_insert_error(struct closure *cl) 304 { 305 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 306 307 /* 308 * Our data write just errored, which means we've got a bunch of keys to 309 * insert that point to data that wasn't succesfully written. 310 * 311 * We don't have to insert those keys but we still have to invalidate 312 * that region of the cache - so, if we just strip off all the pointers 313 * from the keys we'll accomplish just that. 314 */ 315 316 struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; 317 318 while (src != op->insert_keys.top) { 319 struct bkey *n = bkey_next(src); 320 321 SET_KEY_PTRS(src, 0); 322 memmove(dst, src, bkey_bytes(src)); 323 324 dst = bkey_next(dst); 325 src = n; 326 } 327 328 op->insert_keys.top = dst; 329 330 bch_data_insert_keys(cl); 331 } 332 333 static void bch_data_insert_endio(struct bio *bio, int error) 334 { 335 struct closure *cl = bio->bi_private; 336 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 337 338 if (error) { 339 /* TODO: We could try to recover from this. */ 340 if (op->writeback) 341 op->error = error; 342 else if (!op->replace) 343 set_closure_fn(cl, bch_data_insert_error, bcache_wq); 344 else 345 set_closure_fn(cl, NULL, NULL); 346 } 347 348 bch_bbio_endio(op->c, bio, error, "writing data to cache"); 349 } 350 351 static void bch_data_insert_start(struct closure *cl) 352 { 353 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 354 struct bio *bio = op->bio, *n; 355 356 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 357 set_gc_sectors(op->c); 358 wake_up_gc(op->c); 359 } 360 361 if (op->bypass) 362 return bch_data_invalidate(cl); 363 364 /* 365 * Journal writes are marked REQ_FLUSH; if the original write was a 366 * flush, it'll wait on the journal write. 367 */ 368 bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); 369 370 do { 371 unsigned i; 372 struct bkey *k; 373 struct bio_set *split = op->c->bio_split; 374 375 /* 1 for the device pointer and 1 for the chksum */ 376 if (bch_keylist_realloc(&op->insert_keys, 377 3 + (op->csum ? 1 : 0), 378 op->c)) 379 continue_at(cl, bch_data_insert_keys, bcache_wq); 380 381 k = op->insert_keys.top; 382 bkey_init(k); 383 SET_KEY_INODE(k, op->inode); 384 SET_KEY_OFFSET(k, bio->bi_iter.bi_sector); 385 386 if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), 387 op->write_point, op->write_prio, 388 op->writeback)) 389 goto err; 390 391 n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); 392 393 n->bi_end_io = bch_data_insert_endio; 394 n->bi_private = cl; 395 396 if (op->writeback) { 397 SET_KEY_DIRTY(k, true); 398 399 for (i = 0; i < KEY_PTRS(k); i++) 400 SET_GC_MARK(PTR_BUCKET(op->c, k, i), 401 GC_MARK_DIRTY); 402 } 403 404 SET_KEY_CSUM(k, op->csum); 405 if (KEY_CSUM(k)) 406 bio_csum(n, k); 407 408 trace_bcache_cache_insert(k); 409 bch_keylist_push(&op->insert_keys); 410 411 n->bi_rw |= REQ_WRITE; 412 bch_submit_bbio(n, op->c, k, 0); 413 } while (n != bio); 414 415 op->insert_data_done = true; 416 continue_at(cl, bch_data_insert_keys, bcache_wq); 417 err: 418 /* bch_alloc_sectors() blocks if s->writeback = true */ 419 BUG_ON(op->writeback); 420 421 /* 422 * But if it's not a writeback write we'd rather just bail out if 423 * there aren't any buckets ready to write to - it might take awhile and 424 * we might be starving btree writes for gc or something. 425 */ 426 427 if (!op->replace) { 428 /* 429 * Writethrough write: We can't complete the write until we've 430 * updated the index. But we don't want to delay the write while 431 * we wait for buckets to be freed up, so just invalidate the 432 * rest of the write. 433 */ 434 op->bypass = true; 435 return bch_data_invalidate(cl); 436 } else { 437 /* 438 * From a cache miss, we can just insert the keys for the data 439 * we have written or bail out if we didn't do anything. 440 */ 441 op->insert_data_done = true; 442 bio_put(bio); 443 444 if (!bch_keylist_empty(&op->insert_keys)) 445 continue_at(cl, bch_data_insert_keys, bcache_wq); 446 else 447 closure_return(cl); 448 } 449 } 450 451 /** 452 * bch_data_insert - stick some data in the cache 453 * 454 * This is the starting point for any data to end up in a cache device; it could 455 * be from a normal write, or a writeback write, or a write to a flash only 456 * volume - it's also used by the moving garbage collector to compact data in 457 * mostly empty buckets. 458 * 459 * It first writes the data to the cache, creating a list of keys to be inserted 460 * (if the data had to be fragmented there will be multiple keys); after the 461 * data is written it calls bch_journal, and after the keys have been added to 462 * the next journal write they're inserted into the btree. 463 * 464 * It inserts the data in s->cache_bio; bi_sector is used for the key offset, 465 * and op->inode is used for the key inode. 466 * 467 * If s->bypass is true, instead of inserting the data it invalidates the 468 * region of the cache represented by s->cache_bio and op->inode. 469 */ 470 void bch_data_insert(struct closure *cl) 471 { 472 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 473 474 trace_bcache_write(op->bio, op->writeback, op->bypass); 475 476 bch_keylist_init(&op->insert_keys); 477 bio_get(op->bio); 478 bch_data_insert_start(cl); 479 } 480 481 /* Congested? */ 482 483 unsigned bch_get_congested(struct cache_set *c) 484 { 485 int i; 486 long rand; 487 488 if (!c->congested_read_threshold_us && 489 !c->congested_write_threshold_us) 490 return 0; 491 492 i = (local_clock_us() - c->congested_last_us) / 1024; 493 if (i < 0) 494 return 0; 495 496 i += atomic_read(&c->congested); 497 if (i >= 0) 498 return 0; 499 500 i += CONGESTED_MAX; 501 502 if (i > 0) 503 i = fract_exp_two(i, 6); 504 505 rand = get_random_int(); 506 i -= bitmap_weight(&rand, BITS_PER_LONG); 507 508 return i > 0 ? i : 1; 509 } 510 511 static void add_sequential(struct task_struct *t) 512 { 513 ewma_add(t->sequential_io_avg, 514 t->sequential_io, 8, 0); 515 516 t->sequential_io = 0; 517 } 518 519 static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) 520 { 521 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; 522 } 523 524 static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) 525 { 526 struct cache_set *c = dc->disk.c; 527 unsigned mode = cache_mode(dc, bio); 528 unsigned sectors, congested = bch_get_congested(c); 529 struct task_struct *task = current; 530 struct io *i; 531 532 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 533 c->gc_stats.in_use > CUTOFF_CACHE_ADD || 534 (bio->bi_rw & REQ_DISCARD)) 535 goto skip; 536 537 if (mode == CACHE_MODE_NONE || 538 (mode == CACHE_MODE_WRITEAROUND && 539 (bio->bi_rw & REQ_WRITE))) 540 goto skip; 541 542 if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || 543 bio_sectors(bio) & (c->sb.block_size - 1)) { 544 pr_debug("skipping unaligned io"); 545 goto skip; 546 } 547 548 if (bypass_torture_test(dc)) { 549 if ((get_random_int() & 3) == 3) 550 goto skip; 551 else 552 goto rescale; 553 } 554 555 if (!congested && !dc->sequential_cutoff) 556 goto rescale; 557 558 if (!congested && 559 mode == CACHE_MODE_WRITEBACK && 560 (bio->bi_rw & REQ_WRITE) && 561 (bio->bi_rw & REQ_SYNC)) 562 goto rescale; 563 564 spin_lock(&dc->io_lock); 565 566 hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) 567 if (i->last == bio->bi_iter.bi_sector && 568 time_before(jiffies, i->jiffies)) 569 goto found; 570 571 i = list_first_entry(&dc->io_lru, struct io, lru); 572 573 add_sequential(task); 574 i->sequential = 0; 575 found: 576 if (i->sequential + bio->bi_iter.bi_size > i->sequential) 577 i->sequential += bio->bi_iter.bi_size; 578 579 i->last = bio_end_sector(bio); 580 i->jiffies = jiffies + msecs_to_jiffies(5000); 581 task->sequential_io = i->sequential; 582 583 hlist_del(&i->hash); 584 hlist_add_head(&i->hash, iohash(dc, i->last)); 585 list_move_tail(&i->lru, &dc->io_lru); 586 587 spin_unlock(&dc->io_lock); 588 589 sectors = max(task->sequential_io, 590 task->sequential_io_avg) >> 9; 591 592 if (dc->sequential_cutoff && 593 sectors >= dc->sequential_cutoff >> 9) { 594 trace_bcache_bypass_sequential(bio); 595 goto skip; 596 } 597 598 if (congested && sectors >= congested) { 599 trace_bcache_bypass_congested(bio); 600 goto skip; 601 } 602 603 rescale: 604 bch_rescale_priorities(c, bio_sectors(bio)); 605 return false; 606 skip: 607 bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); 608 return true; 609 } 610 611 /* Cache lookup */ 612 613 struct search { 614 /* Stack frame for bio_complete */ 615 struct closure cl; 616 617 struct bbio bio; 618 struct bio *orig_bio; 619 struct bio *cache_miss; 620 struct bcache_device *d; 621 622 unsigned insert_bio_sectors; 623 unsigned recoverable:1; 624 unsigned write:1; 625 unsigned read_dirty_data:1; 626 627 unsigned long start_time; 628 629 struct btree_op op; 630 struct data_insert_op iop; 631 }; 632 633 static void bch_cache_read_endio(struct bio *bio, int error) 634 { 635 struct bbio *b = container_of(bio, struct bbio, bio); 636 struct closure *cl = bio->bi_private; 637 struct search *s = container_of(cl, struct search, cl); 638 639 /* 640 * If the bucket was reused while our bio was in flight, we might have 641 * read the wrong data. Set s->error but not error so it doesn't get 642 * counted against the cache device, but we'll still reread the data 643 * from the backing device. 644 */ 645 646 if (error) 647 s->iop.error = error; 648 else if (!KEY_DIRTY(&b->key) && 649 ptr_stale(s->iop.c, &b->key, 0)) { 650 atomic_long_inc(&s->iop.c->cache_read_races); 651 s->iop.error = -EINTR; 652 } 653 654 bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); 655 } 656 657 /* 658 * Read from a single key, handling the initial cache miss if the key starts in 659 * the middle of the bio 660 */ 661 static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) 662 { 663 struct search *s = container_of(op, struct search, op); 664 struct bio *n, *bio = &s->bio.bio; 665 struct bkey *bio_key; 666 unsigned ptr; 667 668 if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) 669 return MAP_CONTINUE; 670 671 if (KEY_INODE(k) != s->iop.inode || 672 KEY_START(k) > bio->bi_iter.bi_sector) { 673 unsigned bio_sectors = bio_sectors(bio); 674 unsigned sectors = KEY_INODE(k) == s->iop.inode 675 ? min_t(uint64_t, INT_MAX, 676 KEY_START(k) - bio->bi_iter.bi_sector) 677 : INT_MAX; 678 679 int ret = s->d->cache_miss(b, s, bio, sectors); 680 if (ret != MAP_CONTINUE) 681 return ret; 682 683 /* if this was a complete miss we shouldn't get here */ 684 BUG_ON(bio_sectors <= sectors); 685 } 686 687 if (!KEY_SIZE(k)) 688 return MAP_CONTINUE; 689 690 /* XXX: figure out best pointer - for multiple cache devices */ 691 ptr = 0; 692 693 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; 694 695 if (KEY_DIRTY(k)) 696 s->read_dirty_data = true; 697 698 n = bio_next_split(bio, min_t(uint64_t, INT_MAX, 699 KEY_OFFSET(k) - bio->bi_iter.bi_sector), 700 GFP_NOIO, s->d->bio_split); 701 702 bio_key = &container_of(n, struct bbio, bio)->key; 703 bch_bkey_copy_single_ptr(bio_key, k, ptr); 704 705 bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); 706 bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); 707 708 n->bi_end_io = bch_cache_read_endio; 709 n->bi_private = &s->cl; 710 711 /* 712 * The bucket we're reading from might be reused while our bio 713 * is in flight, and we could then end up reading the wrong 714 * data. 715 * 716 * We guard against this by checking (in cache_read_endio()) if 717 * the pointer is stale again; if so, we treat it as an error 718 * and reread from the backing device (but we don't pass that 719 * error up anywhere). 720 */ 721 722 __bch_submit_bbio(n, b->c); 723 return n == bio ? MAP_DONE : MAP_CONTINUE; 724 } 725 726 static void cache_lookup(struct closure *cl) 727 { 728 struct search *s = container_of(cl, struct search, iop.cl); 729 struct bio *bio = &s->bio.bio; 730 int ret; 731 732 bch_btree_op_init(&s->op, -1); 733 734 ret = bch_btree_map_keys(&s->op, s->iop.c, 735 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), 736 cache_lookup_fn, MAP_END_KEY); 737 if (ret == -EAGAIN) 738 continue_at(cl, cache_lookup, bcache_wq); 739 740 closure_return(cl); 741 } 742 743 /* Common code for the make_request functions */ 744 745 static void request_endio(struct bio *bio, int error) 746 { 747 struct closure *cl = bio->bi_private; 748 749 if (error) { 750 struct search *s = container_of(cl, struct search, cl); 751 s->iop.error = error; 752 /* Only cache read errors are recoverable */ 753 s->recoverable = false; 754 } 755 756 bio_put(bio); 757 closure_put(cl); 758 } 759 760 static void bio_complete(struct search *s) 761 { 762 if (s->orig_bio) { 763 int cpu, rw = bio_data_dir(s->orig_bio); 764 unsigned long duration = jiffies - s->start_time; 765 766 cpu = part_stat_lock(); 767 part_round_stats(cpu, &s->d->disk->part0); 768 part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration); 769 part_stat_unlock(); 770 771 trace_bcache_request_end(s->d, s->orig_bio); 772 bio_endio(s->orig_bio, s->iop.error); 773 s->orig_bio = NULL; 774 } 775 } 776 777 static void do_bio_hook(struct search *s, struct bio *orig_bio) 778 { 779 struct bio *bio = &s->bio.bio; 780 781 bio_init(bio); 782 __bio_clone_fast(bio, orig_bio); 783 bio->bi_end_io = request_endio; 784 bio->bi_private = &s->cl; 785 786 atomic_set(&bio->bi_cnt, 3); 787 } 788 789 static void search_free(struct closure *cl) 790 { 791 struct search *s = container_of(cl, struct search, cl); 792 bio_complete(s); 793 794 if (s->iop.bio) 795 bio_put(s->iop.bio); 796 797 closure_debug_destroy(cl); 798 mempool_free(s, s->d->c->search); 799 } 800 801 static inline struct search *search_alloc(struct bio *bio, 802 struct bcache_device *d) 803 { 804 struct search *s; 805 806 s = mempool_alloc(d->c->search, GFP_NOIO); 807 808 closure_init(&s->cl, NULL); 809 do_bio_hook(s, bio); 810 811 s->orig_bio = bio; 812 s->cache_miss = NULL; 813 s->d = d; 814 s->recoverable = 1; 815 s->write = (bio->bi_rw & REQ_WRITE) != 0; 816 s->read_dirty_data = 0; 817 s->start_time = jiffies; 818 819 s->iop.c = d->c; 820 s->iop.bio = NULL; 821 s->iop.inode = d->id; 822 s->iop.write_point = hash_long((unsigned long) current, 16); 823 s->iop.write_prio = 0; 824 s->iop.error = 0; 825 s->iop.flags = 0; 826 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 827 828 return s; 829 } 830 831 /* Cached devices */ 832 833 static void cached_dev_bio_complete(struct closure *cl) 834 { 835 struct search *s = container_of(cl, struct search, cl); 836 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 837 838 search_free(cl); 839 cached_dev_put(dc); 840 } 841 842 /* Process reads */ 843 844 static void cached_dev_cache_miss_done(struct closure *cl) 845 { 846 struct search *s = container_of(cl, struct search, cl); 847 848 if (s->iop.replace_collision) 849 bch_mark_cache_miss_collision(s->iop.c, s->d); 850 851 if (s->iop.bio) { 852 int i; 853 struct bio_vec *bv; 854 855 bio_for_each_segment_all(bv, s->iop.bio, i) 856 __free_page(bv->bv_page); 857 } 858 859 cached_dev_bio_complete(cl); 860 } 861 862 static void cached_dev_read_error(struct closure *cl) 863 { 864 struct search *s = container_of(cl, struct search, cl); 865 struct bio *bio = &s->bio.bio; 866 867 if (s->recoverable) { 868 /* Retry from the backing device: */ 869 trace_bcache_read_retry(s->orig_bio); 870 871 s->iop.error = 0; 872 do_bio_hook(s, s->orig_bio); 873 874 /* XXX: invalidate cache */ 875 876 closure_bio_submit(bio, cl, s->d); 877 } 878 879 continue_at(cl, cached_dev_cache_miss_done, NULL); 880 } 881 882 static void cached_dev_read_done(struct closure *cl) 883 { 884 struct search *s = container_of(cl, struct search, cl); 885 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 886 887 /* 888 * We had a cache miss; cache_bio now contains data ready to be inserted 889 * into the cache. 890 * 891 * First, we copy the data we just read from cache_bio's bounce buffers 892 * to the buffers the original bio pointed to: 893 */ 894 895 if (s->iop.bio) { 896 bio_reset(s->iop.bio); 897 s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; 898 s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; 899 s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; 900 bch_bio_map(s->iop.bio, NULL); 901 902 bio_copy_data(s->cache_miss, s->iop.bio); 903 904 bio_put(s->cache_miss); 905 s->cache_miss = NULL; 906 } 907 908 if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) 909 bch_data_verify(dc, s->orig_bio); 910 911 bio_complete(s); 912 913 if (s->iop.bio && 914 !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { 915 BUG_ON(!s->iop.replace); 916 closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 917 } 918 919 continue_at(cl, cached_dev_cache_miss_done, NULL); 920 } 921 922 static void cached_dev_read_done_bh(struct closure *cl) 923 { 924 struct search *s = container_of(cl, struct search, cl); 925 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 926 927 bch_mark_cache_accounting(s->iop.c, s->d, 928 !s->cache_miss, s->iop.bypass); 929 trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); 930 931 if (s->iop.error) 932 continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); 933 else if (s->iop.bio || verify(dc, &s->bio.bio)) 934 continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); 935 else 936 continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); 937 } 938 939 static int cached_dev_cache_miss(struct btree *b, struct search *s, 940 struct bio *bio, unsigned sectors) 941 { 942 int ret = MAP_CONTINUE; 943 unsigned reada = 0; 944 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 945 struct bio *miss, *cache_bio; 946 947 if (s->cache_miss || s->iop.bypass) { 948 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); 949 ret = miss == bio ? MAP_DONE : MAP_CONTINUE; 950 goto out_submit; 951 } 952 953 if (!(bio->bi_rw & REQ_RAHEAD) && 954 !(bio->bi_rw & REQ_META) && 955 s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) 956 reada = min_t(sector_t, dc->readahead >> 9, 957 bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); 958 959 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); 960 961 s->iop.replace_key = KEY(s->iop.inode, 962 bio->bi_iter.bi_sector + s->insert_bio_sectors, 963 s->insert_bio_sectors); 964 965 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); 966 if (ret) 967 return ret; 968 969 s->iop.replace = true; 970 971 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); 972 973 /* btree_search_recurse()'s btree iterator is no good anymore */ 974 ret = miss == bio ? MAP_DONE : -EINTR; 975 976 cache_bio = bio_alloc_bioset(GFP_NOWAIT, 977 DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), 978 dc->disk.bio_split); 979 if (!cache_bio) 980 goto out_submit; 981 982 cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; 983 cache_bio->bi_bdev = miss->bi_bdev; 984 cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; 985 986 cache_bio->bi_end_io = request_endio; 987 cache_bio->bi_private = &s->cl; 988 989 bch_bio_map(cache_bio, NULL); 990 if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) 991 goto out_put; 992 993 if (reada) 994 bch_mark_cache_readahead(s->iop.c, s->d); 995 996 s->cache_miss = miss; 997 s->iop.bio = cache_bio; 998 bio_get(cache_bio); 999 closure_bio_submit(cache_bio, &s->cl, s->d); 1000 1001 return ret; 1002 out_put: 1003 bio_put(cache_bio); 1004 out_submit: 1005 miss->bi_end_io = request_endio; 1006 miss->bi_private = &s->cl; 1007 closure_bio_submit(miss, &s->cl, s->d); 1008 return ret; 1009 } 1010 1011 static void cached_dev_read(struct cached_dev *dc, struct search *s) 1012 { 1013 struct closure *cl = &s->cl; 1014 1015 closure_call(&s->iop.cl, cache_lookup, NULL, cl); 1016 continue_at(cl, cached_dev_read_done_bh, NULL); 1017 } 1018 1019 /* Process writes */ 1020 1021 static void cached_dev_write_complete(struct closure *cl) 1022 { 1023 struct search *s = container_of(cl, struct search, cl); 1024 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 1025 1026 up_read_non_owner(&dc->writeback_lock); 1027 cached_dev_bio_complete(cl); 1028 } 1029 1030 static void cached_dev_write(struct cached_dev *dc, struct search *s) 1031 { 1032 struct closure *cl = &s->cl; 1033 struct bio *bio = &s->bio.bio; 1034 struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); 1035 struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); 1036 1037 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); 1038 1039 down_read_non_owner(&dc->writeback_lock); 1040 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { 1041 /* 1042 * We overlap with some dirty data undergoing background 1043 * writeback, force this write to writeback 1044 */ 1045 s->iop.bypass = false; 1046 s->iop.writeback = true; 1047 } 1048 1049 /* 1050 * Discards aren't _required_ to do anything, so skipping if 1051 * check_overlapping returned true is ok 1052 * 1053 * But check_overlapping drops dirty keys for which io hasn't started, 1054 * so we still want to call it. 1055 */ 1056 if (bio->bi_rw & REQ_DISCARD) 1057 s->iop.bypass = true; 1058 1059 if (should_writeback(dc, s->orig_bio, 1060 cache_mode(dc, bio), 1061 s->iop.bypass)) { 1062 s->iop.bypass = false; 1063 s->iop.writeback = true; 1064 } 1065 1066 if (s->iop.bypass) { 1067 s->iop.bio = s->orig_bio; 1068 bio_get(s->iop.bio); 1069 1070 if (!(bio->bi_rw & REQ_DISCARD) || 1071 blk_queue_discard(bdev_get_queue(dc->bdev))) 1072 closure_bio_submit(bio, cl, s->d); 1073 } else if (s->iop.writeback) { 1074 bch_writeback_add(dc); 1075 s->iop.bio = bio; 1076 1077 if (bio->bi_rw & REQ_FLUSH) { 1078 /* Also need to send a flush to the backing device */ 1079 struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, 1080 dc->disk.bio_split); 1081 1082 flush->bi_rw = WRITE_FLUSH; 1083 flush->bi_bdev = bio->bi_bdev; 1084 flush->bi_end_io = request_endio; 1085 flush->bi_private = cl; 1086 1087 closure_bio_submit(flush, cl, s->d); 1088 } 1089 } else { 1090 s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); 1091 1092 closure_bio_submit(bio, cl, s->d); 1093 } 1094 1095 closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 1096 continue_at(cl, cached_dev_write_complete, NULL); 1097 } 1098 1099 static void cached_dev_nodata(struct closure *cl) 1100 { 1101 struct search *s = container_of(cl, struct search, cl); 1102 struct bio *bio = &s->bio.bio; 1103 1104 if (s->iop.flush_journal) 1105 bch_journal_meta(s->iop.c, cl); 1106 1107 /* If it's a flush, we send the flush to the backing device too */ 1108 closure_bio_submit(bio, cl, s->d); 1109 1110 continue_at(cl, cached_dev_bio_complete, NULL); 1111 } 1112 1113 /* Cached devices - read & write stuff */ 1114 1115 static void cached_dev_make_request(struct request_queue *q, struct bio *bio) 1116 { 1117 struct search *s; 1118 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 1119 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 1120 int cpu, rw = bio_data_dir(bio); 1121 1122 cpu = part_stat_lock(); 1123 part_stat_inc(cpu, &d->disk->part0, ios[rw]); 1124 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); 1125 part_stat_unlock(); 1126 1127 bio->bi_bdev = dc->bdev; 1128 bio->bi_iter.bi_sector += dc->sb.data_offset; 1129 1130 if (cached_dev_get(dc)) { 1131 s = search_alloc(bio, d); 1132 trace_bcache_request_start(s->d, bio); 1133 1134 if (!bio->bi_iter.bi_size) { 1135 /* 1136 * can't call bch_journal_meta from under 1137 * generic_make_request 1138 */ 1139 continue_at_nobarrier(&s->cl, 1140 cached_dev_nodata, 1141 bcache_wq); 1142 } else { 1143 s->iop.bypass = check_should_bypass(dc, bio); 1144 1145 if (rw) 1146 cached_dev_write(dc, s); 1147 else 1148 cached_dev_read(dc, s); 1149 } 1150 } else { 1151 if ((bio->bi_rw & REQ_DISCARD) && 1152 !blk_queue_discard(bdev_get_queue(dc->bdev))) 1153 bio_endio(bio, 0); 1154 else 1155 bch_generic_make_request(bio, &d->bio_split_hook); 1156 } 1157 } 1158 1159 static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, 1160 unsigned int cmd, unsigned long arg) 1161 { 1162 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 1163 return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); 1164 } 1165 1166 static int cached_dev_congested(void *data, int bits) 1167 { 1168 struct bcache_device *d = data; 1169 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 1170 struct request_queue *q = bdev_get_queue(dc->bdev); 1171 int ret = 0; 1172 1173 if (bdi_congested(&q->backing_dev_info, bits)) 1174 return 1; 1175 1176 if (cached_dev_get(dc)) { 1177 unsigned i; 1178 struct cache *ca; 1179 1180 for_each_cache(ca, d->c, i) { 1181 q = bdev_get_queue(ca->bdev); 1182 ret |= bdi_congested(&q->backing_dev_info, bits); 1183 } 1184 1185 cached_dev_put(dc); 1186 } 1187 1188 return ret; 1189 } 1190 1191 void bch_cached_dev_request_init(struct cached_dev *dc) 1192 { 1193 struct gendisk *g = dc->disk.disk; 1194 1195 g->queue->make_request_fn = cached_dev_make_request; 1196 g->queue->backing_dev_info.congested_fn = cached_dev_congested; 1197 dc->disk.cache_miss = cached_dev_cache_miss; 1198 dc->disk.ioctl = cached_dev_ioctl; 1199 } 1200 1201 /* Flash backed devices */ 1202 1203 static int flash_dev_cache_miss(struct btree *b, struct search *s, 1204 struct bio *bio, unsigned sectors) 1205 { 1206 struct bio_vec bv; 1207 struct bvec_iter iter; 1208 1209 /* Zero fill bio */ 1210 1211 bio_for_each_segment(bv, bio, iter) { 1212 unsigned j = min(bv.bv_len >> 9, sectors); 1213 1214 void *p = kmap(bv.bv_page); 1215 memset(p + bv.bv_offset, 0, j << 9); 1216 kunmap(bv.bv_page); 1217 1218 sectors -= j; 1219 } 1220 1221 bio_advance(bio, min(sectors << 9, bio->bi_iter.bi_size)); 1222 1223 if (!bio->bi_iter.bi_size) 1224 return MAP_DONE; 1225 1226 return MAP_CONTINUE; 1227 } 1228 1229 static void flash_dev_nodata(struct closure *cl) 1230 { 1231 struct search *s = container_of(cl, struct search, cl); 1232 1233 if (s->iop.flush_journal) 1234 bch_journal_meta(s->iop.c, cl); 1235 1236 continue_at(cl, search_free, NULL); 1237 } 1238 1239 static void flash_dev_make_request(struct request_queue *q, struct bio *bio) 1240 { 1241 struct search *s; 1242 struct closure *cl; 1243 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 1244 int cpu, rw = bio_data_dir(bio); 1245 1246 cpu = part_stat_lock(); 1247 part_stat_inc(cpu, &d->disk->part0, ios[rw]); 1248 part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio)); 1249 part_stat_unlock(); 1250 1251 s = search_alloc(bio, d); 1252 cl = &s->cl; 1253 bio = &s->bio.bio; 1254 1255 trace_bcache_request_start(s->d, bio); 1256 1257 if (!bio->bi_iter.bi_size) { 1258 /* 1259 * can't call bch_journal_meta from under 1260 * generic_make_request 1261 */ 1262 continue_at_nobarrier(&s->cl, 1263 flash_dev_nodata, 1264 bcache_wq); 1265 } else if (rw) { 1266 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, 1267 &KEY(d->id, bio->bi_iter.bi_sector, 0), 1268 &KEY(d->id, bio_end_sector(bio), 0)); 1269 1270 s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; 1271 s->iop.writeback = true; 1272 s->iop.bio = bio; 1273 1274 closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 1275 } else { 1276 closure_call(&s->iop.cl, cache_lookup, NULL, cl); 1277 } 1278 1279 continue_at(cl, search_free, NULL); 1280 } 1281 1282 static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, 1283 unsigned int cmd, unsigned long arg) 1284 { 1285 return -ENOTTY; 1286 } 1287 1288 static int flash_dev_congested(void *data, int bits) 1289 { 1290 struct bcache_device *d = data; 1291 struct request_queue *q; 1292 struct cache *ca; 1293 unsigned i; 1294 int ret = 0; 1295 1296 for_each_cache(ca, d->c, i) { 1297 q = bdev_get_queue(ca->bdev); 1298 ret |= bdi_congested(&q->backing_dev_info, bits); 1299 } 1300 1301 return ret; 1302 } 1303 1304 void bch_flash_dev_request_init(struct bcache_device *d) 1305 { 1306 struct gendisk *g = d->disk; 1307 1308 g->queue->make_request_fn = flash_dev_make_request; 1309 g->queue->backing_dev_info.congested_fn = flash_dev_congested; 1310 d->cache_miss = flash_dev_cache_miss; 1311 d->ioctl = flash_dev_ioctl; 1312 } 1313 1314 void bch_request_exit(void) 1315 { 1316 #ifdef CONFIG_CGROUP_BCACHE 1317 cgroup_unload_subsys(&bcache_subsys); 1318 #endif 1319 if (bch_search_cache) 1320 kmem_cache_destroy(bch_search_cache); 1321 } 1322 1323 int __init bch_request_init(void) 1324 { 1325 bch_search_cache = KMEM_CACHE(search, 0); 1326 if (!bch_search_cache) 1327 return -ENOMEM; 1328 1329 #ifdef CONFIG_CGROUP_BCACHE 1330 cgroup_load_subsys(&bcache_subsys); 1331 init_bch_cgroup(&bcache_default_cgroup); 1332 1333 cgroup_add_cftypes(&bcache_subsys, bch_files); 1334 #endif 1335 return 0; 1336 } 1337