1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/wait.h> 9 #include <linux/bio.h> 10 #include <linux/slab.h> 11 #include <linux/buffer_head.h> 12 #include <linux/blkdev.h> 13 #include <linux/random.h> 14 #include <linux/iocontext.h> 15 #include <linux/capability.h> 16 #include <linux/ratelimit.h> 17 #include <linux/kthread.h> 18 #include <linux/raid/pq.h> 19 #include <linux/hash.h> 20 #include <linux/list_sort.h> 21 #include <linux/raid/xor.h> 22 #include <linux/mm.h> 23 #include <asm/div64.h> 24 #include "ctree.h" 25 #include "extent_map.h" 26 #include "disk-io.h" 27 #include "transaction.h" 28 #include "print-tree.h" 29 #include "volumes.h" 30 #include "raid56.h" 31 #include "async-thread.h" 32 #include "check-integrity.h" 33 #include "rcu-string.h" 34 35 /* set when additional merges to this rbio are not allowed */ 36 #define RBIO_RMW_LOCKED_BIT 1 37 38 /* 39 * set when this rbio is sitting in the hash, but it is just a cache 40 * of past RMW 41 */ 42 #define RBIO_CACHE_BIT 2 43 44 /* 45 * set when it is safe to trust the stripe_pages for caching 46 */ 47 #define RBIO_CACHE_READY_BIT 3 48 49 #define RBIO_CACHE_SIZE 1024 50 51 enum btrfs_rbio_ops { 52 BTRFS_RBIO_WRITE, 53 BTRFS_RBIO_READ_REBUILD, 54 BTRFS_RBIO_PARITY_SCRUB, 55 BTRFS_RBIO_REBUILD_MISSING, 56 }; 57 58 struct btrfs_raid_bio { 59 struct btrfs_fs_info *fs_info; 60 struct btrfs_bio *bbio; 61 62 /* while we're doing rmw on a stripe 63 * we put it into a hash table so we can 64 * lock the stripe and merge more rbios 65 * into it. 66 */ 67 struct list_head hash_list; 68 69 /* 70 * LRU list for the stripe cache 71 */ 72 struct list_head stripe_cache; 73 74 /* 75 * for scheduling work in the helper threads 76 */ 77 struct btrfs_work work; 78 79 /* 80 * bio list and bio_list_lock are used 81 * to add more bios into the stripe 82 * in hopes of avoiding the full rmw 83 */ 84 struct bio_list bio_list; 85 spinlock_t bio_list_lock; 86 87 /* also protected by the bio_list_lock, the 88 * plug list is used by the plugging code 89 * to collect partial bios while plugged. The 90 * stripe locking code also uses it to hand off 91 * the stripe lock to the next pending IO 92 */ 93 struct list_head plug_list; 94 95 /* 96 * flags that tell us if it is safe to 97 * merge with this bio 98 */ 99 unsigned long flags; 100 101 /* size of each individual stripe on disk */ 102 int stripe_len; 103 104 /* number of data stripes (no p/q) */ 105 int nr_data; 106 107 int real_stripes; 108 109 int stripe_npages; 110 /* 111 * set if we're doing a parity rebuild 112 * for a read from higher up, which is handled 113 * differently from a parity rebuild as part of 114 * rmw 115 */ 116 enum btrfs_rbio_ops operation; 117 118 /* first bad stripe */ 119 int faila; 120 121 /* second bad stripe (for raid6 use) */ 122 int failb; 123 124 int scrubp; 125 /* 126 * number of pages needed to represent the full 127 * stripe 128 */ 129 int nr_pages; 130 131 /* 132 * size of all the bios in the bio_list. This 133 * helps us decide if the rbio maps to a full 134 * stripe or not 135 */ 136 int bio_list_bytes; 137 138 int generic_bio_cnt; 139 140 refcount_t refs; 141 142 atomic_t stripes_pending; 143 144 atomic_t error; 145 /* 146 * these are two arrays of pointers. We allocate the 147 * rbio big enough to hold them both and setup their 148 * locations when the rbio is allocated 149 */ 150 151 /* pointers to pages that we allocated for 152 * reading/writing stripes directly from the disk (including P/Q) 153 */ 154 struct page **stripe_pages; 155 156 /* 157 * pointers to the pages in the bio_list. Stored 158 * here for faster lookup 159 */ 160 struct page **bio_pages; 161 162 /* 163 * bitmap to record which horizontal stripe has data 164 */ 165 unsigned long *dbitmap; 166 167 /* allocated with real_stripes-many pointers for finish_*() calls */ 168 void **finish_pointers; 169 170 /* allocated with stripe_npages-many bits for finish_*() calls */ 171 unsigned long *finish_pbitmap; 172 }; 173 174 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 175 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 176 static void rmw_work(struct btrfs_work *work); 177 static void read_rebuild_work(struct btrfs_work *work); 178 static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 179 static void async_read_rebuild(struct btrfs_raid_bio *rbio); 180 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 181 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 182 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 183 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 184 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 185 186 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 187 int need_check); 188 static void async_scrub_parity(struct btrfs_raid_bio *rbio); 189 190 /* 191 * the stripe hash table is used for locking, and to collect 192 * bios in hopes of making a full stripe 193 */ 194 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 195 { 196 struct btrfs_stripe_hash_table *table; 197 struct btrfs_stripe_hash_table *x; 198 struct btrfs_stripe_hash *cur; 199 struct btrfs_stripe_hash *h; 200 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 201 int i; 202 int table_size; 203 204 if (info->stripe_hash_table) 205 return 0; 206 207 /* 208 * The table is large, starting with order 4 and can go as high as 209 * order 7 in case lock debugging is turned on. 210 * 211 * Try harder to allocate and fallback to vmalloc to lower the chance 212 * of a failing mount. 213 */ 214 table_size = sizeof(*table) + sizeof(*h) * num_entries; 215 table = kvzalloc(table_size, GFP_KERNEL); 216 if (!table) 217 return -ENOMEM; 218 219 spin_lock_init(&table->cache_lock); 220 INIT_LIST_HEAD(&table->stripe_cache); 221 222 h = table->table; 223 224 for (i = 0; i < num_entries; i++) { 225 cur = h + i; 226 INIT_LIST_HEAD(&cur->hash_list); 227 spin_lock_init(&cur->lock); 228 } 229 230 x = cmpxchg(&info->stripe_hash_table, NULL, table); 231 if (x) 232 kvfree(x); 233 return 0; 234 } 235 236 /* 237 * caching an rbio means to copy anything from the 238 * bio_pages array into the stripe_pages array. We 239 * use the page uptodate bit in the stripe cache array 240 * to indicate if it has valid data 241 * 242 * once the caching is done, we set the cache ready 243 * bit. 244 */ 245 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 246 { 247 int i; 248 char *s; 249 char *d; 250 int ret; 251 252 ret = alloc_rbio_pages(rbio); 253 if (ret) 254 return; 255 256 for (i = 0; i < rbio->nr_pages; i++) { 257 if (!rbio->bio_pages[i]) 258 continue; 259 260 s = kmap(rbio->bio_pages[i]); 261 d = kmap(rbio->stripe_pages[i]); 262 263 memcpy(d, s, PAGE_SIZE); 264 265 kunmap(rbio->bio_pages[i]); 266 kunmap(rbio->stripe_pages[i]); 267 SetPageUptodate(rbio->stripe_pages[i]); 268 } 269 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 270 } 271 272 /* 273 * we hash on the first logical address of the stripe 274 */ 275 static int rbio_bucket(struct btrfs_raid_bio *rbio) 276 { 277 u64 num = rbio->bbio->raid_map[0]; 278 279 /* 280 * we shift down quite a bit. We're using byte 281 * addressing, and most of the lower bits are zeros. 282 * This tends to upset hash_64, and it consistently 283 * returns just one or two different values. 284 * 285 * shifting off the lower bits fixes things. 286 */ 287 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 288 } 289 290 /* 291 * stealing an rbio means taking all the uptodate pages from the stripe 292 * array in the source rbio and putting them into the destination rbio 293 */ 294 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 295 { 296 int i; 297 struct page *s; 298 struct page *d; 299 300 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 301 return; 302 303 for (i = 0; i < dest->nr_pages; i++) { 304 s = src->stripe_pages[i]; 305 if (!s || !PageUptodate(s)) { 306 continue; 307 } 308 309 d = dest->stripe_pages[i]; 310 if (d) 311 __free_page(d); 312 313 dest->stripe_pages[i] = s; 314 src->stripe_pages[i] = NULL; 315 } 316 } 317 318 /* 319 * merging means we take the bio_list from the victim and 320 * splice it into the destination. The victim should 321 * be discarded afterwards. 322 * 323 * must be called with dest->rbio_list_lock held 324 */ 325 static void merge_rbio(struct btrfs_raid_bio *dest, 326 struct btrfs_raid_bio *victim) 327 { 328 bio_list_merge(&dest->bio_list, &victim->bio_list); 329 dest->bio_list_bytes += victim->bio_list_bytes; 330 dest->generic_bio_cnt += victim->generic_bio_cnt; 331 bio_list_init(&victim->bio_list); 332 } 333 334 /* 335 * used to prune items that are in the cache. The caller 336 * must hold the hash table lock. 337 */ 338 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 339 { 340 int bucket = rbio_bucket(rbio); 341 struct btrfs_stripe_hash_table *table; 342 struct btrfs_stripe_hash *h; 343 int freeit = 0; 344 345 /* 346 * check the bit again under the hash table lock. 347 */ 348 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 349 return; 350 351 table = rbio->fs_info->stripe_hash_table; 352 h = table->table + bucket; 353 354 /* hold the lock for the bucket because we may be 355 * removing it from the hash table 356 */ 357 spin_lock(&h->lock); 358 359 /* 360 * hold the lock for the bio list because we need 361 * to make sure the bio list is empty 362 */ 363 spin_lock(&rbio->bio_list_lock); 364 365 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 366 list_del_init(&rbio->stripe_cache); 367 table->cache_size -= 1; 368 freeit = 1; 369 370 /* if the bio list isn't empty, this rbio is 371 * still involved in an IO. We take it out 372 * of the cache list, and drop the ref that 373 * was held for the list. 374 * 375 * If the bio_list was empty, we also remove 376 * the rbio from the hash_table, and drop 377 * the corresponding ref 378 */ 379 if (bio_list_empty(&rbio->bio_list)) { 380 if (!list_empty(&rbio->hash_list)) { 381 list_del_init(&rbio->hash_list); 382 refcount_dec(&rbio->refs); 383 BUG_ON(!list_empty(&rbio->plug_list)); 384 } 385 } 386 } 387 388 spin_unlock(&rbio->bio_list_lock); 389 spin_unlock(&h->lock); 390 391 if (freeit) 392 __free_raid_bio(rbio); 393 } 394 395 /* 396 * prune a given rbio from the cache 397 */ 398 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 399 { 400 struct btrfs_stripe_hash_table *table; 401 unsigned long flags; 402 403 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 404 return; 405 406 table = rbio->fs_info->stripe_hash_table; 407 408 spin_lock_irqsave(&table->cache_lock, flags); 409 __remove_rbio_from_cache(rbio); 410 spin_unlock_irqrestore(&table->cache_lock, flags); 411 } 412 413 /* 414 * remove everything in the cache 415 */ 416 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 417 { 418 struct btrfs_stripe_hash_table *table; 419 unsigned long flags; 420 struct btrfs_raid_bio *rbio; 421 422 table = info->stripe_hash_table; 423 424 spin_lock_irqsave(&table->cache_lock, flags); 425 while (!list_empty(&table->stripe_cache)) { 426 rbio = list_entry(table->stripe_cache.next, 427 struct btrfs_raid_bio, 428 stripe_cache); 429 __remove_rbio_from_cache(rbio); 430 } 431 spin_unlock_irqrestore(&table->cache_lock, flags); 432 } 433 434 /* 435 * remove all cached entries and free the hash table 436 * used by unmount 437 */ 438 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 439 { 440 if (!info->stripe_hash_table) 441 return; 442 btrfs_clear_rbio_cache(info); 443 kvfree(info->stripe_hash_table); 444 info->stripe_hash_table = NULL; 445 } 446 447 /* 448 * insert an rbio into the stripe cache. It 449 * must have already been prepared by calling 450 * cache_rbio_pages 451 * 452 * If this rbio was already cached, it gets 453 * moved to the front of the lru. 454 * 455 * If the size of the rbio cache is too big, we 456 * prune an item. 457 */ 458 static void cache_rbio(struct btrfs_raid_bio *rbio) 459 { 460 struct btrfs_stripe_hash_table *table; 461 unsigned long flags; 462 463 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 464 return; 465 466 table = rbio->fs_info->stripe_hash_table; 467 468 spin_lock_irqsave(&table->cache_lock, flags); 469 spin_lock(&rbio->bio_list_lock); 470 471 /* bump our ref if we were not in the list before */ 472 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 473 refcount_inc(&rbio->refs); 474 475 if (!list_empty(&rbio->stripe_cache)){ 476 list_move(&rbio->stripe_cache, &table->stripe_cache); 477 } else { 478 list_add(&rbio->stripe_cache, &table->stripe_cache); 479 table->cache_size += 1; 480 } 481 482 spin_unlock(&rbio->bio_list_lock); 483 484 if (table->cache_size > RBIO_CACHE_SIZE) { 485 struct btrfs_raid_bio *found; 486 487 found = list_entry(table->stripe_cache.prev, 488 struct btrfs_raid_bio, 489 stripe_cache); 490 491 if (found != rbio) 492 __remove_rbio_from_cache(found); 493 } 494 495 spin_unlock_irqrestore(&table->cache_lock, flags); 496 } 497 498 /* 499 * helper function to run the xor_blocks api. It is only 500 * able to do MAX_XOR_BLOCKS at a time, so we need to 501 * loop through. 502 */ 503 static void run_xor(void **pages, int src_cnt, ssize_t len) 504 { 505 int src_off = 0; 506 int xor_src_cnt = 0; 507 void *dest = pages[src_cnt]; 508 509 while(src_cnt > 0) { 510 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 511 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 512 513 src_cnt -= xor_src_cnt; 514 src_off += xor_src_cnt; 515 } 516 } 517 518 /* 519 * returns true if the bio list inside this rbio 520 * covers an entire stripe (no rmw required). 521 * Must be called with the bio list lock held, or 522 * at a time when you know it is impossible to add 523 * new bios into the list 524 */ 525 static int __rbio_is_full(struct btrfs_raid_bio *rbio) 526 { 527 unsigned long size = rbio->bio_list_bytes; 528 int ret = 1; 529 530 if (size != rbio->nr_data * rbio->stripe_len) 531 ret = 0; 532 533 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 534 return ret; 535 } 536 537 static int rbio_is_full(struct btrfs_raid_bio *rbio) 538 { 539 unsigned long flags; 540 int ret; 541 542 spin_lock_irqsave(&rbio->bio_list_lock, flags); 543 ret = __rbio_is_full(rbio); 544 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 545 return ret; 546 } 547 548 /* 549 * returns 1 if it is safe to merge two rbios together. 550 * The merging is safe if the two rbios correspond to 551 * the same stripe and if they are both going in the same 552 * direction (read vs write), and if neither one is 553 * locked for final IO 554 * 555 * The caller is responsible for locking such that 556 * rmw_locked is safe to test 557 */ 558 static int rbio_can_merge(struct btrfs_raid_bio *last, 559 struct btrfs_raid_bio *cur) 560 { 561 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 562 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 563 return 0; 564 565 /* 566 * we can't merge with cached rbios, since the 567 * idea is that when we merge the destination 568 * rbio is going to run our IO for us. We can 569 * steal from cached rbios though, other functions 570 * handle that. 571 */ 572 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 573 test_bit(RBIO_CACHE_BIT, &cur->flags)) 574 return 0; 575 576 if (last->bbio->raid_map[0] != 577 cur->bbio->raid_map[0]) 578 return 0; 579 580 /* we can't merge with different operations */ 581 if (last->operation != cur->operation) 582 return 0; 583 /* 584 * We've need read the full stripe from the drive. 585 * check and repair the parity and write the new results. 586 * 587 * We're not allowed to add any new bios to the 588 * bio list here, anyone else that wants to 589 * change this stripe needs to do their own rmw. 590 */ 591 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 592 return 0; 593 594 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 595 return 0; 596 597 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 598 int fa = last->faila; 599 int fb = last->failb; 600 int cur_fa = cur->faila; 601 int cur_fb = cur->failb; 602 603 if (last->faila >= last->failb) { 604 fa = last->failb; 605 fb = last->faila; 606 } 607 608 if (cur->faila >= cur->failb) { 609 cur_fa = cur->failb; 610 cur_fb = cur->faila; 611 } 612 613 if (fa != cur_fa || fb != cur_fb) 614 return 0; 615 } 616 return 1; 617 } 618 619 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 620 int index) 621 { 622 return stripe * rbio->stripe_npages + index; 623 } 624 625 /* 626 * these are just the pages from the rbio array, not from anything 627 * the FS sent down to us 628 */ 629 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 630 int index) 631 { 632 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 633 } 634 635 /* 636 * helper to index into the pstripe 637 */ 638 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 639 { 640 return rbio_stripe_page(rbio, rbio->nr_data, index); 641 } 642 643 /* 644 * helper to index into the qstripe, returns null 645 * if there is no qstripe 646 */ 647 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 648 { 649 if (rbio->nr_data + 1 == rbio->real_stripes) 650 return NULL; 651 return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 652 } 653 654 /* 655 * The first stripe in the table for a logical address 656 * has the lock. rbios are added in one of three ways: 657 * 658 * 1) Nobody has the stripe locked yet. The rbio is given 659 * the lock and 0 is returned. The caller must start the IO 660 * themselves. 661 * 662 * 2) Someone has the stripe locked, but we're able to merge 663 * with the lock owner. The rbio is freed and the IO will 664 * start automatically along with the existing rbio. 1 is returned. 665 * 666 * 3) Someone has the stripe locked, but we're not able to merge. 667 * The rbio is added to the lock owner's plug list, or merged into 668 * an rbio already on the plug list. When the lock owner unlocks, 669 * the next rbio on the list is run and the IO is started automatically. 670 * 1 is returned 671 * 672 * If we return 0, the caller still owns the rbio and must continue with 673 * IO submission. If we return 1, the caller must assume the rbio has 674 * already been freed. 675 */ 676 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 677 { 678 int bucket = rbio_bucket(rbio); 679 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 680 struct btrfs_raid_bio *cur; 681 struct btrfs_raid_bio *pending; 682 unsigned long flags; 683 struct btrfs_raid_bio *freeit = NULL; 684 struct btrfs_raid_bio *cache_drop = NULL; 685 int ret = 0; 686 687 spin_lock_irqsave(&h->lock, flags); 688 list_for_each_entry(cur, &h->hash_list, hash_list) { 689 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { 690 spin_lock(&cur->bio_list_lock); 691 692 /* can we steal this cached rbio's pages? */ 693 if (bio_list_empty(&cur->bio_list) && 694 list_empty(&cur->plug_list) && 695 test_bit(RBIO_CACHE_BIT, &cur->flags) && 696 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 697 list_del_init(&cur->hash_list); 698 refcount_dec(&cur->refs); 699 700 steal_rbio(cur, rbio); 701 cache_drop = cur; 702 spin_unlock(&cur->bio_list_lock); 703 704 goto lockit; 705 } 706 707 /* can we merge into the lock owner? */ 708 if (rbio_can_merge(cur, rbio)) { 709 merge_rbio(cur, rbio); 710 spin_unlock(&cur->bio_list_lock); 711 freeit = rbio; 712 ret = 1; 713 goto out; 714 } 715 716 717 /* 718 * we couldn't merge with the running 719 * rbio, see if we can merge with the 720 * pending ones. We don't have to 721 * check for rmw_locked because there 722 * is no way they are inside finish_rmw 723 * right now 724 */ 725 list_for_each_entry(pending, &cur->plug_list, 726 plug_list) { 727 if (rbio_can_merge(pending, rbio)) { 728 merge_rbio(pending, rbio); 729 spin_unlock(&cur->bio_list_lock); 730 freeit = rbio; 731 ret = 1; 732 goto out; 733 } 734 } 735 736 /* no merging, put us on the tail of the plug list, 737 * our rbio will be started with the currently 738 * running rbio unlocks 739 */ 740 list_add_tail(&rbio->plug_list, &cur->plug_list); 741 spin_unlock(&cur->bio_list_lock); 742 ret = 1; 743 goto out; 744 } 745 } 746 lockit: 747 refcount_inc(&rbio->refs); 748 list_add(&rbio->hash_list, &h->hash_list); 749 out: 750 spin_unlock_irqrestore(&h->lock, flags); 751 if (cache_drop) 752 remove_rbio_from_cache(cache_drop); 753 if (freeit) 754 __free_raid_bio(freeit); 755 return ret; 756 } 757 758 /* 759 * called as rmw or parity rebuild is completed. If the plug list has more 760 * rbios waiting for this stripe, the next one on the list will be started 761 */ 762 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 763 { 764 int bucket; 765 struct btrfs_stripe_hash *h; 766 unsigned long flags; 767 int keep_cache = 0; 768 769 bucket = rbio_bucket(rbio); 770 h = rbio->fs_info->stripe_hash_table->table + bucket; 771 772 if (list_empty(&rbio->plug_list)) 773 cache_rbio(rbio); 774 775 spin_lock_irqsave(&h->lock, flags); 776 spin_lock(&rbio->bio_list_lock); 777 778 if (!list_empty(&rbio->hash_list)) { 779 /* 780 * if we're still cached and there is no other IO 781 * to perform, just leave this rbio here for others 782 * to steal from later 783 */ 784 if (list_empty(&rbio->plug_list) && 785 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 786 keep_cache = 1; 787 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 788 BUG_ON(!bio_list_empty(&rbio->bio_list)); 789 goto done; 790 } 791 792 list_del_init(&rbio->hash_list); 793 refcount_dec(&rbio->refs); 794 795 /* 796 * we use the plug list to hold all the rbios 797 * waiting for the chance to lock this stripe. 798 * hand the lock over to one of them. 799 */ 800 if (!list_empty(&rbio->plug_list)) { 801 struct btrfs_raid_bio *next; 802 struct list_head *head = rbio->plug_list.next; 803 804 next = list_entry(head, struct btrfs_raid_bio, 805 plug_list); 806 807 list_del_init(&rbio->plug_list); 808 809 list_add(&next->hash_list, &h->hash_list); 810 refcount_inc(&next->refs); 811 spin_unlock(&rbio->bio_list_lock); 812 spin_unlock_irqrestore(&h->lock, flags); 813 814 if (next->operation == BTRFS_RBIO_READ_REBUILD) 815 async_read_rebuild(next); 816 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 817 steal_rbio(rbio, next); 818 async_read_rebuild(next); 819 } else if (next->operation == BTRFS_RBIO_WRITE) { 820 steal_rbio(rbio, next); 821 async_rmw_stripe(next); 822 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 823 steal_rbio(rbio, next); 824 async_scrub_parity(next); 825 } 826 827 goto done_nolock; 828 } 829 } 830 done: 831 spin_unlock(&rbio->bio_list_lock); 832 spin_unlock_irqrestore(&h->lock, flags); 833 834 done_nolock: 835 if (!keep_cache) 836 remove_rbio_from_cache(rbio); 837 } 838 839 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 840 { 841 int i; 842 843 if (!refcount_dec_and_test(&rbio->refs)) 844 return; 845 846 WARN_ON(!list_empty(&rbio->stripe_cache)); 847 WARN_ON(!list_empty(&rbio->hash_list)); 848 WARN_ON(!bio_list_empty(&rbio->bio_list)); 849 850 for (i = 0; i < rbio->nr_pages; i++) { 851 if (rbio->stripe_pages[i]) { 852 __free_page(rbio->stripe_pages[i]); 853 rbio->stripe_pages[i] = NULL; 854 } 855 } 856 857 btrfs_put_bbio(rbio->bbio); 858 kfree(rbio); 859 } 860 861 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 862 { 863 struct bio *next; 864 865 while (cur) { 866 next = cur->bi_next; 867 cur->bi_next = NULL; 868 cur->bi_status = err; 869 bio_endio(cur); 870 cur = next; 871 } 872 } 873 874 /* 875 * this frees the rbio and runs through all the bios in the 876 * bio_list and calls end_io on them 877 */ 878 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 879 { 880 struct bio *cur = bio_list_get(&rbio->bio_list); 881 struct bio *extra; 882 883 if (rbio->generic_bio_cnt) 884 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 885 886 /* 887 * At this moment, rbio->bio_list is empty, however since rbio does not 888 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 889 * hash list, rbio may be merged with others so that rbio->bio_list 890 * becomes non-empty. 891 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 892 * more and we can call bio_endio() on all queued bios. 893 */ 894 unlock_stripe(rbio); 895 extra = bio_list_get(&rbio->bio_list); 896 __free_raid_bio(rbio); 897 898 rbio_endio_bio_list(cur, err); 899 if (extra) 900 rbio_endio_bio_list(extra, err); 901 } 902 903 /* 904 * end io function used by finish_rmw. When we finally 905 * get here, we've written a full stripe 906 */ 907 static void raid_write_end_io(struct bio *bio) 908 { 909 struct btrfs_raid_bio *rbio = bio->bi_private; 910 blk_status_t err = bio->bi_status; 911 int max_errors; 912 913 if (err) 914 fail_bio_stripe(rbio, bio); 915 916 bio_put(bio); 917 918 if (!atomic_dec_and_test(&rbio->stripes_pending)) 919 return; 920 921 err = BLK_STS_OK; 922 923 /* OK, we have read all the stripes we need to. */ 924 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 925 0 : rbio->bbio->max_errors; 926 if (atomic_read(&rbio->error) > max_errors) 927 err = BLK_STS_IOERR; 928 929 rbio_orig_end_io(rbio, err); 930 } 931 932 /* 933 * the read/modify/write code wants to use the original bio for 934 * any pages it included, and then use the rbio for everything 935 * else. This function decides if a given index (stripe number) 936 * and page number in that stripe fall inside the original bio 937 * or the rbio. 938 * 939 * if you set bio_list_only, you'll get a NULL back for any ranges 940 * that are outside the bio_list 941 * 942 * This doesn't take any refs on anything, you get a bare page pointer 943 * and the caller must bump refs as required. 944 * 945 * You must call index_rbio_pages once before you can trust 946 * the answers from this function. 947 */ 948 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 949 int index, int pagenr, int bio_list_only) 950 { 951 int chunk_page; 952 struct page *p = NULL; 953 954 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 955 956 spin_lock_irq(&rbio->bio_list_lock); 957 p = rbio->bio_pages[chunk_page]; 958 spin_unlock_irq(&rbio->bio_list_lock); 959 960 if (p || bio_list_only) 961 return p; 962 963 return rbio->stripe_pages[chunk_page]; 964 } 965 966 /* 967 * number of pages we need for the entire stripe across all the 968 * drives 969 */ 970 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 971 { 972 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 973 } 974 975 /* 976 * allocation and initial setup for the btrfs_raid_bio. Not 977 * this does not allocate any pages for rbio->pages. 978 */ 979 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 980 struct btrfs_bio *bbio, 981 u64 stripe_len) 982 { 983 struct btrfs_raid_bio *rbio; 984 int nr_data = 0; 985 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 986 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 987 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 988 void *p; 989 990 rbio = kzalloc(sizeof(*rbio) + 991 sizeof(*rbio->stripe_pages) * num_pages + 992 sizeof(*rbio->bio_pages) * num_pages + 993 sizeof(*rbio->finish_pointers) * real_stripes + 994 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + 995 sizeof(*rbio->finish_pbitmap) * 996 BITS_TO_LONGS(stripe_npages), 997 GFP_NOFS); 998 if (!rbio) 999 return ERR_PTR(-ENOMEM); 1000 1001 bio_list_init(&rbio->bio_list); 1002 INIT_LIST_HEAD(&rbio->plug_list); 1003 spin_lock_init(&rbio->bio_list_lock); 1004 INIT_LIST_HEAD(&rbio->stripe_cache); 1005 INIT_LIST_HEAD(&rbio->hash_list); 1006 rbio->bbio = bbio; 1007 rbio->fs_info = fs_info; 1008 rbio->stripe_len = stripe_len; 1009 rbio->nr_pages = num_pages; 1010 rbio->real_stripes = real_stripes; 1011 rbio->stripe_npages = stripe_npages; 1012 rbio->faila = -1; 1013 rbio->failb = -1; 1014 refcount_set(&rbio->refs, 1); 1015 atomic_set(&rbio->error, 0); 1016 atomic_set(&rbio->stripes_pending, 0); 1017 1018 /* 1019 * the stripe_pages, bio_pages, etc arrays point to the extra 1020 * memory we allocated past the end of the rbio 1021 */ 1022 p = rbio + 1; 1023 #define CONSUME_ALLOC(ptr, count) do { \ 1024 ptr = p; \ 1025 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1026 } while (0) 1027 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1028 CONSUME_ALLOC(rbio->bio_pages, num_pages); 1029 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1030 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); 1031 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); 1032 #undef CONSUME_ALLOC 1033 1034 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1035 nr_data = real_stripes - 1; 1036 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1037 nr_data = real_stripes - 2; 1038 else 1039 BUG(); 1040 1041 rbio->nr_data = nr_data; 1042 return rbio; 1043 } 1044 1045 /* allocate pages for all the stripes in the bio, including parity */ 1046 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1047 { 1048 int i; 1049 struct page *page; 1050 1051 for (i = 0; i < rbio->nr_pages; i++) { 1052 if (rbio->stripe_pages[i]) 1053 continue; 1054 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1055 if (!page) 1056 return -ENOMEM; 1057 rbio->stripe_pages[i] = page; 1058 } 1059 return 0; 1060 } 1061 1062 /* only allocate pages for p/q stripes */ 1063 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1064 { 1065 int i; 1066 struct page *page; 1067 1068 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1069 1070 for (; i < rbio->nr_pages; i++) { 1071 if (rbio->stripe_pages[i]) 1072 continue; 1073 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1074 if (!page) 1075 return -ENOMEM; 1076 rbio->stripe_pages[i] = page; 1077 } 1078 return 0; 1079 } 1080 1081 /* 1082 * add a single page from a specific stripe into our list of bios for IO 1083 * this will try to merge into existing bios if possible, and returns 1084 * zero if all went well. 1085 */ 1086 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1087 struct bio_list *bio_list, 1088 struct page *page, 1089 int stripe_nr, 1090 unsigned long page_index, 1091 unsigned long bio_max_len) 1092 { 1093 struct bio *last = bio_list->tail; 1094 u64 last_end = 0; 1095 int ret; 1096 struct bio *bio; 1097 struct btrfs_bio_stripe *stripe; 1098 u64 disk_start; 1099 1100 stripe = &rbio->bbio->stripes[stripe_nr]; 1101 disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1102 1103 /* if the device is missing, just fail this stripe */ 1104 if (!stripe->dev->bdev) 1105 return fail_rbio_index(rbio, stripe_nr); 1106 1107 /* see if we can add this page onto our existing bio */ 1108 if (last) { 1109 last_end = (u64)last->bi_iter.bi_sector << 9; 1110 last_end += last->bi_iter.bi_size; 1111 1112 /* 1113 * we can't merge these if they are from different 1114 * devices or if they are not contiguous 1115 */ 1116 if (last_end == disk_start && stripe->dev->bdev && 1117 !last->bi_status && 1118 last->bi_disk == stripe->dev->bdev->bd_disk && 1119 last->bi_partno == stripe->dev->bdev->bd_partno) { 1120 ret = bio_add_page(last, page, PAGE_SIZE, 0); 1121 if (ret == PAGE_SIZE) 1122 return 0; 1123 } 1124 } 1125 1126 /* put a new bio on the list */ 1127 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1128 bio->bi_iter.bi_size = 0; 1129 bio_set_dev(bio, stripe->dev->bdev); 1130 bio->bi_iter.bi_sector = disk_start >> 9; 1131 1132 bio_add_page(bio, page, PAGE_SIZE, 0); 1133 bio_list_add(bio_list, bio); 1134 return 0; 1135 } 1136 1137 /* 1138 * while we're doing the read/modify/write cycle, we could 1139 * have errors in reading pages off the disk. This checks 1140 * for errors and if we're not able to read the page it'll 1141 * trigger parity reconstruction. The rmw will be finished 1142 * after we've reconstructed the failed stripes 1143 */ 1144 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1145 { 1146 if (rbio->faila >= 0 || rbio->failb >= 0) { 1147 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1148 __raid56_parity_recover(rbio); 1149 } else { 1150 finish_rmw(rbio); 1151 } 1152 } 1153 1154 /* 1155 * helper function to walk our bio list and populate the bio_pages array with 1156 * the result. This seems expensive, but it is faster than constantly 1157 * searching through the bio list as we setup the IO in finish_rmw or stripe 1158 * reconstruction. 1159 * 1160 * This must be called before you trust the answers from page_in_rbio 1161 */ 1162 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1163 { 1164 struct bio *bio; 1165 u64 start; 1166 unsigned long stripe_offset; 1167 unsigned long page_index; 1168 1169 spin_lock_irq(&rbio->bio_list_lock); 1170 bio_list_for_each(bio, &rbio->bio_list) { 1171 struct bio_vec bvec; 1172 struct bvec_iter iter; 1173 int i = 0; 1174 1175 start = (u64)bio->bi_iter.bi_sector << 9; 1176 stripe_offset = start - rbio->bbio->raid_map[0]; 1177 page_index = stripe_offset >> PAGE_SHIFT; 1178 1179 if (bio_flagged(bio, BIO_CLONED)) 1180 bio->bi_iter = btrfs_io_bio(bio)->iter; 1181 1182 bio_for_each_segment(bvec, bio, iter) { 1183 rbio->bio_pages[page_index + i] = bvec.bv_page; 1184 i++; 1185 } 1186 } 1187 spin_unlock_irq(&rbio->bio_list_lock); 1188 } 1189 1190 /* 1191 * this is called from one of two situations. We either 1192 * have a full stripe from the higher layers, or we've read all 1193 * the missing bits off disk. 1194 * 1195 * This will calculate the parity and then send down any 1196 * changed blocks. 1197 */ 1198 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1199 { 1200 struct btrfs_bio *bbio = rbio->bbio; 1201 void **pointers = rbio->finish_pointers; 1202 int nr_data = rbio->nr_data; 1203 int stripe; 1204 int pagenr; 1205 int p_stripe = -1; 1206 int q_stripe = -1; 1207 struct bio_list bio_list; 1208 struct bio *bio; 1209 int ret; 1210 1211 bio_list_init(&bio_list); 1212 1213 if (rbio->real_stripes - rbio->nr_data == 1) { 1214 p_stripe = rbio->real_stripes - 1; 1215 } else if (rbio->real_stripes - rbio->nr_data == 2) { 1216 p_stripe = rbio->real_stripes - 2; 1217 q_stripe = rbio->real_stripes - 1; 1218 } else { 1219 BUG(); 1220 } 1221 1222 /* at this point we either have a full stripe, 1223 * or we've read the full stripe from the drive. 1224 * recalculate the parity and write the new results. 1225 * 1226 * We're not allowed to add any new bios to the 1227 * bio list here, anyone else that wants to 1228 * change this stripe needs to do their own rmw. 1229 */ 1230 spin_lock_irq(&rbio->bio_list_lock); 1231 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1232 spin_unlock_irq(&rbio->bio_list_lock); 1233 1234 atomic_set(&rbio->error, 0); 1235 1236 /* 1237 * now that we've set rmw_locked, run through the 1238 * bio list one last time and map the page pointers 1239 * 1240 * We don't cache full rbios because we're assuming 1241 * the higher layers are unlikely to use this area of 1242 * the disk again soon. If they do use it again, 1243 * hopefully they will send another full bio. 1244 */ 1245 index_rbio_pages(rbio); 1246 if (!rbio_is_full(rbio)) 1247 cache_rbio_pages(rbio); 1248 else 1249 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1250 1251 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1252 struct page *p; 1253 /* first collect one page from each data stripe */ 1254 for (stripe = 0; stripe < nr_data; stripe++) { 1255 p = page_in_rbio(rbio, stripe, pagenr, 0); 1256 pointers[stripe] = kmap(p); 1257 } 1258 1259 /* then add the parity stripe */ 1260 p = rbio_pstripe_page(rbio, pagenr); 1261 SetPageUptodate(p); 1262 pointers[stripe++] = kmap(p); 1263 1264 if (q_stripe != -1) { 1265 1266 /* 1267 * raid6, add the qstripe and call the 1268 * library function to fill in our p/q 1269 */ 1270 p = rbio_qstripe_page(rbio, pagenr); 1271 SetPageUptodate(p); 1272 pointers[stripe++] = kmap(p); 1273 1274 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1275 pointers); 1276 } else { 1277 /* raid5 */ 1278 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 1279 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1280 } 1281 1282 1283 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 1284 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1285 } 1286 1287 /* 1288 * time to start writing. Make bios for everything from the 1289 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1290 * everything else. 1291 */ 1292 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1293 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1294 struct page *page; 1295 if (stripe < rbio->nr_data) { 1296 page = page_in_rbio(rbio, stripe, pagenr, 1); 1297 if (!page) 1298 continue; 1299 } else { 1300 page = rbio_stripe_page(rbio, stripe, pagenr); 1301 } 1302 1303 ret = rbio_add_io_page(rbio, &bio_list, 1304 page, stripe, pagenr, rbio->stripe_len); 1305 if (ret) 1306 goto cleanup; 1307 } 1308 } 1309 1310 if (likely(!bbio->num_tgtdevs)) 1311 goto write_data; 1312 1313 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1314 if (!bbio->tgtdev_map[stripe]) 1315 continue; 1316 1317 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1318 struct page *page; 1319 if (stripe < rbio->nr_data) { 1320 page = page_in_rbio(rbio, stripe, pagenr, 1); 1321 if (!page) 1322 continue; 1323 } else { 1324 page = rbio_stripe_page(rbio, stripe, pagenr); 1325 } 1326 1327 ret = rbio_add_io_page(rbio, &bio_list, page, 1328 rbio->bbio->tgtdev_map[stripe], 1329 pagenr, rbio->stripe_len); 1330 if (ret) 1331 goto cleanup; 1332 } 1333 } 1334 1335 write_data: 1336 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1337 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1338 1339 while (1) { 1340 bio = bio_list_pop(&bio_list); 1341 if (!bio) 1342 break; 1343 1344 bio->bi_private = rbio; 1345 bio->bi_end_io = raid_write_end_io; 1346 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 1347 1348 submit_bio(bio); 1349 } 1350 return; 1351 1352 cleanup: 1353 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1354 1355 while ((bio = bio_list_pop(&bio_list))) 1356 bio_put(bio); 1357 } 1358 1359 /* 1360 * helper to find the stripe number for a given bio. Used to figure out which 1361 * stripe has failed. This expects the bio to correspond to a physical disk, 1362 * so it looks up based on physical sector numbers. 1363 */ 1364 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1365 struct bio *bio) 1366 { 1367 u64 physical = bio->bi_iter.bi_sector; 1368 u64 stripe_start; 1369 int i; 1370 struct btrfs_bio_stripe *stripe; 1371 1372 physical <<= 9; 1373 1374 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1375 stripe = &rbio->bbio->stripes[i]; 1376 stripe_start = stripe->physical; 1377 if (physical >= stripe_start && 1378 physical < stripe_start + rbio->stripe_len && 1379 stripe->dev->bdev && 1380 bio->bi_disk == stripe->dev->bdev->bd_disk && 1381 bio->bi_partno == stripe->dev->bdev->bd_partno) { 1382 return i; 1383 } 1384 } 1385 return -1; 1386 } 1387 1388 /* 1389 * helper to find the stripe number for a given 1390 * bio (before mapping). Used to figure out which stripe has 1391 * failed. This looks up based on logical block numbers. 1392 */ 1393 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1394 struct bio *bio) 1395 { 1396 u64 logical = bio->bi_iter.bi_sector; 1397 u64 stripe_start; 1398 int i; 1399 1400 logical <<= 9; 1401 1402 for (i = 0; i < rbio->nr_data; i++) { 1403 stripe_start = rbio->bbio->raid_map[i]; 1404 if (logical >= stripe_start && 1405 logical < stripe_start + rbio->stripe_len) { 1406 return i; 1407 } 1408 } 1409 return -1; 1410 } 1411 1412 /* 1413 * returns -EIO if we had too many failures 1414 */ 1415 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1416 { 1417 unsigned long flags; 1418 int ret = 0; 1419 1420 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1421 1422 /* we already know this stripe is bad, move on */ 1423 if (rbio->faila == failed || rbio->failb == failed) 1424 goto out; 1425 1426 if (rbio->faila == -1) { 1427 /* first failure on this rbio */ 1428 rbio->faila = failed; 1429 atomic_inc(&rbio->error); 1430 } else if (rbio->failb == -1) { 1431 /* second failure on this rbio */ 1432 rbio->failb = failed; 1433 atomic_inc(&rbio->error); 1434 } else { 1435 ret = -EIO; 1436 } 1437 out: 1438 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1439 1440 return ret; 1441 } 1442 1443 /* 1444 * helper to fail a stripe based on a physical disk 1445 * bio. 1446 */ 1447 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1448 struct bio *bio) 1449 { 1450 int failed = find_bio_stripe(rbio, bio); 1451 1452 if (failed < 0) 1453 return -EIO; 1454 1455 return fail_rbio_index(rbio, failed); 1456 } 1457 1458 /* 1459 * this sets each page in the bio uptodate. It should only be used on private 1460 * rbio pages, nothing that comes in from the higher layers 1461 */ 1462 static void set_bio_pages_uptodate(struct bio *bio) 1463 { 1464 struct bio_vec *bvec; 1465 int i; 1466 1467 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1468 1469 bio_for_each_segment_all(bvec, bio, i) 1470 SetPageUptodate(bvec->bv_page); 1471 } 1472 1473 /* 1474 * end io for the read phase of the rmw cycle. All the bios here are physical 1475 * stripe bios we've read from the disk so we can recalculate the parity of the 1476 * stripe. 1477 * 1478 * This will usually kick off finish_rmw once all the bios are read in, but it 1479 * may trigger parity reconstruction if we had any errors along the way 1480 */ 1481 static void raid_rmw_end_io(struct bio *bio) 1482 { 1483 struct btrfs_raid_bio *rbio = bio->bi_private; 1484 1485 if (bio->bi_status) 1486 fail_bio_stripe(rbio, bio); 1487 else 1488 set_bio_pages_uptodate(bio); 1489 1490 bio_put(bio); 1491 1492 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1493 return; 1494 1495 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1496 goto cleanup; 1497 1498 /* 1499 * this will normally call finish_rmw to start our write 1500 * but if there are any failed stripes we'll reconstruct 1501 * from parity first 1502 */ 1503 validate_rbio_for_rmw(rbio); 1504 return; 1505 1506 cleanup: 1507 1508 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1509 } 1510 1511 static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1512 { 1513 btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL); 1514 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 1515 } 1516 1517 static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1518 { 1519 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1520 read_rebuild_work, NULL, NULL); 1521 1522 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 1523 } 1524 1525 /* 1526 * the stripe must be locked by the caller. It will 1527 * unlock after all the writes are done 1528 */ 1529 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1530 { 1531 int bios_to_read = 0; 1532 struct bio_list bio_list; 1533 int ret; 1534 int pagenr; 1535 int stripe; 1536 struct bio *bio; 1537 1538 bio_list_init(&bio_list); 1539 1540 ret = alloc_rbio_pages(rbio); 1541 if (ret) 1542 goto cleanup; 1543 1544 index_rbio_pages(rbio); 1545 1546 atomic_set(&rbio->error, 0); 1547 /* 1548 * build a list of bios to read all the missing parts of this 1549 * stripe 1550 */ 1551 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1552 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1553 struct page *page; 1554 /* 1555 * we want to find all the pages missing from 1556 * the rbio and read them from the disk. If 1557 * page_in_rbio finds a page in the bio list 1558 * we don't need to read it off the stripe. 1559 */ 1560 page = page_in_rbio(rbio, stripe, pagenr, 1); 1561 if (page) 1562 continue; 1563 1564 page = rbio_stripe_page(rbio, stripe, pagenr); 1565 /* 1566 * the bio cache may have handed us an uptodate 1567 * page. If so, be happy and use it 1568 */ 1569 if (PageUptodate(page)) 1570 continue; 1571 1572 ret = rbio_add_io_page(rbio, &bio_list, page, 1573 stripe, pagenr, rbio->stripe_len); 1574 if (ret) 1575 goto cleanup; 1576 } 1577 } 1578 1579 bios_to_read = bio_list_size(&bio_list); 1580 if (!bios_to_read) { 1581 /* 1582 * this can happen if others have merged with 1583 * us, it means there is nothing left to read. 1584 * But if there are missing devices it may not be 1585 * safe to do the full stripe write yet. 1586 */ 1587 goto finish; 1588 } 1589 1590 /* 1591 * the bbio may be freed once we submit the last bio. Make sure 1592 * not to touch it after that 1593 */ 1594 atomic_set(&rbio->stripes_pending, bios_to_read); 1595 while (1) { 1596 bio = bio_list_pop(&bio_list); 1597 if (!bio) 1598 break; 1599 1600 bio->bi_private = rbio; 1601 bio->bi_end_io = raid_rmw_end_io; 1602 bio_set_op_attrs(bio, REQ_OP_READ, 0); 1603 1604 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1605 1606 submit_bio(bio); 1607 } 1608 /* the actual write will happen once the reads are done */ 1609 return 0; 1610 1611 cleanup: 1612 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1613 1614 while ((bio = bio_list_pop(&bio_list))) 1615 bio_put(bio); 1616 1617 return -EIO; 1618 1619 finish: 1620 validate_rbio_for_rmw(rbio); 1621 return 0; 1622 } 1623 1624 /* 1625 * if the upper layers pass in a full stripe, we thank them by only allocating 1626 * enough pages to hold the parity, and sending it all down quickly. 1627 */ 1628 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1629 { 1630 int ret; 1631 1632 ret = alloc_rbio_parity_pages(rbio); 1633 if (ret) { 1634 __free_raid_bio(rbio); 1635 return ret; 1636 } 1637 1638 ret = lock_stripe_add(rbio); 1639 if (ret == 0) 1640 finish_rmw(rbio); 1641 return 0; 1642 } 1643 1644 /* 1645 * partial stripe writes get handed over to async helpers. 1646 * We're really hoping to merge a few more writes into this 1647 * rbio before calculating new parity 1648 */ 1649 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1650 { 1651 int ret; 1652 1653 ret = lock_stripe_add(rbio); 1654 if (ret == 0) 1655 async_rmw_stripe(rbio); 1656 return 0; 1657 } 1658 1659 /* 1660 * sometimes while we were reading from the drive to 1661 * recalculate parity, enough new bios come into create 1662 * a full stripe. So we do a check here to see if we can 1663 * go directly to finish_rmw 1664 */ 1665 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1666 { 1667 /* head off into rmw land if we don't have a full stripe */ 1668 if (!rbio_is_full(rbio)) 1669 return partial_stripe_write(rbio); 1670 return full_stripe_write(rbio); 1671 } 1672 1673 /* 1674 * We use plugging call backs to collect full stripes. 1675 * Any time we get a partial stripe write while plugged 1676 * we collect it into a list. When the unplug comes down, 1677 * we sort the list by logical block number and merge 1678 * everything we can into the same rbios 1679 */ 1680 struct btrfs_plug_cb { 1681 struct blk_plug_cb cb; 1682 struct btrfs_fs_info *info; 1683 struct list_head rbio_list; 1684 struct btrfs_work work; 1685 }; 1686 1687 /* 1688 * rbios on the plug list are sorted for easier merging. 1689 */ 1690 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1691 { 1692 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1693 plug_list); 1694 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1695 plug_list); 1696 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1697 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1698 1699 if (a_sector < b_sector) 1700 return -1; 1701 if (a_sector > b_sector) 1702 return 1; 1703 return 0; 1704 } 1705 1706 static void run_plug(struct btrfs_plug_cb *plug) 1707 { 1708 struct btrfs_raid_bio *cur; 1709 struct btrfs_raid_bio *last = NULL; 1710 1711 /* 1712 * sort our plug list then try to merge 1713 * everything we can in hopes of creating full 1714 * stripes. 1715 */ 1716 list_sort(NULL, &plug->rbio_list, plug_cmp); 1717 while (!list_empty(&plug->rbio_list)) { 1718 cur = list_entry(plug->rbio_list.next, 1719 struct btrfs_raid_bio, plug_list); 1720 list_del_init(&cur->plug_list); 1721 1722 if (rbio_is_full(cur)) { 1723 /* we have a full stripe, send it down */ 1724 full_stripe_write(cur); 1725 continue; 1726 } 1727 if (last) { 1728 if (rbio_can_merge(last, cur)) { 1729 merge_rbio(last, cur); 1730 __free_raid_bio(cur); 1731 continue; 1732 1733 } 1734 __raid56_parity_write(last); 1735 } 1736 last = cur; 1737 } 1738 if (last) { 1739 __raid56_parity_write(last); 1740 } 1741 kfree(plug); 1742 } 1743 1744 /* 1745 * if the unplug comes from schedule, we have to push the 1746 * work off to a helper thread 1747 */ 1748 static void unplug_work(struct btrfs_work *work) 1749 { 1750 struct btrfs_plug_cb *plug; 1751 plug = container_of(work, struct btrfs_plug_cb, work); 1752 run_plug(plug); 1753 } 1754 1755 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1756 { 1757 struct btrfs_plug_cb *plug; 1758 plug = container_of(cb, struct btrfs_plug_cb, cb); 1759 1760 if (from_schedule) { 1761 btrfs_init_work(&plug->work, btrfs_rmw_helper, 1762 unplug_work, NULL, NULL); 1763 btrfs_queue_work(plug->info->rmw_workers, 1764 &plug->work); 1765 return; 1766 } 1767 run_plug(plug); 1768 } 1769 1770 /* 1771 * our main entry point for writes from the rest of the FS. 1772 */ 1773 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, 1774 struct btrfs_bio *bbio, u64 stripe_len) 1775 { 1776 struct btrfs_raid_bio *rbio; 1777 struct btrfs_plug_cb *plug = NULL; 1778 struct blk_plug_cb *cb; 1779 int ret; 1780 1781 rbio = alloc_rbio(fs_info, bbio, stripe_len); 1782 if (IS_ERR(rbio)) { 1783 btrfs_put_bbio(bbio); 1784 return PTR_ERR(rbio); 1785 } 1786 bio_list_add(&rbio->bio_list, bio); 1787 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1788 rbio->operation = BTRFS_RBIO_WRITE; 1789 1790 btrfs_bio_counter_inc_noblocked(fs_info); 1791 rbio->generic_bio_cnt = 1; 1792 1793 /* 1794 * don't plug on full rbios, just get them out the door 1795 * as quickly as we can 1796 */ 1797 if (rbio_is_full(rbio)) { 1798 ret = full_stripe_write(rbio); 1799 if (ret) 1800 btrfs_bio_counter_dec(fs_info); 1801 return ret; 1802 } 1803 1804 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1805 if (cb) { 1806 plug = container_of(cb, struct btrfs_plug_cb, cb); 1807 if (!plug->info) { 1808 plug->info = fs_info; 1809 INIT_LIST_HEAD(&plug->rbio_list); 1810 } 1811 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1812 ret = 0; 1813 } else { 1814 ret = __raid56_parity_write(rbio); 1815 if (ret) 1816 btrfs_bio_counter_dec(fs_info); 1817 } 1818 return ret; 1819 } 1820 1821 /* 1822 * all parity reconstruction happens here. We've read in everything 1823 * we can find from the drives and this does the heavy lifting of 1824 * sorting the good from the bad. 1825 */ 1826 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1827 { 1828 int pagenr, stripe; 1829 void **pointers; 1830 int faila = -1, failb = -1; 1831 struct page *page; 1832 blk_status_t err; 1833 int i; 1834 1835 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1836 if (!pointers) { 1837 err = BLK_STS_RESOURCE; 1838 goto cleanup_io; 1839 } 1840 1841 faila = rbio->faila; 1842 failb = rbio->failb; 1843 1844 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1845 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1846 spin_lock_irq(&rbio->bio_list_lock); 1847 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1848 spin_unlock_irq(&rbio->bio_list_lock); 1849 } 1850 1851 index_rbio_pages(rbio); 1852 1853 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1854 /* 1855 * Now we just use bitmap to mark the horizontal stripes in 1856 * which we have data when doing parity scrub. 1857 */ 1858 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1859 !test_bit(pagenr, rbio->dbitmap)) 1860 continue; 1861 1862 /* setup our array of pointers with pages 1863 * from each stripe 1864 */ 1865 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1866 /* 1867 * if we're rebuilding a read, we have to use 1868 * pages from the bio list 1869 */ 1870 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1871 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1872 (stripe == faila || stripe == failb)) { 1873 page = page_in_rbio(rbio, stripe, pagenr, 0); 1874 } else { 1875 page = rbio_stripe_page(rbio, stripe, pagenr); 1876 } 1877 pointers[stripe] = kmap(page); 1878 } 1879 1880 /* all raid6 handling here */ 1881 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1882 /* 1883 * single failure, rebuild from parity raid5 1884 * style 1885 */ 1886 if (failb < 0) { 1887 if (faila == rbio->nr_data) { 1888 /* 1889 * Just the P stripe has failed, without 1890 * a bad data or Q stripe. 1891 * TODO, we should redo the xor here. 1892 */ 1893 err = BLK_STS_IOERR; 1894 goto cleanup; 1895 } 1896 /* 1897 * a single failure in raid6 is rebuilt 1898 * in the pstripe code below 1899 */ 1900 goto pstripe; 1901 } 1902 1903 /* make sure our ps and qs are in order */ 1904 if (faila > failb) { 1905 int tmp = failb; 1906 failb = faila; 1907 faila = tmp; 1908 } 1909 1910 /* if the q stripe is failed, do a pstripe reconstruction 1911 * from the xors. 1912 * If both the q stripe and the P stripe are failed, we're 1913 * here due to a crc mismatch and we can't give them the 1914 * data they want 1915 */ 1916 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1917 if (rbio->bbio->raid_map[faila] == 1918 RAID5_P_STRIPE) { 1919 err = BLK_STS_IOERR; 1920 goto cleanup; 1921 } 1922 /* 1923 * otherwise we have one bad data stripe and 1924 * a good P stripe. raid5! 1925 */ 1926 goto pstripe; 1927 } 1928 1929 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1930 raid6_datap_recov(rbio->real_stripes, 1931 PAGE_SIZE, faila, pointers); 1932 } else { 1933 raid6_2data_recov(rbio->real_stripes, 1934 PAGE_SIZE, faila, failb, 1935 pointers); 1936 } 1937 } else { 1938 void *p; 1939 1940 /* rebuild from P stripe here (raid5 or raid6) */ 1941 BUG_ON(failb != -1); 1942 pstripe: 1943 /* Copy parity block into failed block to start with */ 1944 memcpy(pointers[faila], 1945 pointers[rbio->nr_data], 1946 PAGE_SIZE); 1947 1948 /* rearrange the pointer array */ 1949 p = pointers[faila]; 1950 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1951 pointers[stripe] = pointers[stripe + 1]; 1952 pointers[rbio->nr_data - 1] = p; 1953 1954 /* xor in the rest */ 1955 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1956 } 1957 /* if we're doing this rebuild as part of an rmw, go through 1958 * and set all of our private rbio pages in the 1959 * failed stripes as uptodate. This way finish_rmw will 1960 * know they can be trusted. If this was a read reconstruction, 1961 * other endio functions will fiddle the uptodate bits 1962 */ 1963 if (rbio->operation == BTRFS_RBIO_WRITE) { 1964 for (i = 0; i < rbio->stripe_npages; i++) { 1965 if (faila != -1) { 1966 page = rbio_stripe_page(rbio, faila, i); 1967 SetPageUptodate(page); 1968 } 1969 if (failb != -1) { 1970 page = rbio_stripe_page(rbio, failb, i); 1971 SetPageUptodate(page); 1972 } 1973 } 1974 } 1975 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1976 /* 1977 * if we're rebuilding a read, we have to use 1978 * pages from the bio list 1979 */ 1980 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1981 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1982 (stripe == faila || stripe == failb)) { 1983 page = page_in_rbio(rbio, stripe, pagenr, 0); 1984 } else { 1985 page = rbio_stripe_page(rbio, stripe, pagenr); 1986 } 1987 kunmap(page); 1988 } 1989 } 1990 1991 err = BLK_STS_OK; 1992 cleanup: 1993 kfree(pointers); 1994 1995 cleanup_io: 1996 /* 1997 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 1998 * valid rbio which is consistent with ondisk content, thus such a 1999 * valid rbio can be cached to avoid further disk reads. 2000 */ 2001 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2002 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 2003 /* 2004 * - In case of two failures, where rbio->failb != -1: 2005 * 2006 * Do not cache this rbio since the above read reconstruction 2007 * (raid6_datap_recov() or raid6_2data_recov()) may have 2008 * changed some content of stripes which are not identical to 2009 * on-disk content any more, otherwise, a later write/recover 2010 * may steal stripe_pages from this rbio and end up with 2011 * corruptions or rebuild failures. 2012 * 2013 * - In case of single failure, where rbio->failb == -1: 2014 * 2015 * Cache this rbio iff the above read reconstruction is 2016 * excuted without problems. 2017 */ 2018 if (err == BLK_STS_OK && rbio->failb < 0) 2019 cache_rbio_pages(rbio); 2020 else 2021 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2022 2023 rbio_orig_end_io(rbio, err); 2024 } else if (err == BLK_STS_OK) { 2025 rbio->faila = -1; 2026 rbio->failb = -1; 2027 2028 if (rbio->operation == BTRFS_RBIO_WRITE) 2029 finish_rmw(rbio); 2030 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 2031 finish_parity_scrub(rbio, 0); 2032 else 2033 BUG(); 2034 } else { 2035 rbio_orig_end_io(rbio, err); 2036 } 2037 } 2038 2039 /* 2040 * This is called only for stripes we've read from disk to 2041 * reconstruct the parity. 2042 */ 2043 static void raid_recover_end_io(struct bio *bio) 2044 { 2045 struct btrfs_raid_bio *rbio = bio->bi_private; 2046 2047 /* 2048 * we only read stripe pages off the disk, set them 2049 * up to date if there were no errors 2050 */ 2051 if (bio->bi_status) 2052 fail_bio_stripe(rbio, bio); 2053 else 2054 set_bio_pages_uptodate(bio); 2055 bio_put(bio); 2056 2057 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2058 return; 2059 2060 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2061 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2062 else 2063 __raid_recover_end_io(rbio); 2064 } 2065 2066 /* 2067 * reads everything we need off the disk to reconstruct 2068 * the parity. endio handlers trigger final reconstruction 2069 * when the IO is done. 2070 * 2071 * This is used both for reads from the higher layers and for 2072 * parity construction required to finish a rmw cycle. 2073 */ 2074 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2075 { 2076 int bios_to_read = 0; 2077 struct bio_list bio_list; 2078 int ret; 2079 int pagenr; 2080 int stripe; 2081 struct bio *bio; 2082 2083 bio_list_init(&bio_list); 2084 2085 ret = alloc_rbio_pages(rbio); 2086 if (ret) 2087 goto cleanup; 2088 2089 atomic_set(&rbio->error, 0); 2090 2091 /* 2092 * read everything that hasn't failed. Thanks to the 2093 * stripe cache, it is possible that some or all of these 2094 * pages are going to be uptodate. 2095 */ 2096 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2097 if (rbio->faila == stripe || rbio->failb == stripe) { 2098 atomic_inc(&rbio->error); 2099 continue; 2100 } 2101 2102 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2103 struct page *p; 2104 2105 /* 2106 * the rmw code may have already read this 2107 * page in 2108 */ 2109 p = rbio_stripe_page(rbio, stripe, pagenr); 2110 if (PageUptodate(p)) 2111 continue; 2112 2113 ret = rbio_add_io_page(rbio, &bio_list, 2114 rbio_stripe_page(rbio, stripe, pagenr), 2115 stripe, pagenr, rbio->stripe_len); 2116 if (ret < 0) 2117 goto cleanup; 2118 } 2119 } 2120 2121 bios_to_read = bio_list_size(&bio_list); 2122 if (!bios_to_read) { 2123 /* 2124 * we might have no bios to read just because the pages 2125 * were up to date, or we might have no bios to read because 2126 * the devices were gone. 2127 */ 2128 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2129 __raid_recover_end_io(rbio); 2130 goto out; 2131 } else { 2132 goto cleanup; 2133 } 2134 } 2135 2136 /* 2137 * the bbio may be freed once we submit the last bio. Make sure 2138 * not to touch it after that 2139 */ 2140 atomic_set(&rbio->stripes_pending, bios_to_read); 2141 while (1) { 2142 bio = bio_list_pop(&bio_list); 2143 if (!bio) 2144 break; 2145 2146 bio->bi_private = rbio; 2147 bio->bi_end_io = raid_recover_end_io; 2148 bio_set_op_attrs(bio, REQ_OP_READ, 0); 2149 2150 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2151 2152 submit_bio(bio); 2153 } 2154 out: 2155 return 0; 2156 2157 cleanup: 2158 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2159 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2160 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2161 2162 while ((bio = bio_list_pop(&bio_list))) 2163 bio_put(bio); 2164 2165 return -EIO; 2166 } 2167 2168 /* 2169 * the main entry point for reads from the higher layers. This 2170 * is really only called when the normal read path had a failure, 2171 * so we assume the bio they send down corresponds to a failed part 2172 * of the drive. 2173 */ 2174 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, 2175 struct btrfs_bio *bbio, u64 stripe_len, 2176 int mirror_num, int generic_io) 2177 { 2178 struct btrfs_raid_bio *rbio; 2179 int ret; 2180 2181 if (generic_io) { 2182 ASSERT(bbio->mirror_num == mirror_num); 2183 btrfs_io_bio(bio)->mirror_num = mirror_num; 2184 } 2185 2186 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2187 if (IS_ERR(rbio)) { 2188 if (generic_io) 2189 btrfs_put_bbio(bbio); 2190 return PTR_ERR(rbio); 2191 } 2192 2193 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2194 bio_list_add(&rbio->bio_list, bio); 2195 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2196 2197 rbio->faila = find_logical_bio_stripe(rbio, bio); 2198 if (rbio->faila == -1) { 2199 btrfs_warn(fs_info, 2200 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", 2201 __func__, (u64)bio->bi_iter.bi_sector << 9, 2202 (u64)bio->bi_iter.bi_size, bbio->map_type); 2203 if (generic_io) 2204 btrfs_put_bbio(bbio); 2205 kfree(rbio); 2206 return -EIO; 2207 } 2208 2209 if (generic_io) { 2210 btrfs_bio_counter_inc_noblocked(fs_info); 2211 rbio->generic_bio_cnt = 1; 2212 } else { 2213 btrfs_get_bbio(bbio); 2214 } 2215 2216 /* 2217 * Loop retry: 2218 * for 'mirror == 2', reconstruct from all other stripes. 2219 * for 'mirror_num > 2', select a stripe to fail on every retry. 2220 */ 2221 if (mirror_num > 2) { 2222 /* 2223 * 'mirror == 3' is to fail the p stripe and 2224 * reconstruct from the q stripe. 'mirror > 3' is to 2225 * fail a data stripe and reconstruct from p+q stripe. 2226 */ 2227 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2228 ASSERT(rbio->failb > 0); 2229 if (rbio->failb <= rbio->faila) 2230 rbio->failb--; 2231 } 2232 2233 ret = lock_stripe_add(rbio); 2234 2235 /* 2236 * __raid56_parity_recover will end the bio with 2237 * any errors it hits. We don't want to return 2238 * its error value up the stack because our caller 2239 * will end up calling bio_endio with any nonzero 2240 * return 2241 */ 2242 if (ret == 0) 2243 __raid56_parity_recover(rbio); 2244 /* 2245 * our rbio has been added to the list of 2246 * rbios that will be handled after the 2247 * currently lock owner is done 2248 */ 2249 return 0; 2250 2251 } 2252 2253 static void rmw_work(struct btrfs_work *work) 2254 { 2255 struct btrfs_raid_bio *rbio; 2256 2257 rbio = container_of(work, struct btrfs_raid_bio, work); 2258 raid56_rmw_stripe(rbio); 2259 } 2260 2261 static void read_rebuild_work(struct btrfs_work *work) 2262 { 2263 struct btrfs_raid_bio *rbio; 2264 2265 rbio = container_of(work, struct btrfs_raid_bio, work); 2266 __raid56_parity_recover(rbio); 2267 } 2268 2269 /* 2270 * The following code is used to scrub/replace the parity stripe 2271 * 2272 * Caller must have already increased bio_counter for getting @bbio. 2273 * 2274 * Note: We need make sure all the pages that add into the scrub/replace 2275 * raid bio are correct and not be changed during the scrub/replace. That 2276 * is those pages just hold metadata or file data with checksum. 2277 */ 2278 2279 struct btrfs_raid_bio * 2280 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2281 struct btrfs_bio *bbio, u64 stripe_len, 2282 struct btrfs_device *scrub_dev, 2283 unsigned long *dbitmap, int stripe_nsectors) 2284 { 2285 struct btrfs_raid_bio *rbio; 2286 int i; 2287 2288 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2289 if (IS_ERR(rbio)) 2290 return NULL; 2291 bio_list_add(&rbio->bio_list, bio); 2292 /* 2293 * This is a special bio which is used to hold the completion handler 2294 * and make the scrub rbio is similar to the other types 2295 */ 2296 ASSERT(!bio->bi_iter.bi_size); 2297 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2298 2299 /* 2300 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted 2301 * to the end position, so this search can start from the first parity 2302 * stripe. 2303 */ 2304 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2305 if (bbio->stripes[i].dev == scrub_dev) { 2306 rbio->scrubp = i; 2307 break; 2308 } 2309 } 2310 ASSERT(i < rbio->real_stripes); 2311 2312 /* Now we just support the sectorsize equals to page size */ 2313 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2314 ASSERT(rbio->stripe_npages == stripe_nsectors); 2315 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2316 2317 /* 2318 * We have already increased bio_counter when getting bbio, record it 2319 * so we can free it at rbio_orig_end_io(). 2320 */ 2321 rbio->generic_bio_cnt = 1; 2322 2323 return rbio; 2324 } 2325 2326 /* Used for both parity scrub and missing. */ 2327 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2328 u64 logical) 2329 { 2330 int stripe_offset; 2331 int index; 2332 2333 ASSERT(logical >= rbio->bbio->raid_map[0]); 2334 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2335 rbio->stripe_len * rbio->nr_data); 2336 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2337 index = stripe_offset >> PAGE_SHIFT; 2338 rbio->bio_pages[index] = page; 2339 } 2340 2341 /* 2342 * We just scrub the parity that we have correct data on the same horizontal, 2343 * so we needn't allocate all pages for all the stripes. 2344 */ 2345 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2346 { 2347 int i; 2348 int bit; 2349 int index; 2350 struct page *page; 2351 2352 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2353 for (i = 0; i < rbio->real_stripes; i++) { 2354 index = i * rbio->stripe_npages + bit; 2355 if (rbio->stripe_pages[index]) 2356 continue; 2357 2358 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2359 if (!page) 2360 return -ENOMEM; 2361 rbio->stripe_pages[index] = page; 2362 } 2363 } 2364 return 0; 2365 } 2366 2367 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2368 int need_check) 2369 { 2370 struct btrfs_bio *bbio = rbio->bbio; 2371 void **pointers = rbio->finish_pointers; 2372 unsigned long *pbitmap = rbio->finish_pbitmap; 2373 int nr_data = rbio->nr_data; 2374 int stripe; 2375 int pagenr; 2376 int p_stripe = -1; 2377 int q_stripe = -1; 2378 struct page *p_page = NULL; 2379 struct page *q_page = NULL; 2380 struct bio_list bio_list; 2381 struct bio *bio; 2382 int is_replace = 0; 2383 int ret; 2384 2385 bio_list_init(&bio_list); 2386 2387 if (rbio->real_stripes - rbio->nr_data == 1) { 2388 p_stripe = rbio->real_stripes - 1; 2389 } else if (rbio->real_stripes - rbio->nr_data == 2) { 2390 p_stripe = rbio->real_stripes - 2; 2391 q_stripe = rbio->real_stripes - 1; 2392 } else { 2393 BUG(); 2394 } 2395 2396 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2397 is_replace = 1; 2398 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2399 } 2400 2401 /* 2402 * Because the higher layers(scrubber) are unlikely to 2403 * use this area of the disk again soon, so don't cache 2404 * it. 2405 */ 2406 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2407 2408 if (!need_check) 2409 goto writeback; 2410 2411 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2412 if (!p_page) 2413 goto cleanup; 2414 SetPageUptodate(p_page); 2415 2416 if (q_stripe != -1) { 2417 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2418 if (!q_page) { 2419 __free_page(p_page); 2420 goto cleanup; 2421 } 2422 SetPageUptodate(q_page); 2423 } 2424 2425 atomic_set(&rbio->error, 0); 2426 2427 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2428 struct page *p; 2429 void *parity; 2430 /* first collect one page from each data stripe */ 2431 for (stripe = 0; stripe < nr_data; stripe++) { 2432 p = page_in_rbio(rbio, stripe, pagenr, 0); 2433 pointers[stripe] = kmap(p); 2434 } 2435 2436 /* then add the parity stripe */ 2437 pointers[stripe++] = kmap(p_page); 2438 2439 if (q_stripe != -1) { 2440 2441 /* 2442 * raid6, add the qstripe and call the 2443 * library function to fill in our p/q 2444 */ 2445 pointers[stripe++] = kmap(q_page); 2446 2447 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2448 pointers); 2449 } else { 2450 /* raid5 */ 2451 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 2452 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2453 } 2454 2455 /* Check scrubbing parity and repair it */ 2456 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2457 parity = kmap(p); 2458 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2459 memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE); 2460 else 2461 /* Parity is right, needn't writeback */ 2462 bitmap_clear(rbio->dbitmap, pagenr, 1); 2463 kunmap(p); 2464 2465 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 2466 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 2467 } 2468 2469 __free_page(p_page); 2470 if (q_page) 2471 __free_page(q_page); 2472 2473 writeback: 2474 /* 2475 * time to start writing. Make bios for everything from the 2476 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2477 * everything else. 2478 */ 2479 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2480 struct page *page; 2481 2482 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2483 ret = rbio_add_io_page(rbio, &bio_list, 2484 page, rbio->scrubp, pagenr, rbio->stripe_len); 2485 if (ret) 2486 goto cleanup; 2487 } 2488 2489 if (!is_replace) 2490 goto submit_write; 2491 2492 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2493 struct page *page; 2494 2495 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2496 ret = rbio_add_io_page(rbio, &bio_list, page, 2497 bbio->tgtdev_map[rbio->scrubp], 2498 pagenr, rbio->stripe_len); 2499 if (ret) 2500 goto cleanup; 2501 } 2502 2503 submit_write: 2504 nr_data = bio_list_size(&bio_list); 2505 if (!nr_data) { 2506 /* Every parity is right */ 2507 rbio_orig_end_io(rbio, BLK_STS_OK); 2508 return; 2509 } 2510 2511 atomic_set(&rbio->stripes_pending, nr_data); 2512 2513 while (1) { 2514 bio = bio_list_pop(&bio_list); 2515 if (!bio) 2516 break; 2517 2518 bio->bi_private = rbio; 2519 bio->bi_end_io = raid_write_end_io; 2520 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 2521 2522 submit_bio(bio); 2523 } 2524 return; 2525 2526 cleanup: 2527 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2528 2529 while ((bio = bio_list_pop(&bio_list))) 2530 bio_put(bio); 2531 } 2532 2533 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2534 { 2535 if (stripe >= 0 && stripe < rbio->nr_data) 2536 return 1; 2537 return 0; 2538 } 2539 2540 /* 2541 * While we're doing the parity check and repair, we could have errors 2542 * in reading pages off the disk. This checks for errors and if we're 2543 * not able to read the page it'll trigger parity reconstruction. The 2544 * parity scrub will be finished after we've reconstructed the failed 2545 * stripes 2546 */ 2547 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2548 { 2549 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2550 goto cleanup; 2551 2552 if (rbio->faila >= 0 || rbio->failb >= 0) { 2553 int dfail = 0, failp = -1; 2554 2555 if (is_data_stripe(rbio, rbio->faila)) 2556 dfail++; 2557 else if (is_parity_stripe(rbio->faila)) 2558 failp = rbio->faila; 2559 2560 if (is_data_stripe(rbio, rbio->failb)) 2561 dfail++; 2562 else if (is_parity_stripe(rbio->failb)) 2563 failp = rbio->failb; 2564 2565 /* 2566 * Because we can not use a scrubbing parity to repair 2567 * the data, so the capability of the repair is declined. 2568 * (In the case of RAID5, we can not repair anything) 2569 */ 2570 if (dfail > rbio->bbio->max_errors - 1) 2571 goto cleanup; 2572 2573 /* 2574 * If all data is good, only parity is correctly, just 2575 * repair the parity. 2576 */ 2577 if (dfail == 0) { 2578 finish_parity_scrub(rbio, 0); 2579 return; 2580 } 2581 2582 /* 2583 * Here means we got one corrupted data stripe and one 2584 * corrupted parity on RAID6, if the corrupted parity 2585 * is scrubbing parity, luckily, use the other one to repair 2586 * the data, or we can not repair the data stripe. 2587 */ 2588 if (failp != rbio->scrubp) 2589 goto cleanup; 2590 2591 __raid_recover_end_io(rbio); 2592 } else { 2593 finish_parity_scrub(rbio, 1); 2594 } 2595 return; 2596 2597 cleanup: 2598 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2599 } 2600 2601 /* 2602 * end io for the read phase of the rmw cycle. All the bios here are physical 2603 * stripe bios we've read from the disk so we can recalculate the parity of the 2604 * stripe. 2605 * 2606 * This will usually kick off finish_rmw once all the bios are read in, but it 2607 * may trigger parity reconstruction if we had any errors along the way 2608 */ 2609 static void raid56_parity_scrub_end_io(struct bio *bio) 2610 { 2611 struct btrfs_raid_bio *rbio = bio->bi_private; 2612 2613 if (bio->bi_status) 2614 fail_bio_stripe(rbio, bio); 2615 else 2616 set_bio_pages_uptodate(bio); 2617 2618 bio_put(bio); 2619 2620 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2621 return; 2622 2623 /* 2624 * this will normally call finish_rmw to start our write 2625 * but if there are any failed stripes we'll reconstruct 2626 * from parity first 2627 */ 2628 validate_rbio_for_parity_scrub(rbio); 2629 } 2630 2631 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2632 { 2633 int bios_to_read = 0; 2634 struct bio_list bio_list; 2635 int ret; 2636 int pagenr; 2637 int stripe; 2638 struct bio *bio; 2639 2640 bio_list_init(&bio_list); 2641 2642 ret = alloc_rbio_essential_pages(rbio); 2643 if (ret) 2644 goto cleanup; 2645 2646 atomic_set(&rbio->error, 0); 2647 /* 2648 * build a list of bios to read all the missing parts of this 2649 * stripe 2650 */ 2651 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2652 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2653 struct page *page; 2654 /* 2655 * we want to find all the pages missing from 2656 * the rbio and read them from the disk. If 2657 * page_in_rbio finds a page in the bio list 2658 * we don't need to read it off the stripe. 2659 */ 2660 page = page_in_rbio(rbio, stripe, pagenr, 1); 2661 if (page) 2662 continue; 2663 2664 page = rbio_stripe_page(rbio, stripe, pagenr); 2665 /* 2666 * the bio cache may have handed us an uptodate 2667 * page. If so, be happy and use it 2668 */ 2669 if (PageUptodate(page)) 2670 continue; 2671 2672 ret = rbio_add_io_page(rbio, &bio_list, page, 2673 stripe, pagenr, rbio->stripe_len); 2674 if (ret) 2675 goto cleanup; 2676 } 2677 } 2678 2679 bios_to_read = bio_list_size(&bio_list); 2680 if (!bios_to_read) { 2681 /* 2682 * this can happen if others have merged with 2683 * us, it means there is nothing left to read. 2684 * But if there are missing devices it may not be 2685 * safe to do the full stripe write yet. 2686 */ 2687 goto finish; 2688 } 2689 2690 /* 2691 * the bbio may be freed once we submit the last bio. Make sure 2692 * not to touch it after that 2693 */ 2694 atomic_set(&rbio->stripes_pending, bios_to_read); 2695 while (1) { 2696 bio = bio_list_pop(&bio_list); 2697 if (!bio) 2698 break; 2699 2700 bio->bi_private = rbio; 2701 bio->bi_end_io = raid56_parity_scrub_end_io; 2702 bio_set_op_attrs(bio, REQ_OP_READ, 0); 2703 2704 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2705 2706 submit_bio(bio); 2707 } 2708 /* the actual write will happen once the reads are done */ 2709 return; 2710 2711 cleanup: 2712 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2713 2714 while ((bio = bio_list_pop(&bio_list))) 2715 bio_put(bio); 2716 2717 return; 2718 2719 finish: 2720 validate_rbio_for_parity_scrub(rbio); 2721 } 2722 2723 static void scrub_parity_work(struct btrfs_work *work) 2724 { 2725 struct btrfs_raid_bio *rbio; 2726 2727 rbio = container_of(work, struct btrfs_raid_bio, work); 2728 raid56_parity_scrub_stripe(rbio); 2729 } 2730 2731 static void async_scrub_parity(struct btrfs_raid_bio *rbio) 2732 { 2733 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 2734 scrub_parity_work, NULL, NULL); 2735 2736 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 2737 } 2738 2739 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2740 { 2741 if (!lock_stripe_add(rbio)) 2742 async_scrub_parity(rbio); 2743 } 2744 2745 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2746 2747 struct btrfs_raid_bio * 2748 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2749 struct btrfs_bio *bbio, u64 length) 2750 { 2751 struct btrfs_raid_bio *rbio; 2752 2753 rbio = alloc_rbio(fs_info, bbio, length); 2754 if (IS_ERR(rbio)) 2755 return NULL; 2756 2757 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2758 bio_list_add(&rbio->bio_list, bio); 2759 /* 2760 * This is a special bio which is used to hold the completion handler 2761 * and make the scrub rbio is similar to the other types 2762 */ 2763 ASSERT(!bio->bi_iter.bi_size); 2764 2765 rbio->faila = find_logical_bio_stripe(rbio, bio); 2766 if (rbio->faila == -1) { 2767 BUG(); 2768 kfree(rbio); 2769 return NULL; 2770 } 2771 2772 /* 2773 * When we get bbio, we have already increased bio_counter, record it 2774 * so we can free it at rbio_orig_end_io() 2775 */ 2776 rbio->generic_bio_cnt = 1; 2777 2778 return rbio; 2779 } 2780 2781 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2782 { 2783 if (!lock_stripe_add(rbio)) 2784 async_read_rebuild(rbio); 2785 } 2786