1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "ctree.h" 17 #include "disk-io.h" 18 #include "volumes.h" 19 #include "raid56.h" 20 #include "async-thread.h" 21 22 /* set when additional merges to this rbio are not allowed */ 23 #define RBIO_RMW_LOCKED_BIT 1 24 25 /* 26 * set when this rbio is sitting in the hash, but it is just a cache 27 * of past RMW 28 */ 29 #define RBIO_CACHE_BIT 2 30 31 /* 32 * set when it is safe to trust the stripe_pages for caching 33 */ 34 #define RBIO_CACHE_READY_BIT 3 35 36 #define RBIO_CACHE_SIZE 1024 37 38 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 39 40 /* Used by the raid56 code to lock stripes for read/modify/write */ 41 struct btrfs_stripe_hash { 42 struct list_head hash_list; 43 spinlock_t lock; 44 }; 45 46 /* Used by the raid56 code to lock stripes for read/modify/write */ 47 struct btrfs_stripe_hash_table { 48 struct list_head stripe_cache; 49 spinlock_t cache_lock; 50 int cache_size; 51 struct btrfs_stripe_hash table[]; 52 }; 53 54 enum btrfs_rbio_ops { 55 BTRFS_RBIO_WRITE, 56 BTRFS_RBIO_READ_REBUILD, 57 BTRFS_RBIO_PARITY_SCRUB, 58 BTRFS_RBIO_REBUILD_MISSING, 59 }; 60 61 struct btrfs_raid_bio { 62 struct btrfs_fs_info *fs_info; 63 struct btrfs_bio *bbio; 64 65 /* while we're doing rmw on a stripe 66 * we put it into a hash table so we can 67 * lock the stripe and merge more rbios 68 * into it. 69 */ 70 struct list_head hash_list; 71 72 /* 73 * LRU list for the stripe cache 74 */ 75 struct list_head stripe_cache; 76 77 /* 78 * for scheduling work in the helper threads 79 */ 80 struct btrfs_work work; 81 82 /* 83 * bio list and bio_list_lock are used 84 * to add more bios into the stripe 85 * in hopes of avoiding the full rmw 86 */ 87 struct bio_list bio_list; 88 spinlock_t bio_list_lock; 89 90 /* also protected by the bio_list_lock, the 91 * plug list is used by the plugging code 92 * to collect partial bios while plugged. The 93 * stripe locking code also uses it to hand off 94 * the stripe lock to the next pending IO 95 */ 96 struct list_head plug_list; 97 98 /* 99 * flags that tell us if it is safe to 100 * merge with this bio 101 */ 102 unsigned long flags; 103 104 /* size of each individual stripe on disk */ 105 int stripe_len; 106 107 /* number of data stripes (no p/q) */ 108 int nr_data; 109 110 int real_stripes; 111 112 int stripe_npages; 113 /* 114 * set if we're doing a parity rebuild 115 * for a read from higher up, which is handled 116 * differently from a parity rebuild as part of 117 * rmw 118 */ 119 enum btrfs_rbio_ops operation; 120 121 /* first bad stripe */ 122 int faila; 123 124 /* second bad stripe (for raid6 use) */ 125 int failb; 126 127 int scrubp; 128 /* 129 * number of pages needed to represent the full 130 * stripe 131 */ 132 int nr_pages; 133 134 /* 135 * size of all the bios in the bio_list. This 136 * helps us decide if the rbio maps to a full 137 * stripe or not 138 */ 139 int bio_list_bytes; 140 141 int generic_bio_cnt; 142 143 refcount_t refs; 144 145 atomic_t stripes_pending; 146 147 atomic_t error; 148 /* 149 * these are two arrays of pointers. We allocate the 150 * rbio big enough to hold them both and setup their 151 * locations when the rbio is allocated 152 */ 153 154 /* pointers to pages that we allocated for 155 * reading/writing stripes directly from the disk (including P/Q) 156 */ 157 struct page **stripe_pages; 158 159 /* 160 * pointers to the pages in the bio_list. Stored 161 * here for faster lookup 162 */ 163 struct page **bio_pages; 164 165 /* 166 * bitmap to record which horizontal stripe has data 167 */ 168 unsigned long *dbitmap; 169 170 /* allocated with real_stripes-many pointers for finish_*() calls */ 171 void **finish_pointers; 172 173 /* allocated with stripe_npages-many bits for finish_*() calls */ 174 unsigned long *finish_pbitmap; 175 }; 176 177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 179 static void rmw_work(struct btrfs_work *work); 180 static void read_rebuild_work(struct btrfs_work *work); 181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 183 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 184 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 186 187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 188 int need_check); 189 static void scrub_parity_work(struct btrfs_work *work); 190 191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) 192 { 193 btrfs_init_work(&rbio->work, work_func, NULL, NULL); 194 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 195 } 196 197 /* 198 * the stripe hash table is used for locking, and to collect 199 * bios in hopes of making a full stripe 200 */ 201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 202 { 203 struct btrfs_stripe_hash_table *table; 204 struct btrfs_stripe_hash_table *x; 205 struct btrfs_stripe_hash *cur; 206 struct btrfs_stripe_hash *h; 207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 208 int i; 209 210 if (info->stripe_hash_table) 211 return 0; 212 213 /* 214 * The table is large, starting with order 4 and can go as high as 215 * order 7 in case lock debugging is turned on. 216 * 217 * Try harder to allocate and fallback to vmalloc to lower the chance 218 * of a failing mount. 219 */ 220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 221 if (!table) 222 return -ENOMEM; 223 224 spin_lock_init(&table->cache_lock); 225 INIT_LIST_HEAD(&table->stripe_cache); 226 227 h = table->table; 228 229 for (i = 0; i < num_entries; i++) { 230 cur = h + i; 231 INIT_LIST_HEAD(&cur->hash_list); 232 spin_lock_init(&cur->lock); 233 } 234 235 x = cmpxchg(&info->stripe_hash_table, NULL, table); 236 if (x) 237 kvfree(x); 238 return 0; 239 } 240 241 /* 242 * caching an rbio means to copy anything from the 243 * bio_pages array into the stripe_pages array. We 244 * use the page uptodate bit in the stripe cache array 245 * to indicate if it has valid data 246 * 247 * once the caching is done, we set the cache ready 248 * bit. 249 */ 250 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 251 { 252 int i; 253 char *s; 254 char *d; 255 int ret; 256 257 ret = alloc_rbio_pages(rbio); 258 if (ret) 259 return; 260 261 for (i = 0; i < rbio->nr_pages; i++) { 262 if (!rbio->bio_pages[i]) 263 continue; 264 265 s = kmap(rbio->bio_pages[i]); 266 d = kmap(rbio->stripe_pages[i]); 267 268 copy_page(d, s); 269 270 kunmap(rbio->bio_pages[i]); 271 kunmap(rbio->stripe_pages[i]); 272 SetPageUptodate(rbio->stripe_pages[i]); 273 } 274 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 275 } 276 277 /* 278 * we hash on the first logical address of the stripe 279 */ 280 static int rbio_bucket(struct btrfs_raid_bio *rbio) 281 { 282 u64 num = rbio->bbio->raid_map[0]; 283 284 /* 285 * we shift down quite a bit. We're using byte 286 * addressing, and most of the lower bits are zeros. 287 * This tends to upset hash_64, and it consistently 288 * returns just one or two different values. 289 * 290 * shifting off the lower bits fixes things. 291 */ 292 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 293 } 294 295 /* 296 * stealing an rbio means taking all the uptodate pages from the stripe 297 * array in the source rbio and putting them into the destination rbio 298 */ 299 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 300 { 301 int i; 302 struct page *s; 303 struct page *d; 304 305 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 306 return; 307 308 for (i = 0; i < dest->nr_pages; i++) { 309 s = src->stripe_pages[i]; 310 if (!s || !PageUptodate(s)) { 311 continue; 312 } 313 314 d = dest->stripe_pages[i]; 315 if (d) 316 __free_page(d); 317 318 dest->stripe_pages[i] = s; 319 src->stripe_pages[i] = NULL; 320 } 321 } 322 323 /* 324 * merging means we take the bio_list from the victim and 325 * splice it into the destination. The victim should 326 * be discarded afterwards. 327 * 328 * must be called with dest->rbio_list_lock held 329 */ 330 static void merge_rbio(struct btrfs_raid_bio *dest, 331 struct btrfs_raid_bio *victim) 332 { 333 bio_list_merge(&dest->bio_list, &victim->bio_list); 334 dest->bio_list_bytes += victim->bio_list_bytes; 335 dest->generic_bio_cnt += victim->generic_bio_cnt; 336 bio_list_init(&victim->bio_list); 337 } 338 339 /* 340 * used to prune items that are in the cache. The caller 341 * must hold the hash table lock. 342 */ 343 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 344 { 345 int bucket = rbio_bucket(rbio); 346 struct btrfs_stripe_hash_table *table; 347 struct btrfs_stripe_hash *h; 348 int freeit = 0; 349 350 /* 351 * check the bit again under the hash table lock. 352 */ 353 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 354 return; 355 356 table = rbio->fs_info->stripe_hash_table; 357 h = table->table + bucket; 358 359 /* hold the lock for the bucket because we may be 360 * removing it from the hash table 361 */ 362 spin_lock(&h->lock); 363 364 /* 365 * hold the lock for the bio list because we need 366 * to make sure the bio list is empty 367 */ 368 spin_lock(&rbio->bio_list_lock); 369 370 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 371 list_del_init(&rbio->stripe_cache); 372 table->cache_size -= 1; 373 freeit = 1; 374 375 /* if the bio list isn't empty, this rbio is 376 * still involved in an IO. We take it out 377 * of the cache list, and drop the ref that 378 * was held for the list. 379 * 380 * If the bio_list was empty, we also remove 381 * the rbio from the hash_table, and drop 382 * the corresponding ref 383 */ 384 if (bio_list_empty(&rbio->bio_list)) { 385 if (!list_empty(&rbio->hash_list)) { 386 list_del_init(&rbio->hash_list); 387 refcount_dec(&rbio->refs); 388 BUG_ON(!list_empty(&rbio->plug_list)); 389 } 390 } 391 } 392 393 spin_unlock(&rbio->bio_list_lock); 394 spin_unlock(&h->lock); 395 396 if (freeit) 397 __free_raid_bio(rbio); 398 } 399 400 /* 401 * prune a given rbio from the cache 402 */ 403 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 404 { 405 struct btrfs_stripe_hash_table *table; 406 unsigned long flags; 407 408 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 409 return; 410 411 table = rbio->fs_info->stripe_hash_table; 412 413 spin_lock_irqsave(&table->cache_lock, flags); 414 __remove_rbio_from_cache(rbio); 415 spin_unlock_irqrestore(&table->cache_lock, flags); 416 } 417 418 /* 419 * remove everything in the cache 420 */ 421 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 422 { 423 struct btrfs_stripe_hash_table *table; 424 unsigned long flags; 425 struct btrfs_raid_bio *rbio; 426 427 table = info->stripe_hash_table; 428 429 spin_lock_irqsave(&table->cache_lock, flags); 430 while (!list_empty(&table->stripe_cache)) { 431 rbio = list_entry(table->stripe_cache.next, 432 struct btrfs_raid_bio, 433 stripe_cache); 434 __remove_rbio_from_cache(rbio); 435 } 436 spin_unlock_irqrestore(&table->cache_lock, flags); 437 } 438 439 /* 440 * remove all cached entries and free the hash table 441 * used by unmount 442 */ 443 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 444 { 445 if (!info->stripe_hash_table) 446 return; 447 btrfs_clear_rbio_cache(info); 448 kvfree(info->stripe_hash_table); 449 info->stripe_hash_table = NULL; 450 } 451 452 /* 453 * insert an rbio into the stripe cache. It 454 * must have already been prepared by calling 455 * cache_rbio_pages 456 * 457 * If this rbio was already cached, it gets 458 * moved to the front of the lru. 459 * 460 * If the size of the rbio cache is too big, we 461 * prune an item. 462 */ 463 static void cache_rbio(struct btrfs_raid_bio *rbio) 464 { 465 struct btrfs_stripe_hash_table *table; 466 unsigned long flags; 467 468 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 469 return; 470 471 table = rbio->fs_info->stripe_hash_table; 472 473 spin_lock_irqsave(&table->cache_lock, flags); 474 spin_lock(&rbio->bio_list_lock); 475 476 /* bump our ref if we were not in the list before */ 477 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 478 refcount_inc(&rbio->refs); 479 480 if (!list_empty(&rbio->stripe_cache)){ 481 list_move(&rbio->stripe_cache, &table->stripe_cache); 482 } else { 483 list_add(&rbio->stripe_cache, &table->stripe_cache); 484 table->cache_size += 1; 485 } 486 487 spin_unlock(&rbio->bio_list_lock); 488 489 if (table->cache_size > RBIO_CACHE_SIZE) { 490 struct btrfs_raid_bio *found; 491 492 found = list_entry(table->stripe_cache.prev, 493 struct btrfs_raid_bio, 494 stripe_cache); 495 496 if (found != rbio) 497 __remove_rbio_from_cache(found); 498 } 499 500 spin_unlock_irqrestore(&table->cache_lock, flags); 501 } 502 503 /* 504 * helper function to run the xor_blocks api. It is only 505 * able to do MAX_XOR_BLOCKS at a time, so we need to 506 * loop through. 507 */ 508 static void run_xor(void **pages, int src_cnt, ssize_t len) 509 { 510 int src_off = 0; 511 int xor_src_cnt = 0; 512 void *dest = pages[src_cnt]; 513 514 while(src_cnt > 0) { 515 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 516 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 517 518 src_cnt -= xor_src_cnt; 519 src_off += xor_src_cnt; 520 } 521 } 522 523 /* 524 * Returns true if the bio list inside this rbio covers an entire stripe (no 525 * rmw required). 526 */ 527 static int rbio_is_full(struct btrfs_raid_bio *rbio) 528 { 529 unsigned long flags; 530 unsigned long size = rbio->bio_list_bytes; 531 int ret = 1; 532 533 spin_lock_irqsave(&rbio->bio_list_lock, flags); 534 if (size != rbio->nr_data * rbio->stripe_len) 535 ret = 0; 536 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 537 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 538 539 return ret; 540 } 541 542 /* 543 * returns 1 if it is safe to merge two rbios together. 544 * The merging is safe if the two rbios correspond to 545 * the same stripe and if they are both going in the same 546 * direction (read vs write), and if neither one is 547 * locked for final IO 548 * 549 * The caller is responsible for locking such that 550 * rmw_locked is safe to test 551 */ 552 static int rbio_can_merge(struct btrfs_raid_bio *last, 553 struct btrfs_raid_bio *cur) 554 { 555 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 556 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 557 return 0; 558 559 /* 560 * we can't merge with cached rbios, since the 561 * idea is that when we merge the destination 562 * rbio is going to run our IO for us. We can 563 * steal from cached rbios though, other functions 564 * handle that. 565 */ 566 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 567 test_bit(RBIO_CACHE_BIT, &cur->flags)) 568 return 0; 569 570 if (last->bbio->raid_map[0] != 571 cur->bbio->raid_map[0]) 572 return 0; 573 574 /* we can't merge with different operations */ 575 if (last->operation != cur->operation) 576 return 0; 577 /* 578 * We've need read the full stripe from the drive. 579 * check and repair the parity and write the new results. 580 * 581 * We're not allowed to add any new bios to the 582 * bio list here, anyone else that wants to 583 * change this stripe needs to do their own rmw. 584 */ 585 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 586 return 0; 587 588 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 589 return 0; 590 591 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 592 int fa = last->faila; 593 int fb = last->failb; 594 int cur_fa = cur->faila; 595 int cur_fb = cur->failb; 596 597 if (last->faila >= last->failb) { 598 fa = last->failb; 599 fb = last->faila; 600 } 601 602 if (cur->faila >= cur->failb) { 603 cur_fa = cur->failb; 604 cur_fb = cur->faila; 605 } 606 607 if (fa != cur_fa || fb != cur_fb) 608 return 0; 609 } 610 return 1; 611 } 612 613 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 614 int index) 615 { 616 return stripe * rbio->stripe_npages + index; 617 } 618 619 /* 620 * these are just the pages from the rbio array, not from anything 621 * the FS sent down to us 622 */ 623 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 624 int index) 625 { 626 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 627 } 628 629 /* 630 * helper to index into the pstripe 631 */ 632 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 633 { 634 return rbio_stripe_page(rbio, rbio->nr_data, index); 635 } 636 637 /* 638 * helper to index into the qstripe, returns null 639 * if there is no qstripe 640 */ 641 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 642 { 643 if (rbio->nr_data + 1 == rbio->real_stripes) 644 return NULL; 645 return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 646 } 647 648 /* 649 * The first stripe in the table for a logical address 650 * has the lock. rbios are added in one of three ways: 651 * 652 * 1) Nobody has the stripe locked yet. The rbio is given 653 * the lock and 0 is returned. The caller must start the IO 654 * themselves. 655 * 656 * 2) Someone has the stripe locked, but we're able to merge 657 * with the lock owner. The rbio is freed and the IO will 658 * start automatically along with the existing rbio. 1 is returned. 659 * 660 * 3) Someone has the stripe locked, but we're not able to merge. 661 * The rbio is added to the lock owner's plug list, or merged into 662 * an rbio already on the plug list. When the lock owner unlocks, 663 * the next rbio on the list is run and the IO is started automatically. 664 * 1 is returned 665 * 666 * If we return 0, the caller still owns the rbio and must continue with 667 * IO submission. If we return 1, the caller must assume the rbio has 668 * already been freed. 669 */ 670 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 671 { 672 struct btrfs_stripe_hash *h; 673 struct btrfs_raid_bio *cur; 674 struct btrfs_raid_bio *pending; 675 unsigned long flags; 676 struct btrfs_raid_bio *freeit = NULL; 677 struct btrfs_raid_bio *cache_drop = NULL; 678 int ret = 0; 679 680 h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 681 682 spin_lock_irqsave(&h->lock, flags); 683 list_for_each_entry(cur, &h->hash_list, hash_list) { 684 if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) 685 continue; 686 687 spin_lock(&cur->bio_list_lock); 688 689 /* Can we steal this cached rbio's pages? */ 690 if (bio_list_empty(&cur->bio_list) && 691 list_empty(&cur->plug_list) && 692 test_bit(RBIO_CACHE_BIT, &cur->flags) && 693 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 694 list_del_init(&cur->hash_list); 695 refcount_dec(&cur->refs); 696 697 steal_rbio(cur, rbio); 698 cache_drop = cur; 699 spin_unlock(&cur->bio_list_lock); 700 701 goto lockit; 702 } 703 704 /* Can we merge into the lock owner? */ 705 if (rbio_can_merge(cur, rbio)) { 706 merge_rbio(cur, rbio); 707 spin_unlock(&cur->bio_list_lock); 708 freeit = rbio; 709 ret = 1; 710 goto out; 711 } 712 713 714 /* 715 * We couldn't merge with the running rbio, see if we can merge 716 * with the pending ones. We don't have to check for rmw_locked 717 * because there is no way they are inside finish_rmw right now 718 */ 719 list_for_each_entry(pending, &cur->plug_list, plug_list) { 720 if (rbio_can_merge(pending, rbio)) { 721 merge_rbio(pending, rbio); 722 spin_unlock(&cur->bio_list_lock); 723 freeit = rbio; 724 ret = 1; 725 goto out; 726 } 727 } 728 729 /* 730 * No merging, put us on the tail of the plug list, our rbio 731 * will be started with the currently running rbio unlocks 732 */ 733 list_add_tail(&rbio->plug_list, &cur->plug_list); 734 spin_unlock(&cur->bio_list_lock); 735 ret = 1; 736 goto out; 737 } 738 lockit: 739 refcount_inc(&rbio->refs); 740 list_add(&rbio->hash_list, &h->hash_list); 741 out: 742 spin_unlock_irqrestore(&h->lock, flags); 743 if (cache_drop) 744 remove_rbio_from_cache(cache_drop); 745 if (freeit) 746 __free_raid_bio(freeit); 747 return ret; 748 } 749 750 /* 751 * called as rmw or parity rebuild is completed. If the plug list has more 752 * rbios waiting for this stripe, the next one on the list will be started 753 */ 754 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 755 { 756 int bucket; 757 struct btrfs_stripe_hash *h; 758 unsigned long flags; 759 int keep_cache = 0; 760 761 bucket = rbio_bucket(rbio); 762 h = rbio->fs_info->stripe_hash_table->table + bucket; 763 764 if (list_empty(&rbio->plug_list)) 765 cache_rbio(rbio); 766 767 spin_lock_irqsave(&h->lock, flags); 768 spin_lock(&rbio->bio_list_lock); 769 770 if (!list_empty(&rbio->hash_list)) { 771 /* 772 * if we're still cached and there is no other IO 773 * to perform, just leave this rbio here for others 774 * to steal from later 775 */ 776 if (list_empty(&rbio->plug_list) && 777 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 778 keep_cache = 1; 779 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 780 BUG_ON(!bio_list_empty(&rbio->bio_list)); 781 goto done; 782 } 783 784 list_del_init(&rbio->hash_list); 785 refcount_dec(&rbio->refs); 786 787 /* 788 * we use the plug list to hold all the rbios 789 * waiting for the chance to lock this stripe. 790 * hand the lock over to one of them. 791 */ 792 if (!list_empty(&rbio->plug_list)) { 793 struct btrfs_raid_bio *next; 794 struct list_head *head = rbio->plug_list.next; 795 796 next = list_entry(head, struct btrfs_raid_bio, 797 plug_list); 798 799 list_del_init(&rbio->plug_list); 800 801 list_add(&next->hash_list, &h->hash_list); 802 refcount_inc(&next->refs); 803 spin_unlock(&rbio->bio_list_lock); 804 spin_unlock_irqrestore(&h->lock, flags); 805 806 if (next->operation == BTRFS_RBIO_READ_REBUILD) 807 start_async_work(next, read_rebuild_work); 808 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 809 steal_rbio(rbio, next); 810 start_async_work(next, read_rebuild_work); 811 } else if (next->operation == BTRFS_RBIO_WRITE) { 812 steal_rbio(rbio, next); 813 start_async_work(next, rmw_work); 814 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 815 steal_rbio(rbio, next); 816 start_async_work(next, scrub_parity_work); 817 } 818 819 goto done_nolock; 820 } 821 } 822 done: 823 spin_unlock(&rbio->bio_list_lock); 824 spin_unlock_irqrestore(&h->lock, flags); 825 826 done_nolock: 827 if (!keep_cache) 828 remove_rbio_from_cache(rbio); 829 } 830 831 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 832 { 833 int i; 834 835 if (!refcount_dec_and_test(&rbio->refs)) 836 return; 837 838 WARN_ON(!list_empty(&rbio->stripe_cache)); 839 WARN_ON(!list_empty(&rbio->hash_list)); 840 WARN_ON(!bio_list_empty(&rbio->bio_list)); 841 842 for (i = 0; i < rbio->nr_pages; i++) { 843 if (rbio->stripe_pages[i]) { 844 __free_page(rbio->stripe_pages[i]); 845 rbio->stripe_pages[i] = NULL; 846 } 847 } 848 849 btrfs_put_bbio(rbio->bbio); 850 kfree(rbio); 851 } 852 853 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 854 { 855 struct bio *next; 856 857 while (cur) { 858 next = cur->bi_next; 859 cur->bi_next = NULL; 860 cur->bi_status = err; 861 bio_endio(cur); 862 cur = next; 863 } 864 } 865 866 /* 867 * this frees the rbio and runs through all the bios in the 868 * bio_list and calls end_io on them 869 */ 870 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 871 { 872 struct bio *cur = bio_list_get(&rbio->bio_list); 873 struct bio *extra; 874 875 if (rbio->generic_bio_cnt) 876 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 877 878 /* 879 * At this moment, rbio->bio_list is empty, however since rbio does not 880 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 881 * hash list, rbio may be merged with others so that rbio->bio_list 882 * becomes non-empty. 883 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 884 * more and we can call bio_endio() on all queued bios. 885 */ 886 unlock_stripe(rbio); 887 extra = bio_list_get(&rbio->bio_list); 888 __free_raid_bio(rbio); 889 890 rbio_endio_bio_list(cur, err); 891 if (extra) 892 rbio_endio_bio_list(extra, err); 893 } 894 895 /* 896 * end io function used by finish_rmw. When we finally 897 * get here, we've written a full stripe 898 */ 899 static void raid_write_end_io(struct bio *bio) 900 { 901 struct btrfs_raid_bio *rbio = bio->bi_private; 902 blk_status_t err = bio->bi_status; 903 int max_errors; 904 905 if (err) 906 fail_bio_stripe(rbio, bio); 907 908 bio_put(bio); 909 910 if (!atomic_dec_and_test(&rbio->stripes_pending)) 911 return; 912 913 err = BLK_STS_OK; 914 915 /* OK, we have read all the stripes we need to. */ 916 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 917 0 : rbio->bbio->max_errors; 918 if (atomic_read(&rbio->error) > max_errors) 919 err = BLK_STS_IOERR; 920 921 rbio_orig_end_io(rbio, err); 922 } 923 924 /* 925 * the read/modify/write code wants to use the original bio for 926 * any pages it included, and then use the rbio for everything 927 * else. This function decides if a given index (stripe number) 928 * and page number in that stripe fall inside the original bio 929 * or the rbio. 930 * 931 * if you set bio_list_only, you'll get a NULL back for any ranges 932 * that are outside the bio_list 933 * 934 * This doesn't take any refs on anything, you get a bare page pointer 935 * and the caller must bump refs as required. 936 * 937 * You must call index_rbio_pages once before you can trust 938 * the answers from this function. 939 */ 940 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 941 int index, int pagenr, int bio_list_only) 942 { 943 int chunk_page; 944 struct page *p = NULL; 945 946 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 947 948 spin_lock_irq(&rbio->bio_list_lock); 949 p = rbio->bio_pages[chunk_page]; 950 spin_unlock_irq(&rbio->bio_list_lock); 951 952 if (p || bio_list_only) 953 return p; 954 955 return rbio->stripe_pages[chunk_page]; 956 } 957 958 /* 959 * number of pages we need for the entire stripe across all the 960 * drives 961 */ 962 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 963 { 964 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 965 } 966 967 /* 968 * allocation and initial setup for the btrfs_raid_bio. Not 969 * this does not allocate any pages for rbio->pages. 970 */ 971 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 972 struct btrfs_bio *bbio, 973 u64 stripe_len) 974 { 975 struct btrfs_raid_bio *rbio; 976 int nr_data = 0; 977 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 978 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 979 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 980 void *p; 981 982 rbio = kzalloc(sizeof(*rbio) + 983 sizeof(*rbio->stripe_pages) * num_pages + 984 sizeof(*rbio->bio_pages) * num_pages + 985 sizeof(*rbio->finish_pointers) * real_stripes + 986 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + 987 sizeof(*rbio->finish_pbitmap) * 988 BITS_TO_LONGS(stripe_npages), 989 GFP_NOFS); 990 if (!rbio) 991 return ERR_PTR(-ENOMEM); 992 993 bio_list_init(&rbio->bio_list); 994 INIT_LIST_HEAD(&rbio->plug_list); 995 spin_lock_init(&rbio->bio_list_lock); 996 INIT_LIST_HEAD(&rbio->stripe_cache); 997 INIT_LIST_HEAD(&rbio->hash_list); 998 rbio->bbio = bbio; 999 rbio->fs_info = fs_info; 1000 rbio->stripe_len = stripe_len; 1001 rbio->nr_pages = num_pages; 1002 rbio->real_stripes = real_stripes; 1003 rbio->stripe_npages = stripe_npages; 1004 rbio->faila = -1; 1005 rbio->failb = -1; 1006 refcount_set(&rbio->refs, 1); 1007 atomic_set(&rbio->error, 0); 1008 atomic_set(&rbio->stripes_pending, 0); 1009 1010 /* 1011 * the stripe_pages, bio_pages, etc arrays point to the extra 1012 * memory we allocated past the end of the rbio 1013 */ 1014 p = rbio + 1; 1015 #define CONSUME_ALLOC(ptr, count) do { \ 1016 ptr = p; \ 1017 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1018 } while (0) 1019 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1020 CONSUME_ALLOC(rbio->bio_pages, num_pages); 1021 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1022 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); 1023 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); 1024 #undef CONSUME_ALLOC 1025 1026 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1027 nr_data = real_stripes - 1; 1028 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1029 nr_data = real_stripes - 2; 1030 else 1031 BUG(); 1032 1033 rbio->nr_data = nr_data; 1034 return rbio; 1035 } 1036 1037 /* allocate pages for all the stripes in the bio, including parity */ 1038 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1039 { 1040 int i; 1041 struct page *page; 1042 1043 for (i = 0; i < rbio->nr_pages; i++) { 1044 if (rbio->stripe_pages[i]) 1045 continue; 1046 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1047 if (!page) 1048 return -ENOMEM; 1049 rbio->stripe_pages[i] = page; 1050 } 1051 return 0; 1052 } 1053 1054 /* only allocate pages for p/q stripes */ 1055 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1056 { 1057 int i; 1058 struct page *page; 1059 1060 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1061 1062 for (; i < rbio->nr_pages; i++) { 1063 if (rbio->stripe_pages[i]) 1064 continue; 1065 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1066 if (!page) 1067 return -ENOMEM; 1068 rbio->stripe_pages[i] = page; 1069 } 1070 return 0; 1071 } 1072 1073 /* 1074 * add a single page from a specific stripe into our list of bios for IO 1075 * this will try to merge into existing bios if possible, and returns 1076 * zero if all went well. 1077 */ 1078 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1079 struct bio_list *bio_list, 1080 struct page *page, 1081 int stripe_nr, 1082 unsigned long page_index, 1083 unsigned long bio_max_len) 1084 { 1085 struct bio *last = bio_list->tail; 1086 int ret; 1087 struct bio *bio; 1088 struct btrfs_bio_stripe *stripe; 1089 u64 disk_start; 1090 1091 stripe = &rbio->bbio->stripes[stripe_nr]; 1092 disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1093 1094 /* if the device is missing, just fail this stripe */ 1095 if (!stripe->dev->bdev) 1096 return fail_rbio_index(rbio, stripe_nr); 1097 1098 /* see if we can add this page onto our existing bio */ 1099 if (last) { 1100 u64 last_end = last->bi_iter.bi_sector << 9; 1101 last_end += last->bi_iter.bi_size; 1102 1103 /* 1104 * we can't merge these if they are from different 1105 * devices or if they are not contiguous 1106 */ 1107 if (last_end == disk_start && !last->bi_status && 1108 last->bi_disk == stripe->dev->bdev->bd_disk && 1109 last->bi_partno == stripe->dev->bdev->bd_partno) { 1110 ret = bio_add_page(last, page, PAGE_SIZE, 0); 1111 if (ret == PAGE_SIZE) 1112 return 0; 1113 } 1114 } 1115 1116 /* put a new bio on the list */ 1117 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1118 btrfs_io_bio(bio)->device = stripe->dev; 1119 bio->bi_iter.bi_size = 0; 1120 bio_set_dev(bio, stripe->dev->bdev); 1121 bio->bi_iter.bi_sector = disk_start >> 9; 1122 1123 bio_add_page(bio, page, PAGE_SIZE, 0); 1124 bio_list_add(bio_list, bio); 1125 return 0; 1126 } 1127 1128 /* 1129 * while we're doing the read/modify/write cycle, we could 1130 * have errors in reading pages off the disk. This checks 1131 * for errors and if we're not able to read the page it'll 1132 * trigger parity reconstruction. The rmw will be finished 1133 * after we've reconstructed the failed stripes 1134 */ 1135 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1136 { 1137 if (rbio->faila >= 0 || rbio->failb >= 0) { 1138 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1139 __raid56_parity_recover(rbio); 1140 } else { 1141 finish_rmw(rbio); 1142 } 1143 } 1144 1145 /* 1146 * helper function to walk our bio list and populate the bio_pages array with 1147 * the result. This seems expensive, but it is faster than constantly 1148 * searching through the bio list as we setup the IO in finish_rmw or stripe 1149 * reconstruction. 1150 * 1151 * This must be called before you trust the answers from page_in_rbio 1152 */ 1153 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1154 { 1155 struct bio *bio; 1156 u64 start; 1157 unsigned long stripe_offset; 1158 unsigned long page_index; 1159 1160 spin_lock_irq(&rbio->bio_list_lock); 1161 bio_list_for_each(bio, &rbio->bio_list) { 1162 struct bio_vec bvec; 1163 struct bvec_iter iter; 1164 int i = 0; 1165 1166 start = bio->bi_iter.bi_sector << 9; 1167 stripe_offset = start - rbio->bbio->raid_map[0]; 1168 page_index = stripe_offset >> PAGE_SHIFT; 1169 1170 if (bio_flagged(bio, BIO_CLONED)) 1171 bio->bi_iter = btrfs_io_bio(bio)->iter; 1172 1173 bio_for_each_segment(bvec, bio, iter) { 1174 rbio->bio_pages[page_index + i] = bvec.bv_page; 1175 i++; 1176 } 1177 } 1178 spin_unlock_irq(&rbio->bio_list_lock); 1179 } 1180 1181 /* 1182 * this is called from one of two situations. We either 1183 * have a full stripe from the higher layers, or we've read all 1184 * the missing bits off disk. 1185 * 1186 * This will calculate the parity and then send down any 1187 * changed blocks. 1188 */ 1189 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1190 { 1191 struct btrfs_bio *bbio = rbio->bbio; 1192 void **pointers = rbio->finish_pointers; 1193 int nr_data = rbio->nr_data; 1194 int stripe; 1195 int pagenr; 1196 bool has_qstripe; 1197 struct bio_list bio_list; 1198 struct bio *bio; 1199 int ret; 1200 1201 bio_list_init(&bio_list); 1202 1203 if (rbio->real_stripes - rbio->nr_data == 1) 1204 has_qstripe = false; 1205 else if (rbio->real_stripes - rbio->nr_data == 2) 1206 has_qstripe = true; 1207 else 1208 BUG(); 1209 1210 /* at this point we either have a full stripe, 1211 * or we've read the full stripe from the drive. 1212 * recalculate the parity and write the new results. 1213 * 1214 * We're not allowed to add any new bios to the 1215 * bio list here, anyone else that wants to 1216 * change this stripe needs to do their own rmw. 1217 */ 1218 spin_lock_irq(&rbio->bio_list_lock); 1219 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1220 spin_unlock_irq(&rbio->bio_list_lock); 1221 1222 atomic_set(&rbio->error, 0); 1223 1224 /* 1225 * now that we've set rmw_locked, run through the 1226 * bio list one last time and map the page pointers 1227 * 1228 * We don't cache full rbios because we're assuming 1229 * the higher layers are unlikely to use this area of 1230 * the disk again soon. If they do use it again, 1231 * hopefully they will send another full bio. 1232 */ 1233 index_rbio_pages(rbio); 1234 if (!rbio_is_full(rbio)) 1235 cache_rbio_pages(rbio); 1236 else 1237 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1238 1239 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1240 struct page *p; 1241 /* first collect one page from each data stripe */ 1242 for (stripe = 0; stripe < nr_data; stripe++) { 1243 p = page_in_rbio(rbio, stripe, pagenr, 0); 1244 pointers[stripe] = kmap(p); 1245 } 1246 1247 /* then add the parity stripe */ 1248 p = rbio_pstripe_page(rbio, pagenr); 1249 SetPageUptodate(p); 1250 pointers[stripe++] = kmap(p); 1251 1252 if (has_qstripe) { 1253 1254 /* 1255 * raid6, add the qstripe and call the 1256 * library function to fill in our p/q 1257 */ 1258 p = rbio_qstripe_page(rbio, pagenr); 1259 SetPageUptodate(p); 1260 pointers[stripe++] = kmap(p); 1261 1262 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1263 pointers); 1264 } else { 1265 /* raid5 */ 1266 copy_page(pointers[nr_data], pointers[0]); 1267 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1268 } 1269 1270 1271 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 1272 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1273 } 1274 1275 /* 1276 * time to start writing. Make bios for everything from the 1277 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1278 * everything else. 1279 */ 1280 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1281 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1282 struct page *page; 1283 if (stripe < rbio->nr_data) { 1284 page = page_in_rbio(rbio, stripe, pagenr, 1); 1285 if (!page) 1286 continue; 1287 } else { 1288 page = rbio_stripe_page(rbio, stripe, pagenr); 1289 } 1290 1291 ret = rbio_add_io_page(rbio, &bio_list, 1292 page, stripe, pagenr, rbio->stripe_len); 1293 if (ret) 1294 goto cleanup; 1295 } 1296 } 1297 1298 if (likely(!bbio->num_tgtdevs)) 1299 goto write_data; 1300 1301 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1302 if (!bbio->tgtdev_map[stripe]) 1303 continue; 1304 1305 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1306 struct page *page; 1307 if (stripe < rbio->nr_data) { 1308 page = page_in_rbio(rbio, stripe, pagenr, 1); 1309 if (!page) 1310 continue; 1311 } else { 1312 page = rbio_stripe_page(rbio, stripe, pagenr); 1313 } 1314 1315 ret = rbio_add_io_page(rbio, &bio_list, page, 1316 rbio->bbio->tgtdev_map[stripe], 1317 pagenr, rbio->stripe_len); 1318 if (ret) 1319 goto cleanup; 1320 } 1321 } 1322 1323 write_data: 1324 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1325 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1326 1327 while ((bio = bio_list_pop(&bio_list))) { 1328 bio->bi_private = rbio; 1329 bio->bi_end_io = raid_write_end_io; 1330 bio->bi_opf = REQ_OP_WRITE; 1331 1332 submit_bio(bio); 1333 } 1334 return; 1335 1336 cleanup: 1337 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1338 1339 while ((bio = bio_list_pop(&bio_list))) 1340 bio_put(bio); 1341 } 1342 1343 /* 1344 * helper to find the stripe number for a given bio. Used to figure out which 1345 * stripe has failed. This expects the bio to correspond to a physical disk, 1346 * so it looks up based on physical sector numbers. 1347 */ 1348 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1349 struct bio *bio) 1350 { 1351 u64 physical = bio->bi_iter.bi_sector; 1352 int i; 1353 struct btrfs_bio_stripe *stripe; 1354 1355 physical <<= 9; 1356 1357 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1358 stripe = &rbio->bbio->stripes[i]; 1359 if (in_range(physical, stripe->physical, rbio->stripe_len) && 1360 stripe->dev->bdev && 1361 bio->bi_disk == stripe->dev->bdev->bd_disk && 1362 bio->bi_partno == stripe->dev->bdev->bd_partno) { 1363 return i; 1364 } 1365 } 1366 return -1; 1367 } 1368 1369 /* 1370 * helper to find the stripe number for a given 1371 * bio (before mapping). Used to figure out which stripe has 1372 * failed. This looks up based on logical block numbers. 1373 */ 1374 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1375 struct bio *bio) 1376 { 1377 u64 logical = bio->bi_iter.bi_sector << 9; 1378 int i; 1379 1380 for (i = 0; i < rbio->nr_data; i++) { 1381 u64 stripe_start = rbio->bbio->raid_map[i]; 1382 1383 if (in_range(logical, stripe_start, rbio->stripe_len)) 1384 return i; 1385 } 1386 return -1; 1387 } 1388 1389 /* 1390 * returns -EIO if we had too many failures 1391 */ 1392 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1393 { 1394 unsigned long flags; 1395 int ret = 0; 1396 1397 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1398 1399 /* we already know this stripe is bad, move on */ 1400 if (rbio->faila == failed || rbio->failb == failed) 1401 goto out; 1402 1403 if (rbio->faila == -1) { 1404 /* first failure on this rbio */ 1405 rbio->faila = failed; 1406 atomic_inc(&rbio->error); 1407 } else if (rbio->failb == -1) { 1408 /* second failure on this rbio */ 1409 rbio->failb = failed; 1410 atomic_inc(&rbio->error); 1411 } else { 1412 ret = -EIO; 1413 } 1414 out: 1415 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1416 1417 return ret; 1418 } 1419 1420 /* 1421 * helper to fail a stripe based on a physical disk 1422 * bio. 1423 */ 1424 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1425 struct bio *bio) 1426 { 1427 int failed = find_bio_stripe(rbio, bio); 1428 1429 if (failed < 0) 1430 return -EIO; 1431 1432 return fail_rbio_index(rbio, failed); 1433 } 1434 1435 /* 1436 * this sets each page in the bio uptodate. It should only be used on private 1437 * rbio pages, nothing that comes in from the higher layers 1438 */ 1439 static void set_bio_pages_uptodate(struct bio *bio) 1440 { 1441 struct bio_vec *bvec; 1442 struct bvec_iter_all iter_all; 1443 1444 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1445 1446 bio_for_each_segment_all(bvec, bio, iter_all) 1447 SetPageUptodate(bvec->bv_page); 1448 } 1449 1450 /* 1451 * end io for the read phase of the rmw cycle. All the bios here are physical 1452 * stripe bios we've read from the disk so we can recalculate the parity of the 1453 * stripe. 1454 * 1455 * This will usually kick off finish_rmw once all the bios are read in, but it 1456 * may trigger parity reconstruction if we had any errors along the way 1457 */ 1458 static void raid_rmw_end_io(struct bio *bio) 1459 { 1460 struct btrfs_raid_bio *rbio = bio->bi_private; 1461 1462 if (bio->bi_status) 1463 fail_bio_stripe(rbio, bio); 1464 else 1465 set_bio_pages_uptodate(bio); 1466 1467 bio_put(bio); 1468 1469 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1470 return; 1471 1472 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1473 goto cleanup; 1474 1475 /* 1476 * this will normally call finish_rmw to start our write 1477 * but if there are any failed stripes we'll reconstruct 1478 * from parity first 1479 */ 1480 validate_rbio_for_rmw(rbio); 1481 return; 1482 1483 cleanup: 1484 1485 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1486 } 1487 1488 /* 1489 * the stripe must be locked by the caller. It will 1490 * unlock after all the writes are done 1491 */ 1492 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1493 { 1494 int bios_to_read = 0; 1495 struct bio_list bio_list; 1496 int ret; 1497 int pagenr; 1498 int stripe; 1499 struct bio *bio; 1500 1501 bio_list_init(&bio_list); 1502 1503 ret = alloc_rbio_pages(rbio); 1504 if (ret) 1505 goto cleanup; 1506 1507 index_rbio_pages(rbio); 1508 1509 atomic_set(&rbio->error, 0); 1510 /* 1511 * build a list of bios to read all the missing parts of this 1512 * stripe 1513 */ 1514 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1515 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1516 struct page *page; 1517 /* 1518 * we want to find all the pages missing from 1519 * the rbio and read them from the disk. If 1520 * page_in_rbio finds a page in the bio list 1521 * we don't need to read it off the stripe. 1522 */ 1523 page = page_in_rbio(rbio, stripe, pagenr, 1); 1524 if (page) 1525 continue; 1526 1527 page = rbio_stripe_page(rbio, stripe, pagenr); 1528 /* 1529 * the bio cache may have handed us an uptodate 1530 * page. If so, be happy and use it 1531 */ 1532 if (PageUptodate(page)) 1533 continue; 1534 1535 ret = rbio_add_io_page(rbio, &bio_list, page, 1536 stripe, pagenr, rbio->stripe_len); 1537 if (ret) 1538 goto cleanup; 1539 } 1540 } 1541 1542 bios_to_read = bio_list_size(&bio_list); 1543 if (!bios_to_read) { 1544 /* 1545 * this can happen if others have merged with 1546 * us, it means there is nothing left to read. 1547 * But if there are missing devices it may not be 1548 * safe to do the full stripe write yet. 1549 */ 1550 goto finish; 1551 } 1552 1553 /* 1554 * the bbio may be freed once we submit the last bio. Make sure 1555 * not to touch it after that 1556 */ 1557 atomic_set(&rbio->stripes_pending, bios_to_read); 1558 while ((bio = bio_list_pop(&bio_list))) { 1559 bio->bi_private = rbio; 1560 bio->bi_end_io = raid_rmw_end_io; 1561 bio->bi_opf = REQ_OP_READ; 1562 1563 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1564 1565 submit_bio(bio); 1566 } 1567 /* the actual write will happen once the reads are done */ 1568 return 0; 1569 1570 cleanup: 1571 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1572 1573 while ((bio = bio_list_pop(&bio_list))) 1574 bio_put(bio); 1575 1576 return -EIO; 1577 1578 finish: 1579 validate_rbio_for_rmw(rbio); 1580 return 0; 1581 } 1582 1583 /* 1584 * if the upper layers pass in a full stripe, we thank them by only allocating 1585 * enough pages to hold the parity, and sending it all down quickly. 1586 */ 1587 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1588 { 1589 int ret; 1590 1591 ret = alloc_rbio_parity_pages(rbio); 1592 if (ret) { 1593 __free_raid_bio(rbio); 1594 return ret; 1595 } 1596 1597 ret = lock_stripe_add(rbio); 1598 if (ret == 0) 1599 finish_rmw(rbio); 1600 return 0; 1601 } 1602 1603 /* 1604 * partial stripe writes get handed over to async helpers. 1605 * We're really hoping to merge a few more writes into this 1606 * rbio before calculating new parity 1607 */ 1608 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1609 { 1610 int ret; 1611 1612 ret = lock_stripe_add(rbio); 1613 if (ret == 0) 1614 start_async_work(rbio, rmw_work); 1615 return 0; 1616 } 1617 1618 /* 1619 * sometimes while we were reading from the drive to 1620 * recalculate parity, enough new bios come into create 1621 * a full stripe. So we do a check here to see if we can 1622 * go directly to finish_rmw 1623 */ 1624 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1625 { 1626 /* head off into rmw land if we don't have a full stripe */ 1627 if (!rbio_is_full(rbio)) 1628 return partial_stripe_write(rbio); 1629 return full_stripe_write(rbio); 1630 } 1631 1632 /* 1633 * We use plugging call backs to collect full stripes. 1634 * Any time we get a partial stripe write while plugged 1635 * we collect it into a list. When the unplug comes down, 1636 * we sort the list by logical block number and merge 1637 * everything we can into the same rbios 1638 */ 1639 struct btrfs_plug_cb { 1640 struct blk_plug_cb cb; 1641 struct btrfs_fs_info *info; 1642 struct list_head rbio_list; 1643 struct btrfs_work work; 1644 }; 1645 1646 /* 1647 * rbios on the plug list are sorted for easier merging. 1648 */ 1649 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1650 { 1651 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1652 plug_list); 1653 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1654 plug_list); 1655 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1656 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1657 1658 if (a_sector < b_sector) 1659 return -1; 1660 if (a_sector > b_sector) 1661 return 1; 1662 return 0; 1663 } 1664 1665 static void run_plug(struct btrfs_plug_cb *plug) 1666 { 1667 struct btrfs_raid_bio *cur; 1668 struct btrfs_raid_bio *last = NULL; 1669 1670 /* 1671 * sort our plug list then try to merge 1672 * everything we can in hopes of creating full 1673 * stripes. 1674 */ 1675 list_sort(NULL, &plug->rbio_list, plug_cmp); 1676 while (!list_empty(&plug->rbio_list)) { 1677 cur = list_entry(plug->rbio_list.next, 1678 struct btrfs_raid_bio, plug_list); 1679 list_del_init(&cur->plug_list); 1680 1681 if (rbio_is_full(cur)) { 1682 int ret; 1683 1684 /* we have a full stripe, send it down */ 1685 ret = full_stripe_write(cur); 1686 BUG_ON(ret); 1687 continue; 1688 } 1689 if (last) { 1690 if (rbio_can_merge(last, cur)) { 1691 merge_rbio(last, cur); 1692 __free_raid_bio(cur); 1693 continue; 1694 1695 } 1696 __raid56_parity_write(last); 1697 } 1698 last = cur; 1699 } 1700 if (last) { 1701 __raid56_parity_write(last); 1702 } 1703 kfree(plug); 1704 } 1705 1706 /* 1707 * if the unplug comes from schedule, we have to push the 1708 * work off to a helper thread 1709 */ 1710 static void unplug_work(struct btrfs_work *work) 1711 { 1712 struct btrfs_plug_cb *plug; 1713 plug = container_of(work, struct btrfs_plug_cb, work); 1714 run_plug(plug); 1715 } 1716 1717 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1718 { 1719 struct btrfs_plug_cb *plug; 1720 plug = container_of(cb, struct btrfs_plug_cb, cb); 1721 1722 if (from_schedule) { 1723 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1724 btrfs_queue_work(plug->info->rmw_workers, 1725 &plug->work); 1726 return; 1727 } 1728 run_plug(plug); 1729 } 1730 1731 /* 1732 * our main entry point for writes from the rest of the FS. 1733 */ 1734 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, 1735 struct btrfs_bio *bbio, u64 stripe_len) 1736 { 1737 struct btrfs_raid_bio *rbio; 1738 struct btrfs_plug_cb *plug = NULL; 1739 struct blk_plug_cb *cb; 1740 int ret; 1741 1742 rbio = alloc_rbio(fs_info, bbio, stripe_len); 1743 if (IS_ERR(rbio)) { 1744 btrfs_put_bbio(bbio); 1745 return PTR_ERR(rbio); 1746 } 1747 bio_list_add(&rbio->bio_list, bio); 1748 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1749 rbio->operation = BTRFS_RBIO_WRITE; 1750 1751 btrfs_bio_counter_inc_noblocked(fs_info); 1752 rbio->generic_bio_cnt = 1; 1753 1754 /* 1755 * don't plug on full rbios, just get them out the door 1756 * as quickly as we can 1757 */ 1758 if (rbio_is_full(rbio)) { 1759 ret = full_stripe_write(rbio); 1760 if (ret) 1761 btrfs_bio_counter_dec(fs_info); 1762 return ret; 1763 } 1764 1765 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1766 if (cb) { 1767 plug = container_of(cb, struct btrfs_plug_cb, cb); 1768 if (!plug->info) { 1769 plug->info = fs_info; 1770 INIT_LIST_HEAD(&plug->rbio_list); 1771 } 1772 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1773 ret = 0; 1774 } else { 1775 ret = __raid56_parity_write(rbio); 1776 if (ret) 1777 btrfs_bio_counter_dec(fs_info); 1778 } 1779 return ret; 1780 } 1781 1782 /* 1783 * all parity reconstruction happens here. We've read in everything 1784 * we can find from the drives and this does the heavy lifting of 1785 * sorting the good from the bad. 1786 */ 1787 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1788 { 1789 int pagenr, stripe; 1790 void **pointers; 1791 int faila = -1, failb = -1; 1792 struct page *page; 1793 blk_status_t err; 1794 int i; 1795 1796 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1797 if (!pointers) { 1798 err = BLK_STS_RESOURCE; 1799 goto cleanup_io; 1800 } 1801 1802 faila = rbio->faila; 1803 failb = rbio->failb; 1804 1805 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1806 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1807 spin_lock_irq(&rbio->bio_list_lock); 1808 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1809 spin_unlock_irq(&rbio->bio_list_lock); 1810 } 1811 1812 index_rbio_pages(rbio); 1813 1814 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1815 /* 1816 * Now we just use bitmap to mark the horizontal stripes in 1817 * which we have data when doing parity scrub. 1818 */ 1819 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1820 !test_bit(pagenr, rbio->dbitmap)) 1821 continue; 1822 1823 /* setup our array of pointers with pages 1824 * from each stripe 1825 */ 1826 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1827 /* 1828 * if we're rebuilding a read, we have to use 1829 * pages from the bio list 1830 */ 1831 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1832 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1833 (stripe == faila || stripe == failb)) { 1834 page = page_in_rbio(rbio, stripe, pagenr, 0); 1835 } else { 1836 page = rbio_stripe_page(rbio, stripe, pagenr); 1837 } 1838 pointers[stripe] = kmap(page); 1839 } 1840 1841 /* all raid6 handling here */ 1842 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1843 /* 1844 * single failure, rebuild from parity raid5 1845 * style 1846 */ 1847 if (failb < 0) { 1848 if (faila == rbio->nr_data) { 1849 /* 1850 * Just the P stripe has failed, without 1851 * a bad data or Q stripe. 1852 * TODO, we should redo the xor here. 1853 */ 1854 err = BLK_STS_IOERR; 1855 goto cleanup; 1856 } 1857 /* 1858 * a single failure in raid6 is rebuilt 1859 * in the pstripe code below 1860 */ 1861 goto pstripe; 1862 } 1863 1864 /* make sure our ps and qs are in order */ 1865 if (faila > failb) 1866 swap(faila, failb); 1867 1868 /* if the q stripe is failed, do a pstripe reconstruction 1869 * from the xors. 1870 * If both the q stripe and the P stripe are failed, we're 1871 * here due to a crc mismatch and we can't give them the 1872 * data they want 1873 */ 1874 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1875 if (rbio->bbio->raid_map[faila] == 1876 RAID5_P_STRIPE) { 1877 err = BLK_STS_IOERR; 1878 goto cleanup; 1879 } 1880 /* 1881 * otherwise we have one bad data stripe and 1882 * a good P stripe. raid5! 1883 */ 1884 goto pstripe; 1885 } 1886 1887 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1888 raid6_datap_recov(rbio->real_stripes, 1889 PAGE_SIZE, faila, pointers); 1890 } else { 1891 raid6_2data_recov(rbio->real_stripes, 1892 PAGE_SIZE, faila, failb, 1893 pointers); 1894 } 1895 } else { 1896 void *p; 1897 1898 /* rebuild from P stripe here (raid5 or raid6) */ 1899 BUG_ON(failb != -1); 1900 pstripe: 1901 /* Copy parity block into failed block to start with */ 1902 copy_page(pointers[faila], pointers[rbio->nr_data]); 1903 1904 /* rearrange the pointer array */ 1905 p = pointers[faila]; 1906 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1907 pointers[stripe] = pointers[stripe + 1]; 1908 pointers[rbio->nr_data - 1] = p; 1909 1910 /* xor in the rest */ 1911 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1912 } 1913 /* if we're doing this rebuild as part of an rmw, go through 1914 * and set all of our private rbio pages in the 1915 * failed stripes as uptodate. This way finish_rmw will 1916 * know they can be trusted. If this was a read reconstruction, 1917 * other endio functions will fiddle the uptodate bits 1918 */ 1919 if (rbio->operation == BTRFS_RBIO_WRITE) { 1920 for (i = 0; i < rbio->stripe_npages; i++) { 1921 if (faila != -1) { 1922 page = rbio_stripe_page(rbio, faila, i); 1923 SetPageUptodate(page); 1924 } 1925 if (failb != -1) { 1926 page = rbio_stripe_page(rbio, failb, i); 1927 SetPageUptodate(page); 1928 } 1929 } 1930 } 1931 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1932 /* 1933 * if we're rebuilding a read, we have to use 1934 * pages from the bio list 1935 */ 1936 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1937 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1938 (stripe == faila || stripe == failb)) { 1939 page = page_in_rbio(rbio, stripe, pagenr, 0); 1940 } else { 1941 page = rbio_stripe_page(rbio, stripe, pagenr); 1942 } 1943 kunmap(page); 1944 } 1945 } 1946 1947 err = BLK_STS_OK; 1948 cleanup: 1949 kfree(pointers); 1950 1951 cleanup_io: 1952 /* 1953 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 1954 * valid rbio which is consistent with ondisk content, thus such a 1955 * valid rbio can be cached to avoid further disk reads. 1956 */ 1957 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1958 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1959 /* 1960 * - In case of two failures, where rbio->failb != -1: 1961 * 1962 * Do not cache this rbio since the above read reconstruction 1963 * (raid6_datap_recov() or raid6_2data_recov()) may have 1964 * changed some content of stripes which are not identical to 1965 * on-disk content any more, otherwise, a later write/recover 1966 * may steal stripe_pages from this rbio and end up with 1967 * corruptions or rebuild failures. 1968 * 1969 * - In case of single failure, where rbio->failb == -1: 1970 * 1971 * Cache this rbio iff the above read reconstruction is 1972 * executed without problems. 1973 */ 1974 if (err == BLK_STS_OK && rbio->failb < 0) 1975 cache_rbio_pages(rbio); 1976 else 1977 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1978 1979 rbio_orig_end_io(rbio, err); 1980 } else if (err == BLK_STS_OK) { 1981 rbio->faila = -1; 1982 rbio->failb = -1; 1983 1984 if (rbio->operation == BTRFS_RBIO_WRITE) 1985 finish_rmw(rbio); 1986 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 1987 finish_parity_scrub(rbio, 0); 1988 else 1989 BUG(); 1990 } else { 1991 rbio_orig_end_io(rbio, err); 1992 } 1993 } 1994 1995 /* 1996 * This is called only for stripes we've read from disk to 1997 * reconstruct the parity. 1998 */ 1999 static void raid_recover_end_io(struct bio *bio) 2000 { 2001 struct btrfs_raid_bio *rbio = bio->bi_private; 2002 2003 /* 2004 * we only read stripe pages off the disk, set them 2005 * up to date if there were no errors 2006 */ 2007 if (bio->bi_status) 2008 fail_bio_stripe(rbio, bio); 2009 else 2010 set_bio_pages_uptodate(bio); 2011 bio_put(bio); 2012 2013 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2014 return; 2015 2016 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2017 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2018 else 2019 __raid_recover_end_io(rbio); 2020 } 2021 2022 /* 2023 * reads everything we need off the disk to reconstruct 2024 * the parity. endio handlers trigger final reconstruction 2025 * when the IO is done. 2026 * 2027 * This is used both for reads from the higher layers and for 2028 * parity construction required to finish a rmw cycle. 2029 */ 2030 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2031 { 2032 int bios_to_read = 0; 2033 struct bio_list bio_list; 2034 int ret; 2035 int pagenr; 2036 int stripe; 2037 struct bio *bio; 2038 2039 bio_list_init(&bio_list); 2040 2041 ret = alloc_rbio_pages(rbio); 2042 if (ret) 2043 goto cleanup; 2044 2045 atomic_set(&rbio->error, 0); 2046 2047 /* 2048 * read everything that hasn't failed. Thanks to the 2049 * stripe cache, it is possible that some or all of these 2050 * pages are going to be uptodate. 2051 */ 2052 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2053 if (rbio->faila == stripe || rbio->failb == stripe) { 2054 atomic_inc(&rbio->error); 2055 continue; 2056 } 2057 2058 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2059 struct page *p; 2060 2061 /* 2062 * the rmw code may have already read this 2063 * page in 2064 */ 2065 p = rbio_stripe_page(rbio, stripe, pagenr); 2066 if (PageUptodate(p)) 2067 continue; 2068 2069 ret = rbio_add_io_page(rbio, &bio_list, 2070 rbio_stripe_page(rbio, stripe, pagenr), 2071 stripe, pagenr, rbio->stripe_len); 2072 if (ret < 0) 2073 goto cleanup; 2074 } 2075 } 2076 2077 bios_to_read = bio_list_size(&bio_list); 2078 if (!bios_to_read) { 2079 /* 2080 * we might have no bios to read just because the pages 2081 * were up to date, or we might have no bios to read because 2082 * the devices were gone. 2083 */ 2084 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2085 __raid_recover_end_io(rbio); 2086 return 0; 2087 } else { 2088 goto cleanup; 2089 } 2090 } 2091 2092 /* 2093 * the bbio may be freed once we submit the last bio. Make sure 2094 * not to touch it after that 2095 */ 2096 atomic_set(&rbio->stripes_pending, bios_to_read); 2097 while ((bio = bio_list_pop(&bio_list))) { 2098 bio->bi_private = rbio; 2099 bio->bi_end_io = raid_recover_end_io; 2100 bio->bi_opf = REQ_OP_READ; 2101 2102 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2103 2104 submit_bio(bio); 2105 } 2106 2107 return 0; 2108 2109 cleanup: 2110 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2111 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2112 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2113 2114 while ((bio = bio_list_pop(&bio_list))) 2115 bio_put(bio); 2116 2117 return -EIO; 2118 } 2119 2120 /* 2121 * the main entry point for reads from the higher layers. This 2122 * is really only called when the normal read path had a failure, 2123 * so we assume the bio they send down corresponds to a failed part 2124 * of the drive. 2125 */ 2126 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, 2127 struct btrfs_bio *bbio, u64 stripe_len, 2128 int mirror_num, int generic_io) 2129 { 2130 struct btrfs_raid_bio *rbio; 2131 int ret; 2132 2133 if (generic_io) { 2134 ASSERT(bbio->mirror_num == mirror_num); 2135 btrfs_io_bio(bio)->mirror_num = mirror_num; 2136 } 2137 2138 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2139 if (IS_ERR(rbio)) { 2140 if (generic_io) 2141 btrfs_put_bbio(bbio); 2142 return PTR_ERR(rbio); 2143 } 2144 2145 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2146 bio_list_add(&rbio->bio_list, bio); 2147 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2148 2149 rbio->faila = find_logical_bio_stripe(rbio, bio); 2150 if (rbio->faila == -1) { 2151 btrfs_warn(fs_info, 2152 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", 2153 __func__, bio->bi_iter.bi_sector << 9, 2154 (u64)bio->bi_iter.bi_size, bbio->map_type); 2155 if (generic_io) 2156 btrfs_put_bbio(bbio); 2157 kfree(rbio); 2158 return -EIO; 2159 } 2160 2161 if (generic_io) { 2162 btrfs_bio_counter_inc_noblocked(fs_info); 2163 rbio->generic_bio_cnt = 1; 2164 } else { 2165 btrfs_get_bbio(bbio); 2166 } 2167 2168 /* 2169 * Loop retry: 2170 * for 'mirror == 2', reconstruct from all other stripes. 2171 * for 'mirror_num > 2', select a stripe to fail on every retry. 2172 */ 2173 if (mirror_num > 2) { 2174 /* 2175 * 'mirror == 3' is to fail the p stripe and 2176 * reconstruct from the q stripe. 'mirror > 3' is to 2177 * fail a data stripe and reconstruct from p+q stripe. 2178 */ 2179 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2180 ASSERT(rbio->failb > 0); 2181 if (rbio->failb <= rbio->faila) 2182 rbio->failb--; 2183 } 2184 2185 ret = lock_stripe_add(rbio); 2186 2187 /* 2188 * __raid56_parity_recover will end the bio with 2189 * any errors it hits. We don't want to return 2190 * its error value up the stack because our caller 2191 * will end up calling bio_endio with any nonzero 2192 * return 2193 */ 2194 if (ret == 0) 2195 __raid56_parity_recover(rbio); 2196 /* 2197 * our rbio has been added to the list of 2198 * rbios that will be handled after the 2199 * currently lock owner is done 2200 */ 2201 return 0; 2202 2203 } 2204 2205 static void rmw_work(struct btrfs_work *work) 2206 { 2207 struct btrfs_raid_bio *rbio; 2208 2209 rbio = container_of(work, struct btrfs_raid_bio, work); 2210 raid56_rmw_stripe(rbio); 2211 } 2212 2213 static void read_rebuild_work(struct btrfs_work *work) 2214 { 2215 struct btrfs_raid_bio *rbio; 2216 2217 rbio = container_of(work, struct btrfs_raid_bio, work); 2218 __raid56_parity_recover(rbio); 2219 } 2220 2221 /* 2222 * The following code is used to scrub/replace the parity stripe 2223 * 2224 * Caller must have already increased bio_counter for getting @bbio. 2225 * 2226 * Note: We need make sure all the pages that add into the scrub/replace 2227 * raid bio are correct and not be changed during the scrub/replace. That 2228 * is those pages just hold metadata or file data with checksum. 2229 */ 2230 2231 struct btrfs_raid_bio * 2232 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2233 struct btrfs_bio *bbio, u64 stripe_len, 2234 struct btrfs_device *scrub_dev, 2235 unsigned long *dbitmap, int stripe_nsectors) 2236 { 2237 struct btrfs_raid_bio *rbio; 2238 int i; 2239 2240 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2241 if (IS_ERR(rbio)) 2242 return NULL; 2243 bio_list_add(&rbio->bio_list, bio); 2244 /* 2245 * This is a special bio which is used to hold the completion handler 2246 * and make the scrub rbio is similar to the other types 2247 */ 2248 ASSERT(!bio->bi_iter.bi_size); 2249 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2250 2251 /* 2252 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted 2253 * to the end position, so this search can start from the first parity 2254 * stripe. 2255 */ 2256 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2257 if (bbio->stripes[i].dev == scrub_dev) { 2258 rbio->scrubp = i; 2259 break; 2260 } 2261 } 2262 ASSERT(i < rbio->real_stripes); 2263 2264 /* Now we just support the sectorsize equals to page size */ 2265 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2266 ASSERT(rbio->stripe_npages == stripe_nsectors); 2267 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2268 2269 /* 2270 * We have already increased bio_counter when getting bbio, record it 2271 * so we can free it at rbio_orig_end_io(). 2272 */ 2273 rbio->generic_bio_cnt = 1; 2274 2275 return rbio; 2276 } 2277 2278 /* Used for both parity scrub and missing. */ 2279 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2280 u64 logical) 2281 { 2282 int stripe_offset; 2283 int index; 2284 2285 ASSERT(logical >= rbio->bbio->raid_map[0]); 2286 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2287 rbio->stripe_len * rbio->nr_data); 2288 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2289 index = stripe_offset >> PAGE_SHIFT; 2290 rbio->bio_pages[index] = page; 2291 } 2292 2293 /* 2294 * We just scrub the parity that we have correct data on the same horizontal, 2295 * so we needn't allocate all pages for all the stripes. 2296 */ 2297 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2298 { 2299 int i; 2300 int bit; 2301 int index; 2302 struct page *page; 2303 2304 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2305 for (i = 0; i < rbio->real_stripes; i++) { 2306 index = i * rbio->stripe_npages + bit; 2307 if (rbio->stripe_pages[index]) 2308 continue; 2309 2310 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2311 if (!page) 2312 return -ENOMEM; 2313 rbio->stripe_pages[index] = page; 2314 } 2315 } 2316 return 0; 2317 } 2318 2319 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2320 int need_check) 2321 { 2322 struct btrfs_bio *bbio = rbio->bbio; 2323 void **pointers = rbio->finish_pointers; 2324 unsigned long *pbitmap = rbio->finish_pbitmap; 2325 int nr_data = rbio->nr_data; 2326 int stripe; 2327 int pagenr; 2328 bool has_qstripe; 2329 struct page *p_page = NULL; 2330 struct page *q_page = NULL; 2331 struct bio_list bio_list; 2332 struct bio *bio; 2333 int is_replace = 0; 2334 int ret; 2335 2336 bio_list_init(&bio_list); 2337 2338 if (rbio->real_stripes - rbio->nr_data == 1) 2339 has_qstripe = false; 2340 else if (rbio->real_stripes - rbio->nr_data == 2) 2341 has_qstripe = true; 2342 else 2343 BUG(); 2344 2345 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2346 is_replace = 1; 2347 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2348 } 2349 2350 /* 2351 * Because the higher layers(scrubber) are unlikely to 2352 * use this area of the disk again soon, so don't cache 2353 * it. 2354 */ 2355 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2356 2357 if (!need_check) 2358 goto writeback; 2359 2360 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2361 if (!p_page) 2362 goto cleanup; 2363 SetPageUptodate(p_page); 2364 2365 if (has_qstripe) { 2366 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2367 if (!q_page) { 2368 __free_page(p_page); 2369 goto cleanup; 2370 } 2371 SetPageUptodate(q_page); 2372 } 2373 2374 atomic_set(&rbio->error, 0); 2375 2376 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2377 struct page *p; 2378 void *parity; 2379 /* first collect one page from each data stripe */ 2380 for (stripe = 0; stripe < nr_data; stripe++) { 2381 p = page_in_rbio(rbio, stripe, pagenr, 0); 2382 pointers[stripe] = kmap(p); 2383 } 2384 2385 /* then add the parity stripe */ 2386 pointers[stripe++] = kmap(p_page); 2387 2388 if (has_qstripe) { 2389 /* 2390 * raid6, add the qstripe and call the 2391 * library function to fill in our p/q 2392 */ 2393 pointers[stripe++] = kmap(q_page); 2394 2395 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2396 pointers); 2397 } else { 2398 /* raid5 */ 2399 copy_page(pointers[nr_data], pointers[0]); 2400 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2401 } 2402 2403 /* Check scrubbing parity and repair it */ 2404 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2405 parity = kmap(p); 2406 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2407 copy_page(parity, pointers[rbio->scrubp]); 2408 else 2409 /* Parity is right, needn't writeback */ 2410 bitmap_clear(rbio->dbitmap, pagenr, 1); 2411 kunmap(p); 2412 2413 for (stripe = 0; stripe < nr_data; stripe++) 2414 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 2415 kunmap(p_page); 2416 } 2417 2418 __free_page(p_page); 2419 if (q_page) 2420 __free_page(q_page); 2421 2422 writeback: 2423 /* 2424 * time to start writing. Make bios for everything from the 2425 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2426 * everything else. 2427 */ 2428 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2429 struct page *page; 2430 2431 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2432 ret = rbio_add_io_page(rbio, &bio_list, 2433 page, rbio->scrubp, pagenr, rbio->stripe_len); 2434 if (ret) 2435 goto cleanup; 2436 } 2437 2438 if (!is_replace) 2439 goto submit_write; 2440 2441 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2442 struct page *page; 2443 2444 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2445 ret = rbio_add_io_page(rbio, &bio_list, page, 2446 bbio->tgtdev_map[rbio->scrubp], 2447 pagenr, rbio->stripe_len); 2448 if (ret) 2449 goto cleanup; 2450 } 2451 2452 submit_write: 2453 nr_data = bio_list_size(&bio_list); 2454 if (!nr_data) { 2455 /* Every parity is right */ 2456 rbio_orig_end_io(rbio, BLK_STS_OK); 2457 return; 2458 } 2459 2460 atomic_set(&rbio->stripes_pending, nr_data); 2461 2462 while ((bio = bio_list_pop(&bio_list))) { 2463 bio->bi_private = rbio; 2464 bio->bi_end_io = raid_write_end_io; 2465 bio->bi_opf = REQ_OP_WRITE; 2466 2467 submit_bio(bio); 2468 } 2469 return; 2470 2471 cleanup: 2472 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2473 2474 while ((bio = bio_list_pop(&bio_list))) 2475 bio_put(bio); 2476 } 2477 2478 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2479 { 2480 if (stripe >= 0 && stripe < rbio->nr_data) 2481 return 1; 2482 return 0; 2483 } 2484 2485 /* 2486 * While we're doing the parity check and repair, we could have errors 2487 * in reading pages off the disk. This checks for errors and if we're 2488 * not able to read the page it'll trigger parity reconstruction. The 2489 * parity scrub will be finished after we've reconstructed the failed 2490 * stripes 2491 */ 2492 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2493 { 2494 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2495 goto cleanup; 2496 2497 if (rbio->faila >= 0 || rbio->failb >= 0) { 2498 int dfail = 0, failp = -1; 2499 2500 if (is_data_stripe(rbio, rbio->faila)) 2501 dfail++; 2502 else if (is_parity_stripe(rbio->faila)) 2503 failp = rbio->faila; 2504 2505 if (is_data_stripe(rbio, rbio->failb)) 2506 dfail++; 2507 else if (is_parity_stripe(rbio->failb)) 2508 failp = rbio->failb; 2509 2510 /* 2511 * Because we can not use a scrubbing parity to repair 2512 * the data, so the capability of the repair is declined. 2513 * (In the case of RAID5, we can not repair anything) 2514 */ 2515 if (dfail > rbio->bbio->max_errors - 1) 2516 goto cleanup; 2517 2518 /* 2519 * If all data is good, only parity is correctly, just 2520 * repair the parity. 2521 */ 2522 if (dfail == 0) { 2523 finish_parity_scrub(rbio, 0); 2524 return; 2525 } 2526 2527 /* 2528 * Here means we got one corrupted data stripe and one 2529 * corrupted parity on RAID6, if the corrupted parity 2530 * is scrubbing parity, luckily, use the other one to repair 2531 * the data, or we can not repair the data stripe. 2532 */ 2533 if (failp != rbio->scrubp) 2534 goto cleanup; 2535 2536 __raid_recover_end_io(rbio); 2537 } else { 2538 finish_parity_scrub(rbio, 1); 2539 } 2540 return; 2541 2542 cleanup: 2543 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2544 } 2545 2546 /* 2547 * end io for the read phase of the rmw cycle. All the bios here are physical 2548 * stripe bios we've read from the disk so we can recalculate the parity of the 2549 * stripe. 2550 * 2551 * This will usually kick off finish_rmw once all the bios are read in, but it 2552 * may trigger parity reconstruction if we had any errors along the way 2553 */ 2554 static void raid56_parity_scrub_end_io(struct bio *bio) 2555 { 2556 struct btrfs_raid_bio *rbio = bio->bi_private; 2557 2558 if (bio->bi_status) 2559 fail_bio_stripe(rbio, bio); 2560 else 2561 set_bio_pages_uptodate(bio); 2562 2563 bio_put(bio); 2564 2565 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2566 return; 2567 2568 /* 2569 * this will normally call finish_rmw to start our write 2570 * but if there are any failed stripes we'll reconstruct 2571 * from parity first 2572 */ 2573 validate_rbio_for_parity_scrub(rbio); 2574 } 2575 2576 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2577 { 2578 int bios_to_read = 0; 2579 struct bio_list bio_list; 2580 int ret; 2581 int pagenr; 2582 int stripe; 2583 struct bio *bio; 2584 2585 bio_list_init(&bio_list); 2586 2587 ret = alloc_rbio_essential_pages(rbio); 2588 if (ret) 2589 goto cleanup; 2590 2591 atomic_set(&rbio->error, 0); 2592 /* 2593 * build a list of bios to read all the missing parts of this 2594 * stripe 2595 */ 2596 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2597 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2598 struct page *page; 2599 /* 2600 * we want to find all the pages missing from 2601 * the rbio and read them from the disk. If 2602 * page_in_rbio finds a page in the bio list 2603 * we don't need to read it off the stripe. 2604 */ 2605 page = page_in_rbio(rbio, stripe, pagenr, 1); 2606 if (page) 2607 continue; 2608 2609 page = rbio_stripe_page(rbio, stripe, pagenr); 2610 /* 2611 * the bio cache may have handed us an uptodate 2612 * page. If so, be happy and use it 2613 */ 2614 if (PageUptodate(page)) 2615 continue; 2616 2617 ret = rbio_add_io_page(rbio, &bio_list, page, 2618 stripe, pagenr, rbio->stripe_len); 2619 if (ret) 2620 goto cleanup; 2621 } 2622 } 2623 2624 bios_to_read = bio_list_size(&bio_list); 2625 if (!bios_to_read) { 2626 /* 2627 * this can happen if others have merged with 2628 * us, it means there is nothing left to read. 2629 * But if there are missing devices it may not be 2630 * safe to do the full stripe write yet. 2631 */ 2632 goto finish; 2633 } 2634 2635 /* 2636 * the bbio may be freed once we submit the last bio. Make sure 2637 * not to touch it after that 2638 */ 2639 atomic_set(&rbio->stripes_pending, bios_to_read); 2640 while ((bio = bio_list_pop(&bio_list))) { 2641 bio->bi_private = rbio; 2642 bio->bi_end_io = raid56_parity_scrub_end_io; 2643 bio->bi_opf = REQ_OP_READ; 2644 2645 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2646 2647 submit_bio(bio); 2648 } 2649 /* the actual write will happen once the reads are done */ 2650 return; 2651 2652 cleanup: 2653 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2654 2655 while ((bio = bio_list_pop(&bio_list))) 2656 bio_put(bio); 2657 2658 return; 2659 2660 finish: 2661 validate_rbio_for_parity_scrub(rbio); 2662 } 2663 2664 static void scrub_parity_work(struct btrfs_work *work) 2665 { 2666 struct btrfs_raid_bio *rbio; 2667 2668 rbio = container_of(work, struct btrfs_raid_bio, work); 2669 raid56_parity_scrub_stripe(rbio); 2670 } 2671 2672 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2673 { 2674 if (!lock_stripe_add(rbio)) 2675 start_async_work(rbio, scrub_parity_work); 2676 } 2677 2678 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2679 2680 struct btrfs_raid_bio * 2681 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2682 struct btrfs_bio *bbio, u64 length) 2683 { 2684 struct btrfs_raid_bio *rbio; 2685 2686 rbio = alloc_rbio(fs_info, bbio, length); 2687 if (IS_ERR(rbio)) 2688 return NULL; 2689 2690 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2691 bio_list_add(&rbio->bio_list, bio); 2692 /* 2693 * This is a special bio which is used to hold the completion handler 2694 * and make the scrub rbio is similar to the other types 2695 */ 2696 ASSERT(!bio->bi_iter.bi_size); 2697 2698 rbio->faila = find_logical_bio_stripe(rbio, bio); 2699 if (rbio->faila == -1) { 2700 BUG(); 2701 kfree(rbio); 2702 return NULL; 2703 } 2704 2705 /* 2706 * When we get bbio, we have already increased bio_counter, record it 2707 * so we can free it at rbio_orig_end_io() 2708 */ 2709 rbio->generic_bio_cnt = 1; 2710 2711 return rbio; 2712 } 2713 2714 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2715 { 2716 if (!lock_stripe_add(rbio)) 2717 start_async_work(rbio, read_rebuild_work); 2718 } 2719