1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "misc.h" 17 #include "ctree.h" 18 #include "disk-io.h" 19 #include "volumes.h" 20 #include "raid56.h" 21 #include "async-thread.h" 22 23 /* set when additional merges to this rbio are not allowed */ 24 #define RBIO_RMW_LOCKED_BIT 1 25 26 /* 27 * set when this rbio is sitting in the hash, but it is just a cache 28 * of past RMW 29 */ 30 #define RBIO_CACHE_BIT 2 31 32 /* 33 * set when it is safe to trust the stripe_pages for caching 34 */ 35 #define RBIO_CACHE_READY_BIT 3 36 37 #define RBIO_CACHE_SIZE 1024 38 39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 40 41 /* Used by the raid56 code to lock stripes for read/modify/write */ 42 struct btrfs_stripe_hash { 43 struct list_head hash_list; 44 spinlock_t lock; 45 }; 46 47 /* Used by the raid56 code to lock stripes for read/modify/write */ 48 struct btrfs_stripe_hash_table { 49 struct list_head stripe_cache; 50 spinlock_t cache_lock; 51 int cache_size; 52 struct btrfs_stripe_hash table[]; 53 }; 54 55 enum btrfs_rbio_ops { 56 BTRFS_RBIO_WRITE, 57 BTRFS_RBIO_READ_REBUILD, 58 BTRFS_RBIO_PARITY_SCRUB, 59 BTRFS_RBIO_REBUILD_MISSING, 60 }; 61 62 struct btrfs_raid_bio { 63 struct btrfs_fs_info *fs_info; 64 struct btrfs_bio *bbio; 65 66 /* while we're doing rmw on a stripe 67 * we put it into a hash table so we can 68 * lock the stripe and merge more rbios 69 * into it. 70 */ 71 struct list_head hash_list; 72 73 /* 74 * LRU list for the stripe cache 75 */ 76 struct list_head stripe_cache; 77 78 /* 79 * for scheduling work in the helper threads 80 */ 81 struct btrfs_work work; 82 83 /* 84 * bio list and bio_list_lock are used 85 * to add more bios into the stripe 86 * in hopes of avoiding the full rmw 87 */ 88 struct bio_list bio_list; 89 spinlock_t bio_list_lock; 90 91 /* also protected by the bio_list_lock, the 92 * plug list is used by the plugging code 93 * to collect partial bios while plugged. The 94 * stripe locking code also uses it to hand off 95 * the stripe lock to the next pending IO 96 */ 97 struct list_head plug_list; 98 99 /* 100 * flags that tell us if it is safe to 101 * merge with this bio 102 */ 103 unsigned long flags; 104 105 /* size of each individual stripe on disk */ 106 int stripe_len; 107 108 /* number of data stripes (no p/q) */ 109 int nr_data; 110 111 int real_stripes; 112 113 int stripe_npages; 114 /* 115 * set if we're doing a parity rebuild 116 * for a read from higher up, which is handled 117 * differently from a parity rebuild as part of 118 * rmw 119 */ 120 enum btrfs_rbio_ops operation; 121 122 /* first bad stripe */ 123 int faila; 124 125 /* second bad stripe (for raid6 use) */ 126 int failb; 127 128 int scrubp; 129 /* 130 * number of pages needed to represent the full 131 * stripe 132 */ 133 int nr_pages; 134 135 /* 136 * size of all the bios in the bio_list. This 137 * helps us decide if the rbio maps to a full 138 * stripe or not 139 */ 140 int bio_list_bytes; 141 142 int generic_bio_cnt; 143 144 refcount_t refs; 145 146 atomic_t stripes_pending; 147 148 atomic_t error; 149 /* 150 * these are two arrays of pointers. We allocate the 151 * rbio big enough to hold them both and setup their 152 * locations when the rbio is allocated 153 */ 154 155 /* pointers to pages that we allocated for 156 * reading/writing stripes directly from the disk (including P/Q) 157 */ 158 struct page **stripe_pages; 159 160 /* 161 * pointers to the pages in the bio_list. Stored 162 * here for faster lookup 163 */ 164 struct page **bio_pages; 165 166 /* 167 * bitmap to record which horizontal stripe has data 168 */ 169 unsigned long *dbitmap; 170 171 /* allocated with real_stripes-many pointers for finish_*() calls */ 172 void **finish_pointers; 173 174 /* allocated with stripe_npages-many bits for finish_*() calls */ 175 unsigned long *finish_pbitmap; 176 }; 177 178 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 179 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 180 static void rmw_work(struct btrfs_work *work); 181 static void read_rebuild_work(struct btrfs_work *work); 182 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 183 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 184 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 185 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 186 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 187 188 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 189 int need_check); 190 static void scrub_parity_work(struct btrfs_work *work); 191 192 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) 193 { 194 btrfs_init_work(&rbio->work, work_func, NULL, NULL); 195 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 196 } 197 198 /* 199 * the stripe hash table is used for locking, and to collect 200 * bios in hopes of making a full stripe 201 */ 202 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 203 { 204 struct btrfs_stripe_hash_table *table; 205 struct btrfs_stripe_hash_table *x; 206 struct btrfs_stripe_hash *cur; 207 struct btrfs_stripe_hash *h; 208 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 209 int i; 210 211 if (info->stripe_hash_table) 212 return 0; 213 214 /* 215 * The table is large, starting with order 4 and can go as high as 216 * order 7 in case lock debugging is turned on. 217 * 218 * Try harder to allocate and fallback to vmalloc to lower the chance 219 * of a failing mount. 220 */ 221 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 222 if (!table) 223 return -ENOMEM; 224 225 spin_lock_init(&table->cache_lock); 226 INIT_LIST_HEAD(&table->stripe_cache); 227 228 h = table->table; 229 230 for (i = 0; i < num_entries; i++) { 231 cur = h + i; 232 INIT_LIST_HEAD(&cur->hash_list); 233 spin_lock_init(&cur->lock); 234 } 235 236 x = cmpxchg(&info->stripe_hash_table, NULL, table); 237 kvfree(x); 238 return 0; 239 } 240 241 /* 242 * caching an rbio means to copy anything from the 243 * bio_pages array into the stripe_pages array. We 244 * use the page uptodate bit in the stripe cache array 245 * to indicate if it has valid data 246 * 247 * once the caching is done, we set the cache ready 248 * bit. 249 */ 250 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 251 { 252 int i; 253 int ret; 254 255 ret = alloc_rbio_pages(rbio); 256 if (ret) 257 return; 258 259 for (i = 0; i < rbio->nr_pages; i++) { 260 if (!rbio->bio_pages[i]) 261 continue; 262 263 copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); 264 SetPageUptodate(rbio->stripe_pages[i]); 265 } 266 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 267 } 268 269 /* 270 * we hash on the first logical address of the stripe 271 */ 272 static int rbio_bucket(struct btrfs_raid_bio *rbio) 273 { 274 u64 num = rbio->bbio->raid_map[0]; 275 276 /* 277 * we shift down quite a bit. We're using byte 278 * addressing, and most of the lower bits are zeros. 279 * This tends to upset hash_64, and it consistently 280 * returns just one or two different values. 281 * 282 * shifting off the lower bits fixes things. 283 */ 284 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 285 } 286 287 /* 288 * stealing an rbio means taking all the uptodate pages from the stripe 289 * array in the source rbio and putting them into the destination rbio 290 */ 291 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 292 { 293 int i; 294 struct page *s; 295 struct page *d; 296 297 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 298 return; 299 300 for (i = 0; i < dest->nr_pages; i++) { 301 s = src->stripe_pages[i]; 302 if (!s || !PageUptodate(s)) { 303 continue; 304 } 305 306 d = dest->stripe_pages[i]; 307 if (d) 308 __free_page(d); 309 310 dest->stripe_pages[i] = s; 311 src->stripe_pages[i] = NULL; 312 } 313 } 314 315 /* 316 * merging means we take the bio_list from the victim and 317 * splice it into the destination. The victim should 318 * be discarded afterwards. 319 * 320 * must be called with dest->rbio_list_lock held 321 */ 322 static void merge_rbio(struct btrfs_raid_bio *dest, 323 struct btrfs_raid_bio *victim) 324 { 325 bio_list_merge(&dest->bio_list, &victim->bio_list); 326 dest->bio_list_bytes += victim->bio_list_bytes; 327 dest->generic_bio_cnt += victim->generic_bio_cnt; 328 bio_list_init(&victim->bio_list); 329 } 330 331 /* 332 * used to prune items that are in the cache. The caller 333 * must hold the hash table lock. 334 */ 335 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 336 { 337 int bucket = rbio_bucket(rbio); 338 struct btrfs_stripe_hash_table *table; 339 struct btrfs_stripe_hash *h; 340 int freeit = 0; 341 342 /* 343 * check the bit again under the hash table lock. 344 */ 345 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 346 return; 347 348 table = rbio->fs_info->stripe_hash_table; 349 h = table->table + bucket; 350 351 /* hold the lock for the bucket because we may be 352 * removing it from the hash table 353 */ 354 spin_lock(&h->lock); 355 356 /* 357 * hold the lock for the bio list because we need 358 * to make sure the bio list is empty 359 */ 360 spin_lock(&rbio->bio_list_lock); 361 362 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 363 list_del_init(&rbio->stripe_cache); 364 table->cache_size -= 1; 365 freeit = 1; 366 367 /* if the bio list isn't empty, this rbio is 368 * still involved in an IO. We take it out 369 * of the cache list, and drop the ref that 370 * was held for the list. 371 * 372 * If the bio_list was empty, we also remove 373 * the rbio from the hash_table, and drop 374 * the corresponding ref 375 */ 376 if (bio_list_empty(&rbio->bio_list)) { 377 if (!list_empty(&rbio->hash_list)) { 378 list_del_init(&rbio->hash_list); 379 refcount_dec(&rbio->refs); 380 BUG_ON(!list_empty(&rbio->plug_list)); 381 } 382 } 383 } 384 385 spin_unlock(&rbio->bio_list_lock); 386 spin_unlock(&h->lock); 387 388 if (freeit) 389 __free_raid_bio(rbio); 390 } 391 392 /* 393 * prune a given rbio from the cache 394 */ 395 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 396 { 397 struct btrfs_stripe_hash_table *table; 398 unsigned long flags; 399 400 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 401 return; 402 403 table = rbio->fs_info->stripe_hash_table; 404 405 spin_lock_irqsave(&table->cache_lock, flags); 406 __remove_rbio_from_cache(rbio); 407 spin_unlock_irqrestore(&table->cache_lock, flags); 408 } 409 410 /* 411 * remove everything in the cache 412 */ 413 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 414 { 415 struct btrfs_stripe_hash_table *table; 416 unsigned long flags; 417 struct btrfs_raid_bio *rbio; 418 419 table = info->stripe_hash_table; 420 421 spin_lock_irqsave(&table->cache_lock, flags); 422 while (!list_empty(&table->stripe_cache)) { 423 rbio = list_entry(table->stripe_cache.next, 424 struct btrfs_raid_bio, 425 stripe_cache); 426 __remove_rbio_from_cache(rbio); 427 } 428 spin_unlock_irqrestore(&table->cache_lock, flags); 429 } 430 431 /* 432 * remove all cached entries and free the hash table 433 * used by unmount 434 */ 435 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 436 { 437 if (!info->stripe_hash_table) 438 return; 439 btrfs_clear_rbio_cache(info); 440 kvfree(info->stripe_hash_table); 441 info->stripe_hash_table = NULL; 442 } 443 444 /* 445 * insert an rbio into the stripe cache. It 446 * must have already been prepared by calling 447 * cache_rbio_pages 448 * 449 * If this rbio was already cached, it gets 450 * moved to the front of the lru. 451 * 452 * If the size of the rbio cache is too big, we 453 * prune an item. 454 */ 455 static void cache_rbio(struct btrfs_raid_bio *rbio) 456 { 457 struct btrfs_stripe_hash_table *table; 458 unsigned long flags; 459 460 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 461 return; 462 463 table = rbio->fs_info->stripe_hash_table; 464 465 spin_lock_irqsave(&table->cache_lock, flags); 466 spin_lock(&rbio->bio_list_lock); 467 468 /* bump our ref if we were not in the list before */ 469 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 470 refcount_inc(&rbio->refs); 471 472 if (!list_empty(&rbio->stripe_cache)){ 473 list_move(&rbio->stripe_cache, &table->stripe_cache); 474 } else { 475 list_add(&rbio->stripe_cache, &table->stripe_cache); 476 table->cache_size += 1; 477 } 478 479 spin_unlock(&rbio->bio_list_lock); 480 481 if (table->cache_size > RBIO_CACHE_SIZE) { 482 struct btrfs_raid_bio *found; 483 484 found = list_entry(table->stripe_cache.prev, 485 struct btrfs_raid_bio, 486 stripe_cache); 487 488 if (found != rbio) 489 __remove_rbio_from_cache(found); 490 } 491 492 spin_unlock_irqrestore(&table->cache_lock, flags); 493 } 494 495 /* 496 * helper function to run the xor_blocks api. It is only 497 * able to do MAX_XOR_BLOCKS at a time, so we need to 498 * loop through. 499 */ 500 static void run_xor(void **pages, int src_cnt, ssize_t len) 501 { 502 int src_off = 0; 503 int xor_src_cnt = 0; 504 void *dest = pages[src_cnt]; 505 506 while(src_cnt > 0) { 507 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 508 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 509 510 src_cnt -= xor_src_cnt; 511 src_off += xor_src_cnt; 512 } 513 } 514 515 /* 516 * Returns true if the bio list inside this rbio covers an entire stripe (no 517 * rmw required). 518 */ 519 static int rbio_is_full(struct btrfs_raid_bio *rbio) 520 { 521 unsigned long flags; 522 unsigned long size = rbio->bio_list_bytes; 523 int ret = 1; 524 525 spin_lock_irqsave(&rbio->bio_list_lock, flags); 526 if (size != rbio->nr_data * rbio->stripe_len) 527 ret = 0; 528 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 529 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 530 531 return ret; 532 } 533 534 /* 535 * returns 1 if it is safe to merge two rbios together. 536 * The merging is safe if the two rbios correspond to 537 * the same stripe and if they are both going in the same 538 * direction (read vs write), and if neither one is 539 * locked for final IO 540 * 541 * The caller is responsible for locking such that 542 * rmw_locked is safe to test 543 */ 544 static int rbio_can_merge(struct btrfs_raid_bio *last, 545 struct btrfs_raid_bio *cur) 546 { 547 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 548 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 549 return 0; 550 551 /* 552 * we can't merge with cached rbios, since the 553 * idea is that when we merge the destination 554 * rbio is going to run our IO for us. We can 555 * steal from cached rbios though, other functions 556 * handle that. 557 */ 558 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 559 test_bit(RBIO_CACHE_BIT, &cur->flags)) 560 return 0; 561 562 if (last->bbio->raid_map[0] != 563 cur->bbio->raid_map[0]) 564 return 0; 565 566 /* we can't merge with different operations */ 567 if (last->operation != cur->operation) 568 return 0; 569 /* 570 * We've need read the full stripe from the drive. 571 * check and repair the parity and write the new results. 572 * 573 * We're not allowed to add any new bios to the 574 * bio list here, anyone else that wants to 575 * change this stripe needs to do their own rmw. 576 */ 577 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 578 return 0; 579 580 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 581 return 0; 582 583 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 584 int fa = last->faila; 585 int fb = last->failb; 586 int cur_fa = cur->faila; 587 int cur_fb = cur->failb; 588 589 if (last->faila >= last->failb) { 590 fa = last->failb; 591 fb = last->faila; 592 } 593 594 if (cur->faila >= cur->failb) { 595 cur_fa = cur->failb; 596 cur_fb = cur->faila; 597 } 598 599 if (fa != cur_fa || fb != cur_fb) 600 return 0; 601 } 602 return 1; 603 } 604 605 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 606 int index) 607 { 608 return stripe * rbio->stripe_npages + index; 609 } 610 611 /* 612 * these are just the pages from the rbio array, not from anything 613 * the FS sent down to us 614 */ 615 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 616 int index) 617 { 618 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 619 } 620 621 /* 622 * helper to index into the pstripe 623 */ 624 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 625 { 626 return rbio_stripe_page(rbio, rbio->nr_data, index); 627 } 628 629 /* 630 * helper to index into the qstripe, returns null 631 * if there is no qstripe 632 */ 633 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 634 { 635 if (rbio->nr_data + 1 == rbio->real_stripes) 636 return NULL; 637 return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 638 } 639 640 /* 641 * The first stripe in the table for a logical address 642 * has the lock. rbios are added in one of three ways: 643 * 644 * 1) Nobody has the stripe locked yet. The rbio is given 645 * the lock and 0 is returned. The caller must start the IO 646 * themselves. 647 * 648 * 2) Someone has the stripe locked, but we're able to merge 649 * with the lock owner. The rbio is freed and the IO will 650 * start automatically along with the existing rbio. 1 is returned. 651 * 652 * 3) Someone has the stripe locked, but we're not able to merge. 653 * The rbio is added to the lock owner's plug list, or merged into 654 * an rbio already on the plug list. When the lock owner unlocks, 655 * the next rbio on the list is run and the IO is started automatically. 656 * 1 is returned 657 * 658 * If we return 0, the caller still owns the rbio and must continue with 659 * IO submission. If we return 1, the caller must assume the rbio has 660 * already been freed. 661 */ 662 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 663 { 664 struct btrfs_stripe_hash *h; 665 struct btrfs_raid_bio *cur; 666 struct btrfs_raid_bio *pending; 667 unsigned long flags; 668 struct btrfs_raid_bio *freeit = NULL; 669 struct btrfs_raid_bio *cache_drop = NULL; 670 int ret = 0; 671 672 h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 673 674 spin_lock_irqsave(&h->lock, flags); 675 list_for_each_entry(cur, &h->hash_list, hash_list) { 676 if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) 677 continue; 678 679 spin_lock(&cur->bio_list_lock); 680 681 /* Can we steal this cached rbio's pages? */ 682 if (bio_list_empty(&cur->bio_list) && 683 list_empty(&cur->plug_list) && 684 test_bit(RBIO_CACHE_BIT, &cur->flags) && 685 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 686 list_del_init(&cur->hash_list); 687 refcount_dec(&cur->refs); 688 689 steal_rbio(cur, rbio); 690 cache_drop = cur; 691 spin_unlock(&cur->bio_list_lock); 692 693 goto lockit; 694 } 695 696 /* Can we merge into the lock owner? */ 697 if (rbio_can_merge(cur, rbio)) { 698 merge_rbio(cur, rbio); 699 spin_unlock(&cur->bio_list_lock); 700 freeit = rbio; 701 ret = 1; 702 goto out; 703 } 704 705 706 /* 707 * We couldn't merge with the running rbio, see if we can merge 708 * with the pending ones. We don't have to check for rmw_locked 709 * because there is no way they are inside finish_rmw right now 710 */ 711 list_for_each_entry(pending, &cur->plug_list, plug_list) { 712 if (rbio_can_merge(pending, rbio)) { 713 merge_rbio(pending, rbio); 714 spin_unlock(&cur->bio_list_lock); 715 freeit = rbio; 716 ret = 1; 717 goto out; 718 } 719 } 720 721 /* 722 * No merging, put us on the tail of the plug list, our rbio 723 * will be started with the currently running rbio unlocks 724 */ 725 list_add_tail(&rbio->plug_list, &cur->plug_list); 726 spin_unlock(&cur->bio_list_lock); 727 ret = 1; 728 goto out; 729 } 730 lockit: 731 refcount_inc(&rbio->refs); 732 list_add(&rbio->hash_list, &h->hash_list); 733 out: 734 spin_unlock_irqrestore(&h->lock, flags); 735 if (cache_drop) 736 remove_rbio_from_cache(cache_drop); 737 if (freeit) 738 __free_raid_bio(freeit); 739 return ret; 740 } 741 742 /* 743 * called as rmw or parity rebuild is completed. If the plug list has more 744 * rbios waiting for this stripe, the next one on the list will be started 745 */ 746 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 747 { 748 int bucket; 749 struct btrfs_stripe_hash *h; 750 unsigned long flags; 751 int keep_cache = 0; 752 753 bucket = rbio_bucket(rbio); 754 h = rbio->fs_info->stripe_hash_table->table + bucket; 755 756 if (list_empty(&rbio->plug_list)) 757 cache_rbio(rbio); 758 759 spin_lock_irqsave(&h->lock, flags); 760 spin_lock(&rbio->bio_list_lock); 761 762 if (!list_empty(&rbio->hash_list)) { 763 /* 764 * if we're still cached and there is no other IO 765 * to perform, just leave this rbio here for others 766 * to steal from later 767 */ 768 if (list_empty(&rbio->plug_list) && 769 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 770 keep_cache = 1; 771 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 772 BUG_ON(!bio_list_empty(&rbio->bio_list)); 773 goto done; 774 } 775 776 list_del_init(&rbio->hash_list); 777 refcount_dec(&rbio->refs); 778 779 /* 780 * we use the plug list to hold all the rbios 781 * waiting for the chance to lock this stripe. 782 * hand the lock over to one of them. 783 */ 784 if (!list_empty(&rbio->plug_list)) { 785 struct btrfs_raid_bio *next; 786 struct list_head *head = rbio->plug_list.next; 787 788 next = list_entry(head, struct btrfs_raid_bio, 789 plug_list); 790 791 list_del_init(&rbio->plug_list); 792 793 list_add(&next->hash_list, &h->hash_list); 794 refcount_inc(&next->refs); 795 spin_unlock(&rbio->bio_list_lock); 796 spin_unlock_irqrestore(&h->lock, flags); 797 798 if (next->operation == BTRFS_RBIO_READ_REBUILD) 799 start_async_work(next, read_rebuild_work); 800 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 801 steal_rbio(rbio, next); 802 start_async_work(next, read_rebuild_work); 803 } else if (next->operation == BTRFS_RBIO_WRITE) { 804 steal_rbio(rbio, next); 805 start_async_work(next, rmw_work); 806 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 807 steal_rbio(rbio, next); 808 start_async_work(next, scrub_parity_work); 809 } 810 811 goto done_nolock; 812 } 813 } 814 done: 815 spin_unlock(&rbio->bio_list_lock); 816 spin_unlock_irqrestore(&h->lock, flags); 817 818 done_nolock: 819 if (!keep_cache) 820 remove_rbio_from_cache(rbio); 821 } 822 823 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 824 { 825 int i; 826 827 if (!refcount_dec_and_test(&rbio->refs)) 828 return; 829 830 WARN_ON(!list_empty(&rbio->stripe_cache)); 831 WARN_ON(!list_empty(&rbio->hash_list)); 832 WARN_ON(!bio_list_empty(&rbio->bio_list)); 833 834 for (i = 0; i < rbio->nr_pages; i++) { 835 if (rbio->stripe_pages[i]) { 836 __free_page(rbio->stripe_pages[i]); 837 rbio->stripe_pages[i] = NULL; 838 } 839 } 840 841 btrfs_put_bbio(rbio->bbio); 842 kfree(rbio); 843 } 844 845 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 846 { 847 struct bio *next; 848 849 while (cur) { 850 next = cur->bi_next; 851 cur->bi_next = NULL; 852 cur->bi_status = err; 853 bio_endio(cur); 854 cur = next; 855 } 856 } 857 858 /* 859 * this frees the rbio and runs through all the bios in the 860 * bio_list and calls end_io on them 861 */ 862 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 863 { 864 struct bio *cur = bio_list_get(&rbio->bio_list); 865 struct bio *extra; 866 867 if (rbio->generic_bio_cnt) 868 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 869 870 /* 871 * At this moment, rbio->bio_list is empty, however since rbio does not 872 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 873 * hash list, rbio may be merged with others so that rbio->bio_list 874 * becomes non-empty. 875 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 876 * more and we can call bio_endio() on all queued bios. 877 */ 878 unlock_stripe(rbio); 879 extra = bio_list_get(&rbio->bio_list); 880 __free_raid_bio(rbio); 881 882 rbio_endio_bio_list(cur, err); 883 if (extra) 884 rbio_endio_bio_list(extra, err); 885 } 886 887 /* 888 * end io function used by finish_rmw. When we finally 889 * get here, we've written a full stripe 890 */ 891 static void raid_write_end_io(struct bio *bio) 892 { 893 struct btrfs_raid_bio *rbio = bio->bi_private; 894 blk_status_t err = bio->bi_status; 895 int max_errors; 896 897 if (err) 898 fail_bio_stripe(rbio, bio); 899 900 bio_put(bio); 901 902 if (!atomic_dec_and_test(&rbio->stripes_pending)) 903 return; 904 905 err = BLK_STS_OK; 906 907 /* OK, we have read all the stripes we need to. */ 908 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 909 0 : rbio->bbio->max_errors; 910 if (atomic_read(&rbio->error) > max_errors) 911 err = BLK_STS_IOERR; 912 913 rbio_orig_end_io(rbio, err); 914 } 915 916 /* 917 * the read/modify/write code wants to use the original bio for 918 * any pages it included, and then use the rbio for everything 919 * else. This function decides if a given index (stripe number) 920 * and page number in that stripe fall inside the original bio 921 * or the rbio. 922 * 923 * if you set bio_list_only, you'll get a NULL back for any ranges 924 * that are outside the bio_list 925 * 926 * This doesn't take any refs on anything, you get a bare page pointer 927 * and the caller must bump refs as required. 928 * 929 * You must call index_rbio_pages once before you can trust 930 * the answers from this function. 931 */ 932 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 933 int index, int pagenr, int bio_list_only) 934 { 935 int chunk_page; 936 struct page *p = NULL; 937 938 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 939 940 spin_lock_irq(&rbio->bio_list_lock); 941 p = rbio->bio_pages[chunk_page]; 942 spin_unlock_irq(&rbio->bio_list_lock); 943 944 if (p || bio_list_only) 945 return p; 946 947 return rbio->stripe_pages[chunk_page]; 948 } 949 950 /* 951 * number of pages we need for the entire stripe across all the 952 * drives 953 */ 954 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 955 { 956 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 957 } 958 959 /* 960 * allocation and initial setup for the btrfs_raid_bio. Not 961 * this does not allocate any pages for rbio->pages. 962 */ 963 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 964 struct btrfs_bio *bbio, 965 u64 stripe_len) 966 { 967 struct btrfs_raid_bio *rbio; 968 int nr_data = 0; 969 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 970 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 971 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 972 void *p; 973 974 rbio = kzalloc(sizeof(*rbio) + 975 sizeof(*rbio->stripe_pages) * num_pages + 976 sizeof(*rbio->bio_pages) * num_pages + 977 sizeof(*rbio->finish_pointers) * real_stripes + 978 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + 979 sizeof(*rbio->finish_pbitmap) * 980 BITS_TO_LONGS(stripe_npages), 981 GFP_NOFS); 982 if (!rbio) 983 return ERR_PTR(-ENOMEM); 984 985 bio_list_init(&rbio->bio_list); 986 INIT_LIST_HEAD(&rbio->plug_list); 987 spin_lock_init(&rbio->bio_list_lock); 988 INIT_LIST_HEAD(&rbio->stripe_cache); 989 INIT_LIST_HEAD(&rbio->hash_list); 990 rbio->bbio = bbio; 991 rbio->fs_info = fs_info; 992 rbio->stripe_len = stripe_len; 993 rbio->nr_pages = num_pages; 994 rbio->real_stripes = real_stripes; 995 rbio->stripe_npages = stripe_npages; 996 rbio->faila = -1; 997 rbio->failb = -1; 998 refcount_set(&rbio->refs, 1); 999 atomic_set(&rbio->error, 0); 1000 atomic_set(&rbio->stripes_pending, 0); 1001 1002 /* 1003 * the stripe_pages, bio_pages, etc arrays point to the extra 1004 * memory we allocated past the end of the rbio 1005 */ 1006 p = rbio + 1; 1007 #define CONSUME_ALLOC(ptr, count) do { \ 1008 ptr = p; \ 1009 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1010 } while (0) 1011 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1012 CONSUME_ALLOC(rbio->bio_pages, num_pages); 1013 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1014 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); 1015 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); 1016 #undef CONSUME_ALLOC 1017 1018 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1019 nr_data = real_stripes - 1; 1020 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1021 nr_data = real_stripes - 2; 1022 else 1023 BUG(); 1024 1025 rbio->nr_data = nr_data; 1026 return rbio; 1027 } 1028 1029 /* allocate pages for all the stripes in the bio, including parity */ 1030 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1031 { 1032 int i; 1033 struct page *page; 1034 1035 for (i = 0; i < rbio->nr_pages; i++) { 1036 if (rbio->stripe_pages[i]) 1037 continue; 1038 page = alloc_page(GFP_NOFS); 1039 if (!page) 1040 return -ENOMEM; 1041 rbio->stripe_pages[i] = page; 1042 } 1043 return 0; 1044 } 1045 1046 /* only allocate pages for p/q stripes */ 1047 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1048 { 1049 int i; 1050 struct page *page; 1051 1052 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1053 1054 for (; i < rbio->nr_pages; i++) { 1055 if (rbio->stripe_pages[i]) 1056 continue; 1057 page = alloc_page(GFP_NOFS); 1058 if (!page) 1059 return -ENOMEM; 1060 rbio->stripe_pages[i] = page; 1061 } 1062 return 0; 1063 } 1064 1065 /* 1066 * add a single page from a specific stripe into our list of bios for IO 1067 * this will try to merge into existing bios if possible, and returns 1068 * zero if all went well. 1069 */ 1070 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1071 struct bio_list *bio_list, 1072 struct page *page, 1073 int stripe_nr, 1074 unsigned long page_index, 1075 unsigned long bio_max_len) 1076 { 1077 struct bio *last = bio_list->tail; 1078 int ret; 1079 struct bio *bio; 1080 struct btrfs_bio_stripe *stripe; 1081 u64 disk_start; 1082 1083 stripe = &rbio->bbio->stripes[stripe_nr]; 1084 disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1085 1086 /* if the device is missing, just fail this stripe */ 1087 if (!stripe->dev->bdev) 1088 return fail_rbio_index(rbio, stripe_nr); 1089 1090 /* see if we can add this page onto our existing bio */ 1091 if (last) { 1092 u64 last_end = last->bi_iter.bi_sector << 9; 1093 last_end += last->bi_iter.bi_size; 1094 1095 /* 1096 * we can't merge these if they are from different 1097 * devices or if they are not contiguous 1098 */ 1099 if (last_end == disk_start && !last->bi_status && 1100 last->bi_bdev == stripe->dev->bdev) { 1101 ret = bio_add_page(last, page, PAGE_SIZE, 0); 1102 if (ret == PAGE_SIZE) 1103 return 0; 1104 } 1105 } 1106 1107 /* put a new bio on the list */ 1108 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1109 btrfs_io_bio(bio)->device = stripe->dev; 1110 bio->bi_iter.bi_size = 0; 1111 bio_set_dev(bio, stripe->dev->bdev); 1112 bio->bi_iter.bi_sector = disk_start >> 9; 1113 1114 bio_add_page(bio, page, PAGE_SIZE, 0); 1115 bio_list_add(bio_list, bio); 1116 return 0; 1117 } 1118 1119 /* 1120 * while we're doing the read/modify/write cycle, we could 1121 * have errors in reading pages off the disk. This checks 1122 * for errors and if we're not able to read the page it'll 1123 * trigger parity reconstruction. The rmw will be finished 1124 * after we've reconstructed the failed stripes 1125 */ 1126 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1127 { 1128 if (rbio->faila >= 0 || rbio->failb >= 0) { 1129 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1130 __raid56_parity_recover(rbio); 1131 } else { 1132 finish_rmw(rbio); 1133 } 1134 } 1135 1136 /* 1137 * helper function to walk our bio list and populate the bio_pages array with 1138 * the result. This seems expensive, but it is faster than constantly 1139 * searching through the bio list as we setup the IO in finish_rmw or stripe 1140 * reconstruction. 1141 * 1142 * This must be called before you trust the answers from page_in_rbio 1143 */ 1144 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1145 { 1146 struct bio *bio; 1147 u64 start; 1148 unsigned long stripe_offset; 1149 unsigned long page_index; 1150 1151 spin_lock_irq(&rbio->bio_list_lock); 1152 bio_list_for_each(bio, &rbio->bio_list) { 1153 struct bio_vec bvec; 1154 struct bvec_iter iter; 1155 int i = 0; 1156 1157 start = bio->bi_iter.bi_sector << 9; 1158 stripe_offset = start - rbio->bbio->raid_map[0]; 1159 page_index = stripe_offset >> PAGE_SHIFT; 1160 1161 if (bio_flagged(bio, BIO_CLONED)) 1162 bio->bi_iter = btrfs_io_bio(bio)->iter; 1163 1164 bio_for_each_segment(bvec, bio, iter) { 1165 rbio->bio_pages[page_index + i] = bvec.bv_page; 1166 i++; 1167 } 1168 } 1169 spin_unlock_irq(&rbio->bio_list_lock); 1170 } 1171 1172 /* 1173 * this is called from one of two situations. We either 1174 * have a full stripe from the higher layers, or we've read all 1175 * the missing bits off disk. 1176 * 1177 * This will calculate the parity and then send down any 1178 * changed blocks. 1179 */ 1180 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1181 { 1182 struct btrfs_bio *bbio = rbio->bbio; 1183 void **pointers = rbio->finish_pointers; 1184 int nr_data = rbio->nr_data; 1185 int stripe; 1186 int pagenr; 1187 bool has_qstripe; 1188 struct bio_list bio_list; 1189 struct bio *bio; 1190 int ret; 1191 1192 bio_list_init(&bio_list); 1193 1194 if (rbio->real_stripes - rbio->nr_data == 1) 1195 has_qstripe = false; 1196 else if (rbio->real_stripes - rbio->nr_data == 2) 1197 has_qstripe = true; 1198 else 1199 BUG(); 1200 1201 /* at this point we either have a full stripe, 1202 * or we've read the full stripe from the drive. 1203 * recalculate the parity and write the new results. 1204 * 1205 * We're not allowed to add any new bios to the 1206 * bio list here, anyone else that wants to 1207 * change this stripe needs to do their own rmw. 1208 */ 1209 spin_lock_irq(&rbio->bio_list_lock); 1210 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1211 spin_unlock_irq(&rbio->bio_list_lock); 1212 1213 atomic_set(&rbio->error, 0); 1214 1215 /* 1216 * now that we've set rmw_locked, run through the 1217 * bio list one last time and map the page pointers 1218 * 1219 * We don't cache full rbios because we're assuming 1220 * the higher layers are unlikely to use this area of 1221 * the disk again soon. If they do use it again, 1222 * hopefully they will send another full bio. 1223 */ 1224 index_rbio_pages(rbio); 1225 if (!rbio_is_full(rbio)) 1226 cache_rbio_pages(rbio); 1227 else 1228 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1229 1230 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1231 struct page *p; 1232 /* first collect one page from each data stripe */ 1233 for (stripe = 0; stripe < nr_data; stripe++) { 1234 p = page_in_rbio(rbio, stripe, pagenr, 0); 1235 pointers[stripe] = kmap_local_page(p); 1236 } 1237 1238 /* then add the parity stripe */ 1239 p = rbio_pstripe_page(rbio, pagenr); 1240 SetPageUptodate(p); 1241 pointers[stripe++] = kmap_local_page(p); 1242 1243 if (has_qstripe) { 1244 1245 /* 1246 * raid6, add the qstripe and call the 1247 * library function to fill in our p/q 1248 */ 1249 p = rbio_qstripe_page(rbio, pagenr); 1250 SetPageUptodate(p); 1251 pointers[stripe++] = kmap_local_page(p); 1252 1253 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1254 pointers); 1255 } else { 1256 /* raid5 */ 1257 copy_page(pointers[nr_data], pointers[0]); 1258 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1259 } 1260 for (stripe = stripe - 1; stripe >= 0; stripe--) 1261 kunmap_local(pointers[stripe]); 1262 } 1263 1264 /* 1265 * time to start writing. Make bios for everything from the 1266 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1267 * everything else. 1268 */ 1269 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1270 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1271 struct page *page; 1272 if (stripe < rbio->nr_data) { 1273 page = page_in_rbio(rbio, stripe, pagenr, 1); 1274 if (!page) 1275 continue; 1276 } else { 1277 page = rbio_stripe_page(rbio, stripe, pagenr); 1278 } 1279 1280 ret = rbio_add_io_page(rbio, &bio_list, 1281 page, stripe, pagenr, rbio->stripe_len); 1282 if (ret) 1283 goto cleanup; 1284 } 1285 } 1286 1287 if (likely(!bbio->num_tgtdevs)) 1288 goto write_data; 1289 1290 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1291 if (!bbio->tgtdev_map[stripe]) 1292 continue; 1293 1294 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1295 struct page *page; 1296 if (stripe < rbio->nr_data) { 1297 page = page_in_rbio(rbio, stripe, pagenr, 1); 1298 if (!page) 1299 continue; 1300 } else { 1301 page = rbio_stripe_page(rbio, stripe, pagenr); 1302 } 1303 1304 ret = rbio_add_io_page(rbio, &bio_list, page, 1305 rbio->bbio->tgtdev_map[stripe], 1306 pagenr, rbio->stripe_len); 1307 if (ret) 1308 goto cleanup; 1309 } 1310 } 1311 1312 write_data: 1313 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1314 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1315 1316 while ((bio = bio_list_pop(&bio_list))) { 1317 bio->bi_private = rbio; 1318 bio->bi_end_io = raid_write_end_io; 1319 bio->bi_opf = REQ_OP_WRITE; 1320 1321 submit_bio(bio); 1322 } 1323 return; 1324 1325 cleanup: 1326 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1327 1328 while ((bio = bio_list_pop(&bio_list))) 1329 bio_put(bio); 1330 } 1331 1332 /* 1333 * helper to find the stripe number for a given bio. Used to figure out which 1334 * stripe has failed. This expects the bio to correspond to a physical disk, 1335 * so it looks up based on physical sector numbers. 1336 */ 1337 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1338 struct bio *bio) 1339 { 1340 u64 physical = bio->bi_iter.bi_sector; 1341 int i; 1342 struct btrfs_bio_stripe *stripe; 1343 1344 physical <<= 9; 1345 1346 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1347 stripe = &rbio->bbio->stripes[i]; 1348 if (in_range(physical, stripe->physical, rbio->stripe_len) && 1349 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 1350 return i; 1351 } 1352 } 1353 return -1; 1354 } 1355 1356 /* 1357 * helper to find the stripe number for a given 1358 * bio (before mapping). Used to figure out which stripe has 1359 * failed. This looks up based on logical block numbers. 1360 */ 1361 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1362 struct bio *bio) 1363 { 1364 u64 logical = bio->bi_iter.bi_sector << 9; 1365 int i; 1366 1367 for (i = 0; i < rbio->nr_data; i++) { 1368 u64 stripe_start = rbio->bbio->raid_map[i]; 1369 1370 if (in_range(logical, stripe_start, rbio->stripe_len)) 1371 return i; 1372 } 1373 return -1; 1374 } 1375 1376 /* 1377 * returns -EIO if we had too many failures 1378 */ 1379 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1380 { 1381 unsigned long flags; 1382 int ret = 0; 1383 1384 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1385 1386 /* we already know this stripe is bad, move on */ 1387 if (rbio->faila == failed || rbio->failb == failed) 1388 goto out; 1389 1390 if (rbio->faila == -1) { 1391 /* first failure on this rbio */ 1392 rbio->faila = failed; 1393 atomic_inc(&rbio->error); 1394 } else if (rbio->failb == -1) { 1395 /* second failure on this rbio */ 1396 rbio->failb = failed; 1397 atomic_inc(&rbio->error); 1398 } else { 1399 ret = -EIO; 1400 } 1401 out: 1402 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1403 1404 return ret; 1405 } 1406 1407 /* 1408 * helper to fail a stripe based on a physical disk 1409 * bio. 1410 */ 1411 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1412 struct bio *bio) 1413 { 1414 int failed = find_bio_stripe(rbio, bio); 1415 1416 if (failed < 0) 1417 return -EIO; 1418 1419 return fail_rbio_index(rbio, failed); 1420 } 1421 1422 /* 1423 * this sets each page in the bio uptodate. It should only be used on private 1424 * rbio pages, nothing that comes in from the higher layers 1425 */ 1426 static void set_bio_pages_uptodate(struct bio *bio) 1427 { 1428 struct bio_vec *bvec; 1429 struct bvec_iter_all iter_all; 1430 1431 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1432 1433 bio_for_each_segment_all(bvec, bio, iter_all) 1434 SetPageUptodate(bvec->bv_page); 1435 } 1436 1437 /* 1438 * end io for the read phase of the rmw cycle. All the bios here are physical 1439 * stripe bios we've read from the disk so we can recalculate the parity of the 1440 * stripe. 1441 * 1442 * This will usually kick off finish_rmw once all the bios are read in, but it 1443 * may trigger parity reconstruction if we had any errors along the way 1444 */ 1445 static void raid_rmw_end_io(struct bio *bio) 1446 { 1447 struct btrfs_raid_bio *rbio = bio->bi_private; 1448 1449 if (bio->bi_status) 1450 fail_bio_stripe(rbio, bio); 1451 else 1452 set_bio_pages_uptodate(bio); 1453 1454 bio_put(bio); 1455 1456 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1457 return; 1458 1459 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1460 goto cleanup; 1461 1462 /* 1463 * this will normally call finish_rmw to start our write 1464 * but if there are any failed stripes we'll reconstruct 1465 * from parity first 1466 */ 1467 validate_rbio_for_rmw(rbio); 1468 return; 1469 1470 cleanup: 1471 1472 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1473 } 1474 1475 /* 1476 * the stripe must be locked by the caller. It will 1477 * unlock after all the writes are done 1478 */ 1479 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1480 { 1481 int bios_to_read = 0; 1482 struct bio_list bio_list; 1483 int ret; 1484 int pagenr; 1485 int stripe; 1486 struct bio *bio; 1487 1488 bio_list_init(&bio_list); 1489 1490 ret = alloc_rbio_pages(rbio); 1491 if (ret) 1492 goto cleanup; 1493 1494 index_rbio_pages(rbio); 1495 1496 atomic_set(&rbio->error, 0); 1497 /* 1498 * build a list of bios to read all the missing parts of this 1499 * stripe 1500 */ 1501 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1502 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1503 struct page *page; 1504 /* 1505 * we want to find all the pages missing from 1506 * the rbio and read them from the disk. If 1507 * page_in_rbio finds a page in the bio list 1508 * we don't need to read it off the stripe. 1509 */ 1510 page = page_in_rbio(rbio, stripe, pagenr, 1); 1511 if (page) 1512 continue; 1513 1514 page = rbio_stripe_page(rbio, stripe, pagenr); 1515 /* 1516 * the bio cache may have handed us an uptodate 1517 * page. If so, be happy and use it 1518 */ 1519 if (PageUptodate(page)) 1520 continue; 1521 1522 ret = rbio_add_io_page(rbio, &bio_list, page, 1523 stripe, pagenr, rbio->stripe_len); 1524 if (ret) 1525 goto cleanup; 1526 } 1527 } 1528 1529 bios_to_read = bio_list_size(&bio_list); 1530 if (!bios_to_read) { 1531 /* 1532 * this can happen if others have merged with 1533 * us, it means there is nothing left to read. 1534 * But if there are missing devices it may not be 1535 * safe to do the full stripe write yet. 1536 */ 1537 goto finish; 1538 } 1539 1540 /* 1541 * the bbio may be freed once we submit the last bio. Make sure 1542 * not to touch it after that 1543 */ 1544 atomic_set(&rbio->stripes_pending, bios_to_read); 1545 while ((bio = bio_list_pop(&bio_list))) { 1546 bio->bi_private = rbio; 1547 bio->bi_end_io = raid_rmw_end_io; 1548 bio->bi_opf = REQ_OP_READ; 1549 1550 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1551 1552 submit_bio(bio); 1553 } 1554 /* the actual write will happen once the reads are done */ 1555 return 0; 1556 1557 cleanup: 1558 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1559 1560 while ((bio = bio_list_pop(&bio_list))) 1561 bio_put(bio); 1562 1563 return -EIO; 1564 1565 finish: 1566 validate_rbio_for_rmw(rbio); 1567 return 0; 1568 } 1569 1570 /* 1571 * if the upper layers pass in a full stripe, we thank them by only allocating 1572 * enough pages to hold the parity, and sending it all down quickly. 1573 */ 1574 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1575 { 1576 int ret; 1577 1578 ret = alloc_rbio_parity_pages(rbio); 1579 if (ret) { 1580 __free_raid_bio(rbio); 1581 return ret; 1582 } 1583 1584 ret = lock_stripe_add(rbio); 1585 if (ret == 0) 1586 finish_rmw(rbio); 1587 return 0; 1588 } 1589 1590 /* 1591 * partial stripe writes get handed over to async helpers. 1592 * We're really hoping to merge a few more writes into this 1593 * rbio before calculating new parity 1594 */ 1595 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1596 { 1597 int ret; 1598 1599 ret = lock_stripe_add(rbio); 1600 if (ret == 0) 1601 start_async_work(rbio, rmw_work); 1602 return 0; 1603 } 1604 1605 /* 1606 * sometimes while we were reading from the drive to 1607 * recalculate parity, enough new bios come into create 1608 * a full stripe. So we do a check here to see if we can 1609 * go directly to finish_rmw 1610 */ 1611 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1612 { 1613 /* head off into rmw land if we don't have a full stripe */ 1614 if (!rbio_is_full(rbio)) 1615 return partial_stripe_write(rbio); 1616 return full_stripe_write(rbio); 1617 } 1618 1619 /* 1620 * We use plugging call backs to collect full stripes. 1621 * Any time we get a partial stripe write while plugged 1622 * we collect it into a list. When the unplug comes down, 1623 * we sort the list by logical block number and merge 1624 * everything we can into the same rbios 1625 */ 1626 struct btrfs_plug_cb { 1627 struct blk_plug_cb cb; 1628 struct btrfs_fs_info *info; 1629 struct list_head rbio_list; 1630 struct btrfs_work work; 1631 }; 1632 1633 /* 1634 * rbios on the plug list are sorted for easier merging. 1635 */ 1636 static int plug_cmp(void *priv, const struct list_head *a, 1637 const struct list_head *b) 1638 { 1639 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1640 plug_list); 1641 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1642 plug_list); 1643 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1644 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1645 1646 if (a_sector < b_sector) 1647 return -1; 1648 if (a_sector > b_sector) 1649 return 1; 1650 return 0; 1651 } 1652 1653 static void run_plug(struct btrfs_plug_cb *plug) 1654 { 1655 struct btrfs_raid_bio *cur; 1656 struct btrfs_raid_bio *last = NULL; 1657 1658 /* 1659 * sort our plug list then try to merge 1660 * everything we can in hopes of creating full 1661 * stripes. 1662 */ 1663 list_sort(NULL, &plug->rbio_list, plug_cmp); 1664 while (!list_empty(&plug->rbio_list)) { 1665 cur = list_entry(plug->rbio_list.next, 1666 struct btrfs_raid_bio, plug_list); 1667 list_del_init(&cur->plug_list); 1668 1669 if (rbio_is_full(cur)) { 1670 int ret; 1671 1672 /* we have a full stripe, send it down */ 1673 ret = full_stripe_write(cur); 1674 BUG_ON(ret); 1675 continue; 1676 } 1677 if (last) { 1678 if (rbio_can_merge(last, cur)) { 1679 merge_rbio(last, cur); 1680 __free_raid_bio(cur); 1681 continue; 1682 1683 } 1684 __raid56_parity_write(last); 1685 } 1686 last = cur; 1687 } 1688 if (last) { 1689 __raid56_parity_write(last); 1690 } 1691 kfree(plug); 1692 } 1693 1694 /* 1695 * if the unplug comes from schedule, we have to push the 1696 * work off to a helper thread 1697 */ 1698 static void unplug_work(struct btrfs_work *work) 1699 { 1700 struct btrfs_plug_cb *plug; 1701 plug = container_of(work, struct btrfs_plug_cb, work); 1702 run_plug(plug); 1703 } 1704 1705 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1706 { 1707 struct btrfs_plug_cb *plug; 1708 plug = container_of(cb, struct btrfs_plug_cb, cb); 1709 1710 if (from_schedule) { 1711 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1712 btrfs_queue_work(plug->info->rmw_workers, 1713 &plug->work); 1714 return; 1715 } 1716 run_plug(plug); 1717 } 1718 1719 /* 1720 * our main entry point for writes from the rest of the FS. 1721 */ 1722 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, 1723 struct btrfs_bio *bbio, u64 stripe_len) 1724 { 1725 struct btrfs_raid_bio *rbio; 1726 struct btrfs_plug_cb *plug = NULL; 1727 struct blk_plug_cb *cb; 1728 int ret; 1729 1730 rbio = alloc_rbio(fs_info, bbio, stripe_len); 1731 if (IS_ERR(rbio)) { 1732 btrfs_put_bbio(bbio); 1733 return PTR_ERR(rbio); 1734 } 1735 bio_list_add(&rbio->bio_list, bio); 1736 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1737 rbio->operation = BTRFS_RBIO_WRITE; 1738 1739 btrfs_bio_counter_inc_noblocked(fs_info); 1740 rbio->generic_bio_cnt = 1; 1741 1742 /* 1743 * don't plug on full rbios, just get them out the door 1744 * as quickly as we can 1745 */ 1746 if (rbio_is_full(rbio)) { 1747 ret = full_stripe_write(rbio); 1748 if (ret) 1749 btrfs_bio_counter_dec(fs_info); 1750 return ret; 1751 } 1752 1753 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1754 if (cb) { 1755 plug = container_of(cb, struct btrfs_plug_cb, cb); 1756 if (!plug->info) { 1757 plug->info = fs_info; 1758 INIT_LIST_HEAD(&plug->rbio_list); 1759 } 1760 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1761 ret = 0; 1762 } else { 1763 ret = __raid56_parity_write(rbio); 1764 if (ret) 1765 btrfs_bio_counter_dec(fs_info); 1766 } 1767 return ret; 1768 } 1769 1770 /* 1771 * all parity reconstruction happens here. We've read in everything 1772 * we can find from the drives and this does the heavy lifting of 1773 * sorting the good from the bad. 1774 */ 1775 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1776 { 1777 int pagenr, stripe; 1778 void **pointers; 1779 void **unmap_array; 1780 int faila = -1, failb = -1; 1781 struct page *page; 1782 blk_status_t err; 1783 int i; 1784 1785 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1786 if (!pointers) { 1787 err = BLK_STS_RESOURCE; 1788 goto cleanup_io; 1789 } 1790 1791 /* 1792 * Store copy of pointers that does not get reordered during 1793 * reconstruction so that kunmap_local works. 1794 */ 1795 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1796 if (!unmap_array) { 1797 err = BLK_STS_RESOURCE; 1798 goto cleanup_pointers; 1799 } 1800 1801 faila = rbio->faila; 1802 failb = rbio->failb; 1803 1804 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1805 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1806 spin_lock_irq(&rbio->bio_list_lock); 1807 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1808 spin_unlock_irq(&rbio->bio_list_lock); 1809 } 1810 1811 index_rbio_pages(rbio); 1812 1813 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1814 /* 1815 * Now we just use bitmap to mark the horizontal stripes in 1816 * which we have data when doing parity scrub. 1817 */ 1818 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1819 !test_bit(pagenr, rbio->dbitmap)) 1820 continue; 1821 1822 /* 1823 * Setup our array of pointers with pages from each stripe 1824 * 1825 * NOTE: store a duplicate array of pointers to preserve the 1826 * pointer order 1827 */ 1828 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1829 /* 1830 * if we're rebuilding a read, we have to use 1831 * pages from the bio list 1832 */ 1833 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1834 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1835 (stripe == faila || stripe == failb)) { 1836 page = page_in_rbio(rbio, stripe, pagenr, 0); 1837 } else { 1838 page = rbio_stripe_page(rbio, stripe, pagenr); 1839 } 1840 pointers[stripe] = kmap_local_page(page); 1841 unmap_array[stripe] = pointers[stripe]; 1842 } 1843 1844 /* all raid6 handling here */ 1845 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1846 /* 1847 * single failure, rebuild from parity raid5 1848 * style 1849 */ 1850 if (failb < 0) { 1851 if (faila == rbio->nr_data) { 1852 /* 1853 * Just the P stripe has failed, without 1854 * a bad data or Q stripe. 1855 * TODO, we should redo the xor here. 1856 */ 1857 err = BLK_STS_IOERR; 1858 goto cleanup; 1859 } 1860 /* 1861 * a single failure in raid6 is rebuilt 1862 * in the pstripe code below 1863 */ 1864 goto pstripe; 1865 } 1866 1867 /* make sure our ps and qs are in order */ 1868 if (faila > failb) 1869 swap(faila, failb); 1870 1871 /* if the q stripe is failed, do a pstripe reconstruction 1872 * from the xors. 1873 * If both the q stripe and the P stripe are failed, we're 1874 * here due to a crc mismatch and we can't give them the 1875 * data they want 1876 */ 1877 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1878 if (rbio->bbio->raid_map[faila] == 1879 RAID5_P_STRIPE) { 1880 err = BLK_STS_IOERR; 1881 goto cleanup; 1882 } 1883 /* 1884 * otherwise we have one bad data stripe and 1885 * a good P stripe. raid5! 1886 */ 1887 goto pstripe; 1888 } 1889 1890 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1891 raid6_datap_recov(rbio->real_stripes, 1892 PAGE_SIZE, faila, pointers); 1893 } else { 1894 raid6_2data_recov(rbio->real_stripes, 1895 PAGE_SIZE, faila, failb, 1896 pointers); 1897 } 1898 } else { 1899 void *p; 1900 1901 /* rebuild from P stripe here (raid5 or raid6) */ 1902 BUG_ON(failb != -1); 1903 pstripe: 1904 /* Copy parity block into failed block to start with */ 1905 copy_page(pointers[faila], pointers[rbio->nr_data]); 1906 1907 /* rearrange the pointer array */ 1908 p = pointers[faila]; 1909 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1910 pointers[stripe] = pointers[stripe + 1]; 1911 pointers[rbio->nr_data - 1] = p; 1912 1913 /* xor in the rest */ 1914 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1915 } 1916 /* if we're doing this rebuild as part of an rmw, go through 1917 * and set all of our private rbio pages in the 1918 * failed stripes as uptodate. This way finish_rmw will 1919 * know they can be trusted. If this was a read reconstruction, 1920 * other endio functions will fiddle the uptodate bits 1921 */ 1922 if (rbio->operation == BTRFS_RBIO_WRITE) { 1923 for (i = 0; i < rbio->stripe_npages; i++) { 1924 if (faila != -1) { 1925 page = rbio_stripe_page(rbio, faila, i); 1926 SetPageUptodate(page); 1927 } 1928 if (failb != -1) { 1929 page = rbio_stripe_page(rbio, failb, i); 1930 SetPageUptodate(page); 1931 } 1932 } 1933 } 1934 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) 1935 kunmap_local(unmap_array[stripe]); 1936 } 1937 1938 err = BLK_STS_OK; 1939 cleanup: 1940 kfree(unmap_array); 1941 cleanup_pointers: 1942 kfree(pointers); 1943 1944 cleanup_io: 1945 /* 1946 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 1947 * valid rbio which is consistent with ondisk content, thus such a 1948 * valid rbio can be cached to avoid further disk reads. 1949 */ 1950 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1951 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1952 /* 1953 * - In case of two failures, where rbio->failb != -1: 1954 * 1955 * Do not cache this rbio since the above read reconstruction 1956 * (raid6_datap_recov() or raid6_2data_recov()) may have 1957 * changed some content of stripes which are not identical to 1958 * on-disk content any more, otherwise, a later write/recover 1959 * may steal stripe_pages from this rbio and end up with 1960 * corruptions or rebuild failures. 1961 * 1962 * - In case of single failure, where rbio->failb == -1: 1963 * 1964 * Cache this rbio iff the above read reconstruction is 1965 * executed without problems. 1966 */ 1967 if (err == BLK_STS_OK && rbio->failb < 0) 1968 cache_rbio_pages(rbio); 1969 else 1970 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1971 1972 rbio_orig_end_io(rbio, err); 1973 } else if (err == BLK_STS_OK) { 1974 rbio->faila = -1; 1975 rbio->failb = -1; 1976 1977 if (rbio->operation == BTRFS_RBIO_WRITE) 1978 finish_rmw(rbio); 1979 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 1980 finish_parity_scrub(rbio, 0); 1981 else 1982 BUG(); 1983 } else { 1984 rbio_orig_end_io(rbio, err); 1985 } 1986 } 1987 1988 /* 1989 * This is called only for stripes we've read from disk to 1990 * reconstruct the parity. 1991 */ 1992 static void raid_recover_end_io(struct bio *bio) 1993 { 1994 struct btrfs_raid_bio *rbio = bio->bi_private; 1995 1996 /* 1997 * we only read stripe pages off the disk, set them 1998 * up to date if there were no errors 1999 */ 2000 if (bio->bi_status) 2001 fail_bio_stripe(rbio, bio); 2002 else 2003 set_bio_pages_uptodate(bio); 2004 bio_put(bio); 2005 2006 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2007 return; 2008 2009 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2010 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2011 else 2012 __raid_recover_end_io(rbio); 2013 } 2014 2015 /* 2016 * reads everything we need off the disk to reconstruct 2017 * the parity. endio handlers trigger final reconstruction 2018 * when the IO is done. 2019 * 2020 * This is used both for reads from the higher layers and for 2021 * parity construction required to finish a rmw cycle. 2022 */ 2023 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2024 { 2025 int bios_to_read = 0; 2026 struct bio_list bio_list; 2027 int ret; 2028 int pagenr; 2029 int stripe; 2030 struct bio *bio; 2031 2032 bio_list_init(&bio_list); 2033 2034 ret = alloc_rbio_pages(rbio); 2035 if (ret) 2036 goto cleanup; 2037 2038 atomic_set(&rbio->error, 0); 2039 2040 /* 2041 * read everything that hasn't failed. Thanks to the 2042 * stripe cache, it is possible that some or all of these 2043 * pages are going to be uptodate. 2044 */ 2045 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2046 if (rbio->faila == stripe || rbio->failb == stripe) { 2047 atomic_inc(&rbio->error); 2048 continue; 2049 } 2050 2051 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2052 struct page *p; 2053 2054 /* 2055 * the rmw code may have already read this 2056 * page in 2057 */ 2058 p = rbio_stripe_page(rbio, stripe, pagenr); 2059 if (PageUptodate(p)) 2060 continue; 2061 2062 ret = rbio_add_io_page(rbio, &bio_list, 2063 rbio_stripe_page(rbio, stripe, pagenr), 2064 stripe, pagenr, rbio->stripe_len); 2065 if (ret < 0) 2066 goto cleanup; 2067 } 2068 } 2069 2070 bios_to_read = bio_list_size(&bio_list); 2071 if (!bios_to_read) { 2072 /* 2073 * we might have no bios to read just because the pages 2074 * were up to date, or we might have no bios to read because 2075 * the devices were gone. 2076 */ 2077 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2078 __raid_recover_end_io(rbio); 2079 return 0; 2080 } else { 2081 goto cleanup; 2082 } 2083 } 2084 2085 /* 2086 * the bbio may be freed once we submit the last bio. Make sure 2087 * not to touch it after that 2088 */ 2089 atomic_set(&rbio->stripes_pending, bios_to_read); 2090 while ((bio = bio_list_pop(&bio_list))) { 2091 bio->bi_private = rbio; 2092 bio->bi_end_io = raid_recover_end_io; 2093 bio->bi_opf = REQ_OP_READ; 2094 2095 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2096 2097 submit_bio(bio); 2098 } 2099 2100 return 0; 2101 2102 cleanup: 2103 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2104 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2105 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2106 2107 while ((bio = bio_list_pop(&bio_list))) 2108 bio_put(bio); 2109 2110 return -EIO; 2111 } 2112 2113 /* 2114 * the main entry point for reads from the higher layers. This 2115 * is really only called when the normal read path had a failure, 2116 * so we assume the bio they send down corresponds to a failed part 2117 * of the drive. 2118 */ 2119 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, 2120 struct btrfs_bio *bbio, u64 stripe_len, 2121 int mirror_num, int generic_io) 2122 { 2123 struct btrfs_raid_bio *rbio; 2124 int ret; 2125 2126 if (generic_io) { 2127 ASSERT(bbio->mirror_num == mirror_num); 2128 btrfs_io_bio(bio)->mirror_num = mirror_num; 2129 } 2130 2131 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2132 if (IS_ERR(rbio)) { 2133 if (generic_io) 2134 btrfs_put_bbio(bbio); 2135 return PTR_ERR(rbio); 2136 } 2137 2138 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2139 bio_list_add(&rbio->bio_list, bio); 2140 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2141 2142 rbio->faila = find_logical_bio_stripe(rbio, bio); 2143 if (rbio->faila == -1) { 2144 btrfs_warn(fs_info, 2145 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", 2146 __func__, bio->bi_iter.bi_sector << 9, 2147 (u64)bio->bi_iter.bi_size, bbio->map_type); 2148 if (generic_io) 2149 btrfs_put_bbio(bbio); 2150 kfree(rbio); 2151 return -EIO; 2152 } 2153 2154 if (generic_io) { 2155 btrfs_bio_counter_inc_noblocked(fs_info); 2156 rbio->generic_bio_cnt = 1; 2157 } else { 2158 btrfs_get_bbio(bbio); 2159 } 2160 2161 /* 2162 * Loop retry: 2163 * for 'mirror == 2', reconstruct from all other stripes. 2164 * for 'mirror_num > 2', select a stripe to fail on every retry. 2165 */ 2166 if (mirror_num > 2) { 2167 /* 2168 * 'mirror == 3' is to fail the p stripe and 2169 * reconstruct from the q stripe. 'mirror > 3' is to 2170 * fail a data stripe and reconstruct from p+q stripe. 2171 */ 2172 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2173 ASSERT(rbio->failb > 0); 2174 if (rbio->failb <= rbio->faila) 2175 rbio->failb--; 2176 } 2177 2178 ret = lock_stripe_add(rbio); 2179 2180 /* 2181 * __raid56_parity_recover will end the bio with 2182 * any errors it hits. We don't want to return 2183 * its error value up the stack because our caller 2184 * will end up calling bio_endio with any nonzero 2185 * return 2186 */ 2187 if (ret == 0) 2188 __raid56_parity_recover(rbio); 2189 /* 2190 * our rbio has been added to the list of 2191 * rbios that will be handled after the 2192 * currently lock owner is done 2193 */ 2194 return 0; 2195 2196 } 2197 2198 static void rmw_work(struct btrfs_work *work) 2199 { 2200 struct btrfs_raid_bio *rbio; 2201 2202 rbio = container_of(work, struct btrfs_raid_bio, work); 2203 raid56_rmw_stripe(rbio); 2204 } 2205 2206 static void read_rebuild_work(struct btrfs_work *work) 2207 { 2208 struct btrfs_raid_bio *rbio; 2209 2210 rbio = container_of(work, struct btrfs_raid_bio, work); 2211 __raid56_parity_recover(rbio); 2212 } 2213 2214 /* 2215 * The following code is used to scrub/replace the parity stripe 2216 * 2217 * Caller must have already increased bio_counter for getting @bbio. 2218 * 2219 * Note: We need make sure all the pages that add into the scrub/replace 2220 * raid bio are correct and not be changed during the scrub/replace. That 2221 * is those pages just hold metadata or file data with checksum. 2222 */ 2223 2224 struct btrfs_raid_bio * 2225 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2226 struct btrfs_bio *bbio, u64 stripe_len, 2227 struct btrfs_device *scrub_dev, 2228 unsigned long *dbitmap, int stripe_nsectors) 2229 { 2230 struct btrfs_raid_bio *rbio; 2231 int i; 2232 2233 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2234 if (IS_ERR(rbio)) 2235 return NULL; 2236 bio_list_add(&rbio->bio_list, bio); 2237 /* 2238 * This is a special bio which is used to hold the completion handler 2239 * and make the scrub rbio is similar to the other types 2240 */ 2241 ASSERT(!bio->bi_iter.bi_size); 2242 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2243 2244 /* 2245 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted 2246 * to the end position, so this search can start from the first parity 2247 * stripe. 2248 */ 2249 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2250 if (bbio->stripes[i].dev == scrub_dev) { 2251 rbio->scrubp = i; 2252 break; 2253 } 2254 } 2255 ASSERT(i < rbio->real_stripes); 2256 2257 /* Now we just support the sectorsize equals to page size */ 2258 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2259 ASSERT(rbio->stripe_npages == stripe_nsectors); 2260 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2261 2262 /* 2263 * We have already increased bio_counter when getting bbio, record it 2264 * so we can free it at rbio_orig_end_io(). 2265 */ 2266 rbio->generic_bio_cnt = 1; 2267 2268 return rbio; 2269 } 2270 2271 /* Used for both parity scrub and missing. */ 2272 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2273 u64 logical) 2274 { 2275 int stripe_offset; 2276 int index; 2277 2278 ASSERT(logical >= rbio->bbio->raid_map[0]); 2279 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2280 rbio->stripe_len * rbio->nr_data); 2281 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2282 index = stripe_offset >> PAGE_SHIFT; 2283 rbio->bio_pages[index] = page; 2284 } 2285 2286 /* 2287 * We just scrub the parity that we have correct data on the same horizontal, 2288 * so we needn't allocate all pages for all the stripes. 2289 */ 2290 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2291 { 2292 int i; 2293 int bit; 2294 int index; 2295 struct page *page; 2296 2297 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2298 for (i = 0; i < rbio->real_stripes; i++) { 2299 index = i * rbio->stripe_npages + bit; 2300 if (rbio->stripe_pages[index]) 2301 continue; 2302 2303 page = alloc_page(GFP_NOFS); 2304 if (!page) 2305 return -ENOMEM; 2306 rbio->stripe_pages[index] = page; 2307 } 2308 } 2309 return 0; 2310 } 2311 2312 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2313 int need_check) 2314 { 2315 struct btrfs_bio *bbio = rbio->bbio; 2316 void **pointers = rbio->finish_pointers; 2317 unsigned long *pbitmap = rbio->finish_pbitmap; 2318 int nr_data = rbio->nr_data; 2319 int stripe; 2320 int pagenr; 2321 bool has_qstripe; 2322 struct page *p_page = NULL; 2323 struct page *q_page = NULL; 2324 struct bio_list bio_list; 2325 struct bio *bio; 2326 int is_replace = 0; 2327 int ret; 2328 2329 bio_list_init(&bio_list); 2330 2331 if (rbio->real_stripes - rbio->nr_data == 1) 2332 has_qstripe = false; 2333 else if (rbio->real_stripes - rbio->nr_data == 2) 2334 has_qstripe = true; 2335 else 2336 BUG(); 2337 2338 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2339 is_replace = 1; 2340 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2341 } 2342 2343 /* 2344 * Because the higher layers(scrubber) are unlikely to 2345 * use this area of the disk again soon, so don't cache 2346 * it. 2347 */ 2348 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2349 2350 if (!need_check) 2351 goto writeback; 2352 2353 p_page = alloc_page(GFP_NOFS); 2354 if (!p_page) 2355 goto cleanup; 2356 SetPageUptodate(p_page); 2357 2358 if (has_qstripe) { 2359 /* RAID6, allocate and map temp space for the Q stripe */ 2360 q_page = alloc_page(GFP_NOFS); 2361 if (!q_page) { 2362 __free_page(p_page); 2363 goto cleanup; 2364 } 2365 SetPageUptodate(q_page); 2366 pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); 2367 } 2368 2369 atomic_set(&rbio->error, 0); 2370 2371 /* Map the parity stripe just once */ 2372 pointers[nr_data] = kmap_local_page(p_page); 2373 2374 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2375 struct page *p; 2376 void *parity; 2377 /* first collect one page from each data stripe */ 2378 for (stripe = 0; stripe < nr_data; stripe++) { 2379 p = page_in_rbio(rbio, stripe, pagenr, 0); 2380 pointers[stripe] = kmap_local_page(p); 2381 } 2382 2383 if (has_qstripe) { 2384 /* RAID6, call the library function to fill in our P/Q */ 2385 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2386 pointers); 2387 } else { 2388 /* raid5 */ 2389 copy_page(pointers[nr_data], pointers[0]); 2390 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2391 } 2392 2393 /* Check scrubbing parity and repair it */ 2394 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2395 parity = kmap_local_page(p); 2396 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2397 copy_page(parity, pointers[rbio->scrubp]); 2398 else 2399 /* Parity is right, needn't writeback */ 2400 bitmap_clear(rbio->dbitmap, pagenr, 1); 2401 kunmap_local(parity); 2402 2403 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2404 kunmap_local(pointers[stripe]); 2405 } 2406 2407 kunmap_local(pointers[nr_data]); 2408 __free_page(p_page); 2409 if (q_page) { 2410 kunmap_local(pointers[rbio->real_stripes - 1]); 2411 __free_page(q_page); 2412 } 2413 2414 writeback: 2415 /* 2416 * time to start writing. Make bios for everything from the 2417 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2418 * everything else. 2419 */ 2420 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2421 struct page *page; 2422 2423 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2424 ret = rbio_add_io_page(rbio, &bio_list, 2425 page, rbio->scrubp, pagenr, rbio->stripe_len); 2426 if (ret) 2427 goto cleanup; 2428 } 2429 2430 if (!is_replace) 2431 goto submit_write; 2432 2433 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2434 struct page *page; 2435 2436 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2437 ret = rbio_add_io_page(rbio, &bio_list, page, 2438 bbio->tgtdev_map[rbio->scrubp], 2439 pagenr, rbio->stripe_len); 2440 if (ret) 2441 goto cleanup; 2442 } 2443 2444 submit_write: 2445 nr_data = bio_list_size(&bio_list); 2446 if (!nr_data) { 2447 /* Every parity is right */ 2448 rbio_orig_end_io(rbio, BLK_STS_OK); 2449 return; 2450 } 2451 2452 atomic_set(&rbio->stripes_pending, nr_data); 2453 2454 while ((bio = bio_list_pop(&bio_list))) { 2455 bio->bi_private = rbio; 2456 bio->bi_end_io = raid_write_end_io; 2457 bio->bi_opf = REQ_OP_WRITE; 2458 2459 submit_bio(bio); 2460 } 2461 return; 2462 2463 cleanup: 2464 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2465 2466 while ((bio = bio_list_pop(&bio_list))) 2467 bio_put(bio); 2468 } 2469 2470 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2471 { 2472 if (stripe >= 0 && stripe < rbio->nr_data) 2473 return 1; 2474 return 0; 2475 } 2476 2477 /* 2478 * While we're doing the parity check and repair, we could have errors 2479 * in reading pages off the disk. This checks for errors and if we're 2480 * not able to read the page it'll trigger parity reconstruction. The 2481 * parity scrub will be finished after we've reconstructed the failed 2482 * stripes 2483 */ 2484 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2485 { 2486 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2487 goto cleanup; 2488 2489 if (rbio->faila >= 0 || rbio->failb >= 0) { 2490 int dfail = 0, failp = -1; 2491 2492 if (is_data_stripe(rbio, rbio->faila)) 2493 dfail++; 2494 else if (is_parity_stripe(rbio->faila)) 2495 failp = rbio->faila; 2496 2497 if (is_data_stripe(rbio, rbio->failb)) 2498 dfail++; 2499 else if (is_parity_stripe(rbio->failb)) 2500 failp = rbio->failb; 2501 2502 /* 2503 * Because we can not use a scrubbing parity to repair 2504 * the data, so the capability of the repair is declined. 2505 * (In the case of RAID5, we can not repair anything) 2506 */ 2507 if (dfail > rbio->bbio->max_errors - 1) 2508 goto cleanup; 2509 2510 /* 2511 * If all data is good, only parity is correctly, just 2512 * repair the parity. 2513 */ 2514 if (dfail == 0) { 2515 finish_parity_scrub(rbio, 0); 2516 return; 2517 } 2518 2519 /* 2520 * Here means we got one corrupted data stripe and one 2521 * corrupted parity on RAID6, if the corrupted parity 2522 * is scrubbing parity, luckily, use the other one to repair 2523 * the data, or we can not repair the data stripe. 2524 */ 2525 if (failp != rbio->scrubp) 2526 goto cleanup; 2527 2528 __raid_recover_end_io(rbio); 2529 } else { 2530 finish_parity_scrub(rbio, 1); 2531 } 2532 return; 2533 2534 cleanup: 2535 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2536 } 2537 2538 /* 2539 * end io for the read phase of the rmw cycle. All the bios here are physical 2540 * stripe bios we've read from the disk so we can recalculate the parity of the 2541 * stripe. 2542 * 2543 * This will usually kick off finish_rmw once all the bios are read in, but it 2544 * may trigger parity reconstruction if we had any errors along the way 2545 */ 2546 static void raid56_parity_scrub_end_io(struct bio *bio) 2547 { 2548 struct btrfs_raid_bio *rbio = bio->bi_private; 2549 2550 if (bio->bi_status) 2551 fail_bio_stripe(rbio, bio); 2552 else 2553 set_bio_pages_uptodate(bio); 2554 2555 bio_put(bio); 2556 2557 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2558 return; 2559 2560 /* 2561 * this will normally call finish_rmw to start our write 2562 * but if there are any failed stripes we'll reconstruct 2563 * from parity first 2564 */ 2565 validate_rbio_for_parity_scrub(rbio); 2566 } 2567 2568 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2569 { 2570 int bios_to_read = 0; 2571 struct bio_list bio_list; 2572 int ret; 2573 int pagenr; 2574 int stripe; 2575 struct bio *bio; 2576 2577 bio_list_init(&bio_list); 2578 2579 ret = alloc_rbio_essential_pages(rbio); 2580 if (ret) 2581 goto cleanup; 2582 2583 atomic_set(&rbio->error, 0); 2584 /* 2585 * build a list of bios to read all the missing parts of this 2586 * stripe 2587 */ 2588 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2589 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2590 struct page *page; 2591 /* 2592 * we want to find all the pages missing from 2593 * the rbio and read them from the disk. If 2594 * page_in_rbio finds a page in the bio list 2595 * we don't need to read it off the stripe. 2596 */ 2597 page = page_in_rbio(rbio, stripe, pagenr, 1); 2598 if (page) 2599 continue; 2600 2601 page = rbio_stripe_page(rbio, stripe, pagenr); 2602 /* 2603 * the bio cache may have handed us an uptodate 2604 * page. If so, be happy and use it 2605 */ 2606 if (PageUptodate(page)) 2607 continue; 2608 2609 ret = rbio_add_io_page(rbio, &bio_list, page, 2610 stripe, pagenr, rbio->stripe_len); 2611 if (ret) 2612 goto cleanup; 2613 } 2614 } 2615 2616 bios_to_read = bio_list_size(&bio_list); 2617 if (!bios_to_read) { 2618 /* 2619 * this can happen if others have merged with 2620 * us, it means there is nothing left to read. 2621 * But if there are missing devices it may not be 2622 * safe to do the full stripe write yet. 2623 */ 2624 goto finish; 2625 } 2626 2627 /* 2628 * the bbio may be freed once we submit the last bio. Make sure 2629 * not to touch it after that 2630 */ 2631 atomic_set(&rbio->stripes_pending, bios_to_read); 2632 while ((bio = bio_list_pop(&bio_list))) { 2633 bio->bi_private = rbio; 2634 bio->bi_end_io = raid56_parity_scrub_end_io; 2635 bio->bi_opf = REQ_OP_READ; 2636 2637 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2638 2639 submit_bio(bio); 2640 } 2641 /* the actual write will happen once the reads are done */ 2642 return; 2643 2644 cleanup: 2645 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2646 2647 while ((bio = bio_list_pop(&bio_list))) 2648 bio_put(bio); 2649 2650 return; 2651 2652 finish: 2653 validate_rbio_for_parity_scrub(rbio); 2654 } 2655 2656 static void scrub_parity_work(struct btrfs_work *work) 2657 { 2658 struct btrfs_raid_bio *rbio; 2659 2660 rbio = container_of(work, struct btrfs_raid_bio, work); 2661 raid56_parity_scrub_stripe(rbio); 2662 } 2663 2664 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2665 { 2666 if (!lock_stripe_add(rbio)) 2667 start_async_work(rbio, scrub_parity_work); 2668 } 2669 2670 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2671 2672 struct btrfs_raid_bio * 2673 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2674 struct btrfs_bio *bbio, u64 length) 2675 { 2676 struct btrfs_raid_bio *rbio; 2677 2678 rbio = alloc_rbio(fs_info, bbio, length); 2679 if (IS_ERR(rbio)) 2680 return NULL; 2681 2682 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2683 bio_list_add(&rbio->bio_list, bio); 2684 /* 2685 * This is a special bio which is used to hold the completion handler 2686 * and make the scrub rbio is similar to the other types 2687 */ 2688 ASSERT(!bio->bi_iter.bi_size); 2689 2690 rbio->faila = find_logical_bio_stripe(rbio, bio); 2691 if (rbio->faila == -1) { 2692 BUG(); 2693 kfree(rbio); 2694 return NULL; 2695 } 2696 2697 /* 2698 * When we get bbio, we have already increased bio_counter, record it 2699 * so we can free it at rbio_orig_end_io() 2700 */ 2701 rbio->generic_bio_cnt = 1; 2702 2703 return rbio; 2704 } 2705 2706 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2707 { 2708 if (!lock_stripe_add(rbio)) 2709 start_async_work(rbio, read_rebuild_work); 2710 } 2711