1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "ctree.h" 17 #include "disk-io.h" 18 #include "volumes.h" 19 #include "raid56.h" 20 #include "async-thread.h" 21 22 /* set when additional merges to this rbio are not allowed */ 23 #define RBIO_RMW_LOCKED_BIT 1 24 25 /* 26 * set when this rbio is sitting in the hash, but it is just a cache 27 * of past RMW 28 */ 29 #define RBIO_CACHE_BIT 2 30 31 /* 32 * set when it is safe to trust the stripe_pages for caching 33 */ 34 #define RBIO_CACHE_READY_BIT 3 35 36 #define RBIO_CACHE_SIZE 1024 37 38 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 39 40 /* Used by the raid56 code to lock stripes for read/modify/write */ 41 struct btrfs_stripe_hash { 42 struct list_head hash_list; 43 spinlock_t lock; 44 }; 45 46 /* Used by the raid56 code to lock stripes for read/modify/write */ 47 struct btrfs_stripe_hash_table { 48 struct list_head stripe_cache; 49 spinlock_t cache_lock; 50 int cache_size; 51 struct btrfs_stripe_hash table[]; 52 }; 53 54 enum btrfs_rbio_ops { 55 BTRFS_RBIO_WRITE, 56 BTRFS_RBIO_READ_REBUILD, 57 BTRFS_RBIO_PARITY_SCRUB, 58 BTRFS_RBIO_REBUILD_MISSING, 59 }; 60 61 struct btrfs_raid_bio { 62 struct btrfs_fs_info *fs_info; 63 struct btrfs_bio *bbio; 64 65 /* while we're doing rmw on a stripe 66 * we put it into a hash table so we can 67 * lock the stripe and merge more rbios 68 * into it. 69 */ 70 struct list_head hash_list; 71 72 /* 73 * LRU list for the stripe cache 74 */ 75 struct list_head stripe_cache; 76 77 /* 78 * for scheduling work in the helper threads 79 */ 80 struct btrfs_work work; 81 82 /* 83 * bio list and bio_list_lock are used 84 * to add more bios into the stripe 85 * in hopes of avoiding the full rmw 86 */ 87 struct bio_list bio_list; 88 spinlock_t bio_list_lock; 89 90 /* also protected by the bio_list_lock, the 91 * plug list is used by the plugging code 92 * to collect partial bios while plugged. The 93 * stripe locking code also uses it to hand off 94 * the stripe lock to the next pending IO 95 */ 96 struct list_head plug_list; 97 98 /* 99 * flags that tell us if it is safe to 100 * merge with this bio 101 */ 102 unsigned long flags; 103 104 /* size of each individual stripe on disk */ 105 int stripe_len; 106 107 /* number of data stripes (no p/q) */ 108 int nr_data; 109 110 int real_stripes; 111 112 int stripe_npages; 113 /* 114 * set if we're doing a parity rebuild 115 * for a read from higher up, which is handled 116 * differently from a parity rebuild as part of 117 * rmw 118 */ 119 enum btrfs_rbio_ops operation; 120 121 /* first bad stripe */ 122 int faila; 123 124 /* second bad stripe (for raid6 use) */ 125 int failb; 126 127 int scrubp; 128 /* 129 * number of pages needed to represent the full 130 * stripe 131 */ 132 int nr_pages; 133 134 /* 135 * size of all the bios in the bio_list. This 136 * helps us decide if the rbio maps to a full 137 * stripe or not 138 */ 139 int bio_list_bytes; 140 141 int generic_bio_cnt; 142 143 refcount_t refs; 144 145 atomic_t stripes_pending; 146 147 atomic_t error; 148 /* 149 * these are two arrays of pointers. We allocate the 150 * rbio big enough to hold them both and setup their 151 * locations when the rbio is allocated 152 */ 153 154 /* pointers to pages that we allocated for 155 * reading/writing stripes directly from the disk (including P/Q) 156 */ 157 struct page **stripe_pages; 158 159 /* 160 * pointers to the pages in the bio_list. Stored 161 * here for faster lookup 162 */ 163 struct page **bio_pages; 164 165 /* 166 * bitmap to record which horizontal stripe has data 167 */ 168 unsigned long *dbitmap; 169 170 /* allocated with real_stripes-many pointers for finish_*() calls */ 171 void **finish_pointers; 172 173 /* allocated with stripe_npages-many bits for finish_*() calls */ 174 unsigned long *finish_pbitmap; 175 }; 176 177 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 179 static void rmw_work(struct btrfs_work *work); 180 static void read_rebuild_work(struct btrfs_work *work); 181 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 182 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 183 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 184 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 185 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 186 187 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 188 int need_check); 189 static void scrub_parity_work(struct btrfs_work *work); 190 191 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) 192 { 193 btrfs_init_work(&rbio->work, work_func, NULL, NULL); 194 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 195 } 196 197 /* 198 * the stripe hash table is used for locking, and to collect 199 * bios in hopes of making a full stripe 200 */ 201 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 202 { 203 struct btrfs_stripe_hash_table *table; 204 struct btrfs_stripe_hash_table *x; 205 struct btrfs_stripe_hash *cur; 206 struct btrfs_stripe_hash *h; 207 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 208 int i; 209 210 if (info->stripe_hash_table) 211 return 0; 212 213 /* 214 * The table is large, starting with order 4 and can go as high as 215 * order 7 in case lock debugging is turned on. 216 * 217 * Try harder to allocate and fallback to vmalloc to lower the chance 218 * of a failing mount. 219 */ 220 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 221 if (!table) 222 return -ENOMEM; 223 224 spin_lock_init(&table->cache_lock); 225 INIT_LIST_HEAD(&table->stripe_cache); 226 227 h = table->table; 228 229 for (i = 0; i < num_entries; i++) { 230 cur = h + i; 231 INIT_LIST_HEAD(&cur->hash_list); 232 spin_lock_init(&cur->lock); 233 } 234 235 x = cmpxchg(&info->stripe_hash_table, NULL, table); 236 kvfree(x); 237 return 0; 238 } 239 240 /* 241 * caching an rbio means to copy anything from the 242 * bio_pages array into the stripe_pages array. We 243 * use the page uptodate bit in the stripe cache array 244 * to indicate if it has valid data 245 * 246 * once the caching is done, we set the cache ready 247 * bit. 248 */ 249 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 250 { 251 int i; 252 char *s; 253 char *d; 254 int ret; 255 256 ret = alloc_rbio_pages(rbio); 257 if (ret) 258 return; 259 260 for (i = 0; i < rbio->nr_pages; i++) { 261 if (!rbio->bio_pages[i]) 262 continue; 263 264 s = kmap(rbio->bio_pages[i]); 265 d = kmap(rbio->stripe_pages[i]); 266 267 copy_page(d, s); 268 269 kunmap(rbio->bio_pages[i]); 270 kunmap(rbio->stripe_pages[i]); 271 SetPageUptodate(rbio->stripe_pages[i]); 272 } 273 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 274 } 275 276 /* 277 * we hash on the first logical address of the stripe 278 */ 279 static int rbio_bucket(struct btrfs_raid_bio *rbio) 280 { 281 u64 num = rbio->bbio->raid_map[0]; 282 283 /* 284 * we shift down quite a bit. We're using byte 285 * addressing, and most of the lower bits are zeros. 286 * This tends to upset hash_64, and it consistently 287 * returns just one or two different values. 288 * 289 * shifting off the lower bits fixes things. 290 */ 291 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 292 } 293 294 /* 295 * stealing an rbio means taking all the uptodate pages from the stripe 296 * array in the source rbio and putting them into the destination rbio 297 */ 298 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 299 { 300 int i; 301 struct page *s; 302 struct page *d; 303 304 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 305 return; 306 307 for (i = 0; i < dest->nr_pages; i++) { 308 s = src->stripe_pages[i]; 309 if (!s || !PageUptodate(s)) { 310 continue; 311 } 312 313 d = dest->stripe_pages[i]; 314 if (d) 315 __free_page(d); 316 317 dest->stripe_pages[i] = s; 318 src->stripe_pages[i] = NULL; 319 } 320 } 321 322 /* 323 * merging means we take the bio_list from the victim and 324 * splice it into the destination. The victim should 325 * be discarded afterwards. 326 * 327 * must be called with dest->rbio_list_lock held 328 */ 329 static void merge_rbio(struct btrfs_raid_bio *dest, 330 struct btrfs_raid_bio *victim) 331 { 332 bio_list_merge(&dest->bio_list, &victim->bio_list); 333 dest->bio_list_bytes += victim->bio_list_bytes; 334 dest->generic_bio_cnt += victim->generic_bio_cnt; 335 bio_list_init(&victim->bio_list); 336 } 337 338 /* 339 * used to prune items that are in the cache. The caller 340 * must hold the hash table lock. 341 */ 342 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 343 { 344 int bucket = rbio_bucket(rbio); 345 struct btrfs_stripe_hash_table *table; 346 struct btrfs_stripe_hash *h; 347 int freeit = 0; 348 349 /* 350 * check the bit again under the hash table lock. 351 */ 352 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 353 return; 354 355 table = rbio->fs_info->stripe_hash_table; 356 h = table->table + bucket; 357 358 /* hold the lock for the bucket because we may be 359 * removing it from the hash table 360 */ 361 spin_lock(&h->lock); 362 363 /* 364 * hold the lock for the bio list because we need 365 * to make sure the bio list is empty 366 */ 367 spin_lock(&rbio->bio_list_lock); 368 369 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 370 list_del_init(&rbio->stripe_cache); 371 table->cache_size -= 1; 372 freeit = 1; 373 374 /* if the bio list isn't empty, this rbio is 375 * still involved in an IO. We take it out 376 * of the cache list, and drop the ref that 377 * was held for the list. 378 * 379 * If the bio_list was empty, we also remove 380 * the rbio from the hash_table, and drop 381 * the corresponding ref 382 */ 383 if (bio_list_empty(&rbio->bio_list)) { 384 if (!list_empty(&rbio->hash_list)) { 385 list_del_init(&rbio->hash_list); 386 refcount_dec(&rbio->refs); 387 BUG_ON(!list_empty(&rbio->plug_list)); 388 } 389 } 390 } 391 392 spin_unlock(&rbio->bio_list_lock); 393 spin_unlock(&h->lock); 394 395 if (freeit) 396 __free_raid_bio(rbio); 397 } 398 399 /* 400 * prune a given rbio from the cache 401 */ 402 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 403 { 404 struct btrfs_stripe_hash_table *table; 405 unsigned long flags; 406 407 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 408 return; 409 410 table = rbio->fs_info->stripe_hash_table; 411 412 spin_lock_irqsave(&table->cache_lock, flags); 413 __remove_rbio_from_cache(rbio); 414 spin_unlock_irqrestore(&table->cache_lock, flags); 415 } 416 417 /* 418 * remove everything in the cache 419 */ 420 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 421 { 422 struct btrfs_stripe_hash_table *table; 423 unsigned long flags; 424 struct btrfs_raid_bio *rbio; 425 426 table = info->stripe_hash_table; 427 428 spin_lock_irqsave(&table->cache_lock, flags); 429 while (!list_empty(&table->stripe_cache)) { 430 rbio = list_entry(table->stripe_cache.next, 431 struct btrfs_raid_bio, 432 stripe_cache); 433 __remove_rbio_from_cache(rbio); 434 } 435 spin_unlock_irqrestore(&table->cache_lock, flags); 436 } 437 438 /* 439 * remove all cached entries and free the hash table 440 * used by unmount 441 */ 442 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 443 { 444 if (!info->stripe_hash_table) 445 return; 446 btrfs_clear_rbio_cache(info); 447 kvfree(info->stripe_hash_table); 448 info->stripe_hash_table = NULL; 449 } 450 451 /* 452 * insert an rbio into the stripe cache. It 453 * must have already been prepared by calling 454 * cache_rbio_pages 455 * 456 * If this rbio was already cached, it gets 457 * moved to the front of the lru. 458 * 459 * If the size of the rbio cache is too big, we 460 * prune an item. 461 */ 462 static void cache_rbio(struct btrfs_raid_bio *rbio) 463 { 464 struct btrfs_stripe_hash_table *table; 465 unsigned long flags; 466 467 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 468 return; 469 470 table = rbio->fs_info->stripe_hash_table; 471 472 spin_lock_irqsave(&table->cache_lock, flags); 473 spin_lock(&rbio->bio_list_lock); 474 475 /* bump our ref if we were not in the list before */ 476 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 477 refcount_inc(&rbio->refs); 478 479 if (!list_empty(&rbio->stripe_cache)){ 480 list_move(&rbio->stripe_cache, &table->stripe_cache); 481 } else { 482 list_add(&rbio->stripe_cache, &table->stripe_cache); 483 table->cache_size += 1; 484 } 485 486 spin_unlock(&rbio->bio_list_lock); 487 488 if (table->cache_size > RBIO_CACHE_SIZE) { 489 struct btrfs_raid_bio *found; 490 491 found = list_entry(table->stripe_cache.prev, 492 struct btrfs_raid_bio, 493 stripe_cache); 494 495 if (found != rbio) 496 __remove_rbio_from_cache(found); 497 } 498 499 spin_unlock_irqrestore(&table->cache_lock, flags); 500 } 501 502 /* 503 * helper function to run the xor_blocks api. It is only 504 * able to do MAX_XOR_BLOCKS at a time, so we need to 505 * loop through. 506 */ 507 static void run_xor(void **pages, int src_cnt, ssize_t len) 508 { 509 int src_off = 0; 510 int xor_src_cnt = 0; 511 void *dest = pages[src_cnt]; 512 513 while(src_cnt > 0) { 514 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 515 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 516 517 src_cnt -= xor_src_cnt; 518 src_off += xor_src_cnt; 519 } 520 } 521 522 /* 523 * Returns true if the bio list inside this rbio covers an entire stripe (no 524 * rmw required). 525 */ 526 static int rbio_is_full(struct btrfs_raid_bio *rbio) 527 { 528 unsigned long flags; 529 unsigned long size = rbio->bio_list_bytes; 530 int ret = 1; 531 532 spin_lock_irqsave(&rbio->bio_list_lock, flags); 533 if (size != rbio->nr_data * rbio->stripe_len) 534 ret = 0; 535 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 536 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 537 538 return ret; 539 } 540 541 /* 542 * returns 1 if it is safe to merge two rbios together. 543 * The merging is safe if the two rbios correspond to 544 * the same stripe and if they are both going in the same 545 * direction (read vs write), and if neither one is 546 * locked for final IO 547 * 548 * The caller is responsible for locking such that 549 * rmw_locked is safe to test 550 */ 551 static int rbio_can_merge(struct btrfs_raid_bio *last, 552 struct btrfs_raid_bio *cur) 553 { 554 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 555 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 556 return 0; 557 558 /* 559 * we can't merge with cached rbios, since the 560 * idea is that when we merge the destination 561 * rbio is going to run our IO for us. We can 562 * steal from cached rbios though, other functions 563 * handle that. 564 */ 565 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 566 test_bit(RBIO_CACHE_BIT, &cur->flags)) 567 return 0; 568 569 if (last->bbio->raid_map[0] != 570 cur->bbio->raid_map[0]) 571 return 0; 572 573 /* we can't merge with different operations */ 574 if (last->operation != cur->operation) 575 return 0; 576 /* 577 * We've need read the full stripe from the drive. 578 * check and repair the parity and write the new results. 579 * 580 * We're not allowed to add any new bios to the 581 * bio list here, anyone else that wants to 582 * change this stripe needs to do their own rmw. 583 */ 584 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 585 return 0; 586 587 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 588 return 0; 589 590 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 591 int fa = last->faila; 592 int fb = last->failb; 593 int cur_fa = cur->faila; 594 int cur_fb = cur->failb; 595 596 if (last->faila >= last->failb) { 597 fa = last->failb; 598 fb = last->faila; 599 } 600 601 if (cur->faila >= cur->failb) { 602 cur_fa = cur->failb; 603 cur_fb = cur->faila; 604 } 605 606 if (fa != cur_fa || fb != cur_fb) 607 return 0; 608 } 609 return 1; 610 } 611 612 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 613 int index) 614 { 615 return stripe * rbio->stripe_npages + index; 616 } 617 618 /* 619 * these are just the pages from the rbio array, not from anything 620 * the FS sent down to us 621 */ 622 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 623 int index) 624 { 625 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 626 } 627 628 /* 629 * helper to index into the pstripe 630 */ 631 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 632 { 633 return rbio_stripe_page(rbio, rbio->nr_data, index); 634 } 635 636 /* 637 * helper to index into the qstripe, returns null 638 * if there is no qstripe 639 */ 640 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 641 { 642 if (rbio->nr_data + 1 == rbio->real_stripes) 643 return NULL; 644 return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 645 } 646 647 /* 648 * The first stripe in the table for a logical address 649 * has the lock. rbios are added in one of three ways: 650 * 651 * 1) Nobody has the stripe locked yet. The rbio is given 652 * the lock and 0 is returned. The caller must start the IO 653 * themselves. 654 * 655 * 2) Someone has the stripe locked, but we're able to merge 656 * with the lock owner. The rbio is freed and the IO will 657 * start automatically along with the existing rbio. 1 is returned. 658 * 659 * 3) Someone has the stripe locked, but we're not able to merge. 660 * The rbio is added to the lock owner's plug list, or merged into 661 * an rbio already on the plug list. When the lock owner unlocks, 662 * the next rbio on the list is run and the IO is started automatically. 663 * 1 is returned 664 * 665 * If we return 0, the caller still owns the rbio and must continue with 666 * IO submission. If we return 1, the caller must assume the rbio has 667 * already been freed. 668 */ 669 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 670 { 671 struct btrfs_stripe_hash *h; 672 struct btrfs_raid_bio *cur; 673 struct btrfs_raid_bio *pending; 674 unsigned long flags; 675 struct btrfs_raid_bio *freeit = NULL; 676 struct btrfs_raid_bio *cache_drop = NULL; 677 int ret = 0; 678 679 h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 680 681 spin_lock_irqsave(&h->lock, flags); 682 list_for_each_entry(cur, &h->hash_list, hash_list) { 683 if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) 684 continue; 685 686 spin_lock(&cur->bio_list_lock); 687 688 /* Can we steal this cached rbio's pages? */ 689 if (bio_list_empty(&cur->bio_list) && 690 list_empty(&cur->plug_list) && 691 test_bit(RBIO_CACHE_BIT, &cur->flags) && 692 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 693 list_del_init(&cur->hash_list); 694 refcount_dec(&cur->refs); 695 696 steal_rbio(cur, rbio); 697 cache_drop = cur; 698 spin_unlock(&cur->bio_list_lock); 699 700 goto lockit; 701 } 702 703 /* Can we merge into the lock owner? */ 704 if (rbio_can_merge(cur, rbio)) { 705 merge_rbio(cur, rbio); 706 spin_unlock(&cur->bio_list_lock); 707 freeit = rbio; 708 ret = 1; 709 goto out; 710 } 711 712 713 /* 714 * We couldn't merge with the running rbio, see if we can merge 715 * with the pending ones. We don't have to check for rmw_locked 716 * because there is no way they are inside finish_rmw right now 717 */ 718 list_for_each_entry(pending, &cur->plug_list, plug_list) { 719 if (rbio_can_merge(pending, rbio)) { 720 merge_rbio(pending, rbio); 721 spin_unlock(&cur->bio_list_lock); 722 freeit = rbio; 723 ret = 1; 724 goto out; 725 } 726 } 727 728 /* 729 * No merging, put us on the tail of the plug list, our rbio 730 * will be started with the currently running rbio unlocks 731 */ 732 list_add_tail(&rbio->plug_list, &cur->plug_list); 733 spin_unlock(&cur->bio_list_lock); 734 ret = 1; 735 goto out; 736 } 737 lockit: 738 refcount_inc(&rbio->refs); 739 list_add(&rbio->hash_list, &h->hash_list); 740 out: 741 spin_unlock_irqrestore(&h->lock, flags); 742 if (cache_drop) 743 remove_rbio_from_cache(cache_drop); 744 if (freeit) 745 __free_raid_bio(freeit); 746 return ret; 747 } 748 749 /* 750 * called as rmw or parity rebuild is completed. If the plug list has more 751 * rbios waiting for this stripe, the next one on the list will be started 752 */ 753 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 754 { 755 int bucket; 756 struct btrfs_stripe_hash *h; 757 unsigned long flags; 758 int keep_cache = 0; 759 760 bucket = rbio_bucket(rbio); 761 h = rbio->fs_info->stripe_hash_table->table + bucket; 762 763 if (list_empty(&rbio->plug_list)) 764 cache_rbio(rbio); 765 766 spin_lock_irqsave(&h->lock, flags); 767 spin_lock(&rbio->bio_list_lock); 768 769 if (!list_empty(&rbio->hash_list)) { 770 /* 771 * if we're still cached and there is no other IO 772 * to perform, just leave this rbio here for others 773 * to steal from later 774 */ 775 if (list_empty(&rbio->plug_list) && 776 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 777 keep_cache = 1; 778 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 779 BUG_ON(!bio_list_empty(&rbio->bio_list)); 780 goto done; 781 } 782 783 list_del_init(&rbio->hash_list); 784 refcount_dec(&rbio->refs); 785 786 /* 787 * we use the plug list to hold all the rbios 788 * waiting for the chance to lock this stripe. 789 * hand the lock over to one of them. 790 */ 791 if (!list_empty(&rbio->plug_list)) { 792 struct btrfs_raid_bio *next; 793 struct list_head *head = rbio->plug_list.next; 794 795 next = list_entry(head, struct btrfs_raid_bio, 796 plug_list); 797 798 list_del_init(&rbio->plug_list); 799 800 list_add(&next->hash_list, &h->hash_list); 801 refcount_inc(&next->refs); 802 spin_unlock(&rbio->bio_list_lock); 803 spin_unlock_irqrestore(&h->lock, flags); 804 805 if (next->operation == BTRFS_RBIO_READ_REBUILD) 806 start_async_work(next, read_rebuild_work); 807 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 808 steal_rbio(rbio, next); 809 start_async_work(next, read_rebuild_work); 810 } else if (next->operation == BTRFS_RBIO_WRITE) { 811 steal_rbio(rbio, next); 812 start_async_work(next, rmw_work); 813 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 814 steal_rbio(rbio, next); 815 start_async_work(next, scrub_parity_work); 816 } 817 818 goto done_nolock; 819 } 820 } 821 done: 822 spin_unlock(&rbio->bio_list_lock); 823 spin_unlock_irqrestore(&h->lock, flags); 824 825 done_nolock: 826 if (!keep_cache) 827 remove_rbio_from_cache(rbio); 828 } 829 830 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 831 { 832 int i; 833 834 if (!refcount_dec_and_test(&rbio->refs)) 835 return; 836 837 WARN_ON(!list_empty(&rbio->stripe_cache)); 838 WARN_ON(!list_empty(&rbio->hash_list)); 839 WARN_ON(!bio_list_empty(&rbio->bio_list)); 840 841 for (i = 0; i < rbio->nr_pages; i++) { 842 if (rbio->stripe_pages[i]) { 843 __free_page(rbio->stripe_pages[i]); 844 rbio->stripe_pages[i] = NULL; 845 } 846 } 847 848 btrfs_put_bbio(rbio->bbio); 849 kfree(rbio); 850 } 851 852 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 853 { 854 struct bio *next; 855 856 while (cur) { 857 next = cur->bi_next; 858 cur->bi_next = NULL; 859 cur->bi_status = err; 860 bio_endio(cur); 861 cur = next; 862 } 863 } 864 865 /* 866 * this frees the rbio and runs through all the bios in the 867 * bio_list and calls end_io on them 868 */ 869 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 870 { 871 struct bio *cur = bio_list_get(&rbio->bio_list); 872 struct bio *extra; 873 874 if (rbio->generic_bio_cnt) 875 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 876 877 /* 878 * At this moment, rbio->bio_list is empty, however since rbio does not 879 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 880 * hash list, rbio may be merged with others so that rbio->bio_list 881 * becomes non-empty. 882 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 883 * more and we can call bio_endio() on all queued bios. 884 */ 885 unlock_stripe(rbio); 886 extra = bio_list_get(&rbio->bio_list); 887 __free_raid_bio(rbio); 888 889 rbio_endio_bio_list(cur, err); 890 if (extra) 891 rbio_endio_bio_list(extra, err); 892 } 893 894 /* 895 * end io function used by finish_rmw. When we finally 896 * get here, we've written a full stripe 897 */ 898 static void raid_write_end_io(struct bio *bio) 899 { 900 struct btrfs_raid_bio *rbio = bio->bi_private; 901 blk_status_t err = bio->bi_status; 902 int max_errors; 903 904 if (err) 905 fail_bio_stripe(rbio, bio); 906 907 bio_put(bio); 908 909 if (!atomic_dec_and_test(&rbio->stripes_pending)) 910 return; 911 912 err = BLK_STS_OK; 913 914 /* OK, we have read all the stripes we need to. */ 915 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 916 0 : rbio->bbio->max_errors; 917 if (atomic_read(&rbio->error) > max_errors) 918 err = BLK_STS_IOERR; 919 920 rbio_orig_end_io(rbio, err); 921 } 922 923 /* 924 * the read/modify/write code wants to use the original bio for 925 * any pages it included, and then use the rbio for everything 926 * else. This function decides if a given index (stripe number) 927 * and page number in that stripe fall inside the original bio 928 * or the rbio. 929 * 930 * if you set bio_list_only, you'll get a NULL back for any ranges 931 * that are outside the bio_list 932 * 933 * This doesn't take any refs on anything, you get a bare page pointer 934 * and the caller must bump refs as required. 935 * 936 * You must call index_rbio_pages once before you can trust 937 * the answers from this function. 938 */ 939 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 940 int index, int pagenr, int bio_list_only) 941 { 942 int chunk_page; 943 struct page *p = NULL; 944 945 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 946 947 spin_lock_irq(&rbio->bio_list_lock); 948 p = rbio->bio_pages[chunk_page]; 949 spin_unlock_irq(&rbio->bio_list_lock); 950 951 if (p || bio_list_only) 952 return p; 953 954 return rbio->stripe_pages[chunk_page]; 955 } 956 957 /* 958 * number of pages we need for the entire stripe across all the 959 * drives 960 */ 961 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 962 { 963 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 964 } 965 966 /* 967 * allocation and initial setup for the btrfs_raid_bio. Not 968 * this does not allocate any pages for rbio->pages. 969 */ 970 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 971 struct btrfs_bio *bbio, 972 u64 stripe_len) 973 { 974 struct btrfs_raid_bio *rbio; 975 int nr_data = 0; 976 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 977 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 978 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 979 void *p; 980 981 rbio = kzalloc(sizeof(*rbio) + 982 sizeof(*rbio->stripe_pages) * num_pages + 983 sizeof(*rbio->bio_pages) * num_pages + 984 sizeof(*rbio->finish_pointers) * real_stripes + 985 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + 986 sizeof(*rbio->finish_pbitmap) * 987 BITS_TO_LONGS(stripe_npages), 988 GFP_NOFS); 989 if (!rbio) 990 return ERR_PTR(-ENOMEM); 991 992 bio_list_init(&rbio->bio_list); 993 INIT_LIST_HEAD(&rbio->plug_list); 994 spin_lock_init(&rbio->bio_list_lock); 995 INIT_LIST_HEAD(&rbio->stripe_cache); 996 INIT_LIST_HEAD(&rbio->hash_list); 997 rbio->bbio = bbio; 998 rbio->fs_info = fs_info; 999 rbio->stripe_len = stripe_len; 1000 rbio->nr_pages = num_pages; 1001 rbio->real_stripes = real_stripes; 1002 rbio->stripe_npages = stripe_npages; 1003 rbio->faila = -1; 1004 rbio->failb = -1; 1005 refcount_set(&rbio->refs, 1); 1006 atomic_set(&rbio->error, 0); 1007 atomic_set(&rbio->stripes_pending, 0); 1008 1009 /* 1010 * the stripe_pages, bio_pages, etc arrays point to the extra 1011 * memory we allocated past the end of the rbio 1012 */ 1013 p = rbio + 1; 1014 #define CONSUME_ALLOC(ptr, count) do { \ 1015 ptr = p; \ 1016 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 1017 } while (0) 1018 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 1019 CONSUME_ALLOC(rbio->bio_pages, num_pages); 1020 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 1021 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); 1022 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); 1023 #undef CONSUME_ALLOC 1024 1025 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1026 nr_data = real_stripes - 1; 1027 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1028 nr_data = real_stripes - 2; 1029 else 1030 BUG(); 1031 1032 rbio->nr_data = nr_data; 1033 return rbio; 1034 } 1035 1036 /* allocate pages for all the stripes in the bio, including parity */ 1037 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1038 { 1039 int i; 1040 struct page *page; 1041 1042 for (i = 0; i < rbio->nr_pages; i++) { 1043 if (rbio->stripe_pages[i]) 1044 continue; 1045 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1046 if (!page) 1047 return -ENOMEM; 1048 rbio->stripe_pages[i] = page; 1049 } 1050 return 0; 1051 } 1052 1053 /* only allocate pages for p/q stripes */ 1054 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1055 { 1056 int i; 1057 struct page *page; 1058 1059 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1060 1061 for (; i < rbio->nr_pages; i++) { 1062 if (rbio->stripe_pages[i]) 1063 continue; 1064 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1065 if (!page) 1066 return -ENOMEM; 1067 rbio->stripe_pages[i] = page; 1068 } 1069 return 0; 1070 } 1071 1072 /* 1073 * add a single page from a specific stripe into our list of bios for IO 1074 * this will try to merge into existing bios if possible, and returns 1075 * zero if all went well. 1076 */ 1077 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1078 struct bio_list *bio_list, 1079 struct page *page, 1080 int stripe_nr, 1081 unsigned long page_index, 1082 unsigned long bio_max_len) 1083 { 1084 struct bio *last = bio_list->tail; 1085 int ret; 1086 struct bio *bio; 1087 struct btrfs_bio_stripe *stripe; 1088 u64 disk_start; 1089 1090 stripe = &rbio->bbio->stripes[stripe_nr]; 1091 disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1092 1093 /* if the device is missing, just fail this stripe */ 1094 if (!stripe->dev->bdev) 1095 return fail_rbio_index(rbio, stripe_nr); 1096 1097 /* see if we can add this page onto our existing bio */ 1098 if (last) { 1099 u64 last_end = last->bi_iter.bi_sector << 9; 1100 last_end += last->bi_iter.bi_size; 1101 1102 /* 1103 * we can't merge these if they are from different 1104 * devices or if they are not contiguous 1105 */ 1106 if (last_end == disk_start && !last->bi_status && 1107 last->bi_bdev == stripe->dev->bdev) { 1108 ret = bio_add_page(last, page, PAGE_SIZE, 0); 1109 if (ret == PAGE_SIZE) 1110 return 0; 1111 } 1112 } 1113 1114 /* put a new bio on the list */ 1115 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1116 btrfs_io_bio(bio)->device = stripe->dev; 1117 bio->bi_iter.bi_size = 0; 1118 bio_set_dev(bio, stripe->dev->bdev); 1119 bio->bi_iter.bi_sector = disk_start >> 9; 1120 1121 bio_add_page(bio, page, PAGE_SIZE, 0); 1122 bio_list_add(bio_list, bio); 1123 return 0; 1124 } 1125 1126 /* 1127 * while we're doing the read/modify/write cycle, we could 1128 * have errors in reading pages off the disk. This checks 1129 * for errors and if we're not able to read the page it'll 1130 * trigger parity reconstruction. The rmw will be finished 1131 * after we've reconstructed the failed stripes 1132 */ 1133 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1134 { 1135 if (rbio->faila >= 0 || rbio->failb >= 0) { 1136 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1137 __raid56_parity_recover(rbio); 1138 } else { 1139 finish_rmw(rbio); 1140 } 1141 } 1142 1143 /* 1144 * helper function to walk our bio list and populate the bio_pages array with 1145 * the result. This seems expensive, but it is faster than constantly 1146 * searching through the bio list as we setup the IO in finish_rmw or stripe 1147 * reconstruction. 1148 * 1149 * This must be called before you trust the answers from page_in_rbio 1150 */ 1151 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1152 { 1153 struct bio *bio; 1154 u64 start; 1155 unsigned long stripe_offset; 1156 unsigned long page_index; 1157 1158 spin_lock_irq(&rbio->bio_list_lock); 1159 bio_list_for_each(bio, &rbio->bio_list) { 1160 struct bio_vec bvec; 1161 struct bvec_iter iter; 1162 int i = 0; 1163 1164 start = bio->bi_iter.bi_sector << 9; 1165 stripe_offset = start - rbio->bbio->raid_map[0]; 1166 page_index = stripe_offset >> PAGE_SHIFT; 1167 1168 if (bio_flagged(bio, BIO_CLONED)) 1169 bio->bi_iter = btrfs_io_bio(bio)->iter; 1170 1171 bio_for_each_segment(bvec, bio, iter) { 1172 rbio->bio_pages[page_index + i] = bvec.bv_page; 1173 i++; 1174 } 1175 } 1176 spin_unlock_irq(&rbio->bio_list_lock); 1177 } 1178 1179 /* 1180 * this is called from one of two situations. We either 1181 * have a full stripe from the higher layers, or we've read all 1182 * the missing bits off disk. 1183 * 1184 * This will calculate the parity and then send down any 1185 * changed blocks. 1186 */ 1187 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1188 { 1189 struct btrfs_bio *bbio = rbio->bbio; 1190 void **pointers = rbio->finish_pointers; 1191 int nr_data = rbio->nr_data; 1192 int stripe; 1193 int pagenr; 1194 bool has_qstripe; 1195 struct bio_list bio_list; 1196 struct bio *bio; 1197 int ret; 1198 1199 bio_list_init(&bio_list); 1200 1201 if (rbio->real_stripes - rbio->nr_data == 1) 1202 has_qstripe = false; 1203 else if (rbio->real_stripes - rbio->nr_data == 2) 1204 has_qstripe = true; 1205 else 1206 BUG(); 1207 1208 /* at this point we either have a full stripe, 1209 * or we've read the full stripe from the drive. 1210 * recalculate the parity and write the new results. 1211 * 1212 * We're not allowed to add any new bios to the 1213 * bio list here, anyone else that wants to 1214 * change this stripe needs to do their own rmw. 1215 */ 1216 spin_lock_irq(&rbio->bio_list_lock); 1217 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1218 spin_unlock_irq(&rbio->bio_list_lock); 1219 1220 atomic_set(&rbio->error, 0); 1221 1222 /* 1223 * now that we've set rmw_locked, run through the 1224 * bio list one last time and map the page pointers 1225 * 1226 * We don't cache full rbios because we're assuming 1227 * the higher layers are unlikely to use this area of 1228 * the disk again soon. If they do use it again, 1229 * hopefully they will send another full bio. 1230 */ 1231 index_rbio_pages(rbio); 1232 if (!rbio_is_full(rbio)) 1233 cache_rbio_pages(rbio); 1234 else 1235 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1236 1237 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1238 struct page *p; 1239 /* first collect one page from each data stripe */ 1240 for (stripe = 0; stripe < nr_data; stripe++) { 1241 p = page_in_rbio(rbio, stripe, pagenr, 0); 1242 pointers[stripe] = kmap(p); 1243 } 1244 1245 /* then add the parity stripe */ 1246 p = rbio_pstripe_page(rbio, pagenr); 1247 SetPageUptodate(p); 1248 pointers[stripe++] = kmap(p); 1249 1250 if (has_qstripe) { 1251 1252 /* 1253 * raid6, add the qstripe and call the 1254 * library function to fill in our p/q 1255 */ 1256 p = rbio_qstripe_page(rbio, pagenr); 1257 SetPageUptodate(p); 1258 pointers[stripe++] = kmap(p); 1259 1260 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1261 pointers); 1262 } else { 1263 /* raid5 */ 1264 copy_page(pointers[nr_data], pointers[0]); 1265 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1266 } 1267 1268 1269 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 1270 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1271 } 1272 1273 /* 1274 * time to start writing. Make bios for everything from the 1275 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1276 * everything else. 1277 */ 1278 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1279 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1280 struct page *page; 1281 if (stripe < rbio->nr_data) { 1282 page = page_in_rbio(rbio, stripe, pagenr, 1); 1283 if (!page) 1284 continue; 1285 } else { 1286 page = rbio_stripe_page(rbio, stripe, pagenr); 1287 } 1288 1289 ret = rbio_add_io_page(rbio, &bio_list, 1290 page, stripe, pagenr, rbio->stripe_len); 1291 if (ret) 1292 goto cleanup; 1293 } 1294 } 1295 1296 if (likely(!bbio->num_tgtdevs)) 1297 goto write_data; 1298 1299 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1300 if (!bbio->tgtdev_map[stripe]) 1301 continue; 1302 1303 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1304 struct page *page; 1305 if (stripe < rbio->nr_data) { 1306 page = page_in_rbio(rbio, stripe, pagenr, 1); 1307 if (!page) 1308 continue; 1309 } else { 1310 page = rbio_stripe_page(rbio, stripe, pagenr); 1311 } 1312 1313 ret = rbio_add_io_page(rbio, &bio_list, page, 1314 rbio->bbio->tgtdev_map[stripe], 1315 pagenr, rbio->stripe_len); 1316 if (ret) 1317 goto cleanup; 1318 } 1319 } 1320 1321 write_data: 1322 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1323 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1324 1325 while ((bio = bio_list_pop(&bio_list))) { 1326 bio->bi_private = rbio; 1327 bio->bi_end_io = raid_write_end_io; 1328 bio->bi_opf = REQ_OP_WRITE; 1329 1330 submit_bio(bio); 1331 } 1332 return; 1333 1334 cleanup: 1335 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1336 1337 while ((bio = bio_list_pop(&bio_list))) 1338 bio_put(bio); 1339 } 1340 1341 /* 1342 * helper to find the stripe number for a given bio. Used to figure out which 1343 * stripe has failed. This expects the bio to correspond to a physical disk, 1344 * so it looks up based on physical sector numbers. 1345 */ 1346 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1347 struct bio *bio) 1348 { 1349 u64 physical = bio->bi_iter.bi_sector; 1350 int i; 1351 struct btrfs_bio_stripe *stripe; 1352 1353 physical <<= 9; 1354 1355 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1356 stripe = &rbio->bbio->stripes[i]; 1357 if (in_range(physical, stripe->physical, rbio->stripe_len) && 1358 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 1359 return i; 1360 } 1361 } 1362 return -1; 1363 } 1364 1365 /* 1366 * helper to find the stripe number for a given 1367 * bio (before mapping). Used to figure out which stripe has 1368 * failed. This looks up based on logical block numbers. 1369 */ 1370 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1371 struct bio *bio) 1372 { 1373 u64 logical = bio->bi_iter.bi_sector << 9; 1374 int i; 1375 1376 for (i = 0; i < rbio->nr_data; i++) { 1377 u64 stripe_start = rbio->bbio->raid_map[i]; 1378 1379 if (in_range(logical, stripe_start, rbio->stripe_len)) 1380 return i; 1381 } 1382 return -1; 1383 } 1384 1385 /* 1386 * returns -EIO if we had too many failures 1387 */ 1388 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1389 { 1390 unsigned long flags; 1391 int ret = 0; 1392 1393 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1394 1395 /* we already know this stripe is bad, move on */ 1396 if (rbio->faila == failed || rbio->failb == failed) 1397 goto out; 1398 1399 if (rbio->faila == -1) { 1400 /* first failure on this rbio */ 1401 rbio->faila = failed; 1402 atomic_inc(&rbio->error); 1403 } else if (rbio->failb == -1) { 1404 /* second failure on this rbio */ 1405 rbio->failb = failed; 1406 atomic_inc(&rbio->error); 1407 } else { 1408 ret = -EIO; 1409 } 1410 out: 1411 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1412 1413 return ret; 1414 } 1415 1416 /* 1417 * helper to fail a stripe based on a physical disk 1418 * bio. 1419 */ 1420 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1421 struct bio *bio) 1422 { 1423 int failed = find_bio_stripe(rbio, bio); 1424 1425 if (failed < 0) 1426 return -EIO; 1427 1428 return fail_rbio_index(rbio, failed); 1429 } 1430 1431 /* 1432 * this sets each page in the bio uptodate. It should only be used on private 1433 * rbio pages, nothing that comes in from the higher layers 1434 */ 1435 static void set_bio_pages_uptodate(struct bio *bio) 1436 { 1437 struct bio_vec *bvec; 1438 struct bvec_iter_all iter_all; 1439 1440 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1441 1442 bio_for_each_segment_all(bvec, bio, iter_all) 1443 SetPageUptodate(bvec->bv_page); 1444 } 1445 1446 /* 1447 * end io for the read phase of the rmw cycle. All the bios here are physical 1448 * stripe bios we've read from the disk so we can recalculate the parity of the 1449 * stripe. 1450 * 1451 * This will usually kick off finish_rmw once all the bios are read in, but it 1452 * may trigger parity reconstruction if we had any errors along the way 1453 */ 1454 static void raid_rmw_end_io(struct bio *bio) 1455 { 1456 struct btrfs_raid_bio *rbio = bio->bi_private; 1457 1458 if (bio->bi_status) 1459 fail_bio_stripe(rbio, bio); 1460 else 1461 set_bio_pages_uptodate(bio); 1462 1463 bio_put(bio); 1464 1465 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1466 return; 1467 1468 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1469 goto cleanup; 1470 1471 /* 1472 * this will normally call finish_rmw to start our write 1473 * but if there are any failed stripes we'll reconstruct 1474 * from parity first 1475 */ 1476 validate_rbio_for_rmw(rbio); 1477 return; 1478 1479 cleanup: 1480 1481 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1482 } 1483 1484 /* 1485 * the stripe must be locked by the caller. It will 1486 * unlock after all the writes are done 1487 */ 1488 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1489 { 1490 int bios_to_read = 0; 1491 struct bio_list bio_list; 1492 int ret; 1493 int pagenr; 1494 int stripe; 1495 struct bio *bio; 1496 1497 bio_list_init(&bio_list); 1498 1499 ret = alloc_rbio_pages(rbio); 1500 if (ret) 1501 goto cleanup; 1502 1503 index_rbio_pages(rbio); 1504 1505 atomic_set(&rbio->error, 0); 1506 /* 1507 * build a list of bios to read all the missing parts of this 1508 * stripe 1509 */ 1510 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1511 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1512 struct page *page; 1513 /* 1514 * we want to find all the pages missing from 1515 * the rbio and read them from the disk. If 1516 * page_in_rbio finds a page in the bio list 1517 * we don't need to read it off the stripe. 1518 */ 1519 page = page_in_rbio(rbio, stripe, pagenr, 1); 1520 if (page) 1521 continue; 1522 1523 page = rbio_stripe_page(rbio, stripe, pagenr); 1524 /* 1525 * the bio cache may have handed us an uptodate 1526 * page. If so, be happy and use it 1527 */ 1528 if (PageUptodate(page)) 1529 continue; 1530 1531 ret = rbio_add_io_page(rbio, &bio_list, page, 1532 stripe, pagenr, rbio->stripe_len); 1533 if (ret) 1534 goto cleanup; 1535 } 1536 } 1537 1538 bios_to_read = bio_list_size(&bio_list); 1539 if (!bios_to_read) { 1540 /* 1541 * this can happen if others have merged with 1542 * us, it means there is nothing left to read. 1543 * But if there are missing devices it may not be 1544 * safe to do the full stripe write yet. 1545 */ 1546 goto finish; 1547 } 1548 1549 /* 1550 * the bbio may be freed once we submit the last bio. Make sure 1551 * not to touch it after that 1552 */ 1553 atomic_set(&rbio->stripes_pending, bios_to_read); 1554 while ((bio = bio_list_pop(&bio_list))) { 1555 bio->bi_private = rbio; 1556 bio->bi_end_io = raid_rmw_end_io; 1557 bio->bi_opf = REQ_OP_READ; 1558 1559 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1560 1561 submit_bio(bio); 1562 } 1563 /* the actual write will happen once the reads are done */ 1564 return 0; 1565 1566 cleanup: 1567 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1568 1569 while ((bio = bio_list_pop(&bio_list))) 1570 bio_put(bio); 1571 1572 return -EIO; 1573 1574 finish: 1575 validate_rbio_for_rmw(rbio); 1576 return 0; 1577 } 1578 1579 /* 1580 * if the upper layers pass in a full stripe, we thank them by only allocating 1581 * enough pages to hold the parity, and sending it all down quickly. 1582 */ 1583 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1584 { 1585 int ret; 1586 1587 ret = alloc_rbio_parity_pages(rbio); 1588 if (ret) { 1589 __free_raid_bio(rbio); 1590 return ret; 1591 } 1592 1593 ret = lock_stripe_add(rbio); 1594 if (ret == 0) 1595 finish_rmw(rbio); 1596 return 0; 1597 } 1598 1599 /* 1600 * partial stripe writes get handed over to async helpers. 1601 * We're really hoping to merge a few more writes into this 1602 * rbio before calculating new parity 1603 */ 1604 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1605 { 1606 int ret; 1607 1608 ret = lock_stripe_add(rbio); 1609 if (ret == 0) 1610 start_async_work(rbio, rmw_work); 1611 return 0; 1612 } 1613 1614 /* 1615 * sometimes while we were reading from the drive to 1616 * recalculate parity, enough new bios come into create 1617 * a full stripe. So we do a check here to see if we can 1618 * go directly to finish_rmw 1619 */ 1620 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1621 { 1622 /* head off into rmw land if we don't have a full stripe */ 1623 if (!rbio_is_full(rbio)) 1624 return partial_stripe_write(rbio); 1625 return full_stripe_write(rbio); 1626 } 1627 1628 /* 1629 * We use plugging call backs to collect full stripes. 1630 * Any time we get a partial stripe write while plugged 1631 * we collect it into a list. When the unplug comes down, 1632 * we sort the list by logical block number and merge 1633 * everything we can into the same rbios 1634 */ 1635 struct btrfs_plug_cb { 1636 struct blk_plug_cb cb; 1637 struct btrfs_fs_info *info; 1638 struct list_head rbio_list; 1639 struct btrfs_work work; 1640 }; 1641 1642 /* 1643 * rbios on the plug list are sorted for easier merging. 1644 */ 1645 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1646 { 1647 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1648 plug_list); 1649 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1650 plug_list); 1651 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1652 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1653 1654 if (a_sector < b_sector) 1655 return -1; 1656 if (a_sector > b_sector) 1657 return 1; 1658 return 0; 1659 } 1660 1661 static void run_plug(struct btrfs_plug_cb *plug) 1662 { 1663 struct btrfs_raid_bio *cur; 1664 struct btrfs_raid_bio *last = NULL; 1665 1666 /* 1667 * sort our plug list then try to merge 1668 * everything we can in hopes of creating full 1669 * stripes. 1670 */ 1671 list_sort(NULL, &plug->rbio_list, plug_cmp); 1672 while (!list_empty(&plug->rbio_list)) { 1673 cur = list_entry(plug->rbio_list.next, 1674 struct btrfs_raid_bio, plug_list); 1675 list_del_init(&cur->plug_list); 1676 1677 if (rbio_is_full(cur)) { 1678 int ret; 1679 1680 /* we have a full stripe, send it down */ 1681 ret = full_stripe_write(cur); 1682 BUG_ON(ret); 1683 continue; 1684 } 1685 if (last) { 1686 if (rbio_can_merge(last, cur)) { 1687 merge_rbio(last, cur); 1688 __free_raid_bio(cur); 1689 continue; 1690 1691 } 1692 __raid56_parity_write(last); 1693 } 1694 last = cur; 1695 } 1696 if (last) { 1697 __raid56_parity_write(last); 1698 } 1699 kfree(plug); 1700 } 1701 1702 /* 1703 * if the unplug comes from schedule, we have to push the 1704 * work off to a helper thread 1705 */ 1706 static void unplug_work(struct btrfs_work *work) 1707 { 1708 struct btrfs_plug_cb *plug; 1709 plug = container_of(work, struct btrfs_plug_cb, work); 1710 run_plug(plug); 1711 } 1712 1713 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1714 { 1715 struct btrfs_plug_cb *plug; 1716 plug = container_of(cb, struct btrfs_plug_cb, cb); 1717 1718 if (from_schedule) { 1719 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1720 btrfs_queue_work(plug->info->rmw_workers, 1721 &plug->work); 1722 return; 1723 } 1724 run_plug(plug); 1725 } 1726 1727 /* 1728 * our main entry point for writes from the rest of the FS. 1729 */ 1730 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, 1731 struct btrfs_bio *bbio, u64 stripe_len) 1732 { 1733 struct btrfs_raid_bio *rbio; 1734 struct btrfs_plug_cb *plug = NULL; 1735 struct blk_plug_cb *cb; 1736 int ret; 1737 1738 rbio = alloc_rbio(fs_info, bbio, stripe_len); 1739 if (IS_ERR(rbio)) { 1740 btrfs_put_bbio(bbio); 1741 return PTR_ERR(rbio); 1742 } 1743 bio_list_add(&rbio->bio_list, bio); 1744 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1745 rbio->operation = BTRFS_RBIO_WRITE; 1746 1747 btrfs_bio_counter_inc_noblocked(fs_info); 1748 rbio->generic_bio_cnt = 1; 1749 1750 /* 1751 * don't plug on full rbios, just get them out the door 1752 * as quickly as we can 1753 */ 1754 if (rbio_is_full(rbio)) { 1755 ret = full_stripe_write(rbio); 1756 if (ret) 1757 btrfs_bio_counter_dec(fs_info); 1758 return ret; 1759 } 1760 1761 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1762 if (cb) { 1763 plug = container_of(cb, struct btrfs_plug_cb, cb); 1764 if (!plug->info) { 1765 plug->info = fs_info; 1766 INIT_LIST_HEAD(&plug->rbio_list); 1767 } 1768 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1769 ret = 0; 1770 } else { 1771 ret = __raid56_parity_write(rbio); 1772 if (ret) 1773 btrfs_bio_counter_dec(fs_info); 1774 } 1775 return ret; 1776 } 1777 1778 /* 1779 * all parity reconstruction happens here. We've read in everything 1780 * we can find from the drives and this does the heavy lifting of 1781 * sorting the good from the bad. 1782 */ 1783 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1784 { 1785 int pagenr, stripe; 1786 void **pointers; 1787 int faila = -1, failb = -1; 1788 struct page *page; 1789 blk_status_t err; 1790 int i; 1791 1792 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1793 if (!pointers) { 1794 err = BLK_STS_RESOURCE; 1795 goto cleanup_io; 1796 } 1797 1798 faila = rbio->faila; 1799 failb = rbio->failb; 1800 1801 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1802 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1803 spin_lock_irq(&rbio->bio_list_lock); 1804 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1805 spin_unlock_irq(&rbio->bio_list_lock); 1806 } 1807 1808 index_rbio_pages(rbio); 1809 1810 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1811 /* 1812 * Now we just use bitmap to mark the horizontal stripes in 1813 * which we have data when doing parity scrub. 1814 */ 1815 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1816 !test_bit(pagenr, rbio->dbitmap)) 1817 continue; 1818 1819 /* setup our array of pointers with pages 1820 * from each stripe 1821 */ 1822 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1823 /* 1824 * if we're rebuilding a read, we have to use 1825 * pages from the bio list 1826 */ 1827 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1828 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1829 (stripe == faila || stripe == failb)) { 1830 page = page_in_rbio(rbio, stripe, pagenr, 0); 1831 } else { 1832 page = rbio_stripe_page(rbio, stripe, pagenr); 1833 } 1834 pointers[stripe] = kmap(page); 1835 } 1836 1837 /* all raid6 handling here */ 1838 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1839 /* 1840 * single failure, rebuild from parity raid5 1841 * style 1842 */ 1843 if (failb < 0) { 1844 if (faila == rbio->nr_data) { 1845 /* 1846 * Just the P stripe has failed, without 1847 * a bad data or Q stripe. 1848 * TODO, we should redo the xor here. 1849 */ 1850 err = BLK_STS_IOERR; 1851 goto cleanup; 1852 } 1853 /* 1854 * a single failure in raid6 is rebuilt 1855 * in the pstripe code below 1856 */ 1857 goto pstripe; 1858 } 1859 1860 /* make sure our ps and qs are in order */ 1861 if (faila > failb) 1862 swap(faila, failb); 1863 1864 /* if the q stripe is failed, do a pstripe reconstruction 1865 * from the xors. 1866 * If both the q stripe and the P stripe are failed, we're 1867 * here due to a crc mismatch and we can't give them the 1868 * data they want 1869 */ 1870 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1871 if (rbio->bbio->raid_map[faila] == 1872 RAID5_P_STRIPE) { 1873 err = BLK_STS_IOERR; 1874 goto cleanup; 1875 } 1876 /* 1877 * otherwise we have one bad data stripe and 1878 * a good P stripe. raid5! 1879 */ 1880 goto pstripe; 1881 } 1882 1883 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1884 raid6_datap_recov(rbio->real_stripes, 1885 PAGE_SIZE, faila, pointers); 1886 } else { 1887 raid6_2data_recov(rbio->real_stripes, 1888 PAGE_SIZE, faila, failb, 1889 pointers); 1890 } 1891 } else { 1892 void *p; 1893 1894 /* rebuild from P stripe here (raid5 or raid6) */ 1895 BUG_ON(failb != -1); 1896 pstripe: 1897 /* Copy parity block into failed block to start with */ 1898 copy_page(pointers[faila], pointers[rbio->nr_data]); 1899 1900 /* rearrange the pointer array */ 1901 p = pointers[faila]; 1902 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1903 pointers[stripe] = pointers[stripe + 1]; 1904 pointers[rbio->nr_data - 1] = p; 1905 1906 /* xor in the rest */ 1907 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1908 } 1909 /* if we're doing this rebuild as part of an rmw, go through 1910 * and set all of our private rbio pages in the 1911 * failed stripes as uptodate. This way finish_rmw will 1912 * know they can be trusted. If this was a read reconstruction, 1913 * other endio functions will fiddle the uptodate bits 1914 */ 1915 if (rbio->operation == BTRFS_RBIO_WRITE) { 1916 for (i = 0; i < rbio->stripe_npages; i++) { 1917 if (faila != -1) { 1918 page = rbio_stripe_page(rbio, faila, i); 1919 SetPageUptodate(page); 1920 } 1921 if (failb != -1) { 1922 page = rbio_stripe_page(rbio, failb, i); 1923 SetPageUptodate(page); 1924 } 1925 } 1926 } 1927 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1928 /* 1929 * if we're rebuilding a read, we have to use 1930 * pages from the bio list 1931 */ 1932 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1933 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1934 (stripe == faila || stripe == failb)) { 1935 page = page_in_rbio(rbio, stripe, pagenr, 0); 1936 } else { 1937 page = rbio_stripe_page(rbio, stripe, pagenr); 1938 } 1939 kunmap(page); 1940 } 1941 } 1942 1943 err = BLK_STS_OK; 1944 cleanup: 1945 kfree(pointers); 1946 1947 cleanup_io: 1948 /* 1949 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 1950 * valid rbio which is consistent with ondisk content, thus such a 1951 * valid rbio can be cached to avoid further disk reads. 1952 */ 1953 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1954 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1955 /* 1956 * - In case of two failures, where rbio->failb != -1: 1957 * 1958 * Do not cache this rbio since the above read reconstruction 1959 * (raid6_datap_recov() or raid6_2data_recov()) may have 1960 * changed some content of stripes which are not identical to 1961 * on-disk content any more, otherwise, a later write/recover 1962 * may steal stripe_pages from this rbio and end up with 1963 * corruptions or rebuild failures. 1964 * 1965 * - In case of single failure, where rbio->failb == -1: 1966 * 1967 * Cache this rbio iff the above read reconstruction is 1968 * executed without problems. 1969 */ 1970 if (err == BLK_STS_OK && rbio->failb < 0) 1971 cache_rbio_pages(rbio); 1972 else 1973 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1974 1975 rbio_orig_end_io(rbio, err); 1976 } else if (err == BLK_STS_OK) { 1977 rbio->faila = -1; 1978 rbio->failb = -1; 1979 1980 if (rbio->operation == BTRFS_RBIO_WRITE) 1981 finish_rmw(rbio); 1982 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 1983 finish_parity_scrub(rbio, 0); 1984 else 1985 BUG(); 1986 } else { 1987 rbio_orig_end_io(rbio, err); 1988 } 1989 } 1990 1991 /* 1992 * This is called only for stripes we've read from disk to 1993 * reconstruct the parity. 1994 */ 1995 static void raid_recover_end_io(struct bio *bio) 1996 { 1997 struct btrfs_raid_bio *rbio = bio->bi_private; 1998 1999 /* 2000 * we only read stripe pages off the disk, set them 2001 * up to date if there were no errors 2002 */ 2003 if (bio->bi_status) 2004 fail_bio_stripe(rbio, bio); 2005 else 2006 set_bio_pages_uptodate(bio); 2007 bio_put(bio); 2008 2009 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2010 return; 2011 2012 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2013 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2014 else 2015 __raid_recover_end_io(rbio); 2016 } 2017 2018 /* 2019 * reads everything we need off the disk to reconstruct 2020 * the parity. endio handlers trigger final reconstruction 2021 * when the IO is done. 2022 * 2023 * This is used both for reads from the higher layers and for 2024 * parity construction required to finish a rmw cycle. 2025 */ 2026 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2027 { 2028 int bios_to_read = 0; 2029 struct bio_list bio_list; 2030 int ret; 2031 int pagenr; 2032 int stripe; 2033 struct bio *bio; 2034 2035 bio_list_init(&bio_list); 2036 2037 ret = alloc_rbio_pages(rbio); 2038 if (ret) 2039 goto cleanup; 2040 2041 atomic_set(&rbio->error, 0); 2042 2043 /* 2044 * read everything that hasn't failed. Thanks to the 2045 * stripe cache, it is possible that some or all of these 2046 * pages are going to be uptodate. 2047 */ 2048 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2049 if (rbio->faila == stripe || rbio->failb == stripe) { 2050 atomic_inc(&rbio->error); 2051 continue; 2052 } 2053 2054 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2055 struct page *p; 2056 2057 /* 2058 * the rmw code may have already read this 2059 * page in 2060 */ 2061 p = rbio_stripe_page(rbio, stripe, pagenr); 2062 if (PageUptodate(p)) 2063 continue; 2064 2065 ret = rbio_add_io_page(rbio, &bio_list, 2066 rbio_stripe_page(rbio, stripe, pagenr), 2067 stripe, pagenr, rbio->stripe_len); 2068 if (ret < 0) 2069 goto cleanup; 2070 } 2071 } 2072 2073 bios_to_read = bio_list_size(&bio_list); 2074 if (!bios_to_read) { 2075 /* 2076 * we might have no bios to read just because the pages 2077 * were up to date, or we might have no bios to read because 2078 * the devices were gone. 2079 */ 2080 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2081 __raid_recover_end_io(rbio); 2082 return 0; 2083 } else { 2084 goto cleanup; 2085 } 2086 } 2087 2088 /* 2089 * the bbio may be freed once we submit the last bio. Make sure 2090 * not to touch it after that 2091 */ 2092 atomic_set(&rbio->stripes_pending, bios_to_read); 2093 while ((bio = bio_list_pop(&bio_list))) { 2094 bio->bi_private = rbio; 2095 bio->bi_end_io = raid_recover_end_io; 2096 bio->bi_opf = REQ_OP_READ; 2097 2098 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2099 2100 submit_bio(bio); 2101 } 2102 2103 return 0; 2104 2105 cleanup: 2106 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2107 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2108 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2109 2110 while ((bio = bio_list_pop(&bio_list))) 2111 bio_put(bio); 2112 2113 return -EIO; 2114 } 2115 2116 /* 2117 * the main entry point for reads from the higher layers. This 2118 * is really only called when the normal read path had a failure, 2119 * so we assume the bio they send down corresponds to a failed part 2120 * of the drive. 2121 */ 2122 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, 2123 struct btrfs_bio *bbio, u64 stripe_len, 2124 int mirror_num, int generic_io) 2125 { 2126 struct btrfs_raid_bio *rbio; 2127 int ret; 2128 2129 if (generic_io) { 2130 ASSERT(bbio->mirror_num == mirror_num); 2131 btrfs_io_bio(bio)->mirror_num = mirror_num; 2132 } 2133 2134 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2135 if (IS_ERR(rbio)) { 2136 if (generic_io) 2137 btrfs_put_bbio(bbio); 2138 return PTR_ERR(rbio); 2139 } 2140 2141 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2142 bio_list_add(&rbio->bio_list, bio); 2143 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2144 2145 rbio->faila = find_logical_bio_stripe(rbio, bio); 2146 if (rbio->faila == -1) { 2147 btrfs_warn(fs_info, 2148 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", 2149 __func__, bio->bi_iter.bi_sector << 9, 2150 (u64)bio->bi_iter.bi_size, bbio->map_type); 2151 if (generic_io) 2152 btrfs_put_bbio(bbio); 2153 kfree(rbio); 2154 return -EIO; 2155 } 2156 2157 if (generic_io) { 2158 btrfs_bio_counter_inc_noblocked(fs_info); 2159 rbio->generic_bio_cnt = 1; 2160 } else { 2161 btrfs_get_bbio(bbio); 2162 } 2163 2164 /* 2165 * Loop retry: 2166 * for 'mirror == 2', reconstruct from all other stripes. 2167 * for 'mirror_num > 2', select a stripe to fail on every retry. 2168 */ 2169 if (mirror_num > 2) { 2170 /* 2171 * 'mirror == 3' is to fail the p stripe and 2172 * reconstruct from the q stripe. 'mirror > 3' is to 2173 * fail a data stripe and reconstruct from p+q stripe. 2174 */ 2175 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2176 ASSERT(rbio->failb > 0); 2177 if (rbio->failb <= rbio->faila) 2178 rbio->failb--; 2179 } 2180 2181 ret = lock_stripe_add(rbio); 2182 2183 /* 2184 * __raid56_parity_recover will end the bio with 2185 * any errors it hits. We don't want to return 2186 * its error value up the stack because our caller 2187 * will end up calling bio_endio with any nonzero 2188 * return 2189 */ 2190 if (ret == 0) 2191 __raid56_parity_recover(rbio); 2192 /* 2193 * our rbio has been added to the list of 2194 * rbios that will be handled after the 2195 * currently lock owner is done 2196 */ 2197 return 0; 2198 2199 } 2200 2201 static void rmw_work(struct btrfs_work *work) 2202 { 2203 struct btrfs_raid_bio *rbio; 2204 2205 rbio = container_of(work, struct btrfs_raid_bio, work); 2206 raid56_rmw_stripe(rbio); 2207 } 2208 2209 static void read_rebuild_work(struct btrfs_work *work) 2210 { 2211 struct btrfs_raid_bio *rbio; 2212 2213 rbio = container_of(work, struct btrfs_raid_bio, work); 2214 __raid56_parity_recover(rbio); 2215 } 2216 2217 /* 2218 * The following code is used to scrub/replace the parity stripe 2219 * 2220 * Caller must have already increased bio_counter for getting @bbio. 2221 * 2222 * Note: We need make sure all the pages that add into the scrub/replace 2223 * raid bio are correct and not be changed during the scrub/replace. That 2224 * is those pages just hold metadata or file data with checksum. 2225 */ 2226 2227 struct btrfs_raid_bio * 2228 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2229 struct btrfs_bio *bbio, u64 stripe_len, 2230 struct btrfs_device *scrub_dev, 2231 unsigned long *dbitmap, int stripe_nsectors) 2232 { 2233 struct btrfs_raid_bio *rbio; 2234 int i; 2235 2236 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2237 if (IS_ERR(rbio)) 2238 return NULL; 2239 bio_list_add(&rbio->bio_list, bio); 2240 /* 2241 * This is a special bio which is used to hold the completion handler 2242 * and make the scrub rbio is similar to the other types 2243 */ 2244 ASSERT(!bio->bi_iter.bi_size); 2245 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2246 2247 /* 2248 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted 2249 * to the end position, so this search can start from the first parity 2250 * stripe. 2251 */ 2252 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2253 if (bbio->stripes[i].dev == scrub_dev) { 2254 rbio->scrubp = i; 2255 break; 2256 } 2257 } 2258 ASSERT(i < rbio->real_stripes); 2259 2260 /* Now we just support the sectorsize equals to page size */ 2261 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2262 ASSERT(rbio->stripe_npages == stripe_nsectors); 2263 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2264 2265 /* 2266 * We have already increased bio_counter when getting bbio, record it 2267 * so we can free it at rbio_orig_end_io(). 2268 */ 2269 rbio->generic_bio_cnt = 1; 2270 2271 return rbio; 2272 } 2273 2274 /* Used for both parity scrub and missing. */ 2275 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2276 u64 logical) 2277 { 2278 int stripe_offset; 2279 int index; 2280 2281 ASSERT(logical >= rbio->bbio->raid_map[0]); 2282 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2283 rbio->stripe_len * rbio->nr_data); 2284 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2285 index = stripe_offset >> PAGE_SHIFT; 2286 rbio->bio_pages[index] = page; 2287 } 2288 2289 /* 2290 * We just scrub the parity that we have correct data on the same horizontal, 2291 * so we needn't allocate all pages for all the stripes. 2292 */ 2293 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2294 { 2295 int i; 2296 int bit; 2297 int index; 2298 struct page *page; 2299 2300 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2301 for (i = 0; i < rbio->real_stripes; i++) { 2302 index = i * rbio->stripe_npages + bit; 2303 if (rbio->stripe_pages[index]) 2304 continue; 2305 2306 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2307 if (!page) 2308 return -ENOMEM; 2309 rbio->stripe_pages[index] = page; 2310 } 2311 } 2312 return 0; 2313 } 2314 2315 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2316 int need_check) 2317 { 2318 struct btrfs_bio *bbio = rbio->bbio; 2319 void **pointers = rbio->finish_pointers; 2320 unsigned long *pbitmap = rbio->finish_pbitmap; 2321 int nr_data = rbio->nr_data; 2322 int stripe; 2323 int pagenr; 2324 bool has_qstripe; 2325 struct page *p_page = NULL; 2326 struct page *q_page = NULL; 2327 struct bio_list bio_list; 2328 struct bio *bio; 2329 int is_replace = 0; 2330 int ret; 2331 2332 bio_list_init(&bio_list); 2333 2334 if (rbio->real_stripes - rbio->nr_data == 1) 2335 has_qstripe = false; 2336 else if (rbio->real_stripes - rbio->nr_data == 2) 2337 has_qstripe = true; 2338 else 2339 BUG(); 2340 2341 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2342 is_replace = 1; 2343 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2344 } 2345 2346 /* 2347 * Because the higher layers(scrubber) are unlikely to 2348 * use this area of the disk again soon, so don't cache 2349 * it. 2350 */ 2351 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2352 2353 if (!need_check) 2354 goto writeback; 2355 2356 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2357 if (!p_page) 2358 goto cleanup; 2359 SetPageUptodate(p_page); 2360 2361 if (has_qstripe) { 2362 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2363 if (!q_page) { 2364 __free_page(p_page); 2365 goto cleanup; 2366 } 2367 SetPageUptodate(q_page); 2368 } 2369 2370 atomic_set(&rbio->error, 0); 2371 2372 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2373 struct page *p; 2374 void *parity; 2375 /* first collect one page from each data stripe */ 2376 for (stripe = 0; stripe < nr_data; stripe++) { 2377 p = page_in_rbio(rbio, stripe, pagenr, 0); 2378 pointers[stripe] = kmap(p); 2379 } 2380 2381 /* then add the parity stripe */ 2382 pointers[stripe++] = kmap(p_page); 2383 2384 if (has_qstripe) { 2385 /* 2386 * raid6, add the qstripe and call the 2387 * library function to fill in our p/q 2388 */ 2389 pointers[stripe++] = kmap(q_page); 2390 2391 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2392 pointers); 2393 } else { 2394 /* raid5 */ 2395 copy_page(pointers[nr_data], pointers[0]); 2396 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2397 } 2398 2399 /* Check scrubbing parity and repair it */ 2400 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2401 parity = kmap(p); 2402 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2403 copy_page(parity, pointers[rbio->scrubp]); 2404 else 2405 /* Parity is right, needn't writeback */ 2406 bitmap_clear(rbio->dbitmap, pagenr, 1); 2407 kunmap(p); 2408 2409 for (stripe = 0; stripe < nr_data; stripe++) 2410 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 2411 kunmap(p_page); 2412 } 2413 2414 __free_page(p_page); 2415 if (q_page) 2416 __free_page(q_page); 2417 2418 writeback: 2419 /* 2420 * time to start writing. Make bios for everything from the 2421 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2422 * everything else. 2423 */ 2424 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2425 struct page *page; 2426 2427 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2428 ret = rbio_add_io_page(rbio, &bio_list, 2429 page, rbio->scrubp, pagenr, rbio->stripe_len); 2430 if (ret) 2431 goto cleanup; 2432 } 2433 2434 if (!is_replace) 2435 goto submit_write; 2436 2437 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2438 struct page *page; 2439 2440 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2441 ret = rbio_add_io_page(rbio, &bio_list, page, 2442 bbio->tgtdev_map[rbio->scrubp], 2443 pagenr, rbio->stripe_len); 2444 if (ret) 2445 goto cleanup; 2446 } 2447 2448 submit_write: 2449 nr_data = bio_list_size(&bio_list); 2450 if (!nr_data) { 2451 /* Every parity is right */ 2452 rbio_orig_end_io(rbio, BLK_STS_OK); 2453 return; 2454 } 2455 2456 atomic_set(&rbio->stripes_pending, nr_data); 2457 2458 while ((bio = bio_list_pop(&bio_list))) { 2459 bio->bi_private = rbio; 2460 bio->bi_end_io = raid_write_end_io; 2461 bio->bi_opf = REQ_OP_WRITE; 2462 2463 submit_bio(bio); 2464 } 2465 return; 2466 2467 cleanup: 2468 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2469 2470 while ((bio = bio_list_pop(&bio_list))) 2471 bio_put(bio); 2472 } 2473 2474 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2475 { 2476 if (stripe >= 0 && stripe < rbio->nr_data) 2477 return 1; 2478 return 0; 2479 } 2480 2481 /* 2482 * While we're doing the parity check and repair, we could have errors 2483 * in reading pages off the disk. This checks for errors and if we're 2484 * not able to read the page it'll trigger parity reconstruction. The 2485 * parity scrub will be finished after we've reconstructed the failed 2486 * stripes 2487 */ 2488 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2489 { 2490 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2491 goto cleanup; 2492 2493 if (rbio->faila >= 0 || rbio->failb >= 0) { 2494 int dfail = 0, failp = -1; 2495 2496 if (is_data_stripe(rbio, rbio->faila)) 2497 dfail++; 2498 else if (is_parity_stripe(rbio->faila)) 2499 failp = rbio->faila; 2500 2501 if (is_data_stripe(rbio, rbio->failb)) 2502 dfail++; 2503 else if (is_parity_stripe(rbio->failb)) 2504 failp = rbio->failb; 2505 2506 /* 2507 * Because we can not use a scrubbing parity to repair 2508 * the data, so the capability of the repair is declined. 2509 * (In the case of RAID5, we can not repair anything) 2510 */ 2511 if (dfail > rbio->bbio->max_errors - 1) 2512 goto cleanup; 2513 2514 /* 2515 * If all data is good, only parity is correctly, just 2516 * repair the parity. 2517 */ 2518 if (dfail == 0) { 2519 finish_parity_scrub(rbio, 0); 2520 return; 2521 } 2522 2523 /* 2524 * Here means we got one corrupted data stripe and one 2525 * corrupted parity on RAID6, if the corrupted parity 2526 * is scrubbing parity, luckily, use the other one to repair 2527 * the data, or we can not repair the data stripe. 2528 */ 2529 if (failp != rbio->scrubp) 2530 goto cleanup; 2531 2532 __raid_recover_end_io(rbio); 2533 } else { 2534 finish_parity_scrub(rbio, 1); 2535 } 2536 return; 2537 2538 cleanup: 2539 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2540 } 2541 2542 /* 2543 * end io for the read phase of the rmw cycle. All the bios here are physical 2544 * stripe bios we've read from the disk so we can recalculate the parity of the 2545 * stripe. 2546 * 2547 * This will usually kick off finish_rmw once all the bios are read in, but it 2548 * may trigger parity reconstruction if we had any errors along the way 2549 */ 2550 static void raid56_parity_scrub_end_io(struct bio *bio) 2551 { 2552 struct btrfs_raid_bio *rbio = bio->bi_private; 2553 2554 if (bio->bi_status) 2555 fail_bio_stripe(rbio, bio); 2556 else 2557 set_bio_pages_uptodate(bio); 2558 2559 bio_put(bio); 2560 2561 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2562 return; 2563 2564 /* 2565 * this will normally call finish_rmw to start our write 2566 * but if there are any failed stripes we'll reconstruct 2567 * from parity first 2568 */ 2569 validate_rbio_for_parity_scrub(rbio); 2570 } 2571 2572 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2573 { 2574 int bios_to_read = 0; 2575 struct bio_list bio_list; 2576 int ret; 2577 int pagenr; 2578 int stripe; 2579 struct bio *bio; 2580 2581 bio_list_init(&bio_list); 2582 2583 ret = alloc_rbio_essential_pages(rbio); 2584 if (ret) 2585 goto cleanup; 2586 2587 atomic_set(&rbio->error, 0); 2588 /* 2589 * build a list of bios to read all the missing parts of this 2590 * stripe 2591 */ 2592 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2593 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2594 struct page *page; 2595 /* 2596 * we want to find all the pages missing from 2597 * the rbio and read them from the disk. If 2598 * page_in_rbio finds a page in the bio list 2599 * we don't need to read it off the stripe. 2600 */ 2601 page = page_in_rbio(rbio, stripe, pagenr, 1); 2602 if (page) 2603 continue; 2604 2605 page = rbio_stripe_page(rbio, stripe, pagenr); 2606 /* 2607 * the bio cache may have handed us an uptodate 2608 * page. If so, be happy and use it 2609 */ 2610 if (PageUptodate(page)) 2611 continue; 2612 2613 ret = rbio_add_io_page(rbio, &bio_list, page, 2614 stripe, pagenr, rbio->stripe_len); 2615 if (ret) 2616 goto cleanup; 2617 } 2618 } 2619 2620 bios_to_read = bio_list_size(&bio_list); 2621 if (!bios_to_read) { 2622 /* 2623 * this can happen if others have merged with 2624 * us, it means there is nothing left to read. 2625 * But if there are missing devices it may not be 2626 * safe to do the full stripe write yet. 2627 */ 2628 goto finish; 2629 } 2630 2631 /* 2632 * the bbio may be freed once we submit the last bio. Make sure 2633 * not to touch it after that 2634 */ 2635 atomic_set(&rbio->stripes_pending, bios_to_read); 2636 while ((bio = bio_list_pop(&bio_list))) { 2637 bio->bi_private = rbio; 2638 bio->bi_end_io = raid56_parity_scrub_end_io; 2639 bio->bi_opf = REQ_OP_READ; 2640 2641 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2642 2643 submit_bio(bio); 2644 } 2645 /* the actual write will happen once the reads are done */ 2646 return; 2647 2648 cleanup: 2649 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2650 2651 while ((bio = bio_list_pop(&bio_list))) 2652 bio_put(bio); 2653 2654 return; 2655 2656 finish: 2657 validate_rbio_for_parity_scrub(rbio); 2658 } 2659 2660 static void scrub_parity_work(struct btrfs_work *work) 2661 { 2662 struct btrfs_raid_bio *rbio; 2663 2664 rbio = container_of(work, struct btrfs_raid_bio, work); 2665 raid56_parity_scrub_stripe(rbio); 2666 } 2667 2668 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2669 { 2670 if (!lock_stripe_add(rbio)) 2671 start_async_work(rbio, scrub_parity_work); 2672 } 2673 2674 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2675 2676 struct btrfs_raid_bio * 2677 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2678 struct btrfs_bio *bbio, u64 length) 2679 { 2680 struct btrfs_raid_bio *rbio; 2681 2682 rbio = alloc_rbio(fs_info, bbio, length); 2683 if (IS_ERR(rbio)) 2684 return NULL; 2685 2686 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2687 bio_list_add(&rbio->bio_list, bio); 2688 /* 2689 * This is a special bio which is used to hold the completion handler 2690 * and make the scrub rbio is similar to the other types 2691 */ 2692 ASSERT(!bio->bi_iter.bi_size); 2693 2694 rbio->faila = find_logical_bio_stripe(rbio, bio); 2695 if (rbio->faila == -1) { 2696 BUG(); 2697 kfree(rbio); 2698 return NULL; 2699 } 2700 2701 /* 2702 * When we get bbio, we have already increased bio_counter, record it 2703 * so we can free it at rbio_orig_end_io() 2704 */ 2705 rbio->generic_bio_cnt = 1; 2706 2707 return rbio; 2708 } 2709 2710 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2711 { 2712 if (!lock_stripe_add(rbio)) 2713 start_async_work(rbio, read_rebuild_work); 2714 } 2715