1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/wait.h> 9 #include <linux/bio.h> 10 #include <linux/slab.h> 11 #include <linux/buffer_head.h> 12 #include <linux/blkdev.h> 13 #include <linux/random.h> 14 #include <linux/iocontext.h> 15 #include <linux/capability.h> 16 #include <linux/ratelimit.h> 17 #include <linux/kthread.h> 18 #include <linux/raid/pq.h> 19 #include <linux/hash.h> 20 #include <linux/list_sort.h> 21 #include <linux/raid/xor.h> 22 #include <linux/mm.h> 23 #include <asm/div64.h> 24 #include "ctree.h" 25 #include "extent_map.h" 26 #include "disk-io.h" 27 #include "transaction.h" 28 #include "print-tree.h" 29 #include "volumes.h" 30 #include "raid56.h" 31 #include "async-thread.h" 32 #include "check-integrity.h" 33 #include "rcu-string.h" 34 35 /* set when additional merges to this rbio are not allowed */ 36 #define RBIO_RMW_LOCKED_BIT 1 37 38 /* 39 * set when this rbio is sitting in the hash, but it is just a cache 40 * of past RMW 41 */ 42 #define RBIO_CACHE_BIT 2 43 44 /* 45 * set when it is safe to trust the stripe_pages for caching 46 */ 47 #define RBIO_CACHE_READY_BIT 3 48 49 #define RBIO_CACHE_SIZE 1024 50 51 enum btrfs_rbio_ops { 52 BTRFS_RBIO_WRITE, 53 BTRFS_RBIO_READ_REBUILD, 54 BTRFS_RBIO_PARITY_SCRUB, 55 BTRFS_RBIO_REBUILD_MISSING, 56 }; 57 58 struct btrfs_raid_bio { 59 struct btrfs_fs_info *fs_info; 60 struct btrfs_bio *bbio; 61 62 /* while we're doing rmw on a stripe 63 * we put it into a hash table so we can 64 * lock the stripe and merge more rbios 65 * into it. 66 */ 67 struct list_head hash_list; 68 69 /* 70 * LRU list for the stripe cache 71 */ 72 struct list_head stripe_cache; 73 74 /* 75 * for scheduling work in the helper threads 76 */ 77 struct btrfs_work work; 78 79 /* 80 * bio list and bio_list_lock are used 81 * to add more bios into the stripe 82 * in hopes of avoiding the full rmw 83 */ 84 struct bio_list bio_list; 85 spinlock_t bio_list_lock; 86 87 /* also protected by the bio_list_lock, the 88 * plug list is used by the plugging code 89 * to collect partial bios while plugged. The 90 * stripe locking code also uses it to hand off 91 * the stripe lock to the next pending IO 92 */ 93 struct list_head plug_list; 94 95 /* 96 * flags that tell us if it is safe to 97 * merge with this bio 98 */ 99 unsigned long flags; 100 101 /* size of each individual stripe on disk */ 102 int stripe_len; 103 104 /* number of data stripes (no p/q) */ 105 int nr_data; 106 107 int real_stripes; 108 109 int stripe_npages; 110 /* 111 * set if we're doing a parity rebuild 112 * for a read from higher up, which is handled 113 * differently from a parity rebuild as part of 114 * rmw 115 */ 116 enum btrfs_rbio_ops operation; 117 118 /* first bad stripe */ 119 int faila; 120 121 /* second bad stripe (for raid6 use) */ 122 int failb; 123 124 int scrubp; 125 /* 126 * number of pages needed to represent the full 127 * stripe 128 */ 129 int nr_pages; 130 131 /* 132 * size of all the bios in the bio_list. This 133 * helps us decide if the rbio maps to a full 134 * stripe or not 135 */ 136 int bio_list_bytes; 137 138 int generic_bio_cnt; 139 140 refcount_t refs; 141 142 atomic_t stripes_pending; 143 144 atomic_t error; 145 /* 146 * these are two arrays of pointers. We allocate the 147 * rbio big enough to hold them both and setup their 148 * locations when the rbio is allocated 149 */ 150 151 /* pointers to pages that we allocated for 152 * reading/writing stripes directly from the disk (including P/Q) 153 */ 154 struct page **stripe_pages; 155 156 /* 157 * pointers to the pages in the bio_list. Stored 158 * here for faster lookup 159 */ 160 struct page **bio_pages; 161 162 /* 163 * bitmap to record which horizontal stripe has data 164 */ 165 unsigned long *dbitmap; 166 }; 167 168 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 169 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 170 static void rmw_work(struct btrfs_work *work); 171 static void read_rebuild_work(struct btrfs_work *work); 172 static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 173 static void async_read_rebuild(struct btrfs_raid_bio *rbio); 174 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 175 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 176 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 177 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 178 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 179 180 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 181 int need_check); 182 static void async_scrub_parity(struct btrfs_raid_bio *rbio); 183 184 /* 185 * the stripe hash table is used for locking, and to collect 186 * bios in hopes of making a full stripe 187 */ 188 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 189 { 190 struct btrfs_stripe_hash_table *table; 191 struct btrfs_stripe_hash_table *x; 192 struct btrfs_stripe_hash *cur; 193 struct btrfs_stripe_hash *h; 194 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 195 int i; 196 int table_size; 197 198 if (info->stripe_hash_table) 199 return 0; 200 201 /* 202 * The table is large, starting with order 4 and can go as high as 203 * order 7 in case lock debugging is turned on. 204 * 205 * Try harder to allocate and fallback to vmalloc to lower the chance 206 * of a failing mount. 207 */ 208 table_size = sizeof(*table) + sizeof(*h) * num_entries; 209 table = kvzalloc(table_size, GFP_KERNEL); 210 if (!table) 211 return -ENOMEM; 212 213 spin_lock_init(&table->cache_lock); 214 INIT_LIST_HEAD(&table->stripe_cache); 215 216 h = table->table; 217 218 for (i = 0; i < num_entries; i++) { 219 cur = h + i; 220 INIT_LIST_HEAD(&cur->hash_list); 221 spin_lock_init(&cur->lock); 222 } 223 224 x = cmpxchg(&info->stripe_hash_table, NULL, table); 225 if (x) 226 kvfree(x); 227 return 0; 228 } 229 230 /* 231 * caching an rbio means to copy anything from the 232 * bio_pages array into the stripe_pages array. We 233 * use the page uptodate bit in the stripe cache array 234 * to indicate if it has valid data 235 * 236 * once the caching is done, we set the cache ready 237 * bit. 238 */ 239 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 240 { 241 int i; 242 char *s; 243 char *d; 244 int ret; 245 246 ret = alloc_rbio_pages(rbio); 247 if (ret) 248 return; 249 250 for (i = 0; i < rbio->nr_pages; i++) { 251 if (!rbio->bio_pages[i]) 252 continue; 253 254 s = kmap(rbio->bio_pages[i]); 255 d = kmap(rbio->stripe_pages[i]); 256 257 memcpy(d, s, PAGE_SIZE); 258 259 kunmap(rbio->bio_pages[i]); 260 kunmap(rbio->stripe_pages[i]); 261 SetPageUptodate(rbio->stripe_pages[i]); 262 } 263 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 264 } 265 266 /* 267 * we hash on the first logical address of the stripe 268 */ 269 static int rbio_bucket(struct btrfs_raid_bio *rbio) 270 { 271 u64 num = rbio->bbio->raid_map[0]; 272 273 /* 274 * we shift down quite a bit. We're using byte 275 * addressing, and most of the lower bits are zeros. 276 * This tends to upset hash_64, and it consistently 277 * returns just one or two different values. 278 * 279 * shifting off the lower bits fixes things. 280 */ 281 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 282 } 283 284 /* 285 * stealing an rbio means taking all the uptodate pages from the stripe 286 * array in the source rbio and putting them into the destination rbio 287 */ 288 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 289 { 290 int i; 291 struct page *s; 292 struct page *d; 293 294 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 295 return; 296 297 for (i = 0; i < dest->nr_pages; i++) { 298 s = src->stripe_pages[i]; 299 if (!s || !PageUptodate(s)) { 300 continue; 301 } 302 303 d = dest->stripe_pages[i]; 304 if (d) 305 __free_page(d); 306 307 dest->stripe_pages[i] = s; 308 src->stripe_pages[i] = NULL; 309 } 310 } 311 312 /* 313 * merging means we take the bio_list from the victim and 314 * splice it into the destination. The victim should 315 * be discarded afterwards. 316 * 317 * must be called with dest->rbio_list_lock held 318 */ 319 static void merge_rbio(struct btrfs_raid_bio *dest, 320 struct btrfs_raid_bio *victim) 321 { 322 bio_list_merge(&dest->bio_list, &victim->bio_list); 323 dest->bio_list_bytes += victim->bio_list_bytes; 324 dest->generic_bio_cnt += victim->generic_bio_cnt; 325 bio_list_init(&victim->bio_list); 326 } 327 328 /* 329 * used to prune items that are in the cache. The caller 330 * must hold the hash table lock. 331 */ 332 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 333 { 334 int bucket = rbio_bucket(rbio); 335 struct btrfs_stripe_hash_table *table; 336 struct btrfs_stripe_hash *h; 337 int freeit = 0; 338 339 /* 340 * check the bit again under the hash table lock. 341 */ 342 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 343 return; 344 345 table = rbio->fs_info->stripe_hash_table; 346 h = table->table + bucket; 347 348 /* hold the lock for the bucket because we may be 349 * removing it from the hash table 350 */ 351 spin_lock(&h->lock); 352 353 /* 354 * hold the lock for the bio list because we need 355 * to make sure the bio list is empty 356 */ 357 spin_lock(&rbio->bio_list_lock); 358 359 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 360 list_del_init(&rbio->stripe_cache); 361 table->cache_size -= 1; 362 freeit = 1; 363 364 /* if the bio list isn't empty, this rbio is 365 * still involved in an IO. We take it out 366 * of the cache list, and drop the ref that 367 * was held for the list. 368 * 369 * If the bio_list was empty, we also remove 370 * the rbio from the hash_table, and drop 371 * the corresponding ref 372 */ 373 if (bio_list_empty(&rbio->bio_list)) { 374 if (!list_empty(&rbio->hash_list)) { 375 list_del_init(&rbio->hash_list); 376 refcount_dec(&rbio->refs); 377 BUG_ON(!list_empty(&rbio->plug_list)); 378 } 379 } 380 } 381 382 spin_unlock(&rbio->bio_list_lock); 383 spin_unlock(&h->lock); 384 385 if (freeit) 386 __free_raid_bio(rbio); 387 } 388 389 /* 390 * prune a given rbio from the cache 391 */ 392 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 393 { 394 struct btrfs_stripe_hash_table *table; 395 unsigned long flags; 396 397 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 398 return; 399 400 table = rbio->fs_info->stripe_hash_table; 401 402 spin_lock_irqsave(&table->cache_lock, flags); 403 __remove_rbio_from_cache(rbio); 404 spin_unlock_irqrestore(&table->cache_lock, flags); 405 } 406 407 /* 408 * remove everything in the cache 409 */ 410 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 411 { 412 struct btrfs_stripe_hash_table *table; 413 unsigned long flags; 414 struct btrfs_raid_bio *rbio; 415 416 table = info->stripe_hash_table; 417 418 spin_lock_irqsave(&table->cache_lock, flags); 419 while (!list_empty(&table->stripe_cache)) { 420 rbio = list_entry(table->stripe_cache.next, 421 struct btrfs_raid_bio, 422 stripe_cache); 423 __remove_rbio_from_cache(rbio); 424 } 425 spin_unlock_irqrestore(&table->cache_lock, flags); 426 } 427 428 /* 429 * remove all cached entries and free the hash table 430 * used by unmount 431 */ 432 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 433 { 434 if (!info->stripe_hash_table) 435 return; 436 btrfs_clear_rbio_cache(info); 437 kvfree(info->stripe_hash_table); 438 info->stripe_hash_table = NULL; 439 } 440 441 /* 442 * insert an rbio into the stripe cache. It 443 * must have already been prepared by calling 444 * cache_rbio_pages 445 * 446 * If this rbio was already cached, it gets 447 * moved to the front of the lru. 448 * 449 * If the size of the rbio cache is too big, we 450 * prune an item. 451 */ 452 static void cache_rbio(struct btrfs_raid_bio *rbio) 453 { 454 struct btrfs_stripe_hash_table *table; 455 unsigned long flags; 456 457 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 458 return; 459 460 table = rbio->fs_info->stripe_hash_table; 461 462 spin_lock_irqsave(&table->cache_lock, flags); 463 spin_lock(&rbio->bio_list_lock); 464 465 /* bump our ref if we were not in the list before */ 466 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 467 refcount_inc(&rbio->refs); 468 469 if (!list_empty(&rbio->stripe_cache)){ 470 list_move(&rbio->stripe_cache, &table->stripe_cache); 471 } else { 472 list_add(&rbio->stripe_cache, &table->stripe_cache); 473 table->cache_size += 1; 474 } 475 476 spin_unlock(&rbio->bio_list_lock); 477 478 if (table->cache_size > RBIO_CACHE_SIZE) { 479 struct btrfs_raid_bio *found; 480 481 found = list_entry(table->stripe_cache.prev, 482 struct btrfs_raid_bio, 483 stripe_cache); 484 485 if (found != rbio) 486 __remove_rbio_from_cache(found); 487 } 488 489 spin_unlock_irqrestore(&table->cache_lock, flags); 490 } 491 492 /* 493 * helper function to run the xor_blocks api. It is only 494 * able to do MAX_XOR_BLOCKS at a time, so we need to 495 * loop through. 496 */ 497 static void run_xor(void **pages, int src_cnt, ssize_t len) 498 { 499 int src_off = 0; 500 int xor_src_cnt = 0; 501 void *dest = pages[src_cnt]; 502 503 while(src_cnt > 0) { 504 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 505 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 506 507 src_cnt -= xor_src_cnt; 508 src_off += xor_src_cnt; 509 } 510 } 511 512 /* 513 * returns true if the bio list inside this rbio 514 * covers an entire stripe (no rmw required). 515 * Must be called with the bio list lock held, or 516 * at a time when you know it is impossible to add 517 * new bios into the list 518 */ 519 static int __rbio_is_full(struct btrfs_raid_bio *rbio) 520 { 521 unsigned long size = rbio->bio_list_bytes; 522 int ret = 1; 523 524 if (size != rbio->nr_data * rbio->stripe_len) 525 ret = 0; 526 527 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 528 return ret; 529 } 530 531 static int rbio_is_full(struct btrfs_raid_bio *rbio) 532 { 533 unsigned long flags; 534 int ret; 535 536 spin_lock_irqsave(&rbio->bio_list_lock, flags); 537 ret = __rbio_is_full(rbio); 538 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 539 return ret; 540 } 541 542 /* 543 * returns 1 if it is safe to merge two rbios together. 544 * The merging is safe if the two rbios correspond to 545 * the same stripe and if they are both going in the same 546 * direction (read vs write), and if neither one is 547 * locked for final IO 548 * 549 * The caller is responsible for locking such that 550 * rmw_locked is safe to test 551 */ 552 static int rbio_can_merge(struct btrfs_raid_bio *last, 553 struct btrfs_raid_bio *cur) 554 { 555 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 556 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 557 return 0; 558 559 /* 560 * we can't merge with cached rbios, since the 561 * idea is that when we merge the destination 562 * rbio is going to run our IO for us. We can 563 * steal from cached rbios though, other functions 564 * handle that. 565 */ 566 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 567 test_bit(RBIO_CACHE_BIT, &cur->flags)) 568 return 0; 569 570 if (last->bbio->raid_map[0] != 571 cur->bbio->raid_map[0]) 572 return 0; 573 574 /* we can't merge with different operations */ 575 if (last->operation != cur->operation) 576 return 0; 577 /* 578 * We've need read the full stripe from the drive. 579 * check and repair the parity and write the new results. 580 * 581 * We're not allowed to add any new bios to the 582 * bio list here, anyone else that wants to 583 * change this stripe needs to do their own rmw. 584 */ 585 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 586 return 0; 587 588 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 589 return 0; 590 591 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 592 int fa = last->faila; 593 int fb = last->failb; 594 int cur_fa = cur->faila; 595 int cur_fb = cur->failb; 596 597 if (last->faila >= last->failb) { 598 fa = last->failb; 599 fb = last->faila; 600 } 601 602 if (cur->faila >= cur->failb) { 603 cur_fa = cur->failb; 604 cur_fb = cur->faila; 605 } 606 607 if (fa != cur_fa || fb != cur_fb) 608 return 0; 609 } 610 return 1; 611 } 612 613 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, 614 int index) 615 { 616 return stripe * rbio->stripe_npages + index; 617 } 618 619 /* 620 * these are just the pages from the rbio array, not from anything 621 * the FS sent down to us 622 */ 623 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, 624 int index) 625 { 626 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; 627 } 628 629 /* 630 * helper to index into the pstripe 631 */ 632 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 633 { 634 return rbio_stripe_page(rbio, rbio->nr_data, index); 635 } 636 637 /* 638 * helper to index into the qstripe, returns null 639 * if there is no qstripe 640 */ 641 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 642 { 643 if (rbio->nr_data + 1 == rbio->real_stripes) 644 return NULL; 645 return rbio_stripe_page(rbio, rbio->nr_data + 1, index); 646 } 647 648 /* 649 * The first stripe in the table for a logical address 650 * has the lock. rbios are added in one of three ways: 651 * 652 * 1) Nobody has the stripe locked yet. The rbio is given 653 * the lock and 0 is returned. The caller must start the IO 654 * themselves. 655 * 656 * 2) Someone has the stripe locked, but we're able to merge 657 * with the lock owner. The rbio is freed and the IO will 658 * start automatically along with the existing rbio. 1 is returned. 659 * 660 * 3) Someone has the stripe locked, but we're not able to merge. 661 * The rbio is added to the lock owner's plug list, or merged into 662 * an rbio already on the plug list. When the lock owner unlocks, 663 * the next rbio on the list is run and the IO is started automatically. 664 * 1 is returned 665 * 666 * If we return 0, the caller still owns the rbio and must continue with 667 * IO submission. If we return 1, the caller must assume the rbio has 668 * already been freed. 669 */ 670 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 671 { 672 int bucket = rbio_bucket(rbio); 673 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 674 struct btrfs_raid_bio *cur; 675 struct btrfs_raid_bio *pending; 676 unsigned long flags; 677 struct btrfs_raid_bio *freeit = NULL; 678 struct btrfs_raid_bio *cache_drop = NULL; 679 int ret = 0; 680 681 spin_lock_irqsave(&h->lock, flags); 682 list_for_each_entry(cur, &h->hash_list, hash_list) { 683 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { 684 spin_lock(&cur->bio_list_lock); 685 686 /* can we steal this cached rbio's pages? */ 687 if (bio_list_empty(&cur->bio_list) && 688 list_empty(&cur->plug_list) && 689 test_bit(RBIO_CACHE_BIT, &cur->flags) && 690 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 691 list_del_init(&cur->hash_list); 692 refcount_dec(&cur->refs); 693 694 steal_rbio(cur, rbio); 695 cache_drop = cur; 696 spin_unlock(&cur->bio_list_lock); 697 698 goto lockit; 699 } 700 701 /* can we merge into the lock owner? */ 702 if (rbio_can_merge(cur, rbio)) { 703 merge_rbio(cur, rbio); 704 spin_unlock(&cur->bio_list_lock); 705 freeit = rbio; 706 ret = 1; 707 goto out; 708 } 709 710 711 /* 712 * we couldn't merge with the running 713 * rbio, see if we can merge with the 714 * pending ones. We don't have to 715 * check for rmw_locked because there 716 * is no way they are inside finish_rmw 717 * right now 718 */ 719 list_for_each_entry(pending, &cur->plug_list, 720 plug_list) { 721 if (rbio_can_merge(pending, rbio)) { 722 merge_rbio(pending, rbio); 723 spin_unlock(&cur->bio_list_lock); 724 freeit = rbio; 725 ret = 1; 726 goto out; 727 } 728 } 729 730 /* no merging, put us on the tail of the plug list, 731 * our rbio will be started with the currently 732 * running rbio unlocks 733 */ 734 list_add_tail(&rbio->plug_list, &cur->plug_list); 735 spin_unlock(&cur->bio_list_lock); 736 ret = 1; 737 goto out; 738 } 739 } 740 lockit: 741 refcount_inc(&rbio->refs); 742 list_add(&rbio->hash_list, &h->hash_list); 743 out: 744 spin_unlock_irqrestore(&h->lock, flags); 745 if (cache_drop) 746 remove_rbio_from_cache(cache_drop); 747 if (freeit) 748 __free_raid_bio(freeit); 749 return ret; 750 } 751 752 /* 753 * called as rmw or parity rebuild is completed. If the plug list has more 754 * rbios waiting for this stripe, the next one on the list will be started 755 */ 756 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 757 { 758 int bucket; 759 struct btrfs_stripe_hash *h; 760 unsigned long flags; 761 int keep_cache = 0; 762 763 bucket = rbio_bucket(rbio); 764 h = rbio->fs_info->stripe_hash_table->table + bucket; 765 766 if (list_empty(&rbio->plug_list)) 767 cache_rbio(rbio); 768 769 spin_lock_irqsave(&h->lock, flags); 770 spin_lock(&rbio->bio_list_lock); 771 772 if (!list_empty(&rbio->hash_list)) { 773 /* 774 * if we're still cached and there is no other IO 775 * to perform, just leave this rbio here for others 776 * to steal from later 777 */ 778 if (list_empty(&rbio->plug_list) && 779 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 780 keep_cache = 1; 781 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 782 BUG_ON(!bio_list_empty(&rbio->bio_list)); 783 goto done; 784 } 785 786 list_del_init(&rbio->hash_list); 787 refcount_dec(&rbio->refs); 788 789 /* 790 * we use the plug list to hold all the rbios 791 * waiting for the chance to lock this stripe. 792 * hand the lock over to one of them. 793 */ 794 if (!list_empty(&rbio->plug_list)) { 795 struct btrfs_raid_bio *next; 796 struct list_head *head = rbio->plug_list.next; 797 798 next = list_entry(head, struct btrfs_raid_bio, 799 plug_list); 800 801 list_del_init(&rbio->plug_list); 802 803 list_add(&next->hash_list, &h->hash_list); 804 refcount_inc(&next->refs); 805 spin_unlock(&rbio->bio_list_lock); 806 spin_unlock_irqrestore(&h->lock, flags); 807 808 if (next->operation == BTRFS_RBIO_READ_REBUILD) 809 async_read_rebuild(next); 810 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 811 steal_rbio(rbio, next); 812 async_read_rebuild(next); 813 } else if (next->operation == BTRFS_RBIO_WRITE) { 814 steal_rbio(rbio, next); 815 async_rmw_stripe(next); 816 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 817 steal_rbio(rbio, next); 818 async_scrub_parity(next); 819 } 820 821 goto done_nolock; 822 } 823 } 824 done: 825 spin_unlock(&rbio->bio_list_lock); 826 spin_unlock_irqrestore(&h->lock, flags); 827 828 done_nolock: 829 if (!keep_cache) 830 remove_rbio_from_cache(rbio); 831 } 832 833 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 834 { 835 int i; 836 837 if (!refcount_dec_and_test(&rbio->refs)) 838 return; 839 840 WARN_ON(!list_empty(&rbio->stripe_cache)); 841 WARN_ON(!list_empty(&rbio->hash_list)); 842 WARN_ON(!bio_list_empty(&rbio->bio_list)); 843 844 for (i = 0; i < rbio->nr_pages; i++) { 845 if (rbio->stripe_pages[i]) { 846 __free_page(rbio->stripe_pages[i]); 847 rbio->stripe_pages[i] = NULL; 848 } 849 } 850 851 btrfs_put_bbio(rbio->bbio); 852 kfree(rbio); 853 } 854 855 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 856 { 857 struct bio *next; 858 859 while (cur) { 860 next = cur->bi_next; 861 cur->bi_next = NULL; 862 cur->bi_status = err; 863 bio_endio(cur); 864 cur = next; 865 } 866 } 867 868 /* 869 * this frees the rbio and runs through all the bios in the 870 * bio_list and calls end_io on them 871 */ 872 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 873 { 874 struct bio *cur = bio_list_get(&rbio->bio_list); 875 struct bio *extra; 876 877 if (rbio->generic_bio_cnt) 878 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); 879 880 /* 881 * At this moment, rbio->bio_list is empty, however since rbio does not 882 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 883 * hash list, rbio may be merged with others so that rbio->bio_list 884 * becomes non-empty. 885 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 886 * more and we can call bio_endio() on all queued bios. 887 */ 888 unlock_stripe(rbio); 889 extra = bio_list_get(&rbio->bio_list); 890 __free_raid_bio(rbio); 891 892 rbio_endio_bio_list(cur, err); 893 if (extra) 894 rbio_endio_bio_list(extra, err); 895 } 896 897 /* 898 * end io function used by finish_rmw. When we finally 899 * get here, we've written a full stripe 900 */ 901 static void raid_write_end_io(struct bio *bio) 902 { 903 struct btrfs_raid_bio *rbio = bio->bi_private; 904 blk_status_t err = bio->bi_status; 905 int max_errors; 906 907 if (err) 908 fail_bio_stripe(rbio, bio); 909 910 bio_put(bio); 911 912 if (!atomic_dec_and_test(&rbio->stripes_pending)) 913 return; 914 915 err = BLK_STS_OK; 916 917 /* OK, we have read all the stripes we need to. */ 918 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 919 0 : rbio->bbio->max_errors; 920 if (atomic_read(&rbio->error) > max_errors) 921 err = BLK_STS_IOERR; 922 923 rbio_orig_end_io(rbio, err); 924 } 925 926 /* 927 * the read/modify/write code wants to use the original bio for 928 * any pages it included, and then use the rbio for everything 929 * else. This function decides if a given index (stripe number) 930 * and page number in that stripe fall inside the original bio 931 * or the rbio. 932 * 933 * if you set bio_list_only, you'll get a NULL back for any ranges 934 * that are outside the bio_list 935 * 936 * This doesn't take any refs on anything, you get a bare page pointer 937 * and the caller must bump refs as required. 938 * 939 * You must call index_rbio_pages once before you can trust 940 * the answers from this function. 941 */ 942 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 943 int index, int pagenr, int bio_list_only) 944 { 945 int chunk_page; 946 struct page *p = NULL; 947 948 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 949 950 spin_lock_irq(&rbio->bio_list_lock); 951 p = rbio->bio_pages[chunk_page]; 952 spin_unlock_irq(&rbio->bio_list_lock); 953 954 if (p || bio_list_only) 955 return p; 956 957 return rbio->stripe_pages[chunk_page]; 958 } 959 960 /* 961 * number of pages we need for the entire stripe across all the 962 * drives 963 */ 964 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 965 { 966 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; 967 } 968 969 /* 970 * allocation and initial setup for the btrfs_raid_bio. Not 971 * this does not allocate any pages for rbio->pages. 972 */ 973 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 974 struct btrfs_bio *bbio, 975 u64 stripe_len) 976 { 977 struct btrfs_raid_bio *rbio; 978 int nr_data = 0; 979 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; 980 int num_pages = rbio_nr_pages(stripe_len, real_stripes); 981 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); 982 void *p; 983 984 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + 985 DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * 986 sizeof(long), GFP_NOFS); 987 if (!rbio) 988 return ERR_PTR(-ENOMEM); 989 990 bio_list_init(&rbio->bio_list); 991 INIT_LIST_HEAD(&rbio->plug_list); 992 spin_lock_init(&rbio->bio_list_lock); 993 INIT_LIST_HEAD(&rbio->stripe_cache); 994 INIT_LIST_HEAD(&rbio->hash_list); 995 rbio->bbio = bbio; 996 rbio->fs_info = fs_info; 997 rbio->stripe_len = stripe_len; 998 rbio->nr_pages = num_pages; 999 rbio->real_stripes = real_stripes; 1000 rbio->stripe_npages = stripe_npages; 1001 rbio->faila = -1; 1002 rbio->failb = -1; 1003 refcount_set(&rbio->refs, 1); 1004 atomic_set(&rbio->error, 0); 1005 atomic_set(&rbio->stripes_pending, 0); 1006 1007 /* 1008 * the stripe_pages and bio_pages array point to the extra 1009 * memory we allocated past the end of the rbio 1010 */ 1011 p = rbio + 1; 1012 rbio->stripe_pages = p; 1013 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 1014 rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; 1015 1016 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1017 nr_data = real_stripes - 1; 1018 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1019 nr_data = real_stripes - 2; 1020 else 1021 BUG(); 1022 1023 rbio->nr_data = nr_data; 1024 return rbio; 1025 } 1026 1027 /* allocate pages for all the stripes in the bio, including parity */ 1028 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 1029 { 1030 int i; 1031 struct page *page; 1032 1033 for (i = 0; i < rbio->nr_pages; i++) { 1034 if (rbio->stripe_pages[i]) 1035 continue; 1036 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1037 if (!page) 1038 return -ENOMEM; 1039 rbio->stripe_pages[i] = page; 1040 } 1041 return 0; 1042 } 1043 1044 /* only allocate pages for p/q stripes */ 1045 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 1046 { 1047 int i; 1048 struct page *page; 1049 1050 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); 1051 1052 for (; i < rbio->nr_pages; i++) { 1053 if (rbio->stripe_pages[i]) 1054 continue; 1055 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1056 if (!page) 1057 return -ENOMEM; 1058 rbio->stripe_pages[i] = page; 1059 } 1060 return 0; 1061 } 1062 1063 /* 1064 * add a single page from a specific stripe into our list of bios for IO 1065 * this will try to merge into existing bios if possible, and returns 1066 * zero if all went well. 1067 */ 1068 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1069 struct bio_list *bio_list, 1070 struct page *page, 1071 int stripe_nr, 1072 unsigned long page_index, 1073 unsigned long bio_max_len) 1074 { 1075 struct bio *last = bio_list->tail; 1076 u64 last_end = 0; 1077 int ret; 1078 struct bio *bio; 1079 struct btrfs_bio_stripe *stripe; 1080 u64 disk_start; 1081 1082 stripe = &rbio->bbio->stripes[stripe_nr]; 1083 disk_start = stripe->physical + (page_index << PAGE_SHIFT); 1084 1085 /* if the device is missing, just fail this stripe */ 1086 if (!stripe->dev->bdev) 1087 return fail_rbio_index(rbio, stripe_nr); 1088 1089 /* see if we can add this page onto our existing bio */ 1090 if (last) { 1091 last_end = (u64)last->bi_iter.bi_sector << 9; 1092 last_end += last->bi_iter.bi_size; 1093 1094 /* 1095 * we can't merge these if they are from different 1096 * devices or if they are not contiguous 1097 */ 1098 if (last_end == disk_start && stripe->dev->bdev && 1099 !last->bi_status && 1100 last->bi_disk == stripe->dev->bdev->bd_disk && 1101 last->bi_partno == stripe->dev->bdev->bd_partno) { 1102 ret = bio_add_page(last, page, PAGE_SIZE, 0); 1103 if (ret == PAGE_SIZE) 1104 return 0; 1105 } 1106 } 1107 1108 /* put a new bio on the list */ 1109 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); 1110 bio->bi_iter.bi_size = 0; 1111 bio_set_dev(bio, stripe->dev->bdev); 1112 bio->bi_iter.bi_sector = disk_start >> 9; 1113 1114 bio_add_page(bio, page, PAGE_SIZE, 0); 1115 bio_list_add(bio_list, bio); 1116 return 0; 1117 } 1118 1119 /* 1120 * while we're doing the read/modify/write cycle, we could 1121 * have errors in reading pages off the disk. This checks 1122 * for errors and if we're not able to read the page it'll 1123 * trigger parity reconstruction. The rmw will be finished 1124 * after we've reconstructed the failed stripes 1125 */ 1126 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1127 { 1128 if (rbio->faila >= 0 || rbio->failb >= 0) { 1129 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1130 __raid56_parity_recover(rbio); 1131 } else { 1132 finish_rmw(rbio); 1133 } 1134 } 1135 1136 /* 1137 * helper function to walk our bio list and populate the bio_pages array with 1138 * the result. This seems expensive, but it is faster than constantly 1139 * searching through the bio list as we setup the IO in finish_rmw or stripe 1140 * reconstruction. 1141 * 1142 * This must be called before you trust the answers from page_in_rbio 1143 */ 1144 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1145 { 1146 struct bio *bio; 1147 u64 start; 1148 unsigned long stripe_offset; 1149 unsigned long page_index; 1150 1151 spin_lock_irq(&rbio->bio_list_lock); 1152 bio_list_for_each(bio, &rbio->bio_list) { 1153 struct bio_vec bvec; 1154 struct bvec_iter iter; 1155 int i = 0; 1156 1157 start = (u64)bio->bi_iter.bi_sector << 9; 1158 stripe_offset = start - rbio->bbio->raid_map[0]; 1159 page_index = stripe_offset >> PAGE_SHIFT; 1160 1161 if (bio_flagged(bio, BIO_CLONED)) 1162 bio->bi_iter = btrfs_io_bio(bio)->iter; 1163 1164 bio_for_each_segment(bvec, bio, iter) { 1165 rbio->bio_pages[page_index + i] = bvec.bv_page; 1166 i++; 1167 } 1168 } 1169 spin_unlock_irq(&rbio->bio_list_lock); 1170 } 1171 1172 /* 1173 * this is called from one of two situations. We either 1174 * have a full stripe from the higher layers, or we've read all 1175 * the missing bits off disk. 1176 * 1177 * This will calculate the parity and then send down any 1178 * changed blocks. 1179 */ 1180 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1181 { 1182 struct btrfs_bio *bbio = rbio->bbio; 1183 void *pointers[rbio->real_stripes]; 1184 int nr_data = rbio->nr_data; 1185 int stripe; 1186 int pagenr; 1187 int p_stripe = -1; 1188 int q_stripe = -1; 1189 struct bio_list bio_list; 1190 struct bio *bio; 1191 int ret; 1192 1193 bio_list_init(&bio_list); 1194 1195 if (rbio->real_stripes - rbio->nr_data == 1) { 1196 p_stripe = rbio->real_stripes - 1; 1197 } else if (rbio->real_stripes - rbio->nr_data == 2) { 1198 p_stripe = rbio->real_stripes - 2; 1199 q_stripe = rbio->real_stripes - 1; 1200 } else { 1201 BUG(); 1202 } 1203 1204 /* at this point we either have a full stripe, 1205 * or we've read the full stripe from the drive. 1206 * recalculate the parity and write the new results. 1207 * 1208 * We're not allowed to add any new bios to the 1209 * bio list here, anyone else that wants to 1210 * change this stripe needs to do their own rmw. 1211 */ 1212 spin_lock_irq(&rbio->bio_list_lock); 1213 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1214 spin_unlock_irq(&rbio->bio_list_lock); 1215 1216 atomic_set(&rbio->error, 0); 1217 1218 /* 1219 * now that we've set rmw_locked, run through the 1220 * bio list one last time and map the page pointers 1221 * 1222 * We don't cache full rbios because we're assuming 1223 * the higher layers are unlikely to use this area of 1224 * the disk again soon. If they do use it again, 1225 * hopefully they will send another full bio. 1226 */ 1227 index_rbio_pages(rbio); 1228 if (!rbio_is_full(rbio)) 1229 cache_rbio_pages(rbio); 1230 else 1231 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1232 1233 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1234 struct page *p; 1235 /* first collect one page from each data stripe */ 1236 for (stripe = 0; stripe < nr_data; stripe++) { 1237 p = page_in_rbio(rbio, stripe, pagenr, 0); 1238 pointers[stripe] = kmap(p); 1239 } 1240 1241 /* then add the parity stripe */ 1242 p = rbio_pstripe_page(rbio, pagenr); 1243 SetPageUptodate(p); 1244 pointers[stripe++] = kmap(p); 1245 1246 if (q_stripe != -1) { 1247 1248 /* 1249 * raid6, add the qstripe and call the 1250 * library function to fill in our p/q 1251 */ 1252 p = rbio_qstripe_page(rbio, pagenr); 1253 SetPageUptodate(p); 1254 pointers[stripe++] = kmap(p); 1255 1256 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 1257 pointers); 1258 } else { 1259 /* raid5 */ 1260 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 1261 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 1262 } 1263 1264 1265 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 1266 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1267 } 1268 1269 /* 1270 * time to start writing. Make bios for everything from the 1271 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1272 * everything else. 1273 */ 1274 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1275 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1276 struct page *page; 1277 if (stripe < rbio->nr_data) { 1278 page = page_in_rbio(rbio, stripe, pagenr, 1); 1279 if (!page) 1280 continue; 1281 } else { 1282 page = rbio_stripe_page(rbio, stripe, pagenr); 1283 } 1284 1285 ret = rbio_add_io_page(rbio, &bio_list, 1286 page, stripe, pagenr, rbio->stripe_len); 1287 if (ret) 1288 goto cleanup; 1289 } 1290 } 1291 1292 if (likely(!bbio->num_tgtdevs)) 1293 goto write_data; 1294 1295 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1296 if (!bbio->tgtdev_map[stripe]) 1297 continue; 1298 1299 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1300 struct page *page; 1301 if (stripe < rbio->nr_data) { 1302 page = page_in_rbio(rbio, stripe, pagenr, 1); 1303 if (!page) 1304 continue; 1305 } else { 1306 page = rbio_stripe_page(rbio, stripe, pagenr); 1307 } 1308 1309 ret = rbio_add_io_page(rbio, &bio_list, page, 1310 rbio->bbio->tgtdev_map[stripe], 1311 pagenr, rbio->stripe_len); 1312 if (ret) 1313 goto cleanup; 1314 } 1315 } 1316 1317 write_data: 1318 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1319 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1320 1321 while (1) { 1322 bio = bio_list_pop(&bio_list); 1323 if (!bio) 1324 break; 1325 1326 bio->bi_private = rbio; 1327 bio->bi_end_io = raid_write_end_io; 1328 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 1329 1330 submit_bio(bio); 1331 } 1332 return; 1333 1334 cleanup: 1335 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1336 1337 while ((bio = bio_list_pop(&bio_list))) 1338 bio_put(bio); 1339 } 1340 1341 /* 1342 * helper to find the stripe number for a given bio. Used to figure out which 1343 * stripe has failed. This expects the bio to correspond to a physical disk, 1344 * so it looks up based on physical sector numbers. 1345 */ 1346 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1347 struct bio *bio) 1348 { 1349 u64 physical = bio->bi_iter.bi_sector; 1350 u64 stripe_start; 1351 int i; 1352 struct btrfs_bio_stripe *stripe; 1353 1354 physical <<= 9; 1355 1356 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1357 stripe = &rbio->bbio->stripes[i]; 1358 stripe_start = stripe->physical; 1359 if (physical >= stripe_start && 1360 physical < stripe_start + rbio->stripe_len && 1361 stripe->dev->bdev && 1362 bio->bi_disk == stripe->dev->bdev->bd_disk && 1363 bio->bi_partno == stripe->dev->bdev->bd_partno) { 1364 return i; 1365 } 1366 } 1367 return -1; 1368 } 1369 1370 /* 1371 * helper to find the stripe number for a given 1372 * bio (before mapping). Used to figure out which stripe has 1373 * failed. This looks up based on logical block numbers. 1374 */ 1375 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1376 struct bio *bio) 1377 { 1378 u64 logical = bio->bi_iter.bi_sector; 1379 u64 stripe_start; 1380 int i; 1381 1382 logical <<= 9; 1383 1384 for (i = 0; i < rbio->nr_data; i++) { 1385 stripe_start = rbio->bbio->raid_map[i]; 1386 if (logical >= stripe_start && 1387 logical < stripe_start + rbio->stripe_len) { 1388 return i; 1389 } 1390 } 1391 return -1; 1392 } 1393 1394 /* 1395 * returns -EIO if we had too many failures 1396 */ 1397 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1398 { 1399 unsigned long flags; 1400 int ret = 0; 1401 1402 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1403 1404 /* we already know this stripe is bad, move on */ 1405 if (rbio->faila == failed || rbio->failb == failed) 1406 goto out; 1407 1408 if (rbio->faila == -1) { 1409 /* first failure on this rbio */ 1410 rbio->faila = failed; 1411 atomic_inc(&rbio->error); 1412 } else if (rbio->failb == -1) { 1413 /* second failure on this rbio */ 1414 rbio->failb = failed; 1415 atomic_inc(&rbio->error); 1416 } else { 1417 ret = -EIO; 1418 } 1419 out: 1420 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1421 1422 return ret; 1423 } 1424 1425 /* 1426 * helper to fail a stripe based on a physical disk 1427 * bio. 1428 */ 1429 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1430 struct bio *bio) 1431 { 1432 int failed = find_bio_stripe(rbio, bio); 1433 1434 if (failed < 0) 1435 return -EIO; 1436 1437 return fail_rbio_index(rbio, failed); 1438 } 1439 1440 /* 1441 * this sets each page in the bio uptodate. It should only be used on private 1442 * rbio pages, nothing that comes in from the higher layers 1443 */ 1444 static void set_bio_pages_uptodate(struct bio *bio) 1445 { 1446 struct bio_vec *bvec; 1447 int i; 1448 1449 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1450 1451 bio_for_each_segment_all(bvec, bio, i) 1452 SetPageUptodate(bvec->bv_page); 1453 } 1454 1455 /* 1456 * end io for the read phase of the rmw cycle. All the bios here are physical 1457 * stripe bios we've read from the disk so we can recalculate the parity of the 1458 * stripe. 1459 * 1460 * This will usually kick off finish_rmw once all the bios are read in, but it 1461 * may trigger parity reconstruction if we had any errors along the way 1462 */ 1463 static void raid_rmw_end_io(struct bio *bio) 1464 { 1465 struct btrfs_raid_bio *rbio = bio->bi_private; 1466 1467 if (bio->bi_status) 1468 fail_bio_stripe(rbio, bio); 1469 else 1470 set_bio_pages_uptodate(bio); 1471 1472 bio_put(bio); 1473 1474 if (!atomic_dec_and_test(&rbio->stripes_pending)) 1475 return; 1476 1477 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 1478 goto cleanup; 1479 1480 /* 1481 * this will normally call finish_rmw to start our write 1482 * but if there are any failed stripes we'll reconstruct 1483 * from parity first 1484 */ 1485 validate_rbio_for_rmw(rbio); 1486 return; 1487 1488 cleanup: 1489 1490 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1491 } 1492 1493 static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1494 { 1495 btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL); 1496 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 1497 } 1498 1499 static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1500 { 1501 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 1502 read_rebuild_work, NULL, NULL); 1503 1504 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 1505 } 1506 1507 /* 1508 * the stripe must be locked by the caller. It will 1509 * unlock after all the writes are done 1510 */ 1511 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1512 { 1513 int bios_to_read = 0; 1514 struct bio_list bio_list; 1515 int ret; 1516 int pagenr; 1517 int stripe; 1518 struct bio *bio; 1519 1520 bio_list_init(&bio_list); 1521 1522 ret = alloc_rbio_pages(rbio); 1523 if (ret) 1524 goto cleanup; 1525 1526 index_rbio_pages(rbio); 1527 1528 atomic_set(&rbio->error, 0); 1529 /* 1530 * build a list of bios to read all the missing parts of this 1531 * stripe 1532 */ 1533 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1534 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1535 struct page *page; 1536 /* 1537 * we want to find all the pages missing from 1538 * the rbio and read them from the disk. If 1539 * page_in_rbio finds a page in the bio list 1540 * we don't need to read it off the stripe. 1541 */ 1542 page = page_in_rbio(rbio, stripe, pagenr, 1); 1543 if (page) 1544 continue; 1545 1546 page = rbio_stripe_page(rbio, stripe, pagenr); 1547 /* 1548 * the bio cache may have handed us an uptodate 1549 * page. If so, be happy and use it 1550 */ 1551 if (PageUptodate(page)) 1552 continue; 1553 1554 ret = rbio_add_io_page(rbio, &bio_list, page, 1555 stripe, pagenr, rbio->stripe_len); 1556 if (ret) 1557 goto cleanup; 1558 } 1559 } 1560 1561 bios_to_read = bio_list_size(&bio_list); 1562 if (!bios_to_read) { 1563 /* 1564 * this can happen if others have merged with 1565 * us, it means there is nothing left to read. 1566 * But if there are missing devices it may not be 1567 * safe to do the full stripe write yet. 1568 */ 1569 goto finish; 1570 } 1571 1572 /* 1573 * the bbio may be freed once we submit the last bio. Make sure 1574 * not to touch it after that 1575 */ 1576 atomic_set(&rbio->stripes_pending, bios_to_read); 1577 while (1) { 1578 bio = bio_list_pop(&bio_list); 1579 if (!bio) 1580 break; 1581 1582 bio->bi_private = rbio; 1583 bio->bi_end_io = raid_rmw_end_io; 1584 bio_set_op_attrs(bio, REQ_OP_READ, 0); 1585 1586 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 1587 1588 submit_bio(bio); 1589 } 1590 /* the actual write will happen once the reads are done */ 1591 return 0; 1592 1593 cleanup: 1594 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1595 1596 while ((bio = bio_list_pop(&bio_list))) 1597 bio_put(bio); 1598 1599 return -EIO; 1600 1601 finish: 1602 validate_rbio_for_rmw(rbio); 1603 return 0; 1604 } 1605 1606 /* 1607 * if the upper layers pass in a full stripe, we thank them by only allocating 1608 * enough pages to hold the parity, and sending it all down quickly. 1609 */ 1610 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1611 { 1612 int ret; 1613 1614 ret = alloc_rbio_parity_pages(rbio); 1615 if (ret) { 1616 __free_raid_bio(rbio); 1617 return ret; 1618 } 1619 1620 ret = lock_stripe_add(rbio); 1621 if (ret == 0) 1622 finish_rmw(rbio); 1623 return 0; 1624 } 1625 1626 /* 1627 * partial stripe writes get handed over to async helpers. 1628 * We're really hoping to merge a few more writes into this 1629 * rbio before calculating new parity 1630 */ 1631 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1632 { 1633 int ret; 1634 1635 ret = lock_stripe_add(rbio); 1636 if (ret == 0) 1637 async_rmw_stripe(rbio); 1638 return 0; 1639 } 1640 1641 /* 1642 * sometimes while we were reading from the drive to 1643 * recalculate parity, enough new bios come into create 1644 * a full stripe. So we do a check here to see if we can 1645 * go directly to finish_rmw 1646 */ 1647 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1648 { 1649 /* head off into rmw land if we don't have a full stripe */ 1650 if (!rbio_is_full(rbio)) 1651 return partial_stripe_write(rbio); 1652 return full_stripe_write(rbio); 1653 } 1654 1655 /* 1656 * We use plugging call backs to collect full stripes. 1657 * Any time we get a partial stripe write while plugged 1658 * we collect it into a list. When the unplug comes down, 1659 * we sort the list by logical block number and merge 1660 * everything we can into the same rbios 1661 */ 1662 struct btrfs_plug_cb { 1663 struct blk_plug_cb cb; 1664 struct btrfs_fs_info *info; 1665 struct list_head rbio_list; 1666 struct btrfs_work work; 1667 }; 1668 1669 /* 1670 * rbios on the plug list are sorted for easier merging. 1671 */ 1672 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1673 { 1674 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1675 plug_list); 1676 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1677 plug_list); 1678 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1679 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1680 1681 if (a_sector < b_sector) 1682 return -1; 1683 if (a_sector > b_sector) 1684 return 1; 1685 return 0; 1686 } 1687 1688 static void run_plug(struct btrfs_plug_cb *plug) 1689 { 1690 struct btrfs_raid_bio *cur; 1691 struct btrfs_raid_bio *last = NULL; 1692 1693 /* 1694 * sort our plug list then try to merge 1695 * everything we can in hopes of creating full 1696 * stripes. 1697 */ 1698 list_sort(NULL, &plug->rbio_list, plug_cmp); 1699 while (!list_empty(&plug->rbio_list)) { 1700 cur = list_entry(plug->rbio_list.next, 1701 struct btrfs_raid_bio, plug_list); 1702 list_del_init(&cur->plug_list); 1703 1704 if (rbio_is_full(cur)) { 1705 /* we have a full stripe, send it down */ 1706 full_stripe_write(cur); 1707 continue; 1708 } 1709 if (last) { 1710 if (rbio_can_merge(last, cur)) { 1711 merge_rbio(last, cur); 1712 __free_raid_bio(cur); 1713 continue; 1714 1715 } 1716 __raid56_parity_write(last); 1717 } 1718 last = cur; 1719 } 1720 if (last) { 1721 __raid56_parity_write(last); 1722 } 1723 kfree(plug); 1724 } 1725 1726 /* 1727 * if the unplug comes from schedule, we have to push the 1728 * work off to a helper thread 1729 */ 1730 static void unplug_work(struct btrfs_work *work) 1731 { 1732 struct btrfs_plug_cb *plug; 1733 plug = container_of(work, struct btrfs_plug_cb, work); 1734 run_plug(plug); 1735 } 1736 1737 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1738 { 1739 struct btrfs_plug_cb *plug; 1740 plug = container_of(cb, struct btrfs_plug_cb, cb); 1741 1742 if (from_schedule) { 1743 btrfs_init_work(&plug->work, btrfs_rmw_helper, 1744 unplug_work, NULL, NULL); 1745 btrfs_queue_work(plug->info->rmw_workers, 1746 &plug->work); 1747 return; 1748 } 1749 run_plug(plug); 1750 } 1751 1752 /* 1753 * our main entry point for writes from the rest of the FS. 1754 */ 1755 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, 1756 struct btrfs_bio *bbio, u64 stripe_len) 1757 { 1758 struct btrfs_raid_bio *rbio; 1759 struct btrfs_plug_cb *plug = NULL; 1760 struct blk_plug_cb *cb; 1761 int ret; 1762 1763 rbio = alloc_rbio(fs_info, bbio, stripe_len); 1764 if (IS_ERR(rbio)) { 1765 btrfs_put_bbio(bbio); 1766 return PTR_ERR(rbio); 1767 } 1768 bio_list_add(&rbio->bio_list, bio); 1769 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1770 rbio->operation = BTRFS_RBIO_WRITE; 1771 1772 btrfs_bio_counter_inc_noblocked(fs_info); 1773 rbio->generic_bio_cnt = 1; 1774 1775 /* 1776 * don't plug on full rbios, just get them out the door 1777 * as quickly as we can 1778 */ 1779 if (rbio_is_full(rbio)) { 1780 ret = full_stripe_write(rbio); 1781 if (ret) 1782 btrfs_bio_counter_dec(fs_info); 1783 return ret; 1784 } 1785 1786 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1787 if (cb) { 1788 plug = container_of(cb, struct btrfs_plug_cb, cb); 1789 if (!plug->info) { 1790 plug->info = fs_info; 1791 INIT_LIST_HEAD(&plug->rbio_list); 1792 } 1793 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1794 ret = 0; 1795 } else { 1796 ret = __raid56_parity_write(rbio); 1797 if (ret) 1798 btrfs_bio_counter_dec(fs_info); 1799 } 1800 return ret; 1801 } 1802 1803 /* 1804 * all parity reconstruction happens here. We've read in everything 1805 * we can find from the drives and this does the heavy lifting of 1806 * sorting the good from the bad. 1807 */ 1808 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1809 { 1810 int pagenr, stripe; 1811 void **pointers; 1812 int faila = -1, failb = -1; 1813 struct page *page; 1814 blk_status_t err; 1815 int i; 1816 1817 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1818 if (!pointers) { 1819 err = BLK_STS_RESOURCE; 1820 goto cleanup_io; 1821 } 1822 1823 faila = rbio->faila; 1824 failb = rbio->failb; 1825 1826 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1827 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1828 spin_lock_irq(&rbio->bio_list_lock); 1829 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1830 spin_unlock_irq(&rbio->bio_list_lock); 1831 } 1832 1833 index_rbio_pages(rbio); 1834 1835 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 1836 /* 1837 * Now we just use bitmap to mark the horizontal stripes in 1838 * which we have data when doing parity scrub. 1839 */ 1840 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1841 !test_bit(pagenr, rbio->dbitmap)) 1842 continue; 1843 1844 /* setup our array of pointers with pages 1845 * from each stripe 1846 */ 1847 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1848 /* 1849 * if we're rebuilding a read, we have to use 1850 * pages from the bio list 1851 */ 1852 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1853 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1854 (stripe == faila || stripe == failb)) { 1855 page = page_in_rbio(rbio, stripe, pagenr, 0); 1856 } else { 1857 page = rbio_stripe_page(rbio, stripe, pagenr); 1858 } 1859 pointers[stripe] = kmap(page); 1860 } 1861 1862 /* all raid6 handling here */ 1863 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1864 /* 1865 * single failure, rebuild from parity raid5 1866 * style 1867 */ 1868 if (failb < 0) { 1869 if (faila == rbio->nr_data) { 1870 /* 1871 * Just the P stripe has failed, without 1872 * a bad data or Q stripe. 1873 * TODO, we should redo the xor here. 1874 */ 1875 err = BLK_STS_IOERR; 1876 goto cleanup; 1877 } 1878 /* 1879 * a single failure in raid6 is rebuilt 1880 * in the pstripe code below 1881 */ 1882 goto pstripe; 1883 } 1884 1885 /* make sure our ps and qs are in order */ 1886 if (faila > failb) { 1887 int tmp = failb; 1888 failb = faila; 1889 faila = tmp; 1890 } 1891 1892 /* if the q stripe is failed, do a pstripe reconstruction 1893 * from the xors. 1894 * If both the q stripe and the P stripe are failed, we're 1895 * here due to a crc mismatch and we can't give them the 1896 * data they want 1897 */ 1898 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { 1899 if (rbio->bbio->raid_map[faila] == 1900 RAID5_P_STRIPE) { 1901 err = BLK_STS_IOERR; 1902 goto cleanup; 1903 } 1904 /* 1905 * otherwise we have one bad data stripe and 1906 * a good P stripe. raid5! 1907 */ 1908 goto pstripe; 1909 } 1910 1911 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { 1912 raid6_datap_recov(rbio->real_stripes, 1913 PAGE_SIZE, faila, pointers); 1914 } else { 1915 raid6_2data_recov(rbio->real_stripes, 1916 PAGE_SIZE, faila, failb, 1917 pointers); 1918 } 1919 } else { 1920 void *p; 1921 1922 /* rebuild from P stripe here (raid5 or raid6) */ 1923 BUG_ON(failb != -1); 1924 pstripe: 1925 /* Copy parity block into failed block to start with */ 1926 memcpy(pointers[faila], 1927 pointers[rbio->nr_data], 1928 PAGE_SIZE); 1929 1930 /* rearrange the pointer array */ 1931 p = pointers[faila]; 1932 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1933 pointers[stripe] = pointers[stripe + 1]; 1934 pointers[rbio->nr_data - 1] = p; 1935 1936 /* xor in the rest */ 1937 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); 1938 } 1939 /* if we're doing this rebuild as part of an rmw, go through 1940 * and set all of our private rbio pages in the 1941 * failed stripes as uptodate. This way finish_rmw will 1942 * know they can be trusted. If this was a read reconstruction, 1943 * other endio functions will fiddle the uptodate bits 1944 */ 1945 if (rbio->operation == BTRFS_RBIO_WRITE) { 1946 for (i = 0; i < rbio->stripe_npages; i++) { 1947 if (faila != -1) { 1948 page = rbio_stripe_page(rbio, faila, i); 1949 SetPageUptodate(page); 1950 } 1951 if (failb != -1) { 1952 page = rbio_stripe_page(rbio, failb, i); 1953 SetPageUptodate(page); 1954 } 1955 } 1956 } 1957 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1958 /* 1959 * if we're rebuilding a read, we have to use 1960 * pages from the bio list 1961 */ 1962 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1963 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1964 (stripe == faila || stripe == failb)) { 1965 page = page_in_rbio(rbio, stripe, pagenr, 0); 1966 } else { 1967 page = rbio_stripe_page(rbio, stripe, pagenr); 1968 } 1969 kunmap(page); 1970 } 1971 } 1972 1973 err = BLK_STS_OK; 1974 cleanup: 1975 kfree(pointers); 1976 1977 cleanup_io: 1978 /* 1979 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 1980 * valid rbio which is consistent with ondisk content, thus such a 1981 * valid rbio can be cached to avoid further disk reads. 1982 */ 1983 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1984 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1985 /* 1986 * - In case of two failures, where rbio->failb != -1: 1987 * 1988 * Do not cache this rbio since the above read reconstruction 1989 * (raid6_datap_recov() or raid6_2data_recov()) may have 1990 * changed some content of stripes which are not identical to 1991 * on-disk content any more, otherwise, a later write/recover 1992 * may steal stripe_pages from this rbio and end up with 1993 * corruptions or rebuild failures. 1994 * 1995 * - In case of single failure, where rbio->failb == -1: 1996 * 1997 * Cache this rbio iff the above read reconstruction is 1998 * excuted without problems. 1999 */ 2000 if (err == BLK_STS_OK && rbio->failb < 0) 2001 cache_rbio_pages(rbio); 2002 else 2003 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2004 2005 rbio_orig_end_io(rbio, err); 2006 } else if (err == BLK_STS_OK) { 2007 rbio->faila = -1; 2008 rbio->failb = -1; 2009 2010 if (rbio->operation == BTRFS_RBIO_WRITE) 2011 finish_rmw(rbio); 2012 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 2013 finish_parity_scrub(rbio, 0); 2014 else 2015 BUG(); 2016 } else { 2017 rbio_orig_end_io(rbio, err); 2018 } 2019 } 2020 2021 /* 2022 * This is called only for stripes we've read from disk to 2023 * reconstruct the parity. 2024 */ 2025 static void raid_recover_end_io(struct bio *bio) 2026 { 2027 struct btrfs_raid_bio *rbio = bio->bi_private; 2028 2029 /* 2030 * we only read stripe pages off the disk, set them 2031 * up to date if there were no errors 2032 */ 2033 if (bio->bi_status) 2034 fail_bio_stripe(rbio, bio); 2035 else 2036 set_bio_pages_uptodate(bio); 2037 bio_put(bio); 2038 2039 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2040 return; 2041 2042 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2043 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2044 else 2045 __raid_recover_end_io(rbio); 2046 } 2047 2048 /* 2049 * reads everything we need off the disk to reconstruct 2050 * the parity. endio handlers trigger final reconstruction 2051 * when the IO is done. 2052 * 2053 * This is used both for reads from the higher layers and for 2054 * parity construction required to finish a rmw cycle. 2055 */ 2056 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2057 { 2058 int bios_to_read = 0; 2059 struct bio_list bio_list; 2060 int ret; 2061 int pagenr; 2062 int stripe; 2063 struct bio *bio; 2064 2065 bio_list_init(&bio_list); 2066 2067 ret = alloc_rbio_pages(rbio); 2068 if (ret) 2069 goto cleanup; 2070 2071 atomic_set(&rbio->error, 0); 2072 2073 /* 2074 * read everything that hasn't failed. Thanks to the 2075 * stripe cache, it is possible that some or all of these 2076 * pages are going to be uptodate. 2077 */ 2078 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2079 if (rbio->faila == stripe || rbio->failb == stripe) { 2080 atomic_inc(&rbio->error); 2081 continue; 2082 } 2083 2084 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { 2085 struct page *p; 2086 2087 /* 2088 * the rmw code may have already read this 2089 * page in 2090 */ 2091 p = rbio_stripe_page(rbio, stripe, pagenr); 2092 if (PageUptodate(p)) 2093 continue; 2094 2095 ret = rbio_add_io_page(rbio, &bio_list, 2096 rbio_stripe_page(rbio, stripe, pagenr), 2097 stripe, pagenr, rbio->stripe_len); 2098 if (ret < 0) 2099 goto cleanup; 2100 } 2101 } 2102 2103 bios_to_read = bio_list_size(&bio_list); 2104 if (!bios_to_read) { 2105 /* 2106 * we might have no bios to read just because the pages 2107 * were up to date, or we might have no bios to read because 2108 * the devices were gone. 2109 */ 2110 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { 2111 __raid_recover_end_io(rbio); 2112 goto out; 2113 } else { 2114 goto cleanup; 2115 } 2116 } 2117 2118 /* 2119 * the bbio may be freed once we submit the last bio. Make sure 2120 * not to touch it after that 2121 */ 2122 atomic_set(&rbio->stripes_pending, bios_to_read); 2123 while (1) { 2124 bio = bio_list_pop(&bio_list); 2125 if (!bio) 2126 break; 2127 2128 bio->bi_private = rbio; 2129 bio->bi_end_io = raid_recover_end_io; 2130 bio_set_op_attrs(bio, REQ_OP_READ, 0); 2131 2132 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2133 2134 submit_bio(bio); 2135 } 2136 out: 2137 return 0; 2138 2139 cleanup: 2140 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2141 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2142 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2143 2144 while ((bio = bio_list_pop(&bio_list))) 2145 bio_put(bio); 2146 2147 return -EIO; 2148 } 2149 2150 /* 2151 * the main entry point for reads from the higher layers. This 2152 * is really only called when the normal read path had a failure, 2153 * so we assume the bio they send down corresponds to a failed part 2154 * of the drive. 2155 */ 2156 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, 2157 struct btrfs_bio *bbio, u64 stripe_len, 2158 int mirror_num, int generic_io) 2159 { 2160 struct btrfs_raid_bio *rbio; 2161 int ret; 2162 2163 if (generic_io) { 2164 ASSERT(bbio->mirror_num == mirror_num); 2165 btrfs_io_bio(bio)->mirror_num = mirror_num; 2166 } 2167 2168 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2169 if (IS_ERR(rbio)) { 2170 if (generic_io) 2171 btrfs_put_bbio(bbio); 2172 return PTR_ERR(rbio); 2173 } 2174 2175 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2176 bio_list_add(&rbio->bio_list, bio); 2177 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2178 2179 rbio->faila = find_logical_bio_stripe(rbio, bio); 2180 if (rbio->faila == -1) { 2181 btrfs_warn(fs_info, 2182 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", 2183 __func__, (u64)bio->bi_iter.bi_sector << 9, 2184 (u64)bio->bi_iter.bi_size, bbio->map_type); 2185 if (generic_io) 2186 btrfs_put_bbio(bbio); 2187 kfree(rbio); 2188 return -EIO; 2189 } 2190 2191 if (generic_io) { 2192 btrfs_bio_counter_inc_noblocked(fs_info); 2193 rbio->generic_bio_cnt = 1; 2194 } else { 2195 btrfs_get_bbio(bbio); 2196 } 2197 2198 /* 2199 * Loop retry: 2200 * for 'mirror == 2', reconstruct from all other stripes. 2201 * for 'mirror_num > 2', select a stripe to fail on every retry. 2202 */ 2203 if (mirror_num > 2) { 2204 /* 2205 * 'mirror == 3' is to fail the p stripe and 2206 * reconstruct from the q stripe. 'mirror > 3' is to 2207 * fail a data stripe and reconstruct from p+q stripe. 2208 */ 2209 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2210 ASSERT(rbio->failb > 0); 2211 if (rbio->failb <= rbio->faila) 2212 rbio->failb--; 2213 } 2214 2215 ret = lock_stripe_add(rbio); 2216 2217 /* 2218 * __raid56_parity_recover will end the bio with 2219 * any errors it hits. We don't want to return 2220 * its error value up the stack because our caller 2221 * will end up calling bio_endio with any nonzero 2222 * return 2223 */ 2224 if (ret == 0) 2225 __raid56_parity_recover(rbio); 2226 /* 2227 * our rbio has been added to the list of 2228 * rbios that will be handled after the 2229 * currently lock owner is done 2230 */ 2231 return 0; 2232 2233 } 2234 2235 static void rmw_work(struct btrfs_work *work) 2236 { 2237 struct btrfs_raid_bio *rbio; 2238 2239 rbio = container_of(work, struct btrfs_raid_bio, work); 2240 raid56_rmw_stripe(rbio); 2241 } 2242 2243 static void read_rebuild_work(struct btrfs_work *work) 2244 { 2245 struct btrfs_raid_bio *rbio; 2246 2247 rbio = container_of(work, struct btrfs_raid_bio, work); 2248 __raid56_parity_recover(rbio); 2249 } 2250 2251 /* 2252 * The following code is used to scrub/replace the parity stripe 2253 * 2254 * Caller must have already increased bio_counter for getting @bbio. 2255 * 2256 * Note: We need make sure all the pages that add into the scrub/replace 2257 * raid bio are correct and not be changed during the scrub/replace. That 2258 * is those pages just hold metadata or file data with checksum. 2259 */ 2260 2261 struct btrfs_raid_bio * 2262 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2263 struct btrfs_bio *bbio, u64 stripe_len, 2264 struct btrfs_device *scrub_dev, 2265 unsigned long *dbitmap, int stripe_nsectors) 2266 { 2267 struct btrfs_raid_bio *rbio; 2268 int i; 2269 2270 rbio = alloc_rbio(fs_info, bbio, stripe_len); 2271 if (IS_ERR(rbio)) 2272 return NULL; 2273 bio_list_add(&rbio->bio_list, bio); 2274 /* 2275 * This is a special bio which is used to hold the completion handler 2276 * and make the scrub rbio is similar to the other types 2277 */ 2278 ASSERT(!bio->bi_iter.bi_size); 2279 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2280 2281 /* 2282 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted 2283 * to the end position, so this search can start from the first parity 2284 * stripe. 2285 */ 2286 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2287 if (bbio->stripes[i].dev == scrub_dev) { 2288 rbio->scrubp = i; 2289 break; 2290 } 2291 } 2292 ASSERT(i < rbio->real_stripes); 2293 2294 /* Now we just support the sectorsize equals to page size */ 2295 ASSERT(fs_info->sectorsize == PAGE_SIZE); 2296 ASSERT(rbio->stripe_npages == stripe_nsectors); 2297 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); 2298 2299 /* 2300 * We have already increased bio_counter when getting bbio, record it 2301 * so we can free it at rbio_orig_end_io(). 2302 */ 2303 rbio->generic_bio_cnt = 1; 2304 2305 return rbio; 2306 } 2307 2308 /* Used for both parity scrub and missing. */ 2309 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2310 u64 logical) 2311 { 2312 int stripe_offset; 2313 int index; 2314 2315 ASSERT(logical >= rbio->bbio->raid_map[0]); 2316 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + 2317 rbio->stripe_len * rbio->nr_data); 2318 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); 2319 index = stripe_offset >> PAGE_SHIFT; 2320 rbio->bio_pages[index] = page; 2321 } 2322 2323 /* 2324 * We just scrub the parity that we have correct data on the same horizontal, 2325 * so we needn't allocate all pages for all the stripes. 2326 */ 2327 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2328 { 2329 int i; 2330 int bit; 2331 int index; 2332 struct page *page; 2333 2334 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { 2335 for (i = 0; i < rbio->real_stripes; i++) { 2336 index = i * rbio->stripe_npages + bit; 2337 if (rbio->stripe_pages[index]) 2338 continue; 2339 2340 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2341 if (!page) 2342 return -ENOMEM; 2343 rbio->stripe_pages[index] = page; 2344 } 2345 } 2346 return 0; 2347 } 2348 2349 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2350 int need_check) 2351 { 2352 struct btrfs_bio *bbio = rbio->bbio; 2353 void *pointers[rbio->real_stripes]; 2354 DECLARE_BITMAP(pbitmap, rbio->stripe_npages); 2355 int nr_data = rbio->nr_data; 2356 int stripe; 2357 int pagenr; 2358 int p_stripe = -1; 2359 int q_stripe = -1; 2360 struct page *p_page = NULL; 2361 struct page *q_page = NULL; 2362 struct bio_list bio_list; 2363 struct bio *bio; 2364 int is_replace = 0; 2365 int ret; 2366 2367 bio_list_init(&bio_list); 2368 2369 if (rbio->real_stripes - rbio->nr_data == 1) { 2370 p_stripe = rbio->real_stripes - 1; 2371 } else if (rbio->real_stripes - rbio->nr_data == 2) { 2372 p_stripe = rbio->real_stripes - 2; 2373 q_stripe = rbio->real_stripes - 1; 2374 } else { 2375 BUG(); 2376 } 2377 2378 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { 2379 is_replace = 1; 2380 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); 2381 } 2382 2383 /* 2384 * Because the higher layers(scrubber) are unlikely to 2385 * use this area of the disk again soon, so don't cache 2386 * it. 2387 */ 2388 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2389 2390 if (!need_check) 2391 goto writeback; 2392 2393 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2394 if (!p_page) 2395 goto cleanup; 2396 SetPageUptodate(p_page); 2397 2398 if (q_stripe != -1) { 2399 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2400 if (!q_page) { 2401 __free_page(p_page); 2402 goto cleanup; 2403 } 2404 SetPageUptodate(q_page); 2405 } 2406 2407 atomic_set(&rbio->error, 0); 2408 2409 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2410 struct page *p; 2411 void *parity; 2412 /* first collect one page from each data stripe */ 2413 for (stripe = 0; stripe < nr_data; stripe++) { 2414 p = page_in_rbio(rbio, stripe, pagenr, 0); 2415 pointers[stripe] = kmap(p); 2416 } 2417 2418 /* then add the parity stripe */ 2419 pointers[stripe++] = kmap(p_page); 2420 2421 if (q_stripe != -1) { 2422 2423 /* 2424 * raid6, add the qstripe and call the 2425 * library function to fill in our p/q 2426 */ 2427 pointers[stripe++] = kmap(q_page); 2428 2429 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, 2430 pointers); 2431 } else { 2432 /* raid5 */ 2433 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 2434 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); 2435 } 2436 2437 /* Check scrubbing parity and repair it */ 2438 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2439 parity = kmap(p); 2440 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) 2441 memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE); 2442 else 2443 /* Parity is right, needn't writeback */ 2444 bitmap_clear(rbio->dbitmap, pagenr, 1); 2445 kunmap(p); 2446 2447 for (stripe = 0; stripe < rbio->real_stripes; stripe++) 2448 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 2449 } 2450 2451 __free_page(p_page); 2452 if (q_page) 2453 __free_page(q_page); 2454 2455 writeback: 2456 /* 2457 * time to start writing. Make bios for everything from the 2458 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2459 * everything else. 2460 */ 2461 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2462 struct page *page; 2463 2464 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2465 ret = rbio_add_io_page(rbio, &bio_list, 2466 page, rbio->scrubp, pagenr, rbio->stripe_len); 2467 if (ret) 2468 goto cleanup; 2469 } 2470 2471 if (!is_replace) 2472 goto submit_write; 2473 2474 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { 2475 struct page *page; 2476 2477 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); 2478 ret = rbio_add_io_page(rbio, &bio_list, page, 2479 bbio->tgtdev_map[rbio->scrubp], 2480 pagenr, rbio->stripe_len); 2481 if (ret) 2482 goto cleanup; 2483 } 2484 2485 submit_write: 2486 nr_data = bio_list_size(&bio_list); 2487 if (!nr_data) { 2488 /* Every parity is right */ 2489 rbio_orig_end_io(rbio, BLK_STS_OK); 2490 return; 2491 } 2492 2493 atomic_set(&rbio->stripes_pending, nr_data); 2494 2495 while (1) { 2496 bio = bio_list_pop(&bio_list); 2497 if (!bio) 2498 break; 2499 2500 bio->bi_private = rbio; 2501 bio->bi_end_io = raid_write_end_io; 2502 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 2503 2504 submit_bio(bio); 2505 } 2506 return; 2507 2508 cleanup: 2509 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2510 2511 while ((bio = bio_list_pop(&bio_list))) 2512 bio_put(bio); 2513 } 2514 2515 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2516 { 2517 if (stripe >= 0 && stripe < rbio->nr_data) 2518 return 1; 2519 return 0; 2520 } 2521 2522 /* 2523 * While we're doing the parity check and repair, we could have errors 2524 * in reading pages off the disk. This checks for errors and if we're 2525 * not able to read the page it'll trigger parity reconstruction. The 2526 * parity scrub will be finished after we've reconstructed the failed 2527 * stripes 2528 */ 2529 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2530 { 2531 if (atomic_read(&rbio->error) > rbio->bbio->max_errors) 2532 goto cleanup; 2533 2534 if (rbio->faila >= 0 || rbio->failb >= 0) { 2535 int dfail = 0, failp = -1; 2536 2537 if (is_data_stripe(rbio, rbio->faila)) 2538 dfail++; 2539 else if (is_parity_stripe(rbio->faila)) 2540 failp = rbio->faila; 2541 2542 if (is_data_stripe(rbio, rbio->failb)) 2543 dfail++; 2544 else if (is_parity_stripe(rbio->failb)) 2545 failp = rbio->failb; 2546 2547 /* 2548 * Because we can not use a scrubbing parity to repair 2549 * the data, so the capability of the repair is declined. 2550 * (In the case of RAID5, we can not repair anything) 2551 */ 2552 if (dfail > rbio->bbio->max_errors - 1) 2553 goto cleanup; 2554 2555 /* 2556 * If all data is good, only parity is correctly, just 2557 * repair the parity. 2558 */ 2559 if (dfail == 0) { 2560 finish_parity_scrub(rbio, 0); 2561 return; 2562 } 2563 2564 /* 2565 * Here means we got one corrupted data stripe and one 2566 * corrupted parity on RAID6, if the corrupted parity 2567 * is scrubbing parity, luckily, use the other one to repair 2568 * the data, or we can not repair the data stripe. 2569 */ 2570 if (failp != rbio->scrubp) 2571 goto cleanup; 2572 2573 __raid_recover_end_io(rbio); 2574 } else { 2575 finish_parity_scrub(rbio, 1); 2576 } 2577 return; 2578 2579 cleanup: 2580 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2581 } 2582 2583 /* 2584 * end io for the read phase of the rmw cycle. All the bios here are physical 2585 * stripe bios we've read from the disk so we can recalculate the parity of the 2586 * stripe. 2587 * 2588 * This will usually kick off finish_rmw once all the bios are read in, but it 2589 * may trigger parity reconstruction if we had any errors along the way 2590 */ 2591 static void raid56_parity_scrub_end_io(struct bio *bio) 2592 { 2593 struct btrfs_raid_bio *rbio = bio->bi_private; 2594 2595 if (bio->bi_status) 2596 fail_bio_stripe(rbio, bio); 2597 else 2598 set_bio_pages_uptodate(bio); 2599 2600 bio_put(bio); 2601 2602 if (!atomic_dec_and_test(&rbio->stripes_pending)) 2603 return; 2604 2605 /* 2606 * this will normally call finish_rmw to start our write 2607 * but if there are any failed stripes we'll reconstruct 2608 * from parity first 2609 */ 2610 validate_rbio_for_parity_scrub(rbio); 2611 } 2612 2613 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2614 { 2615 int bios_to_read = 0; 2616 struct bio_list bio_list; 2617 int ret; 2618 int pagenr; 2619 int stripe; 2620 struct bio *bio; 2621 2622 bio_list_init(&bio_list); 2623 2624 ret = alloc_rbio_essential_pages(rbio); 2625 if (ret) 2626 goto cleanup; 2627 2628 atomic_set(&rbio->error, 0); 2629 /* 2630 * build a list of bios to read all the missing parts of this 2631 * stripe 2632 */ 2633 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 2634 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { 2635 struct page *page; 2636 /* 2637 * we want to find all the pages missing from 2638 * the rbio and read them from the disk. If 2639 * page_in_rbio finds a page in the bio list 2640 * we don't need to read it off the stripe. 2641 */ 2642 page = page_in_rbio(rbio, stripe, pagenr, 1); 2643 if (page) 2644 continue; 2645 2646 page = rbio_stripe_page(rbio, stripe, pagenr); 2647 /* 2648 * the bio cache may have handed us an uptodate 2649 * page. If so, be happy and use it 2650 */ 2651 if (PageUptodate(page)) 2652 continue; 2653 2654 ret = rbio_add_io_page(rbio, &bio_list, page, 2655 stripe, pagenr, rbio->stripe_len); 2656 if (ret) 2657 goto cleanup; 2658 } 2659 } 2660 2661 bios_to_read = bio_list_size(&bio_list); 2662 if (!bios_to_read) { 2663 /* 2664 * this can happen if others have merged with 2665 * us, it means there is nothing left to read. 2666 * But if there are missing devices it may not be 2667 * safe to do the full stripe write yet. 2668 */ 2669 goto finish; 2670 } 2671 2672 /* 2673 * the bbio may be freed once we submit the last bio. Make sure 2674 * not to touch it after that 2675 */ 2676 atomic_set(&rbio->stripes_pending, bios_to_read); 2677 while (1) { 2678 bio = bio_list_pop(&bio_list); 2679 if (!bio) 2680 break; 2681 2682 bio->bi_private = rbio; 2683 bio->bi_end_io = raid56_parity_scrub_end_io; 2684 bio_set_op_attrs(bio, REQ_OP_READ, 0); 2685 2686 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); 2687 2688 submit_bio(bio); 2689 } 2690 /* the actual write will happen once the reads are done */ 2691 return; 2692 2693 cleanup: 2694 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2695 2696 while ((bio = bio_list_pop(&bio_list))) 2697 bio_put(bio); 2698 2699 return; 2700 2701 finish: 2702 validate_rbio_for_parity_scrub(rbio); 2703 } 2704 2705 static void scrub_parity_work(struct btrfs_work *work) 2706 { 2707 struct btrfs_raid_bio *rbio; 2708 2709 rbio = container_of(work, struct btrfs_raid_bio, work); 2710 raid56_parity_scrub_stripe(rbio); 2711 } 2712 2713 static void async_scrub_parity(struct btrfs_raid_bio *rbio) 2714 { 2715 btrfs_init_work(&rbio->work, btrfs_rmw_helper, 2716 scrub_parity_work, NULL, NULL); 2717 2718 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); 2719 } 2720 2721 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2722 { 2723 if (!lock_stripe_add(rbio)) 2724 async_scrub_parity(rbio); 2725 } 2726 2727 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2728 2729 struct btrfs_raid_bio * 2730 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, 2731 struct btrfs_bio *bbio, u64 length) 2732 { 2733 struct btrfs_raid_bio *rbio; 2734 2735 rbio = alloc_rbio(fs_info, bbio, length); 2736 if (IS_ERR(rbio)) 2737 return NULL; 2738 2739 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2740 bio_list_add(&rbio->bio_list, bio); 2741 /* 2742 * This is a special bio which is used to hold the completion handler 2743 * and make the scrub rbio is similar to the other types 2744 */ 2745 ASSERT(!bio->bi_iter.bi_size); 2746 2747 rbio->faila = find_logical_bio_stripe(rbio, bio); 2748 if (rbio->faila == -1) { 2749 BUG(); 2750 kfree(rbio); 2751 return NULL; 2752 } 2753 2754 /* 2755 * When we get bbio, we have already increased bio_counter, record it 2756 * so we can free it at rbio_orig_end_io() 2757 */ 2758 rbio->generic_bio_cnt = 1; 2759 2760 return rbio; 2761 } 2762 2763 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2764 { 2765 if (!lock_stripe_add(rbio)) 2766 async_read_rebuild(rbio); 2767 } 2768