1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "misc.h" 17 #include "ctree.h" 18 #include "disk-io.h" 19 #include "volumes.h" 20 #include "raid56.h" 21 #include "async-thread.h" 22 23 /* set when additional merges to this rbio are not allowed */ 24 #define RBIO_RMW_LOCKED_BIT 1 25 26 /* 27 * set when this rbio is sitting in the hash, but it is just a cache 28 * of past RMW 29 */ 30 #define RBIO_CACHE_BIT 2 31 32 /* 33 * set when it is safe to trust the stripe_pages for caching 34 */ 35 #define RBIO_CACHE_READY_BIT 3 36 37 #define RBIO_CACHE_SIZE 1024 38 39 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 40 41 /* Used by the raid56 code to lock stripes for read/modify/write */ 42 struct btrfs_stripe_hash { 43 struct list_head hash_list; 44 spinlock_t lock; 45 }; 46 47 /* Used by the raid56 code to lock stripes for read/modify/write */ 48 struct btrfs_stripe_hash_table { 49 struct list_head stripe_cache; 50 spinlock_t cache_lock; 51 int cache_size; 52 struct btrfs_stripe_hash table[]; 53 }; 54 55 /* 56 * A bvec like structure to present a sector inside a page. 57 * 58 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 59 */ 60 struct sector_ptr { 61 struct page *page; 62 unsigned int pgoff:24; 63 unsigned int uptodate:8; 64 }; 65 66 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 67 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 68 static void rmw_work(struct work_struct *work); 69 static void read_rebuild_work(struct work_struct *work); 70 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 71 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 72 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 73 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 74 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 75 76 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 77 int need_check); 78 static void scrub_parity_work(struct work_struct *work); 79 80 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 81 { 82 INIT_WORK(&rbio->work, work_func); 83 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 84 } 85 86 /* 87 * the stripe hash table is used for locking, and to collect 88 * bios in hopes of making a full stripe 89 */ 90 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 91 { 92 struct btrfs_stripe_hash_table *table; 93 struct btrfs_stripe_hash_table *x; 94 struct btrfs_stripe_hash *cur; 95 struct btrfs_stripe_hash *h; 96 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 97 int i; 98 99 if (info->stripe_hash_table) 100 return 0; 101 102 /* 103 * The table is large, starting with order 4 and can go as high as 104 * order 7 in case lock debugging is turned on. 105 * 106 * Try harder to allocate and fallback to vmalloc to lower the chance 107 * of a failing mount. 108 */ 109 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 110 if (!table) 111 return -ENOMEM; 112 113 spin_lock_init(&table->cache_lock); 114 INIT_LIST_HEAD(&table->stripe_cache); 115 116 h = table->table; 117 118 for (i = 0; i < num_entries; i++) { 119 cur = h + i; 120 INIT_LIST_HEAD(&cur->hash_list); 121 spin_lock_init(&cur->lock); 122 } 123 124 x = cmpxchg(&info->stripe_hash_table, NULL, table); 125 kvfree(x); 126 return 0; 127 } 128 129 /* 130 * caching an rbio means to copy anything from the 131 * bio_sectors array into the stripe_pages array. We 132 * use the page uptodate bit in the stripe cache array 133 * to indicate if it has valid data 134 * 135 * once the caching is done, we set the cache ready 136 * bit. 137 */ 138 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 139 { 140 int i; 141 int ret; 142 143 ret = alloc_rbio_pages(rbio); 144 if (ret) 145 return; 146 147 for (i = 0; i < rbio->nr_sectors; i++) { 148 /* Some range not covered by bio (partial write), skip it */ 149 if (!rbio->bio_sectors[i].page) 150 continue; 151 152 ASSERT(rbio->stripe_sectors[i].page); 153 memcpy_page(rbio->stripe_sectors[i].page, 154 rbio->stripe_sectors[i].pgoff, 155 rbio->bio_sectors[i].page, 156 rbio->bio_sectors[i].pgoff, 157 rbio->bioc->fs_info->sectorsize); 158 rbio->stripe_sectors[i].uptodate = 1; 159 } 160 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 161 } 162 163 /* 164 * we hash on the first logical address of the stripe 165 */ 166 static int rbio_bucket(struct btrfs_raid_bio *rbio) 167 { 168 u64 num = rbio->bioc->raid_map[0]; 169 170 /* 171 * we shift down quite a bit. We're using byte 172 * addressing, and most of the lower bits are zeros. 173 * This tends to upset hash_64, and it consistently 174 * returns just one or two different values. 175 * 176 * shifting off the lower bits fixes things. 177 */ 178 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 179 } 180 181 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 182 unsigned int page_nr) 183 { 184 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 185 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 186 int i; 187 188 ASSERT(page_nr < rbio->nr_pages); 189 190 for (i = sectors_per_page * page_nr; 191 i < sectors_per_page * page_nr + sectors_per_page; 192 i++) { 193 if (!rbio->stripe_sectors[i].uptodate) 194 return false; 195 } 196 return true; 197 } 198 199 /* 200 * Update the stripe_sectors[] array to use correct page and pgoff 201 * 202 * Should be called every time any page pointer in stripes_pages[] got modified. 203 */ 204 static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 205 { 206 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 207 u32 offset; 208 int i; 209 210 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 211 int page_index = offset >> PAGE_SHIFT; 212 213 ASSERT(page_index < rbio->nr_pages); 214 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 215 rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 216 } 217 } 218 219 static void steal_rbio_page(struct btrfs_raid_bio *src, 220 struct btrfs_raid_bio *dest, int page_nr) 221 { 222 const u32 sectorsize = src->bioc->fs_info->sectorsize; 223 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 224 int i; 225 226 if (dest->stripe_pages[page_nr]) 227 __free_page(dest->stripe_pages[page_nr]); 228 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 229 src->stripe_pages[page_nr] = NULL; 230 231 /* Also update the sector->uptodate bits. */ 232 for (i = sectors_per_page * page_nr; 233 i < sectors_per_page * page_nr + sectors_per_page; i++) 234 dest->stripe_sectors[i].uptodate = true; 235 } 236 237 /* 238 * Stealing an rbio means taking all the uptodate pages from the stripe array 239 * in the source rbio and putting them into the destination rbio. 240 * 241 * This will also update the involved stripe_sectors[] which are referring to 242 * the old pages. 243 */ 244 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 245 { 246 int i; 247 struct page *s; 248 249 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 250 return; 251 252 for (i = 0; i < dest->nr_pages; i++) { 253 s = src->stripe_pages[i]; 254 if (!s || !full_page_sectors_uptodate(src, i)) 255 continue; 256 257 steal_rbio_page(src, dest, i); 258 } 259 index_stripe_sectors(dest); 260 index_stripe_sectors(src); 261 } 262 263 /* 264 * merging means we take the bio_list from the victim and 265 * splice it into the destination. The victim should 266 * be discarded afterwards. 267 * 268 * must be called with dest->rbio_list_lock held 269 */ 270 static void merge_rbio(struct btrfs_raid_bio *dest, 271 struct btrfs_raid_bio *victim) 272 { 273 bio_list_merge(&dest->bio_list, &victim->bio_list); 274 dest->bio_list_bytes += victim->bio_list_bytes; 275 /* Also inherit the bitmaps from @victim. */ 276 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 277 dest->stripe_nsectors); 278 bio_list_init(&victim->bio_list); 279 } 280 281 /* 282 * used to prune items that are in the cache. The caller 283 * must hold the hash table lock. 284 */ 285 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 286 { 287 int bucket = rbio_bucket(rbio); 288 struct btrfs_stripe_hash_table *table; 289 struct btrfs_stripe_hash *h; 290 int freeit = 0; 291 292 /* 293 * check the bit again under the hash table lock. 294 */ 295 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 296 return; 297 298 table = rbio->bioc->fs_info->stripe_hash_table; 299 h = table->table + bucket; 300 301 /* hold the lock for the bucket because we may be 302 * removing it from the hash table 303 */ 304 spin_lock(&h->lock); 305 306 /* 307 * hold the lock for the bio list because we need 308 * to make sure the bio list is empty 309 */ 310 spin_lock(&rbio->bio_list_lock); 311 312 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 313 list_del_init(&rbio->stripe_cache); 314 table->cache_size -= 1; 315 freeit = 1; 316 317 /* if the bio list isn't empty, this rbio is 318 * still involved in an IO. We take it out 319 * of the cache list, and drop the ref that 320 * was held for the list. 321 * 322 * If the bio_list was empty, we also remove 323 * the rbio from the hash_table, and drop 324 * the corresponding ref 325 */ 326 if (bio_list_empty(&rbio->bio_list)) { 327 if (!list_empty(&rbio->hash_list)) { 328 list_del_init(&rbio->hash_list); 329 refcount_dec(&rbio->refs); 330 BUG_ON(!list_empty(&rbio->plug_list)); 331 } 332 } 333 } 334 335 spin_unlock(&rbio->bio_list_lock); 336 spin_unlock(&h->lock); 337 338 if (freeit) 339 __free_raid_bio(rbio); 340 } 341 342 /* 343 * prune a given rbio from the cache 344 */ 345 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 346 { 347 struct btrfs_stripe_hash_table *table; 348 unsigned long flags; 349 350 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 351 return; 352 353 table = rbio->bioc->fs_info->stripe_hash_table; 354 355 spin_lock_irqsave(&table->cache_lock, flags); 356 __remove_rbio_from_cache(rbio); 357 spin_unlock_irqrestore(&table->cache_lock, flags); 358 } 359 360 /* 361 * remove everything in the cache 362 */ 363 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 364 { 365 struct btrfs_stripe_hash_table *table; 366 unsigned long flags; 367 struct btrfs_raid_bio *rbio; 368 369 table = info->stripe_hash_table; 370 371 spin_lock_irqsave(&table->cache_lock, flags); 372 while (!list_empty(&table->stripe_cache)) { 373 rbio = list_entry(table->stripe_cache.next, 374 struct btrfs_raid_bio, 375 stripe_cache); 376 __remove_rbio_from_cache(rbio); 377 } 378 spin_unlock_irqrestore(&table->cache_lock, flags); 379 } 380 381 /* 382 * remove all cached entries and free the hash table 383 * used by unmount 384 */ 385 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 386 { 387 if (!info->stripe_hash_table) 388 return; 389 btrfs_clear_rbio_cache(info); 390 kvfree(info->stripe_hash_table); 391 info->stripe_hash_table = NULL; 392 } 393 394 /* 395 * insert an rbio into the stripe cache. It 396 * must have already been prepared by calling 397 * cache_rbio_pages 398 * 399 * If this rbio was already cached, it gets 400 * moved to the front of the lru. 401 * 402 * If the size of the rbio cache is too big, we 403 * prune an item. 404 */ 405 static void cache_rbio(struct btrfs_raid_bio *rbio) 406 { 407 struct btrfs_stripe_hash_table *table; 408 unsigned long flags; 409 410 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 411 return; 412 413 table = rbio->bioc->fs_info->stripe_hash_table; 414 415 spin_lock_irqsave(&table->cache_lock, flags); 416 spin_lock(&rbio->bio_list_lock); 417 418 /* bump our ref if we were not in the list before */ 419 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 420 refcount_inc(&rbio->refs); 421 422 if (!list_empty(&rbio->stripe_cache)){ 423 list_move(&rbio->stripe_cache, &table->stripe_cache); 424 } else { 425 list_add(&rbio->stripe_cache, &table->stripe_cache); 426 table->cache_size += 1; 427 } 428 429 spin_unlock(&rbio->bio_list_lock); 430 431 if (table->cache_size > RBIO_CACHE_SIZE) { 432 struct btrfs_raid_bio *found; 433 434 found = list_entry(table->stripe_cache.prev, 435 struct btrfs_raid_bio, 436 stripe_cache); 437 438 if (found != rbio) 439 __remove_rbio_from_cache(found); 440 } 441 442 spin_unlock_irqrestore(&table->cache_lock, flags); 443 } 444 445 /* 446 * helper function to run the xor_blocks api. It is only 447 * able to do MAX_XOR_BLOCKS at a time, so we need to 448 * loop through. 449 */ 450 static void run_xor(void **pages, int src_cnt, ssize_t len) 451 { 452 int src_off = 0; 453 int xor_src_cnt = 0; 454 void *dest = pages[src_cnt]; 455 456 while(src_cnt > 0) { 457 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 458 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 459 460 src_cnt -= xor_src_cnt; 461 src_off += xor_src_cnt; 462 } 463 } 464 465 /* 466 * Returns true if the bio list inside this rbio covers an entire stripe (no 467 * rmw required). 468 */ 469 static int rbio_is_full(struct btrfs_raid_bio *rbio) 470 { 471 unsigned long flags; 472 unsigned long size = rbio->bio_list_bytes; 473 int ret = 1; 474 475 spin_lock_irqsave(&rbio->bio_list_lock, flags); 476 if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 477 ret = 0; 478 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 479 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 480 481 return ret; 482 } 483 484 /* 485 * returns 1 if it is safe to merge two rbios together. 486 * The merging is safe if the two rbios correspond to 487 * the same stripe and if they are both going in the same 488 * direction (read vs write), and if neither one is 489 * locked for final IO 490 * 491 * The caller is responsible for locking such that 492 * rmw_locked is safe to test 493 */ 494 static int rbio_can_merge(struct btrfs_raid_bio *last, 495 struct btrfs_raid_bio *cur) 496 { 497 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 498 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 499 return 0; 500 501 /* 502 * we can't merge with cached rbios, since the 503 * idea is that when we merge the destination 504 * rbio is going to run our IO for us. We can 505 * steal from cached rbios though, other functions 506 * handle that. 507 */ 508 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 509 test_bit(RBIO_CACHE_BIT, &cur->flags)) 510 return 0; 511 512 if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) 513 return 0; 514 515 /* we can't merge with different operations */ 516 if (last->operation != cur->operation) 517 return 0; 518 /* 519 * We've need read the full stripe from the drive. 520 * check and repair the parity and write the new results. 521 * 522 * We're not allowed to add any new bios to the 523 * bio list here, anyone else that wants to 524 * change this stripe needs to do their own rmw. 525 */ 526 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 527 return 0; 528 529 if (last->operation == BTRFS_RBIO_REBUILD_MISSING) 530 return 0; 531 532 if (last->operation == BTRFS_RBIO_READ_REBUILD) { 533 int fa = last->faila; 534 int fb = last->failb; 535 int cur_fa = cur->faila; 536 int cur_fb = cur->failb; 537 538 if (last->faila >= last->failb) { 539 fa = last->failb; 540 fb = last->faila; 541 } 542 543 if (cur->faila >= cur->failb) { 544 cur_fa = cur->failb; 545 cur_fb = cur->faila; 546 } 547 548 if (fa != cur_fa || fb != cur_fb) 549 return 0; 550 } 551 return 1; 552 } 553 554 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 555 unsigned int stripe_nr, 556 unsigned int sector_nr) 557 { 558 ASSERT(stripe_nr < rbio->real_stripes); 559 ASSERT(sector_nr < rbio->stripe_nsectors); 560 561 return stripe_nr * rbio->stripe_nsectors + sector_nr; 562 } 563 564 /* Return a sector from rbio->stripe_sectors, not from the bio list */ 565 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 566 unsigned int stripe_nr, 567 unsigned int sector_nr) 568 { 569 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 570 sector_nr)]; 571 } 572 573 /* Grab a sector inside P stripe */ 574 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 575 unsigned int sector_nr) 576 { 577 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 578 } 579 580 /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 581 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 582 unsigned int sector_nr) 583 { 584 if (rbio->nr_data + 1 == rbio->real_stripes) 585 return NULL; 586 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 587 } 588 589 /* 590 * The first stripe in the table for a logical address 591 * has the lock. rbios are added in one of three ways: 592 * 593 * 1) Nobody has the stripe locked yet. The rbio is given 594 * the lock and 0 is returned. The caller must start the IO 595 * themselves. 596 * 597 * 2) Someone has the stripe locked, but we're able to merge 598 * with the lock owner. The rbio is freed and the IO will 599 * start automatically along with the existing rbio. 1 is returned. 600 * 601 * 3) Someone has the stripe locked, but we're not able to merge. 602 * The rbio is added to the lock owner's plug list, or merged into 603 * an rbio already on the plug list. When the lock owner unlocks, 604 * the next rbio on the list is run and the IO is started automatically. 605 * 1 is returned 606 * 607 * If we return 0, the caller still owns the rbio and must continue with 608 * IO submission. If we return 1, the caller must assume the rbio has 609 * already been freed. 610 */ 611 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 612 { 613 struct btrfs_stripe_hash *h; 614 struct btrfs_raid_bio *cur; 615 struct btrfs_raid_bio *pending; 616 unsigned long flags; 617 struct btrfs_raid_bio *freeit = NULL; 618 struct btrfs_raid_bio *cache_drop = NULL; 619 int ret = 0; 620 621 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 622 623 spin_lock_irqsave(&h->lock, flags); 624 list_for_each_entry(cur, &h->hash_list, hash_list) { 625 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) 626 continue; 627 628 spin_lock(&cur->bio_list_lock); 629 630 /* Can we steal this cached rbio's pages? */ 631 if (bio_list_empty(&cur->bio_list) && 632 list_empty(&cur->plug_list) && 633 test_bit(RBIO_CACHE_BIT, &cur->flags) && 634 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 635 list_del_init(&cur->hash_list); 636 refcount_dec(&cur->refs); 637 638 steal_rbio(cur, rbio); 639 cache_drop = cur; 640 spin_unlock(&cur->bio_list_lock); 641 642 goto lockit; 643 } 644 645 /* Can we merge into the lock owner? */ 646 if (rbio_can_merge(cur, rbio)) { 647 merge_rbio(cur, rbio); 648 spin_unlock(&cur->bio_list_lock); 649 freeit = rbio; 650 ret = 1; 651 goto out; 652 } 653 654 655 /* 656 * We couldn't merge with the running rbio, see if we can merge 657 * with the pending ones. We don't have to check for rmw_locked 658 * because there is no way they are inside finish_rmw right now 659 */ 660 list_for_each_entry(pending, &cur->plug_list, plug_list) { 661 if (rbio_can_merge(pending, rbio)) { 662 merge_rbio(pending, rbio); 663 spin_unlock(&cur->bio_list_lock); 664 freeit = rbio; 665 ret = 1; 666 goto out; 667 } 668 } 669 670 /* 671 * No merging, put us on the tail of the plug list, our rbio 672 * will be started with the currently running rbio unlocks 673 */ 674 list_add_tail(&rbio->plug_list, &cur->plug_list); 675 spin_unlock(&cur->bio_list_lock); 676 ret = 1; 677 goto out; 678 } 679 lockit: 680 refcount_inc(&rbio->refs); 681 list_add(&rbio->hash_list, &h->hash_list); 682 out: 683 spin_unlock_irqrestore(&h->lock, flags); 684 if (cache_drop) 685 remove_rbio_from_cache(cache_drop); 686 if (freeit) 687 __free_raid_bio(freeit); 688 return ret; 689 } 690 691 /* 692 * called as rmw or parity rebuild is completed. If the plug list has more 693 * rbios waiting for this stripe, the next one on the list will be started 694 */ 695 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 696 { 697 int bucket; 698 struct btrfs_stripe_hash *h; 699 unsigned long flags; 700 int keep_cache = 0; 701 702 bucket = rbio_bucket(rbio); 703 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 704 705 if (list_empty(&rbio->plug_list)) 706 cache_rbio(rbio); 707 708 spin_lock_irqsave(&h->lock, flags); 709 spin_lock(&rbio->bio_list_lock); 710 711 if (!list_empty(&rbio->hash_list)) { 712 /* 713 * if we're still cached and there is no other IO 714 * to perform, just leave this rbio here for others 715 * to steal from later 716 */ 717 if (list_empty(&rbio->plug_list) && 718 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 719 keep_cache = 1; 720 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 721 BUG_ON(!bio_list_empty(&rbio->bio_list)); 722 goto done; 723 } 724 725 list_del_init(&rbio->hash_list); 726 refcount_dec(&rbio->refs); 727 728 /* 729 * we use the plug list to hold all the rbios 730 * waiting for the chance to lock this stripe. 731 * hand the lock over to one of them. 732 */ 733 if (!list_empty(&rbio->plug_list)) { 734 struct btrfs_raid_bio *next; 735 struct list_head *head = rbio->plug_list.next; 736 737 next = list_entry(head, struct btrfs_raid_bio, 738 plug_list); 739 740 list_del_init(&rbio->plug_list); 741 742 list_add(&next->hash_list, &h->hash_list); 743 refcount_inc(&next->refs); 744 spin_unlock(&rbio->bio_list_lock); 745 spin_unlock_irqrestore(&h->lock, flags); 746 747 if (next->operation == BTRFS_RBIO_READ_REBUILD) 748 start_async_work(next, read_rebuild_work); 749 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { 750 steal_rbio(rbio, next); 751 start_async_work(next, read_rebuild_work); 752 } else if (next->operation == BTRFS_RBIO_WRITE) { 753 steal_rbio(rbio, next); 754 start_async_work(next, rmw_work); 755 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 756 steal_rbio(rbio, next); 757 start_async_work(next, scrub_parity_work); 758 } 759 760 goto done_nolock; 761 } 762 } 763 done: 764 spin_unlock(&rbio->bio_list_lock); 765 spin_unlock_irqrestore(&h->lock, flags); 766 767 done_nolock: 768 if (!keep_cache) 769 remove_rbio_from_cache(rbio); 770 } 771 772 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 773 { 774 int i; 775 776 if (!refcount_dec_and_test(&rbio->refs)) 777 return; 778 779 WARN_ON(!list_empty(&rbio->stripe_cache)); 780 WARN_ON(!list_empty(&rbio->hash_list)); 781 WARN_ON(!bio_list_empty(&rbio->bio_list)); 782 783 for (i = 0; i < rbio->nr_pages; i++) { 784 if (rbio->stripe_pages[i]) { 785 __free_page(rbio->stripe_pages[i]); 786 rbio->stripe_pages[i] = NULL; 787 } 788 } 789 790 btrfs_put_bioc(rbio->bioc); 791 kfree(rbio); 792 } 793 794 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 795 { 796 struct bio *next; 797 798 while (cur) { 799 next = cur->bi_next; 800 cur->bi_next = NULL; 801 cur->bi_status = err; 802 bio_endio(cur); 803 cur = next; 804 } 805 } 806 807 /* 808 * this frees the rbio and runs through all the bios in the 809 * bio_list and calls end_io on them 810 */ 811 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 812 { 813 struct bio *cur = bio_list_get(&rbio->bio_list); 814 struct bio *extra; 815 816 /* 817 * Clear the data bitmap, as the rbio may be cached for later usage. 818 * do this before before unlock_stripe() so there will be no new bio 819 * for this bio. 820 */ 821 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 822 823 /* 824 * At this moment, rbio->bio_list is empty, however since rbio does not 825 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 826 * hash list, rbio may be merged with others so that rbio->bio_list 827 * becomes non-empty. 828 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 829 * more and we can call bio_endio() on all queued bios. 830 */ 831 unlock_stripe(rbio); 832 extra = bio_list_get(&rbio->bio_list); 833 __free_raid_bio(rbio); 834 835 rbio_endio_bio_list(cur, err); 836 if (extra) 837 rbio_endio_bio_list(extra, err); 838 } 839 840 /* 841 * end io function used by finish_rmw. When we finally 842 * get here, we've written a full stripe 843 */ 844 static void raid_write_end_io(struct bio *bio) 845 { 846 struct btrfs_raid_bio *rbio = bio->bi_private; 847 blk_status_t err = bio->bi_status; 848 int max_errors; 849 850 if (err) 851 fail_bio_stripe(rbio, bio); 852 853 bio_put(bio); 854 855 if (!atomic_dec_and_test(&rbio->stripes_pending)) 856 return; 857 858 err = BLK_STS_OK; 859 860 /* OK, we have read all the stripes we need to. */ 861 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? 862 0 : rbio->bioc->max_errors; 863 if (atomic_read(&rbio->error) > max_errors) 864 err = BLK_STS_IOERR; 865 866 rbio_orig_end_io(rbio, err); 867 } 868 869 /** 870 * Get a sector pointer specified by its @stripe_nr and @sector_nr 871 * 872 * @rbio: The raid bio 873 * @stripe_nr: Stripe number, valid range [0, real_stripe) 874 * @sector_nr: Sector number inside the stripe, 875 * valid range [0, stripe_nsectors) 876 * @bio_list_only: Whether to use sectors inside the bio list only. 877 * 878 * The read/modify/write code wants to reuse the original bio page as much 879 * as possible, and only use stripe_sectors as fallback. 880 */ 881 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 882 int stripe_nr, int sector_nr, 883 bool bio_list_only) 884 { 885 struct sector_ptr *sector; 886 int index; 887 888 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 889 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 890 891 index = stripe_nr * rbio->stripe_nsectors + sector_nr; 892 ASSERT(index >= 0 && index < rbio->nr_sectors); 893 894 spin_lock_irq(&rbio->bio_list_lock); 895 sector = &rbio->bio_sectors[index]; 896 if (sector->page || bio_list_only) { 897 /* Don't return sector without a valid page pointer */ 898 if (!sector->page) 899 sector = NULL; 900 spin_unlock_irq(&rbio->bio_list_lock); 901 return sector; 902 } 903 spin_unlock_irq(&rbio->bio_list_lock); 904 905 return &rbio->stripe_sectors[index]; 906 } 907 908 /* 909 * allocation and initial setup for the btrfs_raid_bio. Not 910 * this does not allocate any pages for rbio->pages. 911 */ 912 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 913 struct btrfs_io_context *bioc) 914 { 915 const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; 916 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 917 const unsigned int num_pages = stripe_npages * real_stripes; 918 const unsigned int stripe_nsectors = 919 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 920 const unsigned int num_sectors = stripe_nsectors * real_stripes; 921 struct btrfs_raid_bio *rbio; 922 void *p; 923 924 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 925 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 926 /* 927 * Our current stripe len should be fixed to 64k thus stripe_nsectors 928 * (at most 16) should be no larger than BITS_PER_LONG. 929 */ 930 ASSERT(stripe_nsectors <= BITS_PER_LONG); 931 932 rbio = kzalloc(sizeof(*rbio) + 933 sizeof(*rbio->stripe_pages) * num_pages + 934 sizeof(*rbio->bio_sectors) * num_sectors + 935 sizeof(*rbio->stripe_sectors) * num_sectors + 936 sizeof(*rbio->finish_pointers) * real_stripes, 937 GFP_NOFS); 938 if (!rbio) 939 return ERR_PTR(-ENOMEM); 940 941 bio_list_init(&rbio->bio_list); 942 INIT_LIST_HEAD(&rbio->plug_list); 943 spin_lock_init(&rbio->bio_list_lock); 944 INIT_LIST_HEAD(&rbio->stripe_cache); 945 INIT_LIST_HEAD(&rbio->hash_list); 946 btrfs_get_bioc(bioc); 947 rbio->bioc = bioc; 948 rbio->nr_pages = num_pages; 949 rbio->nr_sectors = num_sectors; 950 rbio->real_stripes = real_stripes; 951 rbio->stripe_npages = stripe_npages; 952 rbio->stripe_nsectors = stripe_nsectors; 953 rbio->faila = -1; 954 rbio->failb = -1; 955 refcount_set(&rbio->refs, 1); 956 atomic_set(&rbio->error, 0); 957 atomic_set(&rbio->stripes_pending, 0); 958 959 /* 960 * The stripe_pages, bio_sectors, etc arrays point to the extra memory 961 * we allocated past the end of the rbio. 962 */ 963 p = rbio + 1; 964 #define CONSUME_ALLOC(ptr, count) do { \ 965 ptr = p; \ 966 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ 967 } while (0) 968 CONSUME_ALLOC(rbio->stripe_pages, num_pages); 969 CONSUME_ALLOC(rbio->bio_sectors, num_sectors); 970 CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); 971 CONSUME_ALLOC(rbio->finish_pointers, real_stripes); 972 #undef CONSUME_ALLOC 973 974 ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 975 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 976 977 return rbio; 978 } 979 980 /* allocate pages for all the stripes in the bio, including parity */ 981 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 982 { 983 int ret; 984 985 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 986 if (ret < 0) 987 return ret; 988 /* Mapping all sectors */ 989 index_stripe_sectors(rbio); 990 return 0; 991 } 992 993 /* only allocate pages for p/q stripes */ 994 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 995 { 996 const int data_pages = rbio->nr_data * rbio->stripe_npages; 997 int ret; 998 999 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 1000 rbio->stripe_pages + data_pages); 1001 if (ret < 0) 1002 return ret; 1003 1004 index_stripe_sectors(rbio); 1005 return 0; 1006 } 1007 1008 /* 1009 * Add a single sector @sector into our list of bios for IO. 1010 * 1011 * Return 0 if everything went well. 1012 * Return <0 for error. 1013 */ 1014 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 1015 struct bio_list *bio_list, 1016 struct sector_ptr *sector, 1017 unsigned int stripe_nr, 1018 unsigned int sector_nr, 1019 enum req_op op) 1020 { 1021 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1022 struct bio *last = bio_list->tail; 1023 int ret; 1024 struct bio *bio; 1025 struct btrfs_io_stripe *stripe; 1026 u64 disk_start; 1027 1028 /* 1029 * Note: here stripe_nr has taken device replace into consideration, 1030 * thus it can be larger than rbio->real_stripe. 1031 * So here we check against bioc->num_stripes, not rbio->real_stripes. 1032 */ 1033 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 1034 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 1035 ASSERT(sector->page); 1036 1037 stripe = &rbio->bioc->stripes[stripe_nr]; 1038 disk_start = stripe->physical + sector_nr * sectorsize; 1039 1040 /* if the device is missing, just fail this stripe */ 1041 if (!stripe->dev->bdev) 1042 return fail_rbio_index(rbio, stripe_nr); 1043 1044 /* see if we can add this page onto our existing bio */ 1045 if (last) { 1046 u64 last_end = last->bi_iter.bi_sector << 9; 1047 last_end += last->bi_iter.bi_size; 1048 1049 /* 1050 * we can't merge these if they are from different 1051 * devices or if they are not contiguous 1052 */ 1053 if (last_end == disk_start && !last->bi_status && 1054 last->bi_bdev == stripe->dev->bdev) { 1055 ret = bio_add_page(last, sector->page, sectorsize, 1056 sector->pgoff); 1057 if (ret == sectorsize) 1058 return 0; 1059 } 1060 } 1061 1062 /* put a new bio on the list */ 1063 bio = bio_alloc(stripe->dev->bdev, 1064 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1065 op, GFP_NOFS); 1066 bio->bi_iter.bi_sector = disk_start >> 9; 1067 bio->bi_private = rbio; 1068 1069 bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 1070 bio_list_add(bio_list, bio); 1071 return 0; 1072 } 1073 1074 /* 1075 * while we're doing the read/modify/write cycle, we could 1076 * have errors in reading pages off the disk. This checks 1077 * for errors and if we're not able to read the page it'll 1078 * trigger parity reconstruction. The rmw will be finished 1079 * after we've reconstructed the failed stripes 1080 */ 1081 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1082 { 1083 if (rbio->faila >= 0 || rbio->failb >= 0) { 1084 BUG_ON(rbio->faila == rbio->real_stripes - 1); 1085 __raid56_parity_recover(rbio); 1086 } else { 1087 finish_rmw(rbio); 1088 } 1089 } 1090 1091 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1092 { 1093 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1094 struct bio_vec bvec; 1095 struct bvec_iter iter; 1096 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1097 rbio->bioc->raid_map[0]; 1098 1099 bio_for_each_segment(bvec, bio, iter) { 1100 u32 bvec_offset; 1101 1102 for (bvec_offset = 0; bvec_offset < bvec.bv_len; 1103 bvec_offset += sectorsize, offset += sectorsize) { 1104 int index = offset / sectorsize; 1105 struct sector_ptr *sector = &rbio->bio_sectors[index]; 1106 1107 sector->page = bvec.bv_page; 1108 sector->pgoff = bvec.bv_offset + bvec_offset; 1109 ASSERT(sector->pgoff < PAGE_SIZE); 1110 } 1111 } 1112 } 1113 1114 /* 1115 * helper function to walk our bio list and populate the bio_pages array with 1116 * the result. This seems expensive, but it is faster than constantly 1117 * searching through the bio list as we setup the IO in finish_rmw or stripe 1118 * reconstruction. 1119 * 1120 * This must be called before you trust the answers from page_in_rbio 1121 */ 1122 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1123 { 1124 struct bio *bio; 1125 1126 spin_lock_irq(&rbio->bio_list_lock); 1127 bio_list_for_each(bio, &rbio->bio_list) 1128 index_one_bio(rbio, bio); 1129 1130 spin_unlock_irq(&rbio->bio_list_lock); 1131 } 1132 1133 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1134 struct raid56_bio_trace_info *trace_info) 1135 { 1136 const struct btrfs_io_context *bioc = rbio->bioc; 1137 int i; 1138 1139 ASSERT(bioc); 1140 1141 /* We rely on bio->bi_bdev to find the stripe number. */ 1142 if (!bio->bi_bdev) 1143 goto not_found; 1144 1145 for (i = 0; i < bioc->num_stripes; i++) { 1146 if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1147 continue; 1148 trace_info->stripe_nr = i; 1149 trace_info->devid = bioc->stripes[i].dev->devid; 1150 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1151 bioc->stripes[i].physical; 1152 return; 1153 } 1154 1155 not_found: 1156 trace_info->devid = -1; 1157 trace_info->offset = -1; 1158 trace_info->stripe_nr = -1; 1159 } 1160 1161 /* 1162 * this is called from one of two situations. We either 1163 * have a full stripe from the higher layers, or we've read all 1164 * the missing bits off disk. 1165 * 1166 * This will calculate the parity and then send down any 1167 * changed blocks. 1168 */ 1169 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1170 { 1171 struct btrfs_io_context *bioc = rbio->bioc; 1172 const u32 sectorsize = bioc->fs_info->sectorsize; 1173 void **pointers = rbio->finish_pointers; 1174 int nr_data = rbio->nr_data; 1175 /* The total sector number inside the full stripe. */ 1176 int total_sector_nr; 1177 int stripe; 1178 /* Sector number inside a stripe. */ 1179 int sectornr; 1180 bool has_qstripe; 1181 struct bio_list bio_list; 1182 struct bio *bio; 1183 int ret; 1184 1185 bio_list_init(&bio_list); 1186 1187 if (rbio->real_stripes - rbio->nr_data == 1) 1188 has_qstripe = false; 1189 else if (rbio->real_stripes - rbio->nr_data == 2) 1190 has_qstripe = true; 1191 else 1192 BUG(); 1193 1194 /* We should have at least one data sector. */ 1195 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1196 1197 /* at this point we either have a full stripe, 1198 * or we've read the full stripe from the drive. 1199 * recalculate the parity and write the new results. 1200 * 1201 * We're not allowed to add any new bios to the 1202 * bio list here, anyone else that wants to 1203 * change this stripe needs to do their own rmw. 1204 */ 1205 spin_lock_irq(&rbio->bio_list_lock); 1206 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1207 spin_unlock_irq(&rbio->bio_list_lock); 1208 1209 atomic_set(&rbio->error, 0); 1210 1211 /* 1212 * now that we've set rmw_locked, run through the 1213 * bio list one last time and map the page pointers 1214 * 1215 * We don't cache full rbios because we're assuming 1216 * the higher layers are unlikely to use this area of 1217 * the disk again soon. If they do use it again, 1218 * hopefully they will send another full bio. 1219 */ 1220 index_rbio_pages(rbio); 1221 if (!rbio_is_full(rbio)) 1222 cache_rbio_pages(rbio); 1223 else 1224 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1225 1226 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1227 struct sector_ptr *sector; 1228 1229 /* First collect one sector from each data stripe */ 1230 for (stripe = 0; stripe < nr_data; stripe++) { 1231 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1232 pointers[stripe] = kmap_local_page(sector->page) + 1233 sector->pgoff; 1234 } 1235 1236 /* Then add the parity stripe */ 1237 sector = rbio_pstripe_sector(rbio, sectornr); 1238 sector->uptodate = 1; 1239 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 1240 1241 if (has_qstripe) { 1242 /* 1243 * RAID6, add the qstripe and call the library function 1244 * to fill in our p/q 1245 */ 1246 sector = rbio_qstripe_sector(rbio, sectornr); 1247 sector->uptodate = 1; 1248 pointers[stripe++] = kmap_local_page(sector->page) + 1249 sector->pgoff; 1250 1251 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 1252 pointers); 1253 } else { 1254 /* raid5 */ 1255 memcpy(pointers[nr_data], pointers[0], sectorsize); 1256 run_xor(pointers + 1, nr_data - 1, sectorsize); 1257 } 1258 for (stripe = stripe - 1; stripe >= 0; stripe--) 1259 kunmap_local(pointers[stripe]); 1260 } 1261 1262 /* 1263 * Start writing. Make bios for everything from the higher layers (the 1264 * bio_list in our rbio) and our P/Q. Ignore everything else. 1265 */ 1266 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1267 total_sector_nr++) { 1268 struct sector_ptr *sector; 1269 1270 stripe = total_sector_nr / rbio->stripe_nsectors; 1271 sectornr = total_sector_nr % rbio->stripe_nsectors; 1272 1273 /* This vertical stripe has no data, skip it. */ 1274 if (!test_bit(sectornr, &rbio->dbitmap)) 1275 continue; 1276 1277 if (stripe < rbio->nr_data) { 1278 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1279 if (!sector) 1280 continue; 1281 } else { 1282 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1283 } 1284 1285 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 1286 sectornr, REQ_OP_WRITE); 1287 if (ret) 1288 goto cleanup; 1289 } 1290 1291 if (likely(!bioc->num_tgtdevs)) 1292 goto write_data; 1293 1294 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1295 total_sector_nr++) { 1296 struct sector_ptr *sector; 1297 1298 stripe = total_sector_nr / rbio->stripe_nsectors; 1299 sectornr = total_sector_nr % rbio->stripe_nsectors; 1300 1301 if (!bioc->tgtdev_map[stripe]) { 1302 /* 1303 * We can skip the whole stripe completely, note 1304 * total_sector_nr will be increased by one anyway. 1305 */ 1306 ASSERT(sectornr == 0); 1307 total_sector_nr += rbio->stripe_nsectors - 1; 1308 continue; 1309 } 1310 1311 /* This vertical stripe has no data, skip it. */ 1312 if (!test_bit(sectornr, &rbio->dbitmap)) 1313 continue; 1314 1315 if (stripe < rbio->nr_data) { 1316 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1317 if (!sector) 1318 continue; 1319 } else { 1320 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1321 } 1322 1323 ret = rbio_add_io_sector(rbio, &bio_list, sector, 1324 rbio->bioc->tgtdev_map[stripe], 1325 sectornr, REQ_OP_WRITE); 1326 if (ret) 1327 goto cleanup; 1328 } 1329 1330 write_data: 1331 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); 1332 BUG_ON(atomic_read(&rbio->stripes_pending) == 0); 1333 1334 while ((bio = bio_list_pop(&bio_list))) { 1335 bio->bi_end_io = raid_write_end_io; 1336 1337 if (trace_raid56_write_stripe_enabled()) { 1338 struct raid56_bio_trace_info trace_info = { 0 }; 1339 1340 bio_get_trace_info(rbio, bio, &trace_info); 1341 trace_raid56_write_stripe(rbio, bio, &trace_info); 1342 } 1343 submit_bio(bio); 1344 } 1345 return; 1346 1347 cleanup: 1348 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1349 1350 while ((bio = bio_list_pop(&bio_list))) 1351 bio_put(bio); 1352 } 1353 1354 /* 1355 * helper to find the stripe number for a given bio. Used to figure out which 1356 * stripe has failed. This expects the bio to correspond to a physical disk, 1357 * so it looks up based on physical sector numbers. 1358 */ 1359 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1360 struct bio *bio) 1361 { 1362 u64 physical = bio->bi_iter.bi_sector; 1363 int i; 1364 struct btrfs_io_stripe *stripe; 1365 1366 physical <<= 9; 1367 1368 for (i = 0; i < rbio->bioc->num_stripes; i++) { 1369 stripe = &rbio->bioc->stripes[i]; 1370 if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && 1371 stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { 1372 return i; 1373 } 1374 } 1375 return -1; 1376 } 1377 1378 /* 1379 * helper to find the stripe number for a given 1380 * bio (before mapping). Used to figure out which stripe has 1381 * failed. This looks up based on logical block numbers. 1382 */ 1383 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1384 struct bio *bio) 1385 { 1386 u64 logical = bio->bi_iter.bi_sector << 9; 1387 int i; 1388 1389 for (i = 0; i < rbio->nr_data; i++) { 1390 u64 stripe_start = rbio->bioc->raid_map[i]; 1391 1392 if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) 1393 return i; 1394 } 1395 return -1; 1396 } 1397 1398 /* 1399 * returns -EIO if we had too many failures 1400 */ 1401 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1402 { 1403 unsigned long flags; 1404 int ret = 0; 1405 1406 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1407 1408 /* we already know this stripe is bad, move on */ 1409 if (rbio->faila == failed || rbio->failb == failed) 1410 goto out; 1411 1412 if (rbio->faila == -1) { 1413 /* first failure on this rbio */ 1414 rbio->faila = failed; 1415 atomic_inc(&rbio->error); 1416 } else if (rbio->failb == -1) { 1417 /* second failure on this rbio */ 1418 rbio->failb = failed; 1419 atomic_inc(&rbio->error); 1420 } else { 1421 ret = -EIO; 1422 } 1423 out: 1424 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1425 1426 return ret; 1427 } 1428 1429 /* 1430 * helper to fail a stripe based on a physical disk 1431 * bio. 1432 */ 1433 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1434 struct bio *bio) 1435 { 1436 int failed = find_bio_stripe(rbio, bio); 1437 1438 if (failed < 0) 1439 return -EIO; 1440 1441 return fail_rbio_index(rbio, failed); 1442 } 1443 1444 /* 1445 * For subpage case, we can no longer set page Uptodate directly for 1446 * stripe_pages[], thus we need to locate the sector. 1447 */ 1448 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 1449 struct page *page, 1450 unsigned int pgoff) 1451 { 1452 int i; 1453 1454 for (i = 0; i < rbio->nr_sectors; i++) { 1455 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 1456 1457 if (sector->page == page && sector->pgoff == pgoff) 1458 return sector; 1459 } 1460 return NULL; 1461 } 1462 1463 /* 1464 * this sets each page in the bio uptodate. It should only be used on private 1465 * rbio pages, nothing that comes in from the higher layers 1466 */ 1467 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1468 { 1469 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1470 struct bio_vec *bvec; 1471 struct bvec_iter_all iter_all; 1472 1473 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1474 1475 bio_for_each_segment_all(bvec, bio, iter_all) { 1476 struct sector_ptr *sector; 1477 int pgoff; 1478 1479 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 1480 pgoff += sectorsize) { 1481 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 1482 ASSERT(sector); 1483 if (sector) 1484 sector->uptodate = 1; 1485 } 1486 } 1487 } 1488 1489 static void raid56_bio_end_io(struct bio *bio) 1490 { 1491 struct btrfs_raid_bio *rbio = bio->bi_private; 1492 1493 if (bio->bi_status) 1494 fail_bio_stripe(rbio, bio); 1495 else 1496 set_bio_pages_uptodate(rbio, bio); 1497 1498 bio_put(bio); 1499 1500 if (atomic_dec_and_test(&rbio->stripes_pending)) 1501 queue_work(rbio->bioc->fs_info->endio_raid56_workers, 1502 &rbio->end_io_work); 1503 } 1504 1505 /* 1506 * End io handler for the read phase of the RMW cycle. All the bios here are 1507 * physical stripe bios we've read from the disk so we can recalculate the 1508 * parity of the stripe. 1509 * 1510 * This will usually kick off finish_rmw once all the bios are read in, but it 1511 * may trigger parity reconstruction if we had any errors along the way 1512 */ 1513 static void raid56_rmw_end_io_work(struct work_struct *work) 1514 { 1515 struct btrfs_raid_bio *rbio = 1516 container_of(work, struct btrfs_raid_bio, end_io_work); 1517 1518 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { 1519 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1520 return; 1521 } 1522 1523 /* 1524 * This will normally call finish_rmw to start our write but if there 1525 * are any failed stripes we'll reconstruct from parity first. 1526 */ 1527 validate_rbio_for_rmw(rbio); 1528 } 1529 1530 /* 1531 * the stripe must be locked by the caller. It will 1532 * unlock after all the writes are done 1533 */ 1534 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1535 { 1536 int bios_to_read = 0; 1537 struct bio_list bio_list; 1538 const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; 1539 int ret; 1540 int total_sector_nr; 1541 struct bio *bio; 1542 1543 bio_list_init(&bio_list); 1544 1545 ret = alloc_rbio_pages(rbio); 1546 if (ret) 1547 goto cleanup; 1548 1549 index_rbio_pages(rbio); 1550 1551 atomic_set(&rbio->error, 0); 1552 /* Build a list of bios to read all the missing data sectors. */ 1553 for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; 1554 total_sector_nr++) { 1555 struct sector_ptr *sector; 1556 int stripe = total_sector_nr / rbio->stripe_nsectors; 1557 int sectornr = total_sector_nr % rbio->stripe_nsectors; 1558 1559 /* 1560 * We want to find all the sectors missing from the rbio and 1561 * read them from the disk. If sector_in_rbio() finds a page 1562 * in the bio list we don't need to read it off the stripe. 1563 */ 1564 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1565 if (sector) 1566 continue; 1567 1568 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1569 /* 1570 * The bio cache may have handed us an uptodate page. If so, 1571 * use it. 1572 */ 1573 if (sector->uptodate) 1574 continue; 1575 1576 ret = rbio_add_io_sector(rbio, &bio_list, sector, 1577 stripe, sectornr, REQ_OP_READ); 1578 if (ret) 1579 goto cleanup; 1580 } 1581 1582 bios_to_read = bio_list_size(&bio_list); 1583 if (!bios_to_read) { 1584 /* 1585 * this can happen if others have merged with 1586 * us, it means there is nothing left to read. 1587 * But if there are missing devices it may not be 1588 * safe to do the full stripe write yet. 1589 */ 1590 goto finish; 1591 } 1592 1593 /* 1594 * The bioc may be freed once we submit the last bio. Make sure not to 1595 * touch it after that. 1596 */ 1597 atomic_set(&rbio->stripes_pending, bios_to_read); 1598 INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); 1599 while ((bio = bio_list_pop(&bio_list))) { 1600 bio->bi_end_io = raid56_bio_end_io; 1601 1602 if (trace_raid56_read_partial_enabled()) { 1603 struct raid56_bio_trace_info trace_info = { 0 }; 1604 1605 bio_get_trace_info(rbio, bio, &trace_info); 1606 trace_raid56_read_partial(rbio, bio, &trace_info); 1607 } 1608 submit_bio(bio); 1609 } 1610 /* the actual write will happen once the reads are done */ 1611 return 0; 1612 1613 cleanup: 1614 rbio_orig_end_io(rbio, BLK_STS_IOERR); 1615 1616 while ((bio = bio_list_pop(&bio_list))) 1617 bio_put(bio); 1618 1619 return -EIO; 1620 1621 finish: 1622 validate_rbio_for_rmw(rbio); 1623 return 0; 1624 } 1625 1626 /* 1627 * if the upper layers pass in a full stripe, we thank them by only allocating 1628 * enough pages to hold the parity, and sending it all down quickly. 1629 */ 1630 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1631 { 1632 int ret; 1633 1634 ret = alloc_rbio_parity_pages(rbio); 1635 if (ret) 1636 return ret; 1637 1638 ret = lock_stripe_add(rbio); 1639 if (ret == 0) 1640 finish_rmw(rbio); 1641 return 0; 1642 } 1643 1644 /* 1645 * partial stripe writes get handed over to async helpers. 1646 * We're really hoping to merge a few more writes into this 1647 * rbio before calculating new parity 1648 */ 1649 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1650 { 1651 int ret; 1652 1653 ret = lock_stripe_add(rbio); 1654 if (ret == 0) 1655 start_async_work(rbio, rmw_work); 1656 return 0; 1657 } 1658 1659 /* 1660 * sometimes while we were reading from the drive to 1661 * recalculate parity, enough new bios come into create 1662 * a full stripe. So we do a check here to see if we can 1663 * go directly to finish_rmw 1664 */ 1665 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1666 { 1667 /* head off into rmw land if we don't have a full stripe */ 1668 if (!rbio_is_full(rbio)) 1669 return partial_stripe_write(rbio); 1670 return full_stripe_write(rbio); 1671 } 1672 1673 /* 1674 * We use plugging call backs to collect full stripes. 1675 * Any time we get a partial stripe write while plugged 1676 * we collect it into a list. When the unplug comes down, 1677 * we sort the list by logical block number and merge 1678 * everything we can into the same rbios 1679 */ 1680 struct btrfs_plug_cb { 1681 struct blk_plug_cb cb; 1682 struct btrfs_fs_info *info; 1683 struct list_head rbio_list; 1684 struct work_struct work; 1685 }; 1686 1687 /* 1688 * rbios on the plug list are sorted for easier merging. 1689 */ 1690 static int plug_cmp(void *priv, const struct list_head *a, 1691 const struct list_head *b) 1692 { 1693 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1694 plug_list); 1695 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1696 plug_list); 1697 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1698 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1699 1700 if (a_sector < b_sector) 1701 return -1; 1702 if (a_sector > b_sector) 1703 return 1; 1704 return 0; 1705 } 1706 1707 static void run_plug(struct btrfs_plug_cb *plug) 1708 { 1709 struct btrfs_raid_bio *cur; 1710 struct btrfs_raid_bio *last = NULL; 1711 1712 /* 1713 * sort our plug list then try to merge 1714 * everything we can in hopes of creating full 1715 * stripes. 1716 */ 1717 list_sort(NULL, &plug->rbio_list, plug_cmp); 1718 while (!list_empty(&plug->rbio_list)) { 1719 cur = list_entry(plug->rbio_list.next, 1720 struct btrfs_raid_bio, plug_list); 1721 list_del_init(&cur->plug_list); 1722 1723 if (rbio_is_full(cur)) { 1724 int ret; 1725 1726 /* we have a full stripe, send it down */ 1727 ret = full_stripe_write(cur); 1728 BUG_ON(ret); 1729 continue; 1730 } 1731 if (last) { 1732 if (rbio_can_merge(last, cur)) { 1733 merge_rbio(last, cur); 1734 __free_raid_bio(cur); 1735 continue; 1736 1737 } 1738 __raid56_parity_write(last); 1739 } 1740 last = cur; 1741 } 1742 if (last) { 1743 __raid56_parity_write(last); 1744 } 1745 kfree(plug); 1746 } 1747 1748 /* 1749 * if the unplug comes from schedule, we have to push the 1750 * work off to a helper thread 1751 */ 1752 static void unplug_work(struct work_struct *work) 1753 { 1754 struct btrfs_plug_cb *plug; 1755 plug = container_of(work, struct btrfs_plug_cb, work); 1756 run_plug(plug); 1757 } 1758 1759 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1760 { 1761 struct btrfs_plug_cb *plug; 1762 plug = container_of(cb, struct btrfs_plug_cb, cb); 1763 1764 if (from_schedule) { 1765 INIT_WORK(&plug->work, unplug_work); 1766 queue_work(plug->info->rmw_workers, &plug->work); 1767 return; 1768 } 1769 run_plug(plug); 1770 } 1771 1772 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1773 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1774 { 1775 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1776 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1777 const u64 full_stripe_start = rbio->bioc->raid_map[0]; 1778 const u32 orig_len = orig_bio->bi_iter.bi_size; 1779 const u32 sectorsize = fs_info->sectorsize; 1780 u64 cur_logical; 1781 1782 ASSERT(orig_logical >= full_stripe_start && 1783 orig_logical + orig_len <= full_stripe_start + 1784 rbio->nr_data * BTRFS_STRIPE_LEN); 1785 1786 bio_list_add(&rbio->bio_list, orig_bio); 1787 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1788 1789 /* Update the dbitmap. */ 1790 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1791 cur_logical += sectorsize) { 1792 int bit = ((u32)(cur_logical - full_stripe_start) >> 1793 fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1794 1795 set_bit(bit, &rbio->dbitmap); 1796 } 1797 } 1798 1799 /* 1800 * our main entry point for writes from the rest of the FS. 1801 */ 1802 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 1803 { 1804 struct btrfs_fs_info *fs_info = bioc->fs_info; 1805 struct btrfs_raid_bio *rbio; 1806 struct btrfs_plug_cb *plug = NULL; 1807 struct blk_plug_cb *cb; 1808 int ret = 0; 1809 1810 rbio = alloc_rbio(fs_info, bioc); 1811 if (IS_ERR(rbio)) { 1812 ret = PTR_ERR(rbio); 1813 goto fail; 1814 } 1815 rbio->operation = BTRFS_RBIO_WRITE; 1816 rbio_add_bio(rbio, bio); 1817 1818 /* 1819 * don't plug on full rbios, just get them out the door 1820 * as quickly as we can 1821 */ 1822 if (rbio_is_full(rbio)) { 1823 ret = full_stripe_write(rbio); 1824 if (ret) { 1825 __free_raid_bio(rbio); 1826 goto fail; 1827 } 1828 return; 1829 } 1830 1831 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); 1832 if (cb) { 1833 plug = container_of(cb, struct btrfs_plug_cb, cb); 1834 if (!plug->info) { 1835 plug->info = fs_info; 1836 INIT_LIST_HEAD(&plug->rbio_list); 1837 } 1838 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1839 } else { 1840 ret = __raid56_parity_write(rbio); 1841 if (ret) { 1842 __free_raid_bio(rbio); 1843 goto fail; 1844 } 1845 } 1846 1847 return; 1848 1849 fail: 1850 bio->bi_status = errno_to_blk_status(ret); 1851 bio_endio(bio); 1852 } 1853 1854 /* 1855 * all parity reconstruction happens here. We've read in everything 1856 * we can find from the drives and this does the heavy lifting of 1857 * sorting the good from the bad. 1858 */ 1859 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1860 { 1861 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1862 int sectornr, stripe; 1863 void **pointers; 1864 void **unmap_array; 1865 int faila = -1, failb = -1; 1866 blk_status_t err; 1867 int i; 1868 1869 /* 1870 * This array stores the pointer for each sector, thus it has the extra 1871 * pgoff value added from each sector 1872 */ 1873 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1874 if (!pointers) { 1875 err = BLK_STS_RESOURCE; 1876 goto cleanup_io; 1877 } 1878 1879 /* 1880 * Store copy of pointers that does not get reordered during 1881 * reconstruction so that kunmap_local works. 1882 */ 1883 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1884 if (!unmap_array) { 1885 err = BLK_STS_RESOURCE; 1886 goto cleanup_pointers; 1887 } 1888 1889 faila = rbio->faila; 1890 failb = rbio->failb; 1891 1892 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 1893 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 1894 spin_lock_irq(&rbio->bio_list_lock); 1895 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1896 spin_unlock_irq(&rbio->bio_list_lock); 1897 } 1898 1899 index_rbio_pages(rbio); 1900 1901 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1902 struct sector_ptr *sector; 1903 1904 /* 1905 * Now we just use bitmap to mark the horizontal stripes in 1906 * which we have data when doing parity scrub. 1907 */ 1908 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1909 !test_bit(sectornr, &rbio->dbitmap)) 1910 continue; 1911 1912 /* 1913 * Setup our array of pointers with sectors from each stripe 1914 * 1915 * NOTE: store a duplicate array of pointers to preserve the 1916 * pointer order 1917 */ 1918 for (stripe = 0; stripe < rbio->real_stripes; stripe++) { 1919 /* 1920 * If we're rebuilding a read, we have to use 1921 * pages from the bio list 1922 */ 1923 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || 1924 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && 1925 (stripe == faila || stripe == failb)) { 1926 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1927 } else { 1928 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1929 } 1930 ASSERT(sector->page); 1931 pointers[stripe] = kmap_local_page(sector->page) + 1932 sector->pgoff; 1933 unmap_array[stripe] = pointers[stripe]; 1934 } 1935 1936 /* All raid6 handling here */ 1937 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1938 /* Single failure, rebuild from parity raid5 style */ 1939 if (failb < 0) { 1940 if (faila == rbio->nr_data) { 1941 /* 1942 * Just the P stripe has failed, without 1943 * a bad data or Q stripe. 1944 * TODO, we should redo the xor here. 1945 */ 1946 err = BLK_STS_IOERR; 1947 goto cleanup; 1948 } 1949 /* 1950 * a single failure in raid6 is rebuilt 1951 * in the pstripe code below 1952 */ 1953 goto pstripe; 1954 } 1955 1956 /* make sure our ps and qs are in order */ 1957 if (faila > failb) 1958 swap(faila, failb); 1959 1960 /* if the q stripe is failed, do a pstripe reconstruction 1961 * from the xors. 1962 * If both the q stripe and the P stripe are failed, we're 1963 * here due to a crc mismatch and we can't give them the 1964 * data they want 1965 */ 1966 if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { 1967 if (rbio->bioc->raid_map[faila] == 1968 RAID5_P_STRIPE) { 1969 err = BLK_STS_IOERR; 1970 goto cleanup; 1971 } 1972 /* 1973 * otherwise we have one bad data stripe and 1974 * a good P stripe. raid5! 1975 */ 1976 goto pstripe; 1977 } 1978 1979 if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { 1980 raid6_datap_recov(rbio->real_stripes, 1981 sectorsize, faila, pointers); 1982 } else { 1983 raid6_2data_recov(rbio->real_stripes, 1984 sectorsize, faila, failb, 1985 pointers); 1986 } 1987 } else { 1988 void *p; 1989 1990 /* rebuild from P stripe here (raid5 or raid6) */ 1991 BUG_ON(failb != -1); 1992 pstripe: 1993 /* Copy parity block into failed block to start with */ 1994 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 1995 1996 /* rearrange the pointer array */ 1997 p = pointers[faila]; 1998 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1999 pointers[stripe] = pointers[stripe + 1]; 2000 pointers[rbio->nr_data - 1] = p; 2001 2002 /* xor in the rest */ 2003 run_xor(pointers, rbio->nr_data - 1, sectorsize); 2004 } 2005 /* if we're doing this rebuild as part of an rmw, go through 2006 * and set all of our private rbio pages in the 2007 * failed stripes as uptodate. This way finish_rmw will 2008 * know they can be trusted. If this was a read reconstruction, 2009 * other endio functions will fiddle the uptodate bits 2010 */ 2011 if (rbio->operation == BTRFS_RBIO_WRITE) { 2012 for (i = 0; i < rbio->stripe_nsectors; i++) { 2013 if (faila != -1) { 2014 sector = rbio_stripe_sector(rbio, faila, i); 2015 sector->uptodate = 1; 2016 } 2017 if (failb != -1) { 2018 sector = rbio_stripe_sector(rbio, failb, i); 2019 sector->uptodate = 1; 2020 } 2021 } 2022 } 2023 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) 2024 kunmap_local(unmap_array[stripe]); 2025 } 2026 2027 err = BLK_STS_OK; 2028 cleanup: 2029 kfree(unmap_array); 2030 cleanup_pointers: 2031 kfree(pointers); 2032 2033 cleanup_io: 2034 /* 2035 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a 2036 * valid rbio which is consistent with ondisk content, thus such a 2037 * valid rbio can be cached to avoid further disk reads. 2038 */ 2039 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2040 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { 2041 /* 2042 * - In case of two failures, where rbio->failb != -1: 2043 * 2044 * Do not cache this rbio since the above read reconstruction 2045 * (raid6_datap_recov() or raid6_2data_recov()) may have 2046 * changed some content of stripes which are not identical to 2047 * on-disk content any more, otherwise, a later write/recover 2048 * may steal stripe_pages from this rbio and end up with 2049 * corruptions or rebuild failures. 2050 * 2051 * - In case of single failure, where rbio->failb == -1: 2052 * 2053 * Cache this rbio iff the above read reconstruction is 2054 * executed without problems. 2055 */ 2056 if (err == BLK_STS_OK && rbio->failb < 0) 2057 cache_rbio_pages(rbio); 2058 else 2059 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2060 2061 rbio_orig_end_io(rbio, err); 2062 } else if (err == BLK_STS_OK) { 2063 rbio->faila = -1; 2064 rbio->failb = -1; 2065 2066 if (rbio->operation == BTRFS_RBIO_WRITE) 2067 finish_rmw(rbio); 2068 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) 2069 finish_parity_scrub(rbio, 0); 2070 else 2071 BUG(); 2072 } else { 2073 rbio_orig_end_io(rbio, err); 2074 } 2075 } 2076 2077 /* 2078 * This is called only for stripes we've read from disk to reconstruct the 2079 * parity. 2080 */ 2081 static void raid_recover_end_io_work(struct work_struct *work) 2082 { 2083 struct btrfs_raid_bio *rbio = 2084 container_of(work, struct btrfs_raid_bio, end_io_work); 2085 2086 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2087 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2088 else 2089 __raid_recover_end_io(rbio); 2090 } 2091 2092 /* 2093 * reads everything we need off the disk to reconstruct 2094 * the parity. endio handlers trigger final reconstruction 2095 * when the IO is done. 2096 * 2097 * This is used both for reads from the higher layers and for 2098 * parity construction required to finish a rmw cycle. 2099 */ 2100 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 2101 { 2102 int bios_to_read = 0; 2103 struct bio_list bio_list; 2104 int ret; 2105 int total_sector_nr; 2106 struct bio *bio; 2107 2108 bio_list_init(&bio_list); 2109 2110 ret = alloc_rbio_pages(rbio); 2111 if (ret) 2112 goto cleanup; 2113 2114 atomic_set(&rbio->error, 0); 2115 2116 /* 2117 * Read everything that hasn't failed. However this time we will 2118 * not trust any cached sector. 2119 * As we may read out some stale data but higher layer is not reading 2120 * that stale part. 2121 * 2122 * So here we always re-read everything in recovery path. 2123 */ 2124 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2125 total_sector_nr++) { 2126 int stripe = total_sector_nr / rbio->stripe_nsectors; 2127 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2128 struct sector_ptr *sector; 2129 2130 if (rbio->faila == stripe || rbio->failb == stripe) { 2131 atomic_inc(&rbio->error); 2132 /* Skip the current stripe. */ 2133 ASSERT(sectornr == 0); 2134 total_sector_nr += rbio->stripe_nsectors - 1; 2135 continue; 2136 } 2137 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2138 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2139 sectornr, REQ_OP_READ); 2140 if (ret < 0) 2141 goto cleanup; 2142 } 2143 2144 bios_to_read = bio_list_size(&bio_list); 2145 if (!bios_to_read) { 2146 /* 2147 * we might have no bios to read just because the pages 2148 * were up to date, or we might have no bios to read because 2149 * the devices were gone. 2150 */ 2151 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { 2152 __raid_recover_end_io(rbio); 2153 return 0; 2154 } else { 2155 goto cleanup; 2156 } 2157 } 2158 2159 /* 2160 * The bioc may be freed once we submit the last bio. Make sure not to 2161 * touch it after that. 2162 */ 2163 atomic_set(&rbio->stripes_pending, bios_to_read); 2164 INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); 2165 while ((bio = bio_list_pop(&bio_list))) { 2166 bio->bi_end_io = raid56_bio_end_io; 2167 2168 if (trace_raid56_scrub_read_recover_enabled()) { 2169 struct raid56_bio_trace_info trace_info = { 0 }; 2170 2171 bio_get_trace_info(rbio, bio, &trace_info); 2172 trace_raid56_scrub_read_recover(rbio, bio, &trace_info); 2173 } 2174 submit_bio(bio); 2175 } 2176 2177 return 0; 2178 2179 cleanup: 2180 if (rbio->operation == BTRFS_RBIO_READ_REBUILD || 2181 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) 2182 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2183 2184 while ((bio = bio_list_pop(&bio_list))) 2185 bio_put(bio); 2186 2187 return -EIO; 2188 } 2189 2190 /* 2191 * the main entry point for reads from the higher layers. This 2192 * is really only called when the normal read path had a failure, 2193 * so we assume the bio they send down corresponds to a failed part 2194 * of the drive. 2195 */ 2196 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2197 int mirror_num) 2198 { 2199 struct btrfs_fs_info *fs_info = bioc->fs_info; 2200 struct btrfs_raid_bio *rbio; 2201 2202 rbio = alloc_rbio(fs_info, bioc); 2203 if (IS_ERR(rbio)) { 2204 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2205 goto out_end_bio; 2206 } 2207 2208 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2209 rbio_add_bio(rbio, bio); 2210 2211 rbio->faila = find_logical_bio_stripe(rbio, bio); 2212 if (rbio->faila == -1) { 2213 btrfs_warn(fs_info, 2214 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", 2215 __func__, bio->bi_iter.bi_sector << 9, 2216 (u64)bio->bi_iter.bi_size, bioc->map_type); 2217 __free_raid_bio(rbio); 2218 bio->bi_status = BLK_STS_IOERR; 2219 goto out_end_bio; 2220 } 2221 2222 /* 2223 * Loop retry: 2224 * for 'mirror == 2', reconstruct from all other stripes. 2225 * for 'mirror_num > 2', select a stripe to fail on every retry. 2226 */ 2227 if (mirror_num > 2) { 2228 /* 2229 * 'mirror == 3' is to fail the p stripe and 2230 * reconstruct from the q stripe. 'mirror > 3' is to 2231 * fail a data stripe and reconstruct from p+q stripe. 2232 */ 2233 rbio->failb = rbio->real_stripes - (mirror_num - 1); 2234 ASSERT(rbio->failb > 0); 2235 if (rbio->failb <= rbio->faila) 2236 rbio->failb--; 2237 } 2238 2239 if (lock_stripe_add(rbio)) 2240 return; 2241 2242 /* 2243 * This adds our rbio to the list of rbios that will be handled after 2244 * the current lock owner is done. 2245 */ 2246 __raid56_parity_recover(rbio); 2247 return; 2248 2249 out_end_bio: 2250 bio_endio(bio); 2251 } 2252 2253 static void rmw_work(struct work_struct *work) 2254 { 2255 struct btrfs_raid_bio *rbio; 2256 2257 rbio = container_of(work, struct btrfs_raid_bio, work); 2258 raid56_rmw_stripe(rbio); 2259 } 2260 2261 static void read_rebuild_work(struct work_struct *work) 2262 { 2263 struct btrfs_raid_bio *rbio; 2264 2265 rbio = container_of(work, struct btrfs_raid_bio, work); 2266 __raid56_parity_recover(rbio); 2267 } 2268 2269 /* 2270 * The following code is used to scrub/replace the parity stripe 2271 * 2272 * Caller must have already increased bio_counter for getting @bioc. 2273 * 2274 * Note: We need make sure all the pages that add into the scrub/replace 2275 * raid bio are correct and not be changed during the scrub/replace. That 2276 * is those pages just hold metadata or file data with checksum. 2277 */ 2278 2279 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2280 struct btrfs_io_context *bioc, 2281 struct btrfs_device *scrub_dev, 2282 unsigned long *dbitmap, int stripe_nsectors) 2283 { 2284 struct btrfs_fs_info *fs_info = bioc->fs_info; 2285 struct btrfs_raid_bio *rbio; 2286 int i; 2287 2288 rbio = alloc_rbio(fs_info, bioc); 2289 if (IS_ERR(rbio)) 2290 return NULL; 2291 bio_list_add(&rbio->bio_list, bio); 2292 /* 2293 * This is a special bio which is used to hold the completion handler 2294 * and make the scrub rbio is similar to the other types 2295 */ 2296 ASSERT(!bio->bi_iter.bi_size); 2297 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2298 2299 /* 2300 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2301 * to the end position, so this search can start from the first parity 2302 * stripe. 2303 */ 2304 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2305 if (bioc->stripes[i].dev == scrub_dev) { 2306 rbio->scrubp = i; 2307 break; 2308 } 2309 } 2310 ASSERT(i < rbio->real_stripes); 2311 2312 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 2313 return rbio; 2314 } 2315 2316 /* Used for both parity scrub and missing. */ 2317 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, 2318 unsigned int pgoff, u64 logical) 2319 { 2320 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2321 int stripe_offset; 2322 int index; 2323 2324 ASSERT(logical >= rbio->bioc->raid_map[0]); 2325 ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + 2326 BTRFS_STRIPE_LEN * rbio->nr_data); 2327 stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); 2328 index = stripe_offset / sectorsize; 2329 rbio->bio_sectors[index].page = page; 2330 rbio->bio_sectors[index].pgoff = pgoff; 2331 } 2332 2333 /* 2334 * We just scrub the parity that we have correct data on the same horizontal, 2335 * so we needn't allocate all pages for all the stripes. 2336 */ 2337 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2338 { 2339 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2340 int total_sector_nr; 2341 2342 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2343 total_sector_nr++) { 2344 struct page *page; 2345 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2346 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; 2347 2348 if (!test_bit(sectornr, &rbio->dbitmap)) 2349 continue; 2350 if (rbio->stripe_pages[index]) 2351 continue; 2352 page = alloc_page(GFP_NOFS); 2353 if (!page) 2354 return -ENOMEM; 2355 rbio->stripe_pages[index] = page; 2356 } 2357 index_stripe_sectors(rbio); 2358 return 0; 2359 } 2360 2361 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, 2362 int need_check) 2363 { 2364 struct btrfs_io_context *bioc = rbio->bioc; 2365 const u32 sectorsize = bioc->fs_info->sectorsize; 2366 void **pointers = rbio->finish_pointers; 2367 unsigned long *pbitmap = &rbio->finish_pbitmap; 2368 int nr_data = rbio->nr_data; 2369 int stripe; 2370 int sectornr; 2371 bool has_qstripe; 2372 struct sector_ptr p_sector = { 0 }; 2373 struct sector_ptr q_sector = { 0 }; 2374 struct bio_list bio_list; 2375 struct bio *bio; 2376 int is_replace = 0; 2377 int ret; 2378 2379 bio_list_init(&bio_list); 2380 2381 if (rbio->real_stripes - rbio->nr_data == 1) 2382 has_qstripe = false; 2383 else if (rbio->real_stripes - rbio->nr_data == 2) 2384 has_qstripe = true; 2385 else 2386 BUG(); 2387 2388 if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { 2389 is_replace = 1; 2390 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 2391 } 2392 2393 /* 2394 * Because the higher layers(scrubber) are unlikely to 2395 * use this area of the disk again soon, so don't cache 2396 * it. 2397 */ 2398 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2399 2400 if (!need_check) 2401 goto writeback; 2402 2403 p_sector.page = alloc_page(GFP_NOFS); 2404 if (!p_sector.page) 2405 goto cleanup; 2406 p_sector.pgoff = 0; 2407 p_sector.uptodate = 1; 2408 2409 if (has_qstripe) { 2410 /* RAID6, allocate and map temp space for the Q stripe */ 2411 q_sector.page = alloc_page(GFP_NOFS); 2412 if (!q_sector.page) { 2413 __free_page(p_sector.page); 2414 p_sector.page = NULL; 2415 goto cleanup; 2416 } 2417 q_sector.pgoff = 0; 2418 q_sector.uptodate = 1; 2419 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 2420 } 2421 2422 atomic_set(&rbio->error, 0); 2423 2424 /* Map the parity stripe just once */ 2425 pointers[nr_data] = kmap_local_page(p_sector.page); 2426 2427 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2428 struct sector_ptr *sector; 2429 void *parity; 2430 2431 /* first collect one page from each data stripe */ 2432 for (stripe = 0; stripe < nr_data; stripe++) { 2433 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 2434 pointers[stripe] = kmap_local_page(sector->page) + 2435 sector->pgoff; 2436 } 2437 2438 if (has_qstripe) { 2439 /* RAID6, call the library function to fill in our P/Q */ 2440 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 2441 pointers); 2442 } else { 2443 /* raid5 */ 2444 memcpy(pointers[nr_data], pointers[0], sectorsize); 2445 run_xor(pointers + 1, nr_data - 1, sectorsize); 2446 } 2447 2448 /* Check scrubbing parity and repair it */ 2449 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2450 parity = kmap_local_page(sector->page) + sector->pgoff; 2451 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 2452 memcpy(parity, pointers[rbio->scrubp], sectorsize); 2453 else 2454 /* Parity is right, needn't writeback */ 2455 bitmap_clear(&rbio->dbitmap, sectornr, 1); 2456 kunmap_local(parity); 2457 2458 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2459 kunmap_local(pointers[stripe]); 2460 } 2461 2462 kunmap_local(pointers[nr_data]); 2463 __free_page(p_sector.page); 2464 p_sector.page = NULL; 2465 if (q_sector.page) { 2466 kunmap_local(pointers[rbio->real_stripes - 1]); 2467 __free_page(q_sector.page); 2468 q_sector.page = NULL; 2469 } 2470 2471 writeback: 2472 /* 2473 * time to start writing. Make bios for everything from the 2474 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2475 * everything else. 2476 */ 2477 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2478 struct sector_ptr *sector; 2479 2480 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2481 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2482 sectornr, REQ_OP_WRITE); 2483 if (ret) 2484 goto cleanup; 2485 } 2486 2487 if (!is_replace) 2488 goto submit_write; 2489 2490 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2491 struct sector_ptr *sector; 2492 2493 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2494 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2495 bioc->tgtdev_map[rbio->scrubp], 2496 sectornr, REQ_OP_WRITE); 2497 if (ret) 2498 goto cleanup; 2499 } 2500 2501 submit_write: 2502 nr_data = bio_list_size(&bio_list); 2503 if (!nr_data) { 2504 /* Every parity is right */ 2505 rbio_orig_end_io(rbio, BLK_STS_OK); 2506 return; 2507 } 2508 2509 atomic_set(&rbio->stripes_pending, nr_data); 2510 2511 while ((bio = bio_list_pop(&bio_list))) { 2512 bio->bi_end_io = raid_write_end_io; 2513 2514 if (trace_raid56_scrub_write_stripe_enabled()) { 2515 struct raid56_bio_trace_info trace_info = { 0 }; 2516 2517 bio_get_trace_info(rbio, bio, &trace_info); 2518 trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); 2519 } 2520 submit_bio(bio); 2521 } 2522 return; 2523 2524 cleanup: 2525 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2526 2527 while ((bio = bio_list_pop(&bio_list))) 2528 bio_put(bio); 2529 } 2530 2531 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2532 { 2533 if (stripe >= 0 && stripe < rbio->nr_data) 2534 return 1; 2535 return 0; 2536 } 2537 2538 /* 2539 * While we're doing the parity check and repair, we could have errors 2540 * in reading pages off the disk. This checks for errors and if we're 2541 * not able to read the page it'll trigger parity reconstruction. The 2542 * parity scrub will be finished after we've reconstructed the failed 2543 * stripes 2544 */ 2545 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) 2546 { 2547 if (atomic_read(&rbio->error) > rbio->bioc->max_errors) 2548 goto cleanup; 2549 2550 if (rbio->faila >= 0 || rbio->failb >= 0) { 2551 int dfail = 0, failp = -1; 2552 2553 if (is_data_stripe(rbio, rbio->faila)) 2554 dfail++; 2555 else if (is_parity_stripe(rbio->faila)) 2556 failp = rbio->faila; 2557 2558 if (is_data_stripe(rbio, rbio->failb)) 2559 dfail++; 2560 else if (is_parity_stripe(rbio->failb)) 2561 failp = rbio->failb; 2562 2563 /* 2564 * Because we can not use a scrubbing parity to repair 2565 * the data, so the capability of the repair is declined. 2566 * (In the case of RAID5, we can not repair anything) 2567 */ 2568 if (dfail > rbio->bioc->max_errors - 1) 2569 goto cleanup; 2570 2571 /* 2572 * If all data is good, only parity is correctly, just 2573 * repair the parity. 2574 */ 2575 if (dfail == 0) { 2576 finish_parity_scrub(rbio, 0); 2577 return; 2578 } 2579 2580 /* 2581 * Here means we got one corrupted data stripe and one 2582 * corrupted parity on RAID6, if the corrupted parity 2583 * is scrubbing parity, luckily, use the other one to repair 2584 * the data, or we can not repair the data stripe. 2585 */ 2586 if (failp != rbio->scrubp) 2587 goto cleanup; 2588 2589 __raid_recover_end_io(rbio); 2590 } else { 2591 finish_parity_scrub(rbio, 1); 2592 } 2593 return; 2594 2595 cleanup: 2596 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2597 } 2598 2599 /* 2600 * end io for the read phase of the rmw cycle. All the bios here are physical 2601 * stripe bios we've read from the disk so we can recalculate the parity of the 2602 * stripe. 2603 * 2604 * This will usually kick off finish_rmw once all the bios are read in, but it 2605 * may trigger parity reconstruction if we had any errors along the way 2606 */ 2607 static void raid56_parity_scrub_end_io_work(struct work_struct *work) 2608 { 2609 struct btrfs_raid_bio *rbio = 2610 container_of(work, struct btrfs_raid_bio, end_io_work); 2611 2612 /* 2613 * This will normally call finish_rmw to start our write, but if there 2614 * are any failed stripes we'll reconstruct from parity first 2615 */ 2616 validate_rbio_for_parity_scrub(rbio); 2617 } 2618 2619 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) 2620 { 2621 int bios_to_read = 0; 2622 struct bio_list bio_list; 2623 int ret; 2624 int total_sector_nr; 2625 struct bio *bio; 2626 2627 bio_list_init(&bio_list); 2628 2629 ret = alloc_rbio_essential_pages(rbio); 2630 if (ret) 2631 goto cleanup; 2632 2633 atomic_set(&rbio->error, 0); 2634 /* Build a list of bios to read all the missing parts. */ 2635 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2636 total_sector_nr++) { 2637 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2638 int stripe = total_sector_nr / rbio->stripe_nsectors; 2639 struct sector_ptr *sector; 2640 2641 /* No data in the vertical stripe, no need to read. */ 2642 if (!test_bit(sectornr, &rbio->dbitmap)) 2643 continue; 2644 2645 /* 2646 * We want to find all the sectors missing from the rbio and 2647 * read them from the disk. If sector_in_rbio() finds a sector 2648 * in the bio list we don't need to read it off the stripe. 2649 */ 2650 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 2651 if (sector) 2652 continue; 2653 2654 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2655 /* 2656 * The bio cache may have handed us an uptodate sector. If so, 2657 * use it. 2658 */ 2659 if (sector->uptodate) 2660 continue; 2661 2662 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2663 sectornr, REQ_OP_READ); 2664 if (ret) 2665 goto cleanup; 2666 } 2667 2668 bios_to_read = bio_list_size(&bio_list); 2669 if (!bios_to_read) { 2670 /* 2671 * this can happen if others have merged with 2672 * us, it means there is nothing left to read. 2673 * But if there are missing devices it may not be 2674 * safe to do the full stripe write yet. 2675 */ 2676 goto finish; 2677 } 2678 2679 /* 2680 * The bioc may be freed once we submit the last bio. Make sure not to 2681 * touch it after that. 2682 */ 2683 atomic_set(&rbio->stripes_pending, bios_to_read); 2684 INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); 2685 while ((bio = bio_list_pop(&bio_list))) { 2686 bio->bi_end_io = raid56_bio_end_io; 2687 2688 if (trace_raid56_scrub_read_enabled()) { 2689 struct raid56_bio_trace_info trace_info = { 0 }; 2690 2691 bio_get_trace_info(rbio, bio, &trace_info); 2692 trace_raid56_scrub_read(rbio, bio, &trace_info); 2693 } 2694 submit_bio(bio); 2695 } 2696 /* the actual write will happen once the reads are done */ 2697 return; 2698 2699 cleanup: 2700 rbio_orig_end_io(rbio, BLK_STS_IOERR); 2701 2702 while ((bio = bio_list_pop(&bio_list))) 2703 bio_put(bio); 2704 2705 return; 2706 2707 finish: 2708 validate_rbio_for_parity_scrub(rbio); 2709 } 2710 2711 static void scrub_parity_work(struct work_struct *work) 2712 { 2713 struct btrfs_raid_bio *rbio; 2714 2715 rbio = container_of(work, struct btrfs_raid_bio, work); 2716 raid56_parity_scrub_stripe(rbio); 2717 } 2718 2719 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2720 { 2721 if (!lock_stripe_add(rbio)) 2722 start_async_work(rbio, scrub_parity_work); 2723 } 2724 2725 /* The following code is used for dev replace of a missing RAID 5/6 device. */ 2726 2727 struct btrfs_raid_bio * 2728 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) 2729 { 2730 struct btrfs_fs_info *fs_info = bioc->fs_info; 2731 struct btrfs_raid_bio *rbio; 2732 2733 rbio = alloc_rbio(fs_info, bioc); 2734 if (IS_ERR(rbio)) 2735 return NULL; 2736 2737 rbio->operation = BTRFS_RBIO_REBUILD_MISSING; 2738 bio_list_add(&rbio->bio_list, bio); 2739 /* 2740 * This is a special bio which is used to hold the completion handler 2741 * and make the scrub rbio is similar to the other types 2742 */ 2743 ASSERT(!bio->bi_iter.bi_size); 2744 2745 rbio->faila = find_logical_bio_stripe(rbio, bio); 2746 if (rbio->faila == -1) { 2747 btrfs_warn_rl(fs_info, 2748 "can not determine the failed stripe number for full stripe %llu", 2749 bioc->raid_map[0]); 2750 __free_raid_bio(rbio); 2751 return NULL; 2752 } 2753 2754 return rbio; 2755 } 2756 2757 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) 2758 { 2759 if (!lock_stripe_add(rbio)) 2760 start_async_work(rbio, read_rebuild_work); 2761 } 2762