1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2012 Fusion-io All rights reserved. 4 * Copyright (C) 2012 Intel Corp. All rights reserved. 5 */ 6 7 #include <linux/sched.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/raid/pq.h> 12 #include <linux/hash.h> 13 #include <linux/list_sort.h> 14 #include <linux/raid/xor.h> 15 #include <linux/mm.h> 16 #include "messages.h" 17 #include "misc.h" 18 #include "ctree.h" 19 #include "disk-io.h" 20 #include "volumes.h" 21 #include "raid56.h" 22 #include "async-thread.h" 23 #include "file-item.h" 24 #include "btrfs_inode.h" 25 26 /* set when additional merges to this rbio are not allowed */ 27 #define RBIO_RMW_LOCKED_BIT 1 28 29 /* 30 * set when this rbio is sitting in the hash, but it is just a cache 31 * of past RMW 32 */ 33 #define RBIO_CACHE_BIT 2 34 35 /* 36 * set when it is safe to trust the stripe_pages for caching 37 */ 38 #define RBIO_CACHE_READY_BIT 3 39 40 #define RBIO_CACHE_SIZE 1024 41 42 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 43 44 /* Used by the raid56 code to lock stripes for read/modify/write */ 45 struct btrfs_stripe_hash { 46 struct list_head hash_list; 47 spinlock_t lock; 48 }; 49 50 /* Used by the raid56 code to lock stripes for read/modify/write */ 51 struct btrfs_stripe_hash_table { 52 struct list_head stripe_cache; 53 spinlock_t cache_lock; 54 int cache_size; 55 struct btrfs_stripe_hash table[]; 56 }; 57 58 /* 59 * A bvec like structure to present a sector inside a page. 60 * 61 * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. 62 */ 63 struct sector_ptr { 64 struct page *page; 65 unsigned int pgoff:24; 66 unsigned int uptodate:8; 67 }; 68 69 static void rmw_rbio_work(struct work_struct *work); 70 static void rmw_rbio_work_locked(struct work_struct *work); 71 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 72 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 73 74 static int finish_parity_scrub(struct btrfs_raid_bio *rbio); 75 static void scrub_rbio_work_locked(struct work_struct *work); 76 77 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) 78 { 79 bitmap_free(rbio->error_bitmap); 80 kfree(rbio->stripe_pages); 81 kfree(rbio->bio_sectors); 82 kfree(rbio->stripe_sectors); 83 kfree(rbio->finish_pointers); 84 } 85 86 static void free_raid_bio(struct btrfs_raid_bio *rbio) 87 { 88 int i; 89 90 if (!refcount_dec_and_test(&rbio->refs)) 91 return; 92 93 WARN_ON(!list_empty(&rbio->stripe_cache)); 94 WARN_ON(!list_empty(&rbio->hash_list)); 95 WARN_ON(!bio_list_empty(&rbio->bio_list)); 96 97 for (i = 0; i < rbio->nr_pages; i++) { 98 if (rbio->stripe_pages[i]) { 99 __free_page(rbio->stripe_pages[i]); 100 rbio->stripe_pages[i] = NULL; 101 } 102 } 103 104 btrfs_put_bioc(rbio->bioc); 105 free_raid_bio_pointers(rbio); 106 kfree(rbio); 107 } 108 109 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) 110 { 111 INIT_WORK(&rbio->work, work_func); 112 queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); 113 } 114 115 /* 116 * the stripe hash table is used for locking, and to collect 117 * bios in hopes of making a full stripe 118 */ 119 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 120 { 121 struct btrfs_stripe_hash_table *table; 122 struct btrfs_stripe_hash_table *x; 123 struct btrfs_stripe_hash *cur; 124 struct btrfs_stripe_hash *h; 125 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 126 int i; 127 128 if (info->stripe_hash_table) 129 return 0; 130 131 /* 132 * The table is large, starting with order 4 and can go as high as 133 * order 7 in case lock debugging is turned on. 134 * 135 * Try harder to allocate and fallback to vmalloc to lower the chance 136 * of a failing mount. 137 */ 138 table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); 139 if (!table) 140 return -ENOMEM; 141 142 spin_lock_init(&table->cache_lock); 143 INIT_LIST_HEAD(&table->stripe_cache); 144 145 h = table->table; 146 147 for (i = 0; i < num_entries; i++) { 148 cur = h + i; 149 INIT_LIST_HEAD(&cur->hash_list); 150 spin_lock_init(&cur->lock); 151 } 152 153 x = cmpxchg(&info->stripe_hash_table, NULL, table); 154 kvfree(x); 155 return 0; 156 } 157 158 /* 159 * caching an rbio means to copy anything from the 160 * bio_sectors array into the stripe_pages array. We 161 * use the page uptodate bit in the stripe cache array 162 * to indicate if it has valid data 163 * 164 * once the caching is done, we set the cache ready 165 * bit. 166 */ 167 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 168 { 169 int i; 170 int ret; 171 172 ret = alloc_rbio_pages(rbio); 173 if (ret) 174 return; 175 176 for (i = 0; i < rbio->nr_sectors; i++) { 177 /* Some range not covered by bio (partial write), skip it */ 178 if (!rbio->bio_sectors[i].page) { 179 /* 180 * Even if the sector is not covered by bio, if it is 181 * a data sector it should still be uptodate as it is 182 * read from disk. 183 */ 184 if (i < rbio->nr_data * rbio->stripe_nsectors) 185 ASSERT(rbio->stripe_sectors[i].uptodate); 186 continue; 187 } 188 189 ASSERT(rbio->stripe_sectors[i].page); 190 memcpy_page(rbio->stripe_sectors[i].page, 191 rbio->stripe_sectors[i].pgoff, 192 rbio->bio_sectors[i].page, 193 rbio->bio_sectors[i].pgoff, 194 rbio->bioc->fs_info->sectorsize); 195 rbio->stripe_sectors[i].uptodate = 1; 196 } 197 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 198 } 199 200 /* 201 * we hash on the first logical address of the stripe 202 */ 203 static int rbio_bucket(struct btrfs_raid_bio *rbio) 204 { 205 u64 num = rbio->bioc->full_stripe_logical; 206 207 /* 208 * we shift down quite a bit. We're using byte 209 * addressing, and most of the lower bits are zeros. 210 * This tends to upset hash_64, and it consistently 211 * returns just one or two different values. 212 * 213 * shifting off the lower bits fixes things. 214 */ 215 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 216 } 217 218 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, 219 unsigned int page_nr) 220 { 221 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 222 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 223 int i; 224 225 ASSERT(page_nr < rbio->nr_pages); 226 227 for (i = sectors_per_page * page_nr; 228 i < sectors_per_page * page_nr + sectors_per_page; 229 i++) { 230 if (!rbio->stripe_sectors[i].uptodate) 231 return false; 232 } 233 return true; 234 } 235 236 /* 237 * Update the stripe_sectors[] array to use correct page and pgoff 238 * 239 * Should be called every time any page pointer in stripes_pages[] got modified. 240 */ 241 static void index_stripe_sectors(struct btrfs_raid_bio *rbio) 242 { 243 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 244 u32 offset; 245 int i; 246 247 for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { 248 int page_index = offset >> PAGE_SHIFT; 249 250 ASSERT(page_index < rbio->nr_pages); 251 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; 252 rbio->stripe_sectors[i].pgoff = offset_in_page(offset); 253 } 254 } 255 256 static void steal_rbio_page(struct btrfs_raid_bio *src, 257 struct btrfs_raid_bio *dest, int page_nr) 258 { 259 const u32 sectorsize = src->bioc->fs_info->sectorsize; 260 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 261 int i; 262 263 if (dest->stripe_pages[page_nr]) 264 __free_page(dest->stripe_pages[page_nr]); 265 dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; 266 src->stripe_pages[page_nr] = NULL; 267 268 /* Also update the sector->uptodate bits. */ 269 for (i = sectors_per_page * page_nr; 270 i < sectors_per_page * page_nr + sectors_per_page; i++) 271 dest->stripe_sectors[i].uptodate = true; 272 } 273 274 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) 275 { 276 const int sector_nr = (page_nr << PAGE_SHIFT) >> 277 rbio->bioc->fs_info->sectorsize_bits; 278 279 /* 280 * We have ensured PAGE_SIZE is aligned with sectorsize, thus 281 * we won't have a page which is half data half parity. 282 * 283 * Thus if the first sector of the page belongs to data stripes, then 284 * the full page belongs to data stripes. 285 */ 286 return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); 287 } 288 289 /* 290 * Stealing an rbio means taking all the uptodate pages from the stripe array 291 * in the source rbio and putting them into the destination rbio. 292 * 293 * This will also update the involved stripe_sectors[] which are referring to 294 * the old pages. 295 */ 296 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 297 { 298 int i; 299 300 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 301 return; 302 303 for (i = 0; i < dest->nr_pages; i++) { 304 struct page *p = src->stripe_pages[i]; 305 306 /* 307 * We don't need to steal P/Q pages as they will always be 308 * regenerated for RMW or full write anyway. 309 */ 310 if (!is_data_stripe_page(src, i)) 311 continue; 312 313 /* 314 * If @src already has RBIO_CACHE_READY_BIT, it should have 315 * all data stripe pages present and uptodate. 316 */ 317 ASSERT(p); 318 ASSERT(full_page_sectors_uptodate(src, i)); 319 steal_rbio_page(src, dest, i); 320 } 321 index_stripe_sectors(dest); 322 index_stripe_sectors(src); 323 } 324 325 /* 326 * merging means we take the bio_list from the victim and 327 * splice it into the destination. The victim should 328 * be discarded afterwards. 329 * 330 * must be called with dest->rbio_list_lock held 331 */ 332 static void merge_rbio(struct btrfs_raid_bio *dest, 333 struct btrfs_raid_bio *victim) 334 { 335 bio_list_merge(&dest->bio_list, &victim->bio_list); 336 dest->bio_list_bytes += victim->bio_list_bytes; 337 /* Also inherit the bitmaps from @victim. */ 338 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 339 dest->stripe_nsectors); 340 bio_list_init(&victim->bio_list); 341 } 342 343 /* 344 * used to prune items that are in the cache. The caller 345 * must hold the hash table lock. 346 */ 347 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 348 { 349 int bucket = rbio_bucket(rbio); 350 struct btrfs_stripe_hash_table *table; 351 struct btrfs_stripe_hash *h; 352 int freeit = 0; 353 354 /* 355 * check the bit again under the hash table lock. 356 */ 357 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 358 return; 359 360 table = rbio->bioc->fs_info->stripe_hash_table; 361 h = table->table + bucket; 362 363 /* hold the lock for the bucket because we may be 364 * removing it from the hash table 365 */ 366 spin_lock(&h->lock); 367 368 /* 369 * hold the lock for the bio list because we need 370 * to make sure the bio list is empty 371 */ 372 spin_lock(&rbio->bio_list_lock); 373 374 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 375 list_del_init(&rbio->stripe_cache); 376 table->cache_size -= 1; 377 freeit = 1; 378 379 /* if the bio list isn't empty, this rbio is 380 * still involved in an IO. We take it out 381 * of the cache list, and drop the ref that 382 * was held for the list. 383 * 384 * If the bio_list was empty, we also remove 385 * the rbio from the hash_table, and drop 386 * the corresponding ref 387 */ 388 if (bio_list_empty(&rbio->bio_list)) { 389 if (!list_empty(&rbio->hash_list)) { 390 list_del_init(&rbio->hash_list); 391 refcount_dec(&rbio->refs); 392 BUG_ON(!list_empty(&rbio->plug_list)); 393 } 394 } 395 } 396 397 spin_unlock(&rbio->bio_list_lock); 398 spin_unlock(&h->lock); 399 400 if (freeit) 401 free_raid_bio(rbio); 402 } 403 404 /* 405 * prune a given rbio from the cache 406 */ 407 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 408 { 409 struct btrfs_stripe_hash_table *table; 410 411 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 412 return; 413 414 table = rbio->bioc->fs_info->stripe_hash_table; 415 416 spin_lock(&table->cache_lock); 417 __remove_rbio_from_cache(rbio); 418 spin_unlock(&table->cache_lock); 419 } 420 421 /* 422 * remove everything in the cache 423 */ 424 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 425 { 426 struct btrfs_stripe_hash_table *table; 427 struct btrfs_raid_bio *rbio; 428 429 table = info->stripe_hash_table; 430 431 spin_lock(&table->cache_lock); 432 while (!list_empty(&table->stripe_cache)) { 433 rbio = list_entry(table->stripe_cache.next, 434 struct btrfs_raid_bio, 435 stripe_cache); 436 __remove_rbio_from_cache(rbio); 437 } 438 spin_unlock(&table->cache_lock); 439 } 440 441 /* 442 * remove all cached entries and free the hash table 443 * used by unmount 444 */ 445 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 446 { 447 if (!info->stripe_hash_table) 448 return; 449 btrfs_clear_rbio_cache(info); 450 kvfree(info->stripe_hash_table); 451 info->stripe_hash_table = NULL; 452 } 453 454 /* 455 * insert an rbio into the stripe cache. It 456 * must have already been prepared by calling 457 * cache_rbio_pages 458 * 459 * If this rbio was already cached, it gets 460 * moved to the front of the lru. 461 * 462 * If the size of the rbio cache is too big, we 463 * prune an item. 464 */ 465 static void cache_rbio(struct btrfs_raid_bio *rbio) 466 { 467 struct btrfs_stripe_hash_table *table; 468 469 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 470 return; 471 472 table = rbio->bioc->fs_info->stripe_hash_table; 473 474 spin_lock(&table->cache_lock); 475 spin_lock(&rbio->bio_list_lock); 476 477 /* bump our ref if we were not in the list before */ 478 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 479 refcount_inc(&rbio->refs); 480 481 if (!list_empty(&rbio->stripe_cache)){ 482 list_move(&rbio->stripe_cache, &table->stripe_cache); 483 } else { 484 list_add(&rbio->stripe_cache, &table->stripe_cache); 485 table->cache_size += 1; 486 } 487 488 spin_unlock(&rbio->bio_list_lock); 489 490 if (table->cache_size > RBIO_CACHE_SIZE) { 491 struct btrfs_raid_bio *found; 492 493 found = list_entry(table->stripe_cache.prev, 494 struct btrfs_raid_bio, 495 stripe_cache); 496 497 if (found != rbio) 498 __remove_rbio_from_cache(found); 499 } 500 501 spin_unlock(&table->cache_lock); 502 } 503 504 /* 505 * helper function to run the xor_blocks api. It is only 506 * able to do MAX_XOR_BLOCKS at a time, so we need to 507 * loop through. 508 */ 509 static void run_xor(void **pages, int src_cnt, ssize_t len) 510 { 511 int src_off = 0; 512 int xor_src_cnt = 0; 513 void *dest = pages[src_cnt]; 514 515 while(src_cnt > 0) { 516 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 517 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 518 519 src_cnt -= xor_src_cnt; 520 src_off += xor_src_cnt; 521 } 522 } 523 524 /* 525 * Returns true if the bio list inside this rbio covers an entire stripe (no 526 * rmw required). 527 */ 528 static int rbio_is_full(struct btrfs_raid_bio *rbio) 529 { 530 unsigned long size = rbio->bio_list_bytes; 531 int ret = 1; 532 533 spin_lock(&rbio->bio_list_lock); 534 if (size != rbio->nr_data * BTRFS_STRIPE_LEN) 535 ret = 0; 536 BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); 537 spin_unlock(&rbio->bio_list_lock); 538 539 return ret; 540 } 541 542 /* 543 * returns 1 if it is safe to merge two rbios together. 544 * The merging is safe if the two rbios correspond to 545 * the same stripe and if they are both going in the same 546 * direction (read vs write), and if neither one is 547 * locked for final IO 548 * 549 * The caller is responsible for locking such that 550 * rmw_locked is safe to test 551 */ 552 static int rbio_can_merge(struct btrfs_raid_bio *last, 553 struct btrfs_raid_bio *cur) 554 { 555 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 556 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 557 return 0; 558 559 /* 560 * we can't merge with cached rbios, since the 561 * idea is that when we merge the destination 562 * rbio is going to run our IO for us. We can 563 * steal from cached rbios though, other functions 564 * handle that. 565 */ 566 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 567 test_bit(RBIO_CACHE_BIT, &cur->flags)) 568 return 0; 569 570 if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) 571 return 0; 572 573 /* we can't merge with different operations */ 574 if (last->operation != cur->operation) 575 return 0; 576 /* 577 * We've need read the full stripe from the drive. 578 * check and repair the parity and write the new results. 579 * 580 * We're not allowed to add any new bios to the 581 * bio list here, anyone else that wants to 582 * change this stripe needs to do their own rmw. 583 */ 584 if (last->operation == BTRFS_RBIO_PARITY_SCRUB) 585 return 0; 586 587 if (last->operation == BTRFS_RBIO_READ_REBUILD) 588 return 0; 589 590 return 1; 591 } 592 593 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, 594 unsigned int stripe_nr, 595 unsigned int sector_nr) 596 { 597 ASSERT(stripe_nr < rbio->real_stripes); 598 ASSERT(sector_nr < rbio->stripe_nsectors); 599 600 return stripe_nr * rbio->stripe_nsectors + sector_nr; 601 } 602 603 /* Return a sector from rbio->stripe_sectors, not from the bio list */ 604 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, 605 unsigned int stripe_nr, 606 unsigned int sector_nr) 607 { 608 return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, 609 sector_nr)]; 610 } 611 612 /* Grab a sector inside P stripe */ 613 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, 614 unsigned int sector_nr) 615 { 616 return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); 617 } 618 619 /* Grab a sector inside Q stripe, return NULL if not RAID6 */ 620 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, 621 unsigned int sector_nr) 622 { 623 if (rbio->nr_data + 1 == rbio->real_stripes) 624 return NULL; 625 return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); 626 } 627 628 /* 629 * The first stripe in the table for a logical address 630 * has the lock. rbios are added in one of three ways: 631 * 632 * 1) Nobody has the stripe locked yet. The rbio is given 633 * the lock and 0 is returned. The caller must start the IO 634 * themselves. 635 * 636 * 2) Someone has the stripe locked, but we're able to merge 637 * with the lock owner. The rbio is freed and the IO will 638 * start automatically along with the existing rbio. 1 is returned. 639 * 640 * 3) Someone has the stripe locked, but we're not able to merge. 641 * The rbio is added to the lock owner's plug list, or merged into 642 * an rbio already on the plug list. When the lock owner unlocks, 643 * the next rbio on the list is run and the IO is started automatically. 644 * 1 is returned 645 * 646 * If we return 0, the caller still owns the rbio and must continue with 647 * IO submission. If we return 1, the caller must assume the rbio has 648 * already been freed. 649 */ 650 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 651 { 652 struct btrfs_stripe_hash *h; 653 struct btrfs_raid_bio *cur; 654 struct btrfs_raid_bio *pending; 655 struct btrfs_raid_bio *freeit = NULL; 656 struct btrfs_raid_bio *cache_drop = NULL; 657 int ret = 0; 658 659 h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); 660 661 spin_lock(&h->lock); 662 list_for_each_entry(cur, &h->hash_list, hash_list) { 663 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) 664 continue; 665 666 spin_lock(&cur->bio_list_lock); 667 668 /* Can we steal this cached rbio's pages? */ 669 if (bio_list_empty(&cur->bio_list) && 670 list_empty(&cur->plug_list) && 671 test_bit(RBIO_CACHE_BIT, &cur->flags) && 672 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 673 list_del_init(&cur->hash_list); 674 refcount_dec(&cur->refs); 675 676 steal_rbio(cur, rbio); 677 cache_drop = cur; 678 spin_unlock(&cur->bio_list_lock); 679 680 goto lockit; 681 } 682 683 /* Can we merge into the lock owner? */ 684 if (rbio_can_merge(cur, rbio)) { 685 merge_rbio(cur, rbio); 686 spin_unlock(&cur->bio_list_lock); 687 freeit = rbio; 688 ret = 1; 689 goto out; 690 } 691 692 693 /* 694 * We couldn't merge with the running rbio, see if we can merge 695 * with the pending ones. We don't have to check for rmw_locked 696 * because there is no way they are inside finish_rmw right now 697 */ 698 list_for_each_entry(pending, &cur->plug_list, plug_list) { 699 if (rbio_can_merge(pending, rbio)) { 700 merge_rbio(pending, rbio); 701 spin_unlock(&cur->bio_list_lock); 702 freeit = rbio; 703 ret = 1; 704 goto out; 705 } 706 } 707 708 /* 709 * No merging, put us on the tail of the plug list, our rbio 710 * will be started with the currently running rbio unlocks 711 */ 712 list_add_tail(&rbio->plug_list, &cur->plug_list); 713 spin_unlock(&cur->bio_list_lock); 714 ret = 1; 715 goto out; 716 } 717 lockit: 718 refcount_inc(&rbio->refs); 719 list_add(&rbio->hash_list, &h->hash_list); 720 out: 721 spin_unlock(&h->lock); 722 if (cache_drop) 723 remove_rbio_from_cache(cache_drop); 724 if (freeit) 725 free_raid_bio(freeit); 726 return ret; 727 } 728 729 static void recover_rbio_work_locked(struct work_struct *work); 730 731 /* 732 * called as rmw or parity rebuild is completed. If the plug list has more 733 * rbios waiting for this stripe, the next one on the list will be started 734 */ 735 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 736 { 737 int bucket; 738 struct btrfs_stripe_hash *h; 739 int keep_cache = 0; 740 741 bucket = rbio_bucket(rbio); 742 h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; 743 744 if (list_empty(&rbio->plug_list)) 745 cache_rbio(rbio); 746 747 spin_lock(&h->lock); 748 spin_lock(&rbio->bio_list_lock); 749 750 if (!list_empty(&rbio->hash_list)) { 751 /* 752 * if we're still cached and there is no other IO 753 * to perform, just leave this rbio here for others 754 * to steal from later 755 */ 756 if (list_empty(&rbio->plug_list) && 757 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 758 keep_cache = 1; 759 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 760 BUG_ON(!bio_list_empty(&rbio->bio_list)); 761 goto done; 762 } 763 764 list_del_init(&rbio->hash_list); 765 refcount_dec(&rbio->refs); 766 767 /* 768 * we use the plug list to hold all the rbios 769 * waiting for the chance to lock this stripe. 770 * hand the lock over to one of them. 771 */ 772 if (!list_empty(&rbio->plug_list)) { 773 struct btrfs_raid_bio *next; 774 struct list_head *head = rbio->plug_list.next; 775 776 next = list_entry(head, struct btrfs_raid_bio, 777 plug_list); 778 779 list_del_init(&rbio->plug_list); 780 781 list_add(&next->hash_list, &h->hash_list); 782 refcount_inc(&next->refs); 783 spin_unlock(&rbio->bio_list_lock); 784 spin_unlock(&h->lock); 785 786 if (next->operation == BTRFS_RBIO_READ_REBUILD) { 787 start_async_work(next, recover_rbio_work_locked); 788 } else if (next->operation == BTRFS_RBIO_WRITE) { 789 steal_rbio(rbio, next); 790 start_async_work(next, rmw_rbio_work_locked); 791 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { 792 steal_rbio(rbio, next); 793 start_async_work(next, scrub_rbio_work_locked); 794 } 795 796 goto done_nolock; 797 } 798 } 799 done: 800 spin_unlock(&rbio->bio_list_lock); 801 spin_unlock(&h->lock); 802 803 done_nolock: 804 if (!keep_cache) 805 remove_rbio_from_cache(rbio); 806 } 807 808 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) 809 { 810 struct bio *next; 811 812 while (cur) { 813 next = cur->bi_next; 814 cur->bi_next = NULL; 815 cur->bi_status = err; 816 bio_endio(cur); 817 cur = next; 818 } 819 } 820 821 /* 822 * this frees the rbio and runs through all the bios in the 823 * bio_list and calls end_io on them 824 */ 825 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) 826 { 827 struct bio *cur = bio_list_get(&rbio->bio_list); 828 struct bio *extra; 829 830 kfree(rbio->csum_buf); 831 bitmap_free(rbio->csum_bitmap); 832 rbio->csum_buf = NULL; 833 rbio->csum_bitmap = NULL; 834 835 /* 836 * Clear the data bitmap, as the rbio may be cached for later usage. 837 * do this before before unlock_stripe() so there will be no new bio 838 * for this bio. 839 */ 840 bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); 841 842 /* 843 * At this moment, rbio->bio_list is empty, however since rbio does not 844 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the 845 * hash list, rbio may be merged with others so that rbio->bio_list 846 * becomes non-empty. 847 * Once unlock_stripe() is done, rbio->bio_list will not be updated any 848 * more and we can call bio_endio() on all queued bios. 849 */ 850 unlock_stripe(rbio); 851 extra = bio_list_get(&rbio->bio_list); 852 free_raid_bio(rbio); 853 854 rbio_endio_bio_list(cur, err); 855 if (extra) 856 rbio_endio_bio_list(extra, err); 857 } 858 859 /* 860 * Get a sector pointer specified by its @stripe_nr and @sector_nr. 861 * 862 * @rbio: The raid bio 863 * @stripe_nr: Stripe number, valid range [0, real_stripe) 864 * @sector_nr: Sector number inside the stripe, 865 * valid range [0, stripe_nsectors) 866 * @bio_list_only: Whether to use sectors inside the bio list only. 867 * 868 * The read/modify/write code wants to reuse the original bio page as much 869 * as possible, and only use stripe_sectors as fallback. 870 */ 871 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, 872 int stripe_nr, int sector_nr, 873 bool bio_list_only) 874 { 875 struct sector_ptr *sector; 876 int index; 877 878 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); 879 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 880 881 index = stripe_nr * rbio->stripe_nsectors + sector_nr; 882 ASSERT(index >= 0 && index < rbio->nr_sectors); 883 884 spin_lock(&rbio->bio_list_lock); 885 sector = &rbio->bio_sectors[index]; 886 if (sector->page || bio_list_only) { 887 /* Don't return sector without a valid page pointer */ 888 if (!sector->page) 889 sector = NULL; 890 spin_unlock(&rbio->bio_list_lock); 891 return sector; 892 } 893 spin_unlock(&rbio->bio_list_lock); 894 895 return &rbio->stripe_sectors[index]; 896 } 897 898 /* 899 * allocation and initial setup for the btrfs_raid_bio. Not 900 * this does not allocate any pages for rbio->pages. 901 */ 902 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, 903 struct btrfs_io_context *bioc) 904 { 905 const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; 906 const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; 907 const unsigned int num_pages = stripe_npages * real_stripes; 908 const unsigned int stripe_nsectors = 909 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; 910 const unsigned int num_sectors = stripe_nsectors * real_stripes; 911 struct btrfs_raid_bio *rbio; 912 913 /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ 914 ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); 915 /* 916 * Our current stripe len should be fixed to 64k thus stripe_nsectors 917 * (at most 16) should be no larger than BITS_PER_LONG. 918 */ 919 ASSERT(stripe_nsectors <= BITS_PER_LONG); 920 921 rbio = kzalloc(sizeof(*rbio), GFP_NOFS); 922 if (!rbio) 923 return ERR_PTR(-ENOMEM); 924 rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), 925 GFP_NOFS); 926 rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 927 GFP_NOFS); 928 rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), 929 GFP_NOFS); 930 rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); 931 rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); 932 933 if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || 934 !rbio->finish_pointers || !rbio->error_bitmap) { 935 free_raid_bio_pointers(rbio); 936 kfree(rbio); 937 return ERR_PTR(-ENOMEM); 938 } 939 940 bio_list_init(&rbio->bio_list); 941 init_waitqueue_head(&rbio->io_wait); 942 INIT_LIST_HEAD(&rbio->plug_list); 943 spin_lock_init(&rbio->bio_list_lock); 944 INIT_LIST_HEAD(&rbio->stripe_cache); 945 INIT_LIST_HEAD(&rbio->hash_list); 946 btrfs_get_bioc(bioc); 947 rbio->bioc = bioc; 948 rbio->nr_pages = num_pages; 949 rbio->nr_sectors = num_sectors; 950 rbio->real_stripes = real_stripes; 951 rbio->stripe_npages = stripe_npages; 952 rbio->stripe_nsectors = stripe_nsectors; 953 refcount_set(&rbio->refs, 1); 954 atomic_set(&rbio->stripes_pending, 0); 955 956 ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); 957 rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); 958 959 return rbio; 960 } 961 962 /* allocate pages for all the stripes in the bio, including parity */ 963 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 964 { 965 int ret; 966 967 ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); 968 if (ret < 0) 969 return ret; 970 /* Mapping all sectors */ 971 index_stripe_sectors(rbio); 972 return 0; 973 } 974 975 /* only allocate pages for p/q stripes */ 976 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 977 { 978 const int data_pages = rbio->nr_data * rbio->stripe_npages; 979 int ret; 980 981 ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, 982 rbio->stripe_pages + data_pages); 983 if (ret < 0) 984 return ret; 985 986 index_stripe_sectors(rbio); 987 return 0; 988 } 989 990 /* 991 * Return the total number of errors found in the vertical stripe of @sector_nr. 992 * 993 * @faila and @failb will also be updated to the first and second stripe 994 * number of the errors. 995 */ 996 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, 997 int *faila, int *failb) 998 { 999 int stripe_nr; 1000 int found_errors = 0; 1001 1002 if (faila || failb) { 1003 /* 1004 * Both @faila and @failb should be valid pointers if any of 1005 * them is specified. 1006 */ 1007 ASSERT(faila && failb); 1008 *faila = -1; 1009 *failb = -1; 1010 } 1011 1012 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1013 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; 1014 1015 if (test_bit(total_sector_nr, rbio->error_bitmap)) { 1016 found_errors++; 1017 if (faila) { 1018 /* Update faila and failb. */ 1019 if (*faila < 0) 1020 *faila = stripe_nr; 1021 else if (*failb < 0) 1022 *failb = stripe_nr; 1023 } 1024 } 1025 } 1026 return found_errors; 1027 } 1028 1029 /* 1030 * Add a single sector @sector into our list of bios for IO. 1031 * 1032 * Return 0 if everything went well. 1033 * Return <0 for error. 1034 */ 1035 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, 1036 struct bio_list *bio_list, 1037 struct sector_ptr *sector, 1038 unsigned int stripe_nr, 1039 unsigned int sector_nr, 1040 enum req_op op) 1041 { 1042 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1043 struct bio *last = bio_list->tail; 1044 int ret; 1045 struct bio *bio; 1046 struct btrfs_io_stripe *stripe; 1047 u64 disk_start; 1048 1049 /* 1050 * Note: here stripe_nr has taken device replace into consideration, 1051 * thus it can be larger than rbio->real_stripe. 1052 * So here we check against bioc->num_stripes, not rbio->real_stripes. 1053 */ 1054 ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); 1055 ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); 1056 ASSERT(sector->page); 1057 1058 stripe = &rbio->bioc->stripes[stripe_nr]; 1059 disk_start = stripe->physical + sector_nr * sectorsize; 1060 1061 /* if the device is missing, just fail this stripe */ 1062 if (!stripe->dev->bdev) { 1063 int found_errors; 1064 1065 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, 1066 rbio->error_bitmap); 1067 1068 /* Check if we have reached tolerance early. */ 1069 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 1070 NULL, NULL); 1071 if (found_errors > rbio->bioc->max_errors) 1072 return -EIO; 1073 return 0; 1074 } 1075 1076 /* see if we can add this page onto our existing bio */ 1077 if (last) { 1078 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; 1079 last_end += last->bi_iter.bi_size; 1080 1081 /* 1082 * we can't merge these if they are from different 1083 * devices or if they are not contiguous 1084 */ 1085 if (last_end == disk_start && !last->bi_status && 1086 last->bi_bdev == stripe->dev->bdev) { 1087 ret = bio_add_page(last, sector->page, sectorsize, 1088 sector->pgoff); 1089 if (ret == sectorsize) 1090 return 0; 1091 } 1092 } 1093 1094 /* put a new bio on the list */ 1095 bio = bio_alloc(stripe->dev->bdev, 1096 max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), 1097 op, GFP_NOFS); 1098 bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; 1099 bio->bi_private = rbio; 1100 1101 __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); 1102 bio_list_add(bio_list, bio); 1103 return 0; 1104 } 1105 1106 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) 1107 { 1108 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1109 struct bio_vec bvec; 1110 struct bvec_iter iter; 1111 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1112 rbio->bioc->full_stripe_logical; 1113 1114 bio_for_each_segment(bvec, bio, iter) { 1115 u32 bvec_offset; 1116 1117 for (bvec_offset = 0; bvec_offset < bvec.bv_len; 1118 bvec_offset += sectorsize, offset += sectorsize) { 1119 int index = offset / sectorsize; 1120 struct sector_ptr *sector = &rbio->bio_sectors[index]; 1121 1122 sector->page = bvec.bv_page; 1123 sector->pgoff = bvec.bv_offset + bvec_offset; 1124 ASSERT(sector->pgoff < PAGE_SIZE); 1125 } 1126 } 1127 } 1128 1129 /* 1130 * helper function to walk our bio list and populate the bio_pages array with 1131 * the result. This seems expensive, but it is faster than constantly 1132 * searching through the bio list as we setup the IO in finish_rmw or stripe 1133 * reconstruction. 1134 * 1135 * This must be called before you trust the answers from page_in_rbio 1136 */ 1137 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1138 { 1139 struct bio *bio; 1140 1141 spin_lock(&rbio->bio_list_lock); 1142 bio_list_for_each(bio, &rbio->bio_list) 1143 index_one_bio(rbio, bio); 1144 1145 spin_unlock(&rbio->bio_list_lock); 1146 } 1147 1148 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, 1149 struct raid56_bio_trace_info *trace_info) 1150 { 1151 const struct btrfs_io_context *bioc = rbio->bioc; 1152 int i; 1153 1154 ASSERT(bioc); 1155 1156 /* We rely on bio->bi_bdev to find the stripe number. */ 1157 if (!bio->bi_bdev) 1158 goto not_found; 1159 1160 for (i = 0; i < bioc->num_stripes; i++) { 1161 if (bio->bi_bdev != bioc->stripes[i].dev->bdev) 1162 continue; 1163 trace_info->stripe_nr = i; 1164 trace_info->devid = bioc->stripes[i].dev->devid; 1165 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1166 bioc->stripes[i].physical; 1167 return; 1168 } 1169 1170 not_found: 1171 trace_info->devid = -1; 1172 trace_info->offset = -1; 1173 trace_info->stripe_nr = -1; 1174 } 1175 1176 static inline void bio_list_put(struct bio_list *bio_list) 1177 { 1178 struct bio *bio; 1179 1180 while ((bio = bio_list_pop(bio_list))) 1181 bio_put(bio); 1182 } 1183 1184 /* Generate PQ for one vertical stripe. */ 1185 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) 1186 { 1187 void **pointers = rbio->finish_pointers; 1188 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1189 struct sector_ptr *sector; 1190 int stripe; 1191 const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; 1192 1193 /* First collect one sector from each data stripe */ 1194 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1195 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 1196 pointers[stripe] = kmap_local_page(sector->page) + 1197 sector->pgoff; 1198 } 1199 1200 /* Then add the parity stripe */ 1201 sector = rbio_pstripe_sector(rbio, sectornr); 1202 sector->uptodate = 1; 1203 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; 1204 1205 if (has_qstripe) { 1206 /* 1207 * RAID6, add the qstripe and call the library function 1208 * to fill in our p/q 1209 */ 1210 sector = rbio_qstripe_sector(rbio, sectornr); 1211 sector->uptodate = 1; 1212 pointers[stripe++] = kmap_local_page(sector->page) + 1213 sector->pgoff; 1214 1215 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 1216 pointers); 1217 } else { 1218 /* raid5 */ 1219 memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); 1220 run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); 1221 } 1222 for (stripe = stripe - 1; stripe >= 0; stripe--) 1223 kunmap_local(pointers[stripe]); 1224 } 1225 1226 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, 1227 struct bio_list *bio_list) 1228 { 1229 /* The total sector number inside the full stripe. */ 1230 int total_sector_nr; 1231 int sectornr; 1232 int stripe; 1233 int ret; 1234 1235 ASSERT(bio_list_size(bio_list) == 0); 1236 1237 /* We should have at least one data sector. */ 1238 ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); 1239 1240 /* 1241 * Reset errors, as we may have errors inherited from from degraded 1242 * write. 1243 */ 1244 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 1245 1246 /* 1247 * Start assembly. Make bios for everything from the higher layers (the 1248 * bio_list in our rbio) and our P/Q. Ignore everything else. 1249 */ 1250 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1251 total_sector_nr++) { 1252 struct sector_ptr *sector; 1253 1254 stripe = total_sector_nr / rbio->stripe_nsectors; 1255 sectornr = total_sector_nr % rbio->stripe_nsectors; 1256 1257 /* This vertical stripe has no data, skip it. */ 1258 if (!test_bit(sectornr, &rbio->dbitmap)) 1259 continue; 1260 1261 if (stripe < rbio->nr_data) { 1262 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1263 if (!sector) 1264 continue; 1265 } else { 1266 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1267 } 1268 1269 ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, 1270 sectornr, REQ_OP_WRITE); 1271 if (ret) 1272 goto error; 1273 } 1274 1275 if (likely(!rbio->bioc->replace_nr_stripes)) 1276 return 0; 1277 1278 /* 1279 * Make a copy for the replace target device. 1280 * 1281 * Thus the source stripe number (in replace_stripe_src) should be valid. 1282 */ 1283 ASSERT(rbio->bioc->replace_stripe_src >= 0); 1284 1285 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1286 total_sector_nr++) { 1287 struct sector_ptr *sector; 1288 1289 stripe = total_sector_nr / rbio->stripe_nsectors; 1290 sectornr = total_sector_nr % rbio->stripe_nsectors; 1291 1292 /* 1293 * For RAID56, there is only one device that can be replaced, 1294 * and replace_stripe_src[0] indicates the stripe number we 1295 * need to copy from. 1296 */ 1297 if (stripe != rbio->bioc->replace_stripe_src) { 1298 /* 1299 * We can skip the whole stripe completely, note 1300 * total_sector_nr will be increased by one anyway. 1301 */ 1302 ASSERT(sectornr == 0); 1303 total_sector_nr += rbio->stripe_nsectors - 1; 1304 continue; 1305 } 1306 1307 /* This vertical stripe has no data, skip it. */ 1308 if (!test_bit(sectornr, &rbio->dbitmap)) 1309 continue; 1310 1311 if (stripe < rbio->nr_data) { 1312 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 1313 if (!sector) 1314 continue; 1315 } else { 1316 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1317 } 1318 1319 ret = rbio_add_io_sector(rbio, bio_list, sector, 1320 rbio->real_stripes, 1321 sectornr, REQ_OP_WRITE); 1322 if (ret) 1323 goto error; 1324 } 1325 1326 return 0; 1327 error: 1328 bio_list_put(bio_list); 1329 return -EIO; 1330 } 1331 1332 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) 1333 { 1334 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1335 u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - 1336 rbio->bioc->full_stripe_logical; 1337 int total_nr_sector = offset >> fs_info->sectorsize_bits; 1338 1339 ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); 1340 1341 bitmap_set(rbio->error_bitmap, total_nr_sector, 1342 bio->bi_iter.bi_size >> fs_info->sectorsize_bits); 1343 1344 /* 1345 * Special handling for raid56_alloc_missing_rbio() used by 1346 * scrub/replace. Unlike call path in raid56_parity_recover(), they 1347 * pass an empty bio here. Thus we have to find out the missing device 1348 * and mark the stripe error instead. 1349 */ 1350 if (bio->bi_iter.bi_size == 0) { 1351 bool found_missing = false; 1352 int stripe_nr; 1353 1354 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1355 if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { 1356 found_missing = true; 1357 bitmap_set(rbio->error_bitmap, 1358 stripe_nr * rbio->stripe_nsectors, 1359 rbio->stripe_nsectors); 1360 } 1361 } 1362 ASSERT(found_missing); 1363 } 1364 } 1365 1366 /* 1367 * For subpage case, we can no longer set page Up-to-date directly for 1368 * stripe_pages[], thus we need to locate the sector. 1369 */ 1370 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, 1371 struct page *page, 1372 unsigned int pgoff) 1373 { 1374 int i; 1375 1376 for (i = 0; i < rbio->nr_sectors; i++) { 1377 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 1378 1379 if (sector->page == page && sector->pgoff == pgoff) 1380 return sector; 1381 } 1382 return NULL; 1383 } 1384 1385 /* 1386 * this sets each page in the bio uptodate. It should only be used on private 1387 * rbio pages, nothing that comes in from the higher layers 1388 */ 1389 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) 1390 { 1391 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 1392 struct bio_vec *bvec; 1393 struct bvec_iter_all iter_all; 1394 1395 ASSERT(!bio_flagged(bio, BIO_CLONED)); 1396 1397 bio_for_each_segment_all(bvec, bio, iter_all) { 1398 struct sector_ptr *sector; 1399 int pgoff; 1400 1401 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; 1402 pgoff += sectorsize) { 1403 sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); 1404 ASSERT(sector); 1405 if (sector) 1406 sector->uptodate = 1; 1407 } 1408 } 1409 } 1410 1411 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) 1412 { 1413 struct bio_vec *bv = bio_first_bvec_all(bio); 1414 int i; 1415 1416 for (i = 0; i < rbio->nr_sectors; i++) { 1417 struct sector_ptr *sector; 1418 1419 sector = &rbio->stripe_sectors[i]; 1420 if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) 1421 break; 1422 sector = &rbio->bio_sectors[i]; 1423 if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) 1424 break; 1425 } 1426 ASSERT(i < rbio->nr_sectors); 1427 return i; 1428 } 1429 1430 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) 1431 { 1432 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1433 u32 bio_size = 0; 1434 struct bio_vec *bvec; 1435 int i; 1436 1437 bio_for_each_bvec_all(bvec, bio, i) 1438 bio_size += bvec->bv_len; 1439 1440 /* 1441 * Since we can have multiple bios touching the error_bitmap, we cannot 1442 * call bitmap_set() without protection. 1443 * 1444 * Instead use set_bit() for each bit, as set_bit() itself is atomic. 1445 */ 1446 for (i = total_sector_nr; i < total_sector_nr + 1447 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) 1448 set_bit(i, rbio->error_bitmap); 1449 } 1450 1451 /* Verify the data sectors at read time. */ 1452 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, 1453 struct bio *bio) 1454 { 1455 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1456 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1457 struct bio_vec *bvec; 1458 struct bvec_iter_all iter_all; 1459 1460 /* No data csum for the whole stripe, no need to verify. */ 1461 if (!rbio->csum_bitmap || !rbio->csum_buf) 1462 return; 1463 1464 /* P/Q stripes, they have no data csum to verify against. */ 1465 if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) 1466 return; 1467 1468 bio_for_each_segment_all(bvec, bio, iter_all) { 1469 int bv_offset; 1470 1471 for (bv_offset = bvec->bv_offset; 1472 bv_offset < bvec->bv_offset + bvec->bv_len; 1473 bv_offset += fs_info->sectorsize, total_sector_nr++) { 1474 u8 csum_buf[BTRFS_CSUM_SIZE]; 1475 u8 *expected_csum = rbio->csum_buf + 1476 total_sector_nr * fs_info->csum_size; 1477 int ret; 1478 1479 /* No csum for this sector, skip to the next sector. */ 1480 if (!test_bit(total_sector_nr, rbio->csum_bitmap)) 1481 continue; 1482 1483 ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, 1484 bv_offset, csum_buf, expected_csum); 1485 if (ret < 0) 1486 set_bit(total_sector_nr, rbio->error_bitmap); 1487 } 1488 } 1489 } 1490 1491 static void raid_wait_read_end_io(struct bio *bio) 1492 { 1493 struct btrfs_raid_bio *rbio = bio->bi_private; 1494 1495 if (bio->bi_status) { 1496 rbio_update_error_bitmap(rbio, bio); 1497 } else { 1498 set_bio_pages_uptodate(rbio, bio); 1499 verify_bio_data_sectors(rbio, bio); 1500 } 1501 1502 bio_put(bio); 1503 if (atomic_dec_and_test(&rbio->stripes_pending)) 1504 wake_up(&rbio->io_wait); 1505 } 1506 1507 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, 1508 struct bio_list *bio_list) 1509 { 1510 struct bio *bio; 1511 1512 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 1513 while ((bio = bio_list_pop(bio_list))) { 1514 bio->bi_end_io = raid_wait_read_end_io; 1515 1516 if (trace_raid56_read_enabled()) { 1517 struct raid56_bio_trace_info trace_info = { 0 }; 1518 1519 bio_get_trace_info(rbio, bio, &trace_info); 1520 trace_raid56_read(rbio, bio, &trace_info); 1521 } 1522 submit_bio(bio); 1523 } 1524 1525 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 1526 } 1527 1528 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) 1529 { 1530 const int data_pages = rbio->nr_data * rbio->stripe_npages; 1531 int ret; 1532 1533 ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); 1534 if (ret < 0) 1535 return ret; 1536 1537 index_stripe_sectors(rbio); 1538 return 0; 1539 } 1540 1541 /* 1542 * We use plugging call backs to collect full stripes. 1543 * Any time we get a partial stripe write while plugged 1544 * we collect it into a list. When the unplug comes down, 1545 * we sort the list by logical block number and merge 1546 * everything we can into the same rbios 1547 */ 1548 struct btrfs_plug_cb { 1549 struct blk_plug_cb cb; 1550 struct btrfs_fs_info *info; 1551 struct list_head rbio_list; 1552 struct work_struct work; 1553 }; 1554 1555 /* 1556 * rbios on the plug list are sorted for easier merging. 1557 */ 1558 static int plug_cmp(void *priv, const struct list_head *a, 1559 const struct list_head *b) 1560 { 1561 const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1562 plug_list); 1563 const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1564 plug_list); 1565 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1566 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1567 1568 if (a_sector < b_sector) 1569 return -1; 1570 if (a_sector > b_sector) 1571 return 1; 1572 return 0; 1573 } 1574 1575 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1576 { 1577 struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); 1578 struct btrfs_raid_bio *cur; 1579 struct btrfs_raid_bio *last = NULL; 1580 1581 list_sort(NULL, &plug->rbio_list, plug_cmp); 1582 1583 while (!list_empty(&plug->rbio_list)) { 1584 cur = list_entry(plug->rbio_list.next, 1585 struct btrfs_raid_bio, plug_list); 1586 list_del_init(&cur->plug_list); 1587 1588 if (rbio_is_full(cur)) { 1589 /* We have a full stripe, queue it down. */ 1590 start_async_work(cur, rmw_rbio_work); 1591 continue; 1592 } 1593 if (last) { 1594 if (rbio_can_merge(last, cur)) { 1595 merge_rbio(last, cur); 1596 free_raid_bio(cur); 1597 continue; 1598 } 1599 start_async_work(last, rmw_rbio_work); 1600 } 1601 last = cur; 1602 } 1603 if (last) 1604 start_async_work(last, rmw_rbio_work); 1605 kfree(plug); 1606 } 1607 1608 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ 1609 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) 1610 { 1611 const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1612 const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; 1613 const u64 full_stripe_start = rbio->bioc->full_stripe_logical; 1614 const u32 orig_len = orig_bio->bi_iter.bi_size; 1615 const u32 sectorsize = fs_info->sectorsize; 1616 u64 cur_logical; 1617 1618 ASSERT(orig_logical >= full_stripe_start && 1619 orig_logical + orig_len <= full_stripe_start + 1620 rbio->nr_data * BTRFS_STRIPE_LEN); 1621 1622 bio_list_add(&rbio->bio_list, orig_bio); 1623 rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; 1624 1625 /* Update the dbitmap. */ 1626 for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; 1627 cur_logical += sectorsize) { 1628 int bit = ((u32)(cur_logical - full_stripe_start) >> 1629 fs_info->sectorsize_bits) % rbio->stripe_nsectors; 1630 1631 set_bit(bit, &rbio->dbitmap); 1632 } 1633 } 1634 1635 /* 1636 * our main entry point for writes from the rest of the FS. 1637 */ 1638 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) 1639 { 1640 struct btrfs_fs_info *fs_info = bioc->fs_info; 1641 struct btrfs_raid_bio *rbio; 1642 struct btrfs_plug_cb *plug = NULL; 1643 struct blk_plug_cb *cb; 1644 1645 rbio = alloc_rbio(fs_info, bioc); 1646 if (IS_ERR(rbio)) { 1647 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 1648 bio_endio(bio); 1649 return; 1650 } 1651 rbio->operation = BTRFS_RBIO_WRITE; 1652 rbio_add_bio(rbio, bio); 1653 1654 /* 1655 * Don't plug on full rbios, just get them out the door 1656 * as quickly as we can 1657 */ 1658 if (!rbio_is_full(rbio)) { 1659 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); 1660 if (cb) { 1661 plug = container_of(cb, struct btrfs_plug_cb, cb); 1662 if (!plug->info) { 1663 plug->info = fs_info; 1664 INIT_LIST_HEAD(&plug->rbio_list); 1665 } 1666 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1667 return; 1668 } 1669 } 1670 1671 /* 1672 * Either we don't have any existing plug, or we're doing a full stripe, 1673 * queue the rmw work now. 1674 */ 1675 start_async_work(rbio, rmw_rbio_work); 1676 } 1677 1678 static int verify_one_sector(struct btrfs_raid_bio *rbio, 1679 int stripe_nr, int sector_nr) 1680 { 1681 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1682 struct sector_ptr *sector; 1683 u8 csum_buf[BTRFS_CSUM_SIZE]; 1684 u8 *csum_expected; 1685 int ret; 1686 1687 if (!rbio->csum_bitmap || !rbio->csum_buf) 1688 return 0; 1689 1690 /* No way to verify P/Q as they are not covered by data csum. */ 1691 if (stripe_nr >= rbio->nr_data) 1692 return 0; 1693 /* 1694 * If we're rebuilding a read, we have to use pages from the 1695 * bio list if possible. 1696 */ 1697 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1698 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 1699 } else { 1700 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 1701 } 1702 1703 ASSERT(sector->page); 1704 1705 csum_expected = rbio->csum_buf + 1706 (stripe_nr * rbio->stripe_nsectors + sector_nr) * 1707 fs_info->csum_size; 1708 ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, 1709 csum_buf, csum_expected); 1710 return ret; 1711 } 1712 1713 /* 1714 * Recover a vertical stripe specified by @sector_nr. 1715 * @*pointers are the pre-allocated pointers by the caller, so we don't 1716 * need to allocate/free the pointers again and again. 1717 */ 1718 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, 1719 void **pointers, void **unmap_array) 1720 { 1721 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 1722 struct sector_ptr *sector; 1723 const u32 sectorsize = fs_info->sectorsize; 1724 int found_errors; 1725 int faila; 1726 int failb; 1727 int stripe_nr; 1728 int ret = 0; 1729 1730 /* 1731 * Now we just use bitmap to mark the horizontal stripes in 1732 * which we have data when doing parity scrub. 1733 */ 1734 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && 1735 !test_bit(sector_nr, &rbio->dbitmap)) 1736 return 0; 1737 1738 found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, 1739 &failb); 1740 /* 1741 * No errors in the vertical stripe, skip it. Can happen for recovery 1742 * which only part of a stripe failed csum check. 1743 */ 1744 if (!found_errors) 1745 return 0; 1746 1747 if (found_errors > rbio->bioc->max_errors) 1748 return -EIO; 1749 1750 /* 1751 * Setup our array of pointers with sectors from each stripe 1752 * 1753 * NOTE: store a duplicate array of pointers to preserve the 1754 * pointer order. 1755 */ 1756 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { 1757 /* 1758 * If we're rebuilding a read, we have to use pages from the 1759 * bio list if possible. 1760 */ 1761 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1762 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); 1763 } else { 1764 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); 1765 } 1766 ASSERT(sector->page); 1767 pointers[stripe_nr] = kmap_local_page(sector->page) + 1768 sector->pgoff; 1769 unmap_array[stripe_nr] = pointers[stripe_nr]; 1770 } 1771 1772 /* All raid6 handling here */ 1773 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { 1774 /* Single failure, rebuild from parity raid5 style */ 1775 if (failb < 0) { 1776 if (faila == rbio->nr_data) 1777 /* 1778 * Just the P stripe has failed, without 1779 * a bad data or Q stripe. 1780 * We have nothing to do, just skip the 1781 * recovery for this stripe. 1782 */ 1783 goto cleanup; 1784 /* 1785 * a single failure in raid6 is rebuilt 1786 * in the pstripe code below 1787 */ 1788 goto pstripe; 1789 } 1790 1791 /* 1792 * If the q stripe is failed, do a pstripe reconstruction from 1793 * the xors. 1794 * If both the q stripe and the P stripe are failed, we're 1795 * here due to a crc mismatch and we can't give them the 1796 * data they want. 1797 */ 1798 if (failb == rbio->real_stripes - 1) { 1799 if (faila == rbio->real_stripes - 2) 1800 /* 1801 * Only P and Q are corrupted. 1802 * We only care about data stripes recovery, 1803 * can skip this vertical stripe. 1804 */ 1805 goto cleanup; 1806 /* 1807 * Otherwise we have one bad data stripe and 1808 * a good P stripe. raid5! 1809 */ 1810 goto pstripe; 1811 } 1812 1813 if (failb == rbio->real_stripes - 2) { 1814 raid6_datap_recov(rbio->real_stripes, sectorsize, 1815 faila, pointers); 1816 } else { 1817 raid6_2data_recov(rbio->real_stripes, sectorsize, 1818 faila, failb, pointers); 1819 } 1820 } else { 1821 void *p; 1822 1823 /* Rebuild from P stripe here (raid5 or raid6). */ 1824 ASSERT(failb == -1); 1825 pstripe: 1826 /* Copy parity block into failed block to start with */ 1827 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); 1828 1829 /* Rearrange the pointer array */ 1830 p = pointers[faila]; 1831 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; 1832 stripe_nr++) 1833 pointers[stripe_nr] = pointers[stripe_nr + 1]; 1834 pointers[rbio->nr_data - 1] = p; 1835 1836 /* Xor in the rest */ 1837 run_xor(pointers, rbio->nr_data - 1, sectorsize); 1838 1839 } 1840 1841 /* 1842 * No matter if this is a RMW or recovery, we should have all 1843 * failed sectors repaired in the vertical stripe, thus they are now 1844 * uptodate. 1845 * Especially if we determine to cache the rbio, we need to 1846 * have at least all data sectors uptodate. 1847 * 1848 * If possible, also check if the repaired sector matches its data 1849 * checksum. 1850 */ 1851 if (faila >= 0) { 1852 ret = verify_one_sector(rbio, faila, sector_nr); 1853 if (ret < 0) 1854 goto cleanup; 1855 1856 sector = rbio_stripe_sector(rbio, faila, sector_nr); 1857 sector->uptodate = 1; 1858 } 1859 if (failb >= 0) { 1860 ret = verify_one_sector(rbio, failb, sector_nr); 1861 if (ret < 0) 1862 goto cleanup; 1863 1864 sector = rbio_stripe_sector(rbio, failb, sector_nr); 1865 sector->uptodate = 1; 1866 } 1867 1868 cleanup: 1869 for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) 1870 kunmap_local(unmap_array[stripe_nr]); 1871 return ret; 1872 } 1873 1874 static int recover_sectors(struct btrfs_raid_bio *rbio) 1875 { 1876 void **pointers = NULL; 1877 void **unmap_array = NULL; 1878 int sectornr; 1879 int ret = 0; 1880 1881 /* 1882 * @pointers array stores the pointer for each sector. 1883 * 1884 * @unmap_array stores copy of pointers that does not get reordered 1885 * during reconstruction so that kunmap_local works. 1886 */ 1887 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1888 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 1889 if (!pointers || !unmap_array) { 1890 ret = -ENOMEM; 1891 goto out; 1892 } 1893 1894 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { 1895 spin_lock(&rbio->bio_list_lock); 1896 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1897 spin_unlock(&rbio->bio_list_lock); 1898 } 1899 1900 index_rbio_pages(rbio); 1901 1902 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 1903 ret = recover_vertical(rbio, sectornr, pointers, unmap_array); 1904 if (ret < 0) 1905 break; 1906 } 1907 1908 out: 1909 kfree(pointers); 1910 kfree(unmap_array); 1911 return ret; 1912 } 1913 1914 static void recover_rbio(struct btrfs_raid_bio *rbio) 1915 { 1916 struct bio_list bio_list = BIO_EMPTY_LIST; 1917 int total_sector_nr; 1918 int ret = 0; 1919 1920 /* 1921 * Either we're doing recover for a read failure or degraded write, 1922 * caller should have set error bitmap correctly. 1923 */ 1924 ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); 1925 1926 /* For recovery, we need to read all sectors including P/Q. */ 1927 ret = alloc_rbio_pages(rbio); 1928 if (ret < 0) 1929 goto out; 1930 1931 index_rbio_pages(rbio); 1932 1933 /* 1934 * Read everything that hasn't failed. However this time we will 1935 * not trust any cached sector. 1936 * As we may read out some stale data but higher layer is not reading 1937 * that stale part. 1938 * 1939 * So here we always re-read everything in recovery path. 1940 */ 1941 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 1942 total_sector_nr++) { 1943 int stripe = total_sector_nr / rbio->stripe_nsectors; 1944 int sectornr = total_sector_nr % rbio->stripe_nsectors; 1945 struct sector_ptr *sector; 1946 1947 /* 1948 * Skip the range which has error. It can be a range which is 1949 * marked error (for csum mismatch), or it can be a missing 1950 * device. 1951 */ 1952 if (!rbio->bioc->stripes[stripe].dev->bdev || 1953 test_bit(total_sector_nr, rbio->error_bitmap)) { 1954 /* 1955 * Also set the error bit for missing device, which 1956 * may not yet have its error bit set. 1957 */ 1958 set_bit(total_sector_nr, rbio->error_bitmap); 1959 continue; 1960 } 1961 1962 sector = rbio_stripe_sector(rbio, stripe, sectornr); 1963 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 1964 sectornr, REQ_OP_READ); 1965 if (ret < 0) { 1966 bio_list_put(&bio_list); 1967 goto out; 1968 } 1969 } 1970 1971 submit_read_wait_bio_list(rbio, &bio_list); 1972 ret = recover_sectors(rbio); 1973 out: 1974 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 1975 } 1976 1977 static void recover_rbio_work(struct work_struct *work) 1978 { 1979 struct btrfs_raid_bio *rbio; 1980 1981 rbio = container_of(work, struct btrfs_raid_bio, work); 1982 if (!lock_stripe_add(rbio)) 1983 recover_rbio(rbio); 1984 } 1985 1986 static void recover_rbio_work_locked(struct work_struct *work) 1987 { 1988 recover_rbio(container_of(work, struct btrfs_raid_bio, work)); 1989 } 1990 1991 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) 1992 { 1993 bool found = false; 1994 int sector_nr; 1995 1996 /* 1997 * This is for RAID6 extra recovery tries, thus mirror number should 1998 * be large than 2. 1999 * Mirror 1 means read from data stripes. Mirror 2 means rebuild using 2000 * RAID5 methods. 2001 */ 2002 ASSERT(mirror_num > 2); 2003 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2004 int found_errors; 2005 int faila; 2006 int failb; 2007 2008 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 2009 &faila, &failb); 2010 /* This vertical stripe doesn't have errors. */ 2011 if (!found_errors) 2012 continue; 2013 2014 /* 2015 * If we found errors, there should be only one error marked 2016 * by previous set_rbio_range_error(). 2017 */ 2018 ASSERT(found_errors == 1); 2019 found = true; 2020 2021 /* Now select another stripe to mark as error. */ 2022 failb = rbio->real_stripes - (mirror_num - 1); 2023 if (failb <= faila) 2024 failb--; 2025 2026 /* Set the extra bit in error bitmap. */ 2027 if (failb >= 0) 2028 set_bit(failb * rbio->stripe_nsectors + sector_nr, 2029 rbio->error_bitmap); 2030 } 2031 2032 /* We should found at least one vertical stripe with error.*/ 2033 ASSERT(found); 2034 } 2035 2036 /* 2037 * the main entry point for reads from the higher layers. This 2038 * is really only called when the normal read path had a failure, 2039 * so we assume the bio they send down corresponds to a failed part 2040 * of the drive. 2041 */ 2042 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, 2043 int mirror_num) 2044 { 2045 struct btrfs_fs_info *fs_info = bioc->fs_info; 2046 struct btrfs_raid_bio *rbio; 2047 2048 rbio = alloc_rbio(fs_info, bioc); 2049 if (IS_ERR(rbio)) { 2050 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); 2051 bio_endio(bio); 2052 return; 2053 } 2054 2055 rbio->operation = BTRFS_RBIO_READ_REBUILD; 2056 rbio_add_bio(rbio, bio); 2057 2058 set_rbio_range_error(rbio, bio); 2059 2060 /* 2061 * Loop retry: 2062 * for 'mirror == 2', reconstruct from all other stripes. 2063 * for 'mirror_num > 2', select a stripe to fail on every retry. 2064 */ 2065 if (mirror_num > 2) 2066 set_rbio_raid6_extra_error(rbio, mirror_num); 2067 2068 start_async_work(rbio, recover_rbio_work); 2069 } 2070 2071 static void fill_data_csums(struct btrfs_raid_bio *rbio) 2072 { 2073 struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; 2074 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, 2075 rbio->bioc->full_stripe_logical); 2076 const u64 start = rbio->bioc->full_stripe_logical; 2077 const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << 2078 fs_info->sectorsize_bits; 2079 int ret; 2080 2081 /* The rbio should not have its csum buffer initialized. */ 2082 ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); 2083 2084 /* 2085 * Skip the csum search if: 2086 * 2087 * - The rbio doesn't belong to data block groups 2088 * Then we are doing IO for tree blocks, no need to search csums. 2089 * 2090 * - The rbio belongs to mixed block groups 2091 * This is to avoid deadlock, as we're already holding the full 2092 * stripe lock, if we trigger a metadata read, and it needs to do 2093 * raid56 recovery, we will deadlock. 2094 */ 2095 if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || 2096 rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) 2097 return; 2098 2099 rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * 2100 fs_info->csum_size, GFP_NOFS); 2101 rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, 2102 GFP_NOFS); 2103 if (!rbio->csum_buf || !rbio->csum_bitmap) { 2104 ret = -ENOMEM; 2105 goto error; 2106 } 2107 2108 ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, 2109 rbio->csum_buf, rbio->csum_bitmap); 2110 if (ret < 0) 2111 goto error; 2112 if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) 2113 goto no_csum; 2114 return; 2115 2116 error: 2117 /* 2118 * We failed to allocate memory or grab the csum, but it's not fatal, 2119 * we can still continue. But better to warn users that RMW is no 2120 * longer safe for this particular sub-stripe write. 2121 */ 2122 btrfs_warn_rl(fs_info, 2123 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", 2124 rbio->bioc->full_stripe_logical, ret); 2125 no_csum: 2126 kfree(rbio->csum_buf); 2127 bitmap_free(rbio->csum_bitmap); 2128 rbio->csum_buf = NULL; 2129 rbio->csum_bitmap = NULL; 2130 } 2131 2132 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) 2133 { 2134 struct bio_list bio_list = BIO_EMPTY_LIST; 2135 int total_sector_nr; 2136 int ret = 0; 2137 2138 /* 2139 * Fill the data csums we need for data verification. We need to fill 2140 * the csum_bitmap/csum_buf first, as our endio function will try to 2141 * verify the data sectors. 2142 */ 2143 fill_data_csums(rbio); 2144 2145 /* 2146 * Build a list of bios to read all sectors (including data and P/Q). 2147 * 2148 * This behavior is to compensate the later csum verification and recovery. 2149 */ 2150 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2151 total_sector_nr++) { 2152 struct sector_ptr *sector; 2153 int stripe = total_sector_nr / rbio->stripe_nsectors; 2154 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2155 2156 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2157 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2158 stripe, sectornr, REQ_OP_READ); 2159 if (ret) { 2160 bio_list_put(&bio_list); 2161 return ret; 2162 } 2163 } 2164 2165 /* 2166 * We may or may not have any corrupted sectors (including missing dev 2167 * and csum mismatch), just let recover_sectors() to handle them all. 2168 */ 2169 submit_read_wait_bio_list(rbio, &bio_list); 2170 return recover_sectors(rbio); 2171 } 2172 2173 static void raid_wait_write_end_io(struct bio *bio) 2174 { 2175 struct btrfs_raid_bio *rbio = bio->bi_private; 2176 blk_status_t err = bio->bi_status; 2177 2178 if (err) 2179 rbio_update_error_bitmap(rbio, bio); 2180 bio_put(bio); 2181 if (atomic_dec_and_test(&rbio->stripes_pending)) 2182 wake_up(&rbio->io_wait); 2183 } 2184 2185 static void submit_write_bios(struct btrfs_raid_bio *rbio, 2186 struct bio_list *bio_list) 2187 { 2188 struct bio *bio; 2189 2190 atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); 2191 while ((bio = bio_list_pop(bio_list))) { 2192 bio->bi_end_io = raid_wait_write_end_io; 2193 2194 if (trace_raid56_write_enabled()) { 2195 struct raid56_bio_trace_info trace_info = { 0 }; 2196 2197 bio_get_trace_info(rbio, bio, &trace_info); 2198 trace_raid56_write(rbio, bio, &trace_info); 2199 } 2200 submit_bio(bio); 2201 } 2202 } 2203 2204 /* 2205 * To determine if we need to read any sector from the disk. 2206 * Should only be utilized in RMW path, to skip cached rbio. 2207 */ 2208 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) 2209 { 2210 int i; 2211 2212 for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { 2213 struct sector_ptr *sector = &rbio->stripe_sectors[i]; 2214 2215 /* 2216 * We have a sector which doesn't have page nor uptodate, 2217 * thus this rbio can not be cached one, as cached one must 2218 * have all its data sectors present and uptodate. 2219 */ 2220 if (!sector->page || !sector->uptodate) 2221 return true; 2222 } 2223 return false; 2224 } 2225 2226 static void rmw_rbio(struct btrfs_raid_bio *rbio) 2227 { 2228 struct bio_list bio_list; 2229 int sectornr; 2230 int ret = 0; 2231 2232 /* 2233 * Allocate the pages for parity first, as P/Q pages will always be 2234 * needed for both full-stripe and sub-stripe writes. 2235 */ 2236 ret = alloc_rbio_parity_pages(rbio); 2237 if (ret < 0) 2238 goto out; 2239 2240 /* 2241 * Either full stripe write, or we have every data sector already 2242 * cached, can go to write path immediately. 2243 */ 2244 if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { 2245 /* 2246 * Now we're doing sub-stripe write, also need all data stripes 2247 * to do the full RMW. 2248 */ 2249 ret = alloc_rbio_data_pages(rbio); 2250 if (ret < 0) 2251 goto out; 2252 2253 index_rbio_pages(rbio); 2254 2255 ret = rmw_read_wait_recover(rbio); 2256 if (ret < 0) 2257 goto out; 2258 } 2259 2260 /* 2261 * At this stage we're not allowed to add any new bios to the 2262 * bio list any more, anyone else that wants to change this stripe 2263 * needs to do their own rmw. 2264 */ 2265 spin_lock(&rbio->bio_list_lock); 2266 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 2267 spin_unlock(&rbio->bio_list_lock); 2268 2269 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2270 2271 index_rbio_pages(rbio); 2272 2273 /* 2274 * We don't cache full rbios because we're assuming 2275 * the higher layers are unlikely to use this area of 2276 * the disk again soon. If they do use it again, 2277 * hopefully they will send another full bio. 2278 */ 2279 if (!rbio_is_full(rbio)) 2280 cache_rbio_pages(rbio); 2281 else 2282 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2283 2284 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) 2285 generate_pq_vertical(rbio, sectornr); 2286 2287 bio_list_init(&bio_list); 2288 ret = rmw_assemble_write_bios(rbio, &bio_list); 2289 if (ret < 0) 2290 goto out; 2291 2292 /* We should have at least one bio assembled. */ 2293 ASSERT(bio_list_size(&bio_list)); 2294 submit_write_bios(rbio, &bio_list); 2295 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2296 2297 /* We may have more errors than our tolerance during the read. */ 2298 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { 2299 int found_errors; 2300 2301 found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); 2302 if (found_errors > rbio->bioc->max_errors) { 2303 ret = -EIO; 2304 break; 2305 } 2306 } 2307 out: 2308 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2309 } 2310 2311 static void rmw_rbio_work(struct work_struct *work) 2312 { 2313 struct btrfs_raid_bio *rbio; 2314 2315 rbio = container_of(work, struct btrfs_raid_bio, work); 2316 if (lock_stripe_add(rbio) == 0) 2317 rmw_rbio(rbio); 2318 } 2319 2320 static void rmw_rbio_work_locked(struct work_struct *work) 2321 { 2322 rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); 2323 } 2324 2325 /* 2326 * The following code is used to scrub/replace the parity stripe 2327 * 2328 * Caller must have already increased bio_counter for getting @bioc. 2329 * 2330 * Note: We need make sure all the pages that add into the scrub/replace 2331 * raid bio are correct and not be changed during the scrub/replace. That 2332 * is those pages just hold metadata or file data with checksum. 2333 */ 2334 2335 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, 2336 struct btrfs_io_context *bioc, 2337 struct btrfs_device *scrub_dev, 2338 unsigned long *dbitmap, int stripe_nsectors) 2339 { 2340 struct btrfs_fs_info *fs_info = bioc->fs_info; 2341 struct btrfs_raid_bio *rbio; 2342 int i; 2343 2344 rbio = alloc_rbio(fs_info, bioc); 2345 if (IS_ERR(rbio)) 2346 return NULL; 2347 bio_list_add(&rbio->bio_list, bio); 2348 /* 2349 * This is a special bio which is used to hold the completion handler 2350 * and make the scrub rbio is similar to the other types 2351 */ 2352 ASSERT(!bio->bi_iter.bi_size); 2353 rbio->operation = BTRFS_RBIO_PARITY_SCRUB; 2354 2355 /* 2356 * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted 2357 * to the end position, so this search can start from the first parity 2358 * stripe. 2359 */ 2360 for (i = rbio->nr_data; i < rbio->real_stripes; i++) { 2361 if (bioc->stripes[i].dev == scrub_dev) { 2362 rbio->scrubp = i; 2363 break; 2364 } 2365 } 2366 ASSERT(i < rbio->real_stripes); 2367 2368 bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); 2369 return rbio; 2370 } 2371 2372 /* 2373 * We just scrub the parity that we have correct data on the same horizontal, 2374 * so we needn't allocate all pages for all the stripes. 2375 */ 2376 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) 2377 { 2378 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2379 int total_sector_nr; 2380 2381 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2382 total_sector_nr++) { 2383 struct page *page; 2384 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2385 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; 2386 2387 if (!test_bit(sectornr, &rbio->dbitmap)) 2388 continue; 2389 if (rbio->stripe_pages[index]) 2390 continue; 2391 page = alloc_page(GFP_NOFS); 2392 if (!page) 2393 return -ENOMEM; 2394 rbio->stripe_pages[index] = page; 2395 } 2396 index_stripe_sectors(rbio); 2397 return 0; 2398 } 2399 2400 static int finish_parity_scrub(struct btrfs_raid_bio *rbio) 2401 { 2402 struct btrfs_io_context *bioc = rbio->bioc; 2403 const u32 sectorsize = bioc->fs_info->sectorsize; 2404 void **pointers = rbio->finish_pointers; 2405 unsigned long *pbitmap = &rbio->finish_pbitmap; 2406 int nr_data = rbio->nr_data; 2407 int stripe; 2408 int sectornr; 2409 bool has_qstripe; 2410 struct sector_ptr p_sector = { 0 }; 2411 struct sector_ptr q_sector = { 0 }; 2412 struct bio_list bio_list; 2413 int is_replace = 0; 2414 int ret; 2415 2416 bio_list_init(&bio_list); 2417 2418 if (rbio->real_stripes - rbio->nr_data == 1) 2419 has_qstripe = false; 2420 else if (rbio->real_stripes - rbio->nr_data == 2) 2421 has_qstripe = true; 2422 else 2423 BUG(); 2424 2425 /* 2426 * Replace is running and our P/Q stripe is being replaced, then we 2427 * need to duplicate the final write to replace target. 2428 */ 2429 if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { 2430 is_replace = 1; 2431 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); 2432 } 2433 2434 /* 2435 * Because the higher layers(scrubber) are unlikely to 2436 * use this area of the disk again soon, so don't cache 2437 * it. 2438 */ 2439 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2440 2441 p_sector.page = alloc_page(GFP_NOFS); 2442 if (!p_sector.page) 2443 return -ENOMEM; 2444 p_sector.pgoff = 0; 2445 p_sector.uptodate = 1; 2446 2447 if (has_qstripe) { 2448 /* RAID6, allocate and map temp space for the Q stripe */ 2449 q_sector.page = alloc_page(GFP_NOFS); 2450 if (!q_sector.page) { 2451 __free_page(p_sector.page); 2452 p_sector.page = NULL; 2453 return -ENOMEM; 2454 } 2455 q_sector.pgoff = 0; 2456 q_sector.uptodate = 1; 2457 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); 2458 } 2459 2460 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2461 2462 /* Map the parity stripe just once */ 2463 pointers[nr_data] = kmap_local_page(p_sector.page); 2464 2465 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2466 struct sector_ptr *sector; 2467 void *parity; 2468 2469 /* first collect one page from each data stripe */ 2470 for (stripe = 0; stripe < nr_data; stripe++) { 2471 sector = sector_in_rbio(rbio, stripe, sectornr, 0); 2472 pointers[stripe] = kmap_local_page(sector->page) + 2473 sector->pgoff; 2474 } 2475 2476 if (has_qstripe) { 2477 /* RAID6, call the library function to fill in our P/Q */ 2478 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, 2479 pointers); 2480 } else { 2481 /* raid5 */ 2482 memcpy(pointers[nr_data], pointers[0], sectorsize); 2483 run_xor(pointers + 1, nr_data - 1, sectorsize); 2484 } 2485 2486 /* Check scrubbing parity and repair it */ 2487 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2488 parity = kmap_local_page(sector->page) + sector->pgoff; 2489 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) 2490 memcpy(parity, pointers[rbio->scrubp], sectorsize); 2491 else 2492 /* Parity is right, needn't writeback */ 2493 bitmap_clear(&rbio->dbitmap, sectornr, 1); 2494 kunmap_local(parity); 2495 2496 for (stripe = nr_data - 1; stripe >= 0; stripe--) 2497 kunmap_local(pointers[stripe]); 2498 } 2499 2500 kunmap_local(pointers[nr_data]); 2501 __free_page(p_sector.page); 2502 p_sector.page = NULL; 2503 if (q_sector.page) { 2504 kunmap_local(pointers[rbio->real_stripes - 1]); 2505 __free_page(q_sector.page); 2506 q_sector.page = NULL; 2507 } 2508 2509 /* 2510 * time to start writing. Make bios for everything from the 2511 * higher layers (the bio_list in our rbio) and our p/q. Ignore 2512 * everything else. 2513 */ 2514 for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { 2515 struct sector_ptr *sector; 2516 2517 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2518 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, 2519 sectornr, REQ_OP_WRITE); 2520 if (ret) 2521 goto cleanup; 2522 } 2523 2524 if (!is_replace) 2525 goto submit_write; 2526 2527 /* 2528 * Replace is running and our parity stripe needs to be duplicated to 2529 * the target device. Check we have a valid source stripe number. 2530 */ 2531 ASSERT(rbio->bioc->replace_stripe_src >= 0); 2532 for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { 2533 struct sector_ptr *sector; 2534 2535 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); 2536 ret = rbio_add_io_sector(rbio, &bio_list, sector, 2537 rbio->real_stripes, 2538 sectornr, REQ_OP_WRITE); 2539 if (ret) 2540 goto cleanup; 2541 } 2542 2543 submit_write: 2544 submit_write_bios(rbio, &bio_list); 2545 return 0; 2546 2547 cleanup: 2548 bio_list_put(&bio_list); 2549 return ret; 2550 } 2551 2552 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) 2553 { 2554 if (stripe >= 0 && stripe < rbio->nr_data) 2555 return 1; 2556 return 0; 2557 } 2558 2559 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) 2560 { 2561 void **pointers = NULL; 2562 void **unmap_array = NULL; 2563 int sector_nr; 2564 int ret = 0; 2565 2566 /* 2567 * @pointers array stores the pointer for each sector. 2568 * 2569 * @unmap_array stores copy of pointers that does not get reordered 2570 * during reconstruction so that kunmap_local works. 2571 */ 2572 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2573 unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2574 if (!pointers || !unmap_array) { 2575 ret = -ENOMEM; 2576 goto out; 2577 } 2578 2579 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2580 int dfail = 0, failp = -1; 2581 int faila; 2582 int failb; 2583 int found_errors; 2584 2585 found_errors = get_rbio_veritical_errors(rbio, sector_nr, 2586 &faila, &failb); 2587 if (found_errors > rbio->bioc->max_errors) { 2588 ret = -EIO; 2589 goto out; 2590 } 2591 if (found_errors == 0) 2592 continue; 2593 2594 /* We should have at least one error here. */ 2595 ASSERT(faila >= 0 || failb >= 0); 2596 2597 if (is_data_stripe(rbio, faila)) 2598 dfail++; 2599 else if (is_parity_stripe(faila)) 2600 failp = faila; 2601 2602 if (is_data_stripe(rbio, failb)) 2603 dfail++; 2604 else if (is_parity_stripe(failb)) 2605 failp = failb; 2606 /* 2607 * Because we can not use a scrubbing parity to repair the 2608 * data, so the capability of the repair is declined. (In the 2609 * case of RAID5, we can not repair anything.) 2610 */ 2611 if (dfail > rbio->bioc->max_errors - 1) { 2612 ret = -EIO; 2613 goto out; 2614 } 2615 /* 2616 * If all data is good, only parity is correctly, just repair 2617 * the parity, no need to recover data stripes. 2618 */ 2619 if (dfail == 0) 2620 continue; 2621 2622 /* 2623 * Here means we got one corrupted data stripe and one 2624 * corrupted parity on RAID6, if the corrupted parity is 2625 * scrubbing parity, luckily, use the other one to repair the 2626 * data, or we can not repair the data stripe. 2627 */ 2628 if (failp != rbio->scrubp) { 2629 ret = -EIO; 2630 goto out; 2631 } 2632 2633 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); 2634 if (ret < 0) 2635 goto out; 2636 } 2637 out: 2638 kfree(pointers); 2639 kfree(unmap_array); 2640 return ret; 2641 } 2642 2643 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) 2644 { 2645 struct bio_list bio_list = BIO_EMPTY_LIST; 2646 int total_sector_nr; 2647 int ret = 0; 2648 2649 /* Build a list of bios to read all the missing parts. */ 2650 for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; 2651 total_sector_nr++) { 2652 int sectornr = total_sector_nr % rbio->stripe_nsectors; 2653 int stripe = total_sector_nr / rbio->stripe_nsectors; 2654 struct sector_ptr *sector; 2655 2656 /* No data in the vertical stripe, no need to read. */ 2657 if (!test_bit(sectornr, &rbio->dbitmap)) 2658 continue; 2659 2660 /* 2661 * We want to find all the sectors missing from the rbio and 2662 * read them from the disk. If sector_in_rbio() finds a sector 2663 * in the bio list we don't need to read it off the stripe. 2664 */ 2665 sector = sector_in_rbio(rbio, stripe, sectornr, 1); 2666 if (sector) 2667 continue; 2668 2669 sector = rbio_stripe_sector(rbio, stripe, sectornr); 2670 /* 2671 * The bio cache may have handed us an uptodate sector. If so, 2672 * use it. 2673 */ 2674 if (sector->uptodate) 2675 continue; 2676 2677 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, 2678 sectornr, REQ_OP_READ); 2679 if (ret) { 2680 bio_list_put(&bio_list); 2681 return ret; 2682 } 2683 } 2684 2685 submit_read_wait_bio_list(rbio, &bio_list); 2686 return 0; 2687 } 2688 2689 static void scrub_rbio(struct btrfs_raid_bio *rbio) 2690 { 2691 int sector_nr; 2692 int ret; 2693 2694 ret = alloc_rbio_essential_pages(rbio); 2695 if (ret) 2696 goto out; 2697 2698 bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); 2699 2700 ret = scrub_assemble_read_bios(rbio); 2701 if (ret < 0) 2702 goto out; 2703 2704 /* We may have some failures, recover the failed sectors first. */ 2705 ret = recover_scrub_rbio(rbio); 2706 if (ret < 0) 2707 goto out; 2708 2709 /* 2710 * We have every sector properly prepared. Can finish the scrub 2711 * and writeback the good content. 2712 */ 2713 ret = finish_parity_scrub(rbio); 2714 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2715 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2716 int found_errors; 2717 2718 found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); 2719 if (found_errors > rbio->bioc->max_errors) { 2720 ret = -EIO; 2721 break; 2722 } 2723 } 2724 out: 2725 rbio_orig_end_io(rbio, errno_to_blk_status(ret)); 2726 } 2727 2728 static void scrub_rbio_work_locked(struct work_struct *work) 2729 { 2730 scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); 2731 } 2732 2733 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) 2734 { 2735 if (!lock_stripe_add(rbio)) 2736 start_async_work(rbio, scrub_rbio_work_locked); 2737 } 2738 2739 /* 2740 * This is for scrub call sites where we already have correct data contents. 2741 * This allows us to avoid reading data stripes again. 2742 * 2743 * Unfortunately here we have to do page copy, other than reusing the pages. 2744 * This is due to the fact rbio has its own page management for its cache. 2745 */ 2746 void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, 2747 struct page **data_pages, u64 data_logical) 2748 { 2749 const u64 offset_in_full_stripe = data_logical - 2750 rbio->bioc->full_stripe_logical; 2751 const int page_index = offset_in_full_stripe >> PAGE_SHIFT; 2752 const u32 sectorsize = rbio->bioc->fs_info->sectorsize; 2753 const u32 sectors_per_page = PAGE_SIZE / sectorsize; 2754 int ret; 2755 2756 /* 2757 * If we hit ENOMEM temporarily, but later at 2758 * raid56_parity_submit_scrub_rbio() time it succeeded, we just do 2759 * the extra read, not a big deal. 2760 * 2761 * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, 2762 * the bio would got proper error number set. 2763 */ 2764 ret = alloc_rbio_data_pages(rbio); 2765 if (ret < 0) 2766 return; 2767 2768 /* data_logical must be at stripe boundary and inside the full stripe. */ 2769 ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); 2770 ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); 2771 2772 for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { 2773 struct page *dst = rbio->stripe_pages[page_nr + page_index]; 2774 struct page *src = data_pages[page_nr]; 2775 2776 memcpy_page(dst, 0, src, 0, PAGE_SIZE); 2777 for (int sector_nr = sectors_per_page * page_index; 2778 sector_nr < sectors_per_page * (page_index + 1); 2779 sector_nr++) 2780 rbio->stripe_sectors[sector_nr].uptodate = true; 2781 } 2782 } 2783