1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include <crypto/hash.h> 10 #include "ctree.h" 11 #include "discard.h" 12 #include "volumes.h" 13 #include "disk-io.h" 14 #include "ordered-data.h" 15 #include "transaction.h" 16 #include "backref.h" 17 #include "extent_io.h" 18 #include "dev-replace.h" 19 #include "check-integrity.h" 20 #include "rcu-string.h" 21 #include "raid56.h" 22 #include "block-group.h" 23 #include "zoned.h" 24 25 /* 26 * This is only the first step towards a full-features scrub. It reads all 27 * extent and super block and verifies the checksums. In case a bad checksum 28 * is found or the extent cannot be read, good data will be written back if 29 * any can be found. 30 * 31 * Future enhancements: 32 * - In case an unrepairable extent is encountered, track which files are 33 * affected and report them 34 * - track and record media errors, throw out bad devices 35 * - add a mode to also read unallocated space 36 */ 37 38 struct scrub_block; 39 struct scrub_ctx; 40 41 /* 42 * The following three values only influence the performance. 43 * 44 * The last one configures the number of parallel and outstanding I/O 45 * operations. The first one configures an upper limit for the number 46 * of (dynamically allocated) pages that are added to a bio. 47 */ 48 #define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ 49 #define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ 50 51 /* 52 * The following value times PAGE_SIZE needs to be large enough to match the 53 * largest node/leaf/sector size that shall be supported. 54 */ 55 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) 56 57 struct scrub_recover { 58 refcount_t refs; 59 struct btrfs_io_context *bioc; 60 u64 map_length; 61 }; 62 63 struct scrub_sector { 64 struct scrub_block *sblock; 65 struct page *page; 66 struct btrfs_device *dev; 67 struct list_head list; 68 u64 flags; /* extent flags */ 69 u64 generation; 70 u64 logical; 71 u64 physical; 72 u64 physical_for_dev_replace; 73 atomic_t refs; 74 u8 mirror_num; 75 unsigned int have_csum:1; 76 unsigned int io_error:1; 77 u8 csum[BTRFS_CSUM_SIZE]; 78 79 struct scrub_recover *recover; 80 }; 81 82 struct scrub_bio { 83 int index; 84 struct scrub_ctx *sctx; 85 struct btrfs_device *dev; 86 struct bio *bio; 87 blk_status_t status; 88 u64 logical; 89 u64 physical; 90 struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; 91 int sector_count; 92 int next_free; 93 struct work_struct work; 94 }; 95 96 struct scrub_block { 97 struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; 98 int sector_count; 99 atomic_t outstanding_sectors; 100 refcount_t refs; /* free mem on transition to zero */ 101 struct scrub_ctx *sctx; 102 struct scrub_parity *sparity; 103 struct { 104 unsigned int header_error:1; 105 unsigned int checksum_error:1; 106 unsigned int no_io_error_seen:1; 107 unsigned int generation_error:1; /* also sets header_error */ 108 109 /* The following is for the data used to check parity */ 110 /* It is for the data with checksum */ 111 unsigned int data_corrected:1; 112 }; 113 struct work_struct work; 114 }; 115 116 /* Used for the chunks with parity stripe such RAID5/6 */ 117 struct scrub_parity { 118 struct scrub_ctx *sctx; 119 120 struct btrfs_device *scrub_dev; 121 122 u64 logic_start; 123 124 u64 logic_end; 125 126 int nsectors; 127 128 u32 stripe_len; 129 130 refcount_t refs; 131 132 struct list_head sectors_list; 133 134 /* Work of parity check and repair */ 135 struct work_struct work; 136 137 /* Mark the parity blocks which have data */ 138 unsigned long dbitmap; 139 140 /* 141 * Mark the parity blocks which have data, but errors happen when 142 * read data or check data 143 */ 144 unsigned long ebitmap; 145 }; 146 147 struct scrub_ctx { 148 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 149 struct btrfs_fs_info *fs_info; 150 int first_free; 151 int curr; 152 atomic_t bios_in_flight; 153 atomic_t workers_pending; 154 spinlock_t list_lock; 155 wait_queue_head_t list_wait; 156 struct list_head csum_list; 157 atomic_t cancel_req; 158 int readonly; 159 int sectors_per_bio; 160 161 /* State of IO submission throttling affecting the associated device */ 162 ktime_t throttle_deadline; 163 u64 throttle_sent; 164 165 int is_dev_replace; 166 u64 write_pointer; 167 168 struct scrub_bio *wr_curr_bio; 169 struct mutex wr_lock; 170 struct btrfs_device *wr_tgtdev; 171 bool flush_all_writes; 172 173 /* 174 * statistics 175 */ 176 struct btrfs_scrub_progress stat; 177 spinlock_t stat_lock; 178 179 /* 180 * Use a ref counter to avoid use-after-free issues. Scrub workers 181 * decrement bios_in_flight and workers_pending and then do a wakeup 182 * on the list_wait wait queue. We must ensure the main scrub task 183 * doesn't free the scrub context before or while the workers are 184 * doing the wakeup() call. 185 */ 186 refcount_t refs; 187 }; 188 189 struct scrub_warning { 190 struct btrfs_path *path; 191 u64 extent_item_size; 192 const char *errstr; 193 u64 physical; 194 u64 logical; 195 struct btrfs_device *dev; 196 }; 197 198 struct full_stripe_lock { 199 struct rb_node node; 200 u64 logical; 201 u64 refs; 202 struct mutex mutex; 203 }; 204 205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 206 struct scrub_block *sblocks_for_recheck); 207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 208 struct scrub_block *sblock, 209 int retry_failed_mirror); 210 static void scrub_recheck_block_checksum(struct scrub_block *sblock); 211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 212 struct scrub_block *sblock_good); 213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, 214 struct scrub_block *sblock_good, 215 int sector_num, int force_write); 216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, 218 int sector_num); 219 static int scrub_checksum_data(struct scrub_block *sblock); 220 static int scrub_checksum_tree_block(struct scrub_block *sblock); 221 static int scrub_checksum_super(struct scrub_block *sblock); 222 static void scrub_block_put(struct scrub_block *sblock); 223 static void scrub_sector_get(struct scrub_sector *sector); 224 static void scrub_sector_put(struct scrub_sector *sector); 225 static void scrub_parity_get(struct scrub_parity *sparity); 226 static void scrub_parity_put(struct scrub_parity *sparity); 227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, 228 u64 physical, struct btrfs_device *dev, u64 flags, 229 u64 gen, int mirror_num, u8 *csum, 230 u64 physical_for_dev_replace); 231 static void scrub_bio_end_io(struct bio *bio); 232 static void scrub_bio_end_io_worker(struct work_struct *work); 233 static void scrub_block_complete(struct scrub_block *sblock); 234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, 235 u64 extent_logical, u32 extent_len, 236 u64 *extent_physical, 237 struct btrfs_device **extent_dev, 238 int *extent_mirror_num); 239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, 240 struct scrub_sector *sector); 241 static void scrub_wr_submit(struct scrub_ctx *sctx); 242 static void scrub_wr_bio_end_io(struct bio *bio); 243 static void scrub_wr_bio_end_io_worker(struct work_struct *work); 244 static void scrub_put_ctx(struct scrub_ctx *sctx); 245 246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) 247 { 248 return sector->recover && 249 (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); 250 } 251 252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 253 { 254 refcount_inc(&sctx->refs); 255 atomic_inc(&sctx->bios_in_flight); 256 } 257 258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx) 259 { 260 atomic_dec(&sctx->bios_in_flight); 261 wake_up(&sctx->list_wait); 262 scrub_put_ctx(sctx); 263 } 264 265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 266 { 267 while (atomic_read(&fs_info->scrub_pause_req)) { 268 mutex_unlock(&fs_info->scrub_lock); 269 wait_event(fs_info->scrub_pause_wait, 270 atomic_read(&fs_info->scrub_pause_req) == 0); 271 mutex_lock(&fs_info->scrub_lock); 272 } 273 } 274 275 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 276 { 277 atomic_inc(&fs_info->scrubs_paused); 278 wake_up(&fs_info->scrub_pause_wait); 279 } 280 281 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 282 { 283 mutex_lock(&fs_info->scrub_lock); 284 __scrub_blocked_if_needed(fs_info); 285 atomic_dec(&fs_info->scrubs_paused); 286 mutex_unlock(&fs_info->scrub_lock); 287 288 wake_up(&fs_info->scrub_pause_wait); 289 } 290 291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 292 { 293 scrub_pause_on(fs_info); 294 scrub_pause_off(fs_info); 295 } 296 297 /* 298 * Insert new full stripe lock into full stripe locks tree 299 * 300 * Return pointer to existing or newly inserted full_stripe_lock structure if 301 * everything works well. 302 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory 303 * 304 * NOTE: caller must hold full_stripe_locks_root->lock before calling this 305 * function 306 */ 307 static struct full_stripe_lock *insert_full_stripe_lock( 308 struct btrfs_full_stripe_locks_tree *locks_root, 309 u64 fstripe_logical) 310 { 311 struct rb_node **p; 312 struct rb_node *parent = NULL; 313 struct full_stripe_lock *entry; 314 struct full_stripe_lock *ret; 315 316 lockdep_assert_held(&locks_root->lock); 317 318 p = &locks_root->root.rb_node; 319 while (*p) { 320 parent = *p; 321 entry = rb_entry(parent, struct full_stripe_lock, node); 322 if (fstripe_logical < entry->logical) { 323 p = &(*p)->rb_left; 324 } else if (fstripe_logical > entry->logical) { 325 p = &(*p)->rb_right; 326 } else { 327 entry->refs++; 328 return entry; 329 } 330 } 331 332 /* 333 * Insert new lock. 334 */ 335 ret = kmalloc(sizeof(*ret), GFP_KERNEL); 336 if (!ret) 337 return ERR_PTR(-ENOMEM); 338 ret->logical = fstripe_logical; 339 ret->refs = 1; 340 mutex_init(&ret->mutex); 341 342 rb_link_node(&ret->node, parent, p); 343 rb_insert_color(&ret->node, &locks_root->root); 344 return ret; 345 } 346 347 /* 348 * Search for a full stripe lock of a block group 349 * 350 * Return pointer to existing full stripe lock if found 351 * Return NULL if not found 352 */ 353 static struct full_stripe_lock *search_full_stripe_lock( 354 struct btrfs_full_stripe_locks_tree *locks_root, 355 u64 fstripe_logical) 356 { 357 struct rb_node *node; 358 struct full_stripe_lock *entry; 359 360 lockdep_assert_held(&locks_root->lock); 361 362 node = locks_root->root.rb_node; 363 while (node) { 364 entry = rb_entry(node, struct full_stripe_lock, node); 365 if (fstripe_logical < entry->logical) 366 node = node->rb_left; 367 else if (fstripe_logical > entry->logical) 368 node = node->rb_right; 369 else 370 return entry; 371 } 372 return NULL; 373 } 374 375 /* 376 * Helper to get full stripe logical from a normal bytenr. 377 * 378 * Caller must ensure @cache is a RAID56 block group. 379 */ 380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) 381 { 382 u64 ret; 383 384 /* 385 * Due to chunk item size limit, full stripe length should not be 386 * larger than U32_MAX. Just a sanity check here. 387 */ 388 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); 389 390 /* 391 * round_down() can only handle power of 2, while RAID56 full 392 * stripe length can be 64KiB * n, so we need to manually round down. 393 */ 394 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * 395 cache->full_stripe_len + cache->start; 396 return ret; 397 } 398 399 /* 400 * Lock a full stripe to avoid concurrency of recovery and read 401 * 402 * It's only used for profiles with parities (RAID5/6), for other profiles it 403 * does nothing. 404 * 405 * Return 0 if we locked full stripe covering @bytenr, with a mutex held. 406 * So caller must call unlock_full_stripe() at the same context. 407 * 408 * Return <0 if encounters error. 409 */ 410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, 411 bool *locked_ret) 412 { 413 struct btrfs_block_group *bg_cache; 414 struct btrfs_full_stripe_locks_tree *locks_root; 415 struct full_stripe_lock *existing; 416 u64 fstripe_start; 417 int ret = 0; 418 419 *locked_ret = false; 420 bg_cache = btrfs_lookup_block_group(fs_info, bytenr); 421 if (!bg_cache) { 422 ASSERT(0); 423 return -ENOENT; 424 } 425 426 /* Profiles not based on parity don't need full stripe lock */ 427 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) 428 goto out; 429 locks_root = &bg_cache->full_stripe_locks_root; 430 431 fstripe_start = get_full_stripe_logical(bg_cache, bytenr); 432 433 /* Now insert the full stripe lock */ 434 mutex_lock(&locks_root->lock); 435 existing = insert_full_stripe_lock(locks_root, fstripe_start); 436 mutex_unlock(&locks_root->lock); 437 if (IS_ERR(existing)) { 438 ret = PTR_ERR(existing); 439 goto out; 440 } 441 mutex_lock(&existing->mutex); 442 *locked_ret = true; 443 out: 444 btrfs_put_block_group(bg_cache); 445 return ret; 446 } 447 448 /* 449 * Unlock a full stripe. 450 * 451 * NOTE: Caller must ensure it's the same context calling corresponding 452 * lock_full_stripe(). 453 * 454 * Return 0 if we unlock full stripe without problem. 455 * Return <0 for error 456 */ 457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, 458 bool locked) 459 { 460 struct btrfs_block_group *bg_cache; 461 struct btrfs_full_stripe_locks_tree *locks_root; 462 struct full_stripe_lock *fstripe_lock; 463 u64 fstripe_start; 464 bool freeit = false; 465 int ret = 0; 466 467 /* If we didn't acquire full stripe lock, no need to continue */ 468 if (!locked) 469 return 0; 470 471 bg_cache = btrfs_lookup_block_group(fs_info, bytenr); 472 if (!bg_cache) { 473 ASSERT(0); 474 return -ENOENT; 475 } 476 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) 477 goto out; 478 479 locks_root = &bg_cache->full_stripe_locks_root; 480 fstripe_start = get_full_stripe_logical(bg_cache, bytenr); 481 482 mutex_lock(&locks_root->lock); 483 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); 484 /* Unpaired unlock_full_stripe() detected */ 485 if (!fstripe_lock) { 486 WARN_ON(1); 487 ret = -ENOENT; 488 mutex_unlock(&locks_root->lock); 489 goto out; 490 } 491 492 if (fstripe_lock->refs == 0) { 493 WARN_ON(1); 494 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", 495 fstripe_lock->logical); 496 } else { 497 fstripe_lock->refs--; 498 } 499 500 if (fstripe_lock->refs == 0) { 501 rb_erase(&fstripe_lock->node, &locks_root->root); 502 freeit = true; 503 } 504 mutex_unlock(&locks_root->lock); 505 506 mutex_unlock(&fstripe_lock->mutex); 507 if (freeit) 508 kfree(fstripe_lock); 509 out: 510 btrfs_put_block_group(bg_cache); 511 return ret; 512 } 513 514 static void scrub_free_csums(struct scrub_ctx *sctx) 515 { 516 while (!list_empty(&sctx->csum_list)) { 517 struct btrfs_ordered_sum *sum; 518 sum = list_first_entry(&sctx->csum_list, 519 struct btrfs_ordered_sum, list); 520 list_del(&sum->list); 521 kfree(sum); 522 } 523 } 524 525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 526 { 527 int i; 528 529 if (!sctx) 530 return; 531 532 /* this can happen when scrub is cancelled */ 533 if (sctx->curr != -1) { 534 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 535 536 for (i = 0; i < sbio->sector_count; i++) { 537 WARN_ON(!sbio->sectors[i]->page); 538 scrub_block_put(sbio->sectors[i]->sblock); 539 } 540 bio_put(sbio->bio); 541 } 542 543 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 544 struct scrub_bio *sbio = sctx->bios[i]; 545 546 if (!sbio) 547 break; 548 kfree(sbio); 549 } 550 551 kfree(sctx->wr_curr_bio); 552 scrub_free_csums(sctx); 553 kfree(sctx); 554 } 555 556 static void scrub_put_ctx(struct scrub_ctx *sctx) 557 { 558 if (refcount_dec_and_test(&sctx->refs)) 559 scrub_free_ctx(sctx); 560 } 561 562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( 563 struct btrfs_fs_info *fs_info, int is_dev_replace) 564 { 565 struct scrub_ctx *sctx; 566 int i; 567 568 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); 569 if (!sctx) 570 goto nomem; 571 refcount_set(&sctx->refs, 1); 572 sctx->is_dev_replace = is_dev_replace; 573 sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; 574 sctx->curr = -1; 575 sctx->fs_info = fs_info; 576 INIT_LIST_HEAD(&sctx->csum_list); 577 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 578 struct scrub_bio *sbio; 579 580 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); 581 if (!sbio) 582 goto nomem; 583 sctx->bios[i] = sbio; 584 585 sbio->index = i; 586 sbio->sctx = sctx; 587 sbio->sector_count = 0; 588 INIT_WORK(&sbio->work, scrub_bio_end_io_worker); 589 590 if (i != SCRUB_BIOS_PER_SCTX - 1) 591 sctx->bios[i]->next_free = i + 1; 592 else 593 sctx->bios[i]->next_free = -1; 594 } 595 sctx->first_free = 0; 596 atomic_set(&sctx->bios_in_flight, 0); 597 atomic_set(&sctx->workers_pending, 0); 598 atomic_set(&sctx->cancel_req, 0); 599 600 spin_lock_init(&sctx->list_lock); 601 spin_lock_init(&sctx->stat_lock); 602 init_waitqueue_head(&sctx->list_wait); 603 sctx->throttle_deadline = 0; 604 605 WARN_ON(sctx->wr_curr_bio != NULL); 606 mutex_init(&sctx->wr_lock); 607 sctx->wr_curr_bio = NULL; 608 if (is_dev_replace) { 609 WARN_ON(!fs_info->dev_replace.tgtdev); 610 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 611 sctx->flush_all_writes = false; 612 } 613 614 return sctx; 615 616 nomem: 617 scrub_free_ctx(sctx); 618 return ERR_PTR(-ENOMEM); 619 } 620 621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 622 void *warn_ctx) 623 { 624 u32 nlink; 625 int ret; 626 int i; 627 unsigned nofs_flag; 628 struct extent_buffer *eb; 629 struct btrfs_inode_item *inode_item; 630 struct scrub_warning *swarn = warn_ctx; 631 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 632 struct inode_fs_paths *ipath = NULL; 633 struct btrfs_root *local_root; 634 struct btrfs_key key; 635 636 local_root = btrfs_get_fs_root(fs_info, root, true); 637 if (IS_ERR(local_root)) { 638 ret = PTR_ERR(local_root); 639 goto err; 640 } 641 642 /* 643 * this makes the path point to (inum INODE_ITEM ioff) 644 */ 645 key.objectid = inum; 646 key.type = BTRFS_INODE_ITEM_KEY; 647 key.offset = 0; 648 649 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 650 if (ret) { 651 btrfs_put_root(local_root); 652 btrfs_release_path(swarn->path); 653 goto err; 654 } 655 656 eb = swarn->path->nodes[0]; 657 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 658 struct btrfs_inode_item); 659 nlink = btrfs_inode_nlink(eb, inode_item); 660 btrfs_release_path(swarn->path); 661 662 /* 663 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 664 * uses GFP_NOFS in this context, so we keep it consistent but it does 665 * not seem to be strictly necessary. 666 */ 667 nofs_flag = memalloc_nofs_save(); 668 ipath = init_ipath(4096, local_root, swarn->path); 669 memalloc_nofs_restore(nofs_flag); 670 if (IS_ERR(ipath)) { 671 btrfs_put_root(local_root); 672 ret = PTR_ERR(ipath); 673 ipath = NULL; 674 goto err; 675 } 676 ret = paths_from_inode(inum, ipath); 677 678 if (ret < 0) 679 goto err; 680 681 /* 682 * we deliberately ignore the bit ipath might have been too small to 683 * hold all of the paths here 684 */ 685 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 686 btrfs_warn_in_rcu(fs_info, 687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", 688 swarn->errstr, swarn->logical, 689 rcu_str_deref(swarn->dev->name), 690 swarn->physical, 691 root, inum, offset, 692 fs_info->sectorsize, nlink, 693 (char *)(unsigned long)ipath->fspath->val[i]); 694 695 btrfs_put_root(local_root); 696 free_ipath(ipath); 697 return 0; 698 699 err: 700 btrfs_warn_in_rcu(fs_info, 701 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", 702 swarn->errstr, swarn->logical, 703 rcu_str_deref(swarn->dev->name), 704 swarn->physical, 705 root, inum, offset, ret); 706 707 free_ipath(ipath); 708 return 0; 709 } 710 711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 712 { 713 struct btrfs_device *dev; 714 struct btrfs_fs_info *fs_info; 715 struct btrfs_path *path; 716 struct btrfs_key found_key; 717 struct extent_buffer *eb; 718 struct btrfs_extent_item *ei; 719 struct scrub_warning swarn; 720 unsigned long ptr = 0; 721 u64 extent_item_pos; 722 u64 flags = 0; 723 u64 ref_root; 724 u32 item_size; 725 u8 ref_level = 0; 726 int ret; 727 728 WARN_ON(sblock->sector_count < 1); 729 dev = sblock->sectors[0]->dev; 730 fs_info = sblock->sctx->fs_info; 731 732 path = btrfs_alloc_path(); 733 if (!path) 734 return; 735 736 swarn.physical = sblock->sectors[0]->physical; 737 swarn.logical = sblock->sectors[0]->logical; 738 swarn.errstr = errstr; 739 swarn.dev = NULL; 740 741 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 742 &flags); 743 if (ret < 0) 744 goto out; 745 746 extent_item_pos = swarn.logical - found_key.objectid; 747 swarn.extent_item_size = found_key.offset; 748 749 eb = path->nodes[0]; 750 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 751 item_size = btrfs_item_size(eb, path->slots[0]); 752 753 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 754 do { 755 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 756 item_size, &ref_root, 757 &ref_level); 758 btrfs_warn_in_rcu(fs_info, 759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 760 errstr, swarn.logical, 761 rcu_str_deref(dev->name), 762 swarn.physical, 763 ref_level ? "node" : "leaf", 764 ret < 0 ? -1 : ref_level, 765 ret < 0 ? -1 : ref_root); 766 } while (ret != 1); 767 btrfs_release_path(path); 768 } else { 769 btrfs_release_path(path); 770 swarn.path = path; 771 swarn.dev = dev; 772 iterate_extent_inodes(fs_info, found_key.objectid, 773 extent_item_pos, 1, 774 scrub_print_warning_inode, &swarn, false); 775 } 776 777 out: 778 btrfs_free_path(path); 779 } 780 781 static inline void scrub_get_recover(struct scrub_recover *recover) 782 { 783 refcount_inc(&recover->refs); 784 } 785 786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, 787 struct scrub_recover *recover) 788 { 789 if (refcount_dec_and_test(&recover->refs)) { 790 btrfs_bio_counter_dec(fs_info); 791 btrfs_put_bioc(recover->bioc); 792 kfree(recover); 793 } 794 } 795 796 /* 797 * scrub_handle_errored_block gets called when either verification of the 798 * sectors failed or the bio failed to read, e.g. with EIO. In the latter 799 * case, this function handles all sectors in the bio, even though only one 800 * may be bad. 801 * The goal of this function is to repair the errored block by using the 802 * contents of one of the mirrors. 803 */ 804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 805 { 806 struct scrub_ctx *sctx = sblock_to_check->sctx; 807 struct btrfs_device *dev; 808 struct btrfs_fs_info *fs_info; 809 u64 logical; 810 unsigned int failed_mirror_index; 811 unsigned int is_metadata; 812 unsigned int have_csum; 813 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ 814 struct scrub_block *sblock_bad; 815 int ret; 816 int mirror_index; 817 int sector_num; 818 int success; 819 bool full_stripe_locked; 820 unsigned int nofs_flag; 821 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, 822 DEFAULT_RATELIMIT_BURST); 823 824 BUG_ON(sblock_to_check->sector_count < 1); 825 fs_info = sctx->fs_info; 826 if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 827 /* 828 * if we find an error in a super block, we just report it. 829 * They will get written with the next transaction commit 830 * anyway 831 */ 832 spin_lock(&sctx->stat_lock); 833 ++sctx->stat.super_errors; 834 spin_unlock(&sctx->stat_lock); 835 return 0; 836 } 837 logical = sblock_to_check->sectors[0]->logical; 838 BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); 839 failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; 840 is_metadata = !(sblock_to_check->sectors[0]->flags & 841 BTRFS_EXTENT_FLAG_DATA); 842 have_csum = sblock_to_check->sectors[0]->have_csum; 843 dev = sblock_to_check->sectors[0]->dev; 844 845 if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) 846 return 0; 847 848 /* 849 * We must use GFP_NOFS because the scrub task might be waiting for a 850 * worker task executing this function and in turn a transaction commit 851 * might be waiting the scrub task to pause (which needs to wait for all 852 * the worker tasks to complete before pausing). 853 * We do allocations in the workers through insert_full_stripe_lock() 854 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of 855 * this function. 856 */ 857 nofs_flag = memalloc_nofs_save(); 858 /* 859 * For RAID5/6, race can happen for a different device scrub thread. 860 * For data corruption, Parity and Data threads will both try 861 * to recovery the data. 862 * Race can lead to doubly added csum error, or even unrecoverable 863 * error. 864 */ 865 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); 866 if (ret < 0) { 867 memalloc_nofs_restore(nofs_flag); 868 spin_lock(&sctx->stat_lock); 869 if (ret == -ENOMEM) 870 sctx->stat.malloc_errors++; 871 sctx->stat.read_errors++; 872 sctx->stat.uncorrectable_errors++; 873 spin_unlock(&sctx->stat_lock); 874 return ret; 875 } 876 877 /* 878 * read all mirrors one after the other. This includes to 879 * re-read the extent or metadata block that failed (that was 880 * the cause that this fixup code is called) another time, 881 * sector by sector this time in order to know which sectors 882 * caused I/O errors and which ones are good (for all mirrors). 883 * It is the goal to handle the situation when more than one 884 * mirror contains I/O errors, but the errors do not 885 * overlap, i.e. the data can be repaired by selecting the 886 * sectors from those mirrors without I/O error on the 887 * particular sectors. One example (with blocks >= 2 * sectorsize) 888 * would be that mirror #1 has an I/O error on the first sector, 889 * the second sector is good, and mirror #2 has an I/O error on 890 * the second sector, but the first sector is good. 891 * Then the first sector of the first mirror can be repaired by 892 * taking the first sector of the second mirror, and the 893 * second sector of the second mirror can be repaired by 894 * copying the contents of the 2nd sector of the 1st mirror. 895 * One more note: if the sectors of one mirror contain I/O 896 * errors, the checksum cannot be verified. In order to get 897 * the best data for repairing, the first attempt is to find 898 * a mirror without I/O errors and with a validated checksum. 899 * Only if this is not possible, the sectors are picked from 900 * mirrors with I/O errors without considering the checksum. 901 * If the latter is the case, at the end, the checksum of the 902 * repaired area is verified in order to correctly maintain 903 * the statistics. 904 */ 905 906 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, 907 sizeof(*sblocks_for_recheck), GFP_KERNEL); 908 if (!sblocks_for_recheck) { 909 spin_lock(&sctx->stat_lock); 910 sctx->stat.malloc_errors++; 911 sctx->stat.read_errors++; 912 sctx->stat.uncorrectable_errors++; 913 spin_unlock(&sctx->stat_lock); 914 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 915 goto out; 916 } 917 918 /* Setup the context, map the logical blocks and alloc the sectors */ 919 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); 920 if (ret) { 921 spin_lock(&sctx->stat_lock); 922 sctx->stat.read_errors++; 923 sctx->stat.uncorrectable_errors++; 924 spin_unlock(&sctx->stat_lock); 925 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 926 goto out; 927 } 928 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 929 sblock_bad = sblocks_for_recheck + failed_mirror_index; 930 931 /* build and submit the bios for the failed mirror, check checksums */ 932 scrub_recheck_block(fs_info, sblock_bad, 1); 933 934 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 935 sblock_bad->no_io_error_seen) { 936 /* 937 * The error disappeared after reading sector by sector, or 938 * the area was part of a huge bio and other parts of the 939 * bio caused I/O errors, or the block layer merged several 940 * read requests into one and the error is caused by a 941 * different bio (usually one of the two latter cases is 942 * the cause) 943 */ 944 spin_lock(&sctx->stat_lock); 945 sctx->stat.unverified_errors++; 946 sblock_to_check->data_corrected = 1; 947 spin_unlock(&sctx->stat_lock); 948 949 if (sctx->is_dev_replace) 950 scrub_write_block_to_dev_replace(sblock_bad); 951 goto out; 952 } 953 954 if (!sblock_bad->no_io_error_seen) { 955 spin_lock(&sctx->stat_lock); 956 sctx->stat.read_errors++; 957 spin_unlock(&sctx->stat_lock); 958 if (__ratelimit(&rs)) 959 scrub_print_warning("i/o error", sblock_to_check); 960 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 961 } else if (sblock_bad->checksum_error) { 962 spin_lock(&sctx->stat_lock); 963 sctx->stat.csum_errors++; 964 spin_unlock(&sctx->stat_lock); 965 if (__ratelimit(&rs)) 966 scrub_print_warning("checksum error", sblock_to_check); 967 btrfs_dev_stat_inc_and_print(dev, 968 BTRFS_DEV_STAT_CORRUPTION_ERRS); 969 } else if (sblock_bad->header_error) { 970 spin_lock(&sctx->stat_lock); 971 sctx->stat.verify_errors++; 972 spin_unlock(&sctx->stat_lock); 973 if (__ratelimit(&rs)) 974 scrub_print_warning("checksum/header error", 975 sblock_to_check); 976 if (sblock_bad->generation_error) 977 btrfs_dev_stat_inc_and_print(dev, 978 BTRFS_DEV_STAT_GENERATION_ERRS); 979 else 980 btrfs_dev_stat_inc_and_print(dev, 981 BTRFS_DEV_STAT_CORRUPTION_ERRS); 982 } 983 984 if (sctx->readonly) { 985 ASSERT(!sctx->is_dev_replace); 986 goto out; 987 } 988 989 /* 990 * now build and submit the bios for the other mirrors, check 991 * checksums. 992 * First try to pick the mirror which is completely without I/O 993 * errors and also does not have a checksum error. 994 * If one is found, and if a checksum is present, the full block 995 * that is known to contain an error is rewritten. Afterwards 996 * the block is known to be corrected. 997 * If a mirror is found which is completely correct, and no 998 * checksum is present, only those sectors are rewritten that had 999 * an I/O error in the block to be repaired, since it cannot be 1000 * determined, which copy of the other sectors is better (and it 1001 * could happen otherwise that a correct sector would be 1002 * overwritten by a bad one). 1003 */ 1004 for (mirror_index = 0; ;mirror_index++) { 1005 struct scrub_block *sblock_other; 1006 1007 if (mirror_index == failed_mirror_index) 1008 continue; 1009 1010 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ 1011 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { 1012 if (mirror_index >= BTRFS_MAX_MIRRORS) 1013 break; 1014 if (!sblocks_for_recheck[mirror_index].sector_count) 1015 break; 1016 1017 sblock_other = sblocks_for_recheck + mirror_index; 1018 } else { 1019 struct scrub_recover *r = sblock_bad->sectors[0]->recover; 1020 int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; 1021 1022 if (mirror_index >= max_allowed) 1023 break; 1024 if (!sblocks_for_recheck[1].sector_count) 1025 break; 1026 1027 ASSERT(failed_mirror_index == 0); 1028 sblock_other = sblocks_for_recheck + 1; 1029 sblock_other->sectors[0]->mirror_num = 1 + mirror_index; 1030 } 1031 1032 /* build and submit the bios, check checksums */ 1033 scrub_recheck_block(fs_info, sblock_other, 0); 1034 1035 if (!sblock_other->header_error && 1036 !sblock_other->checksum_error && 1037 sblock_other->no_io_error_seen) { 1038 if (sctx->is_dev_replace) { 1039 scrub_write_block_to_dev_replace(sblock_other); 1040 goto corrected_error; 1041 } else { 1042 ret = scrub_repair_block_from_good_copy( 1043 sblock_bad, sblock_other); 1044 if (!ret) 1045 goto corrected_error; 1046 } 1047 } 1048 } 1049 1050 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) 1051 goto did_not_correct_error; 1052 1053 /* 1054 * In case of I/O errors in the area that is supposed to be 1055 * repaired, continue by picking good copies of those sectors. 1056 * Select the good sectors from mirrors to rewrite bad sectors from 1057 * the area to fix. Afterwards verify the checksum of the block 1058 * that is supposed to be repaired. This verification step is 1059 * only done for the purpose of statistic counting and for the 1060 * final scrub report, whether errors remain. 1061 * A perfect algorithm could make use of the checksum and try 1062 * all possible combinations of sectors from the different mirrors 1063 * until the checksum verification succeeds. For example, when 1064 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector 1065 * of mirror #2 is readable but the final checksum test fails, 1066 * then the 2nd sector of mirror #3 could be tried, whether now 1067 * the final checksum succeeds. But this would be a rare 1068 * exception and is therefore not implemented. At least it is 1069 * avoided that the good copy is overwritten. 1070 * A more useful improvement would be to pick the sectors 1071 * without I/O error based on sector sizes (512 bytes on legacy 1072 * disks) instead of on sectorsize. Then maybe 512 byte of one 1073 * mirror could be repaired by taking 512 byte of a different 1074 * mirror, even if other 512 byte sectors in the same sectorsize 1075 * area are unreadable. 1076 */ 1077 success = 1; 1078 for (sector_num = 0; sector_num < sblock_bad->sector_count; 1079 sector_num++) { 1080 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; 1081 struct scrub_block *sblock_other = NULL; 1082 1083 /* Skip no-io-error sectors in scrub */ 1084 if (!sector_bad->io_error && !sctx->is_dev_replace) 1085 continue; 1086 1087 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { 1088 /* 1089 * In case of dev replace, if raid56 rebuild process 1090 * didn't work out correct data, then copy the content 1091 * in sblock_bad to make sure target device is identical 1092 * to source device, instead of writing garbage data in 1093 * sblock_for_recheck array to target device. 1094 */ 1095 sblock_other = NULL; 1096 } else if (sector_bad->io_error) { 1097 /* Try to find no-io-error sector in mirrors */ 1098 for (mirror_index = 0; 1099 mirror_index < BTRFS_MAX_MIRRORS && 1100 sblocks_for_recheck[mirror_index].sector_count > 0; 1101 mirror_index++) { 1102 if (!sblocks_for_recheck[mirror_index]. 1103 sectors[sector_num]->io_error) { 1104 sblock_other = sblocks_for_recheck + 1105 mirror_index; 1106 break; 1107 } 1108 } 1109 if (!sblock_other) 1110 success = 0; 1111 } 1112 1113 if (sctx->is_dev_replace) { 1114 /* 1115 * Did not find a mirror to fetch the sector from. 1116 * scrub_write_sector_to_dev_replace() handles this 1117 * case (sector->io_error), by filling the block with 1118 * zeros before submitting the write request 1119 */ 1120 if (!sblock_other) 1121 sblock_other = sblock_bad; 1122 1123 if (scrub_write_sector_to_dev_replace(sblock_other, 1124 sector_num) != 0) { 1125 atomic64_inc( 1126 &fs_info->dev_replace.num_write_errors); 1127 success = 0; 1128 } 1129 } else if (sblock_other) { 1130 ret = scrub_repair_sector_from_good_copy(sblock_bad, 1131 sblock_other, 1132 sector_num, 0); 1133 if (0 == ret) 1134 sector_bad->io_error = 0; 1135 else 1136 success = 0; 1137 } 1138 } 1139 1140 if (success && !sctx->is_dev_replace) { 1141 if (is_metadata || have_csum) { 1142 /* 1143 * need to verify the checksum now that all 1144 * sectors on disk are repaired (the write 1145 * request for data to be repaired is on its way). 1146 * Just be lazy and use scrub_recheck_block() 1147 * which re-reads the data before the checksum 1148 * is verified, but most likely the data comes out 1149 * of the page cache. 1150 */ 1151 scrub_recheck_block(fs_info, sblock_bad, 1); 1152 if (!sblock_bad->header_error && 1153 !sblock_bad->checksum_error && 1154 sblock_bad->no_io_error_seen) 1155 goto corrected_error; 1156 else 1157 goto did_not_correct_error; 1158 } else { 1159 corrected_error: 1160 spin_lock(&sctx->stat_lock); 1161 sctx->stat.corrected_errors++; 1162 sblock_to_check->data_corrected = 1; 1163 spin_unlock(&sctx->stat_lock); 1164 btrfs_err_rl_in_rcu(fs_info, 1165 "fixed up error at logical %llu on dev %s", 1166 logical, rcu_str_deref(dev->name)); 1167 } 1168 } else { 1169 did_not_correct_error: 1170 spin_lock(&sctx->stat_lock); 1171 sctx->stat.uncorrectable_errors++; 1172 spin_unlock(&sctx->stat_lock); 1173 btrfs_err_rl_in_rcu(fs_info, 1174 "unable to fixup (regular) error at logical %llu on dev %s", 1175 logical, rcu_str_deref(dev->name)); 1176 } 1177 1178 out: 1179 if (sblocks_for_recheck) { 1180 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; 1181 mirror_index++) { 1182 struct scrub_block *sblock = sblocks_for_recheck + 1183 mirror_index; 1184 struct scrub_recover *recover; 1185 int i; 1186 1187 for (i = 0; i < sblock->sector_count; i++) { 1188 sblock->sectors[i]->sblock = NULL; 1189 recover = sblock->sectors[i]->recover; 1190 if (recover) { 1191 scrub_put_recover(fs_info, recover); 1192 sblock->sectors[i]->recover = NULL; 1193 } 1194 scrub_sector_put(sblock->sectors[i]); 1195 } 1196 } 1197 kfree(sblocks_for_recheck); 1198 } 1199 1200 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); 1201 memalloc_nofs_restore(nofs_flag); 1202 if (ret < 0) 1203 return ret; 1204 return 0; 1205 } 1206 1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) 1208 { 1209 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) 1210 return 2; 1211 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) 1212 return 3; 1213 else 1214 return (int)bioc->num_stripes; 1215 } 1216 1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, 1218 u64 *raid_map, 1219 int nstripes, int mirror, 1220 int *stripe_index, 1221 u64 *stripe_offset) 1222 { 1223 int i; 1224 1225 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 1226 /* RAID5/6 */ 1227 for (i = 0; i < nstripes; i++) { 1228 if (raid_map[i] == RAID6_Q_STRIPE || 1229 raid_map[i] == RAID5_P_STRIPE) 1230 continue; 1231 1232 if (logical >= raid_map[i] && 1233 logical < raid_map[i] + BTRFS_STRIPE_LEN) 1234 break; 1235 } 1236 1237 *stripe_index = i; 1238 *stripe_offset = logical - raid_map[i]; 1239 } else { 1240 /* The other RAID type */ 1241 *stripe_index = mirror; 1242 *stripe_offset = 0; 1243 } 1244 } 1245 1246 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 1247 struct scrub_block *sblocks_for_recheck) 1248 { 1249 struct scrub_ctx *sctx = original_sblock->sctx; 1250 struct btrfs_fs_info *fs_info = sctx->fs_info; 1251 u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; 1252 u64 logical = original_sblock->sectors[0]->logical; 1253 u64 generation = original_sblock->sectors[0]->generation; 1254 u64 flags = original_sblock->sectors[0]->flags; 1255 u64 have_csum = original_sblock->sectors[0]->have_csum; 1256 struct scrub_recover *recover; 1257 struct btrfs_io_context *bioc; 1258 u64 sublen; 1259 u64 mapped_length; 1260 u64 stripe_offset; 1261 int stripe_index; 1262 int sector_index = 0; 1263 int mirror_index; 1264 int nmirrors; 1265 int ret; 1266 1267 /* 1268 * Note: the two members refs and outstanding_sectors are not used (and 1269 * not set) in the blocks that are used for the recheck procedure. 1270 */ 1271 1272 while (length > 0) { 1273 sublen = min_t(u64, length, fs_info->sectorsize); 1274 mapped_length = sublen; 1275 bioc = NULL; 1276 1277 /* 1278 * With a length of sectorsize, each returned stripe represents 1279 * one mirror 1280 */ 1281 btrfs_bio_counter_inc_blocked(fs_info); 1282 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1283 logical, &mapped_length, &bioc); 1284 if (ret || !bioc || mapped_length < sublen) { 1285 btrfs_put_bioc(bioc); 1286 btrfs_bio_counter_dec(fs_info); 1287 return -EIO; 1288 } 1289 1290 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); 1291 if (!recover) { 1292 btrfs_put_bioc(bioc); 1293 btrfs_bio_counter_dec(fs_info); 1294 return -ENOMEM; 1295 } 1296 1297 refcount_set(&recover->refs, 1); 1298 recover->bioc = bioc; 1299 recover->map_length = mapped_length; 1300 1301 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); 1302 1303 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); 1304 1305 for (mirror_index = 0; mirror_index < nmirrors; 1306 mirror_index++) { 1307 struct scrub_block *sblock; 1308 struct scrub_sector *sector; 1309 1310 sblock = sblocks_for_recheck + mirror_index; 1311 sblock->sctx = sctx; 1312 1313 sector = kzalloc(sizeof(*sector), GFP_NOFS); 1314 if (!sector) { 1315 leave_nomem: 1316 spin_lock(&sctx->stat_lock); 1317 sctx->stat.malloc_errors++; 1318 spin_unlock(&sctx->stat_lock); 1319 scrub_put_recover(fs_info, recover); 1320 return -ENOMEM; 1321 } 1322 scrub_sector_get(sector); 1323 sblock->sectors[sector_index] = sector; 1324 sector->sblock = sblock; 1325 sector->flags = flags; 1326 sector->generation = generation; 1327 sector->logical = logical; 1328 sector->have_csum = have_csum; 1329 if (have_csum) 1330 memcpy(sector->csum, 1331 original_sblock->sectors[0]->csum, 1332 sctx->fs_info->csum_size); 1333 1334 scrub_stripe_index_and_offset(logical, 1335 bioc->map_type, 1336 bioc->raid_map, 1337 bioc->num_stripes - 1338 bioc->num_tgtdevs, 1339 mirror_index, 1340 &stripe_index, 1341 &stripe_offset); 1342 sector->physical = bioc->stripes[stripe_index].physical + 1343 stripe_offset; 1344 sector->dev = bioc->stripes[stripe_index].dev; 1345 1346 BUG_ON(sector_index >= original_sblock->sector_count); 1347 sector->physical_for_dev_replace = 1348 original_sblock->sectors[sector_index]-> 1349 physical_for_dev_replace; 1350 /* For missing devices, dev->bdev is NULL */ 1351 sector->mirror_num = mirror_index + 1; 1352 sblock->sector_count++; 1353 sector->page = alloc_page(GFP_NOFS); 1354 if (!sector->page) 1355 goto leave_nomem; 1356 1357 scrub_get_recover(recover); 1358 sector->recover = recover; 1359 } 1360 scrub_put_recover(fs_info, recover); 1361 length -= sublen; 1362 logical += sublen; 1363 sector_index++; 1364 } 1365 1366 return 0; 1367 } 1368 1369 static void scrub_bio_wait_endio(struct bio *bio) 1370 { 1371 complete(bio->bi_private); 1372 } 1373 1374 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, 1375 struct bio *bio, 1376 struct scrub_sector *sector) 1377 { 1378 DECLARE_COMPLETION_ONSTACK(done); 1379 1380 bio->bi_iter.bi_sector = sector->logical >> 9; 1381 bio->bi_private = &done; 1382 bio->bi_end_io = scrub_bio_wait_endio; 1383 raid56_parity_recover(bio, sector->recover->bioc, 1384 sector->sblock->sectors[0]->mirror_num, false); 1385 1386 wait_for_completion_io(&done); 1387 return blk_status_to_errno(bio->bi_status); 1388 } 1389 1390 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, 1391 struct scrub_block *sblock) 1392 { 1393 struct scrub_sector *first_sector = sblock->sectors[0]; 1394 struct bio *bio; 1395 int i; 1396 1397 /* All sectors in sblock belong to the same stripe on the same device. */ 1398 ASSERT(first_sector->dev); 1399 if (!first_sector->dev->bdev) 1400 goto out; 1401 1402 bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); 1403 1404 for (i = 0; i < sblock->sector_count; i++) { 1405 struct scrub_sector *sector = sblock->sectors[i]; 1406 1407 WARN_ON(!sector->page); 1408 bio_add_page(bio, sector->page, PAGE_SIZE, 0); 1409 } 1410 1411 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { 1412 bio_put(bio); 1413 goto out; 1414 } 1415 1416 bio_put(bio); 1417 1418 scrub_recheck_block_checksum(sblock); 1419 1420 return; 1421 out: 1422 for (i = 0; i < sblock->sector_count; i++) 1423 sblock->sectors[i]->io_error = 1; 1424 1425 sblock->no_io_error_seen = 0; 1426 } 1427 1428 /* 1429 * This function will check the on disk data for checksum errors, header errors 1430 * and read I/O errors. If any I/O errors happen, the exact sectors which are 1431 * errored are marked as being bad. The goal is to enable scrub to take those 1432 * sectors that are not errored from all the mirrors so that the sectors that 1433 * are errored in the just handled mirror can be repaired. 1434 */ 1435 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1436 struct scrub_block *sblock, 1437 int retry_failed_mirror) 1438 { 1439 int i; 1440 1441 sblock->no_io_error_seen = 1; 1442 1443 /* short cut for raid56 */ 1444 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) 1445 return scrub_recheck_block_on_raid56(fs_info, sblock); 1446 1447 for (i = 0; i < sblock->sector_count; i++) { 1448 struct scrub_sector *sector = sblock->sectors[i]; 1449 struct bio bio; 1450 struct bio_vec bvec; 1451 1452 if (sector->dev->bdev == NULL) { 1453 sector->io_error = 1; 1454 sblock->no_io_error_seen = 0; 1455 continue; 1456 } 1457 1458 WARN_ON(!sector->page); 1459 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); 1460 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); 1461 bio.bi_iter.bi_sector = sector->physical >> 9; 1462 1463 btrfsic_check_bio(&bio); 1464 if (submit_bio_wait(&bio)) { 1465 sector->io_error = 1; 1466 sblock->no_io_error_seen = 0; 1467 } 1468 1469 bio_uninit(&bio); 1470 } 1471 1472 if (sblock->no_io_error_seen) 1473 scrub_recheck_block_checksum(sblock); 1474 } 1475 1476 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) 1477 { 1478 struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; 1479 int ret; 1480 1481 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1482 return !ret; 1483 } 1484 1485 static void scrub_recheck_block_checksum(struct scrub_block *sblock) 1486 { 1487 sblock->header_error = 0; 1488 sblock->checksum_error = 0; 1489 sblock->generation_error = 0; 1490 1491 if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) 1492 scrub_checksum_data(sblock); 1493 else 1494 scrub_checksum_tree_block(sblock); 1495 } 1496 1497 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1498 struct scrub_block *sblock_good) 1499 { 1500 int i; 1501 int ret = 0; 1502 1503 for (i = 0; i < sblock_bad->sector_count; i++) { 1504 int ret_sub; 1505 1506 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, 1507 sblock_good, i, 1); 1508 if (ret_sub) 1509 ret = ret_sub; 1510 } 1511 1512 return ret; 1513 } 1514 1515 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, 1516 struct scrub_block *sblock_good, 1517 int sector_num, int force_write) 1518 { 1519 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; 1520 struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; 1521 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; 1522 const u32 sectorsize = fs_info->sectorsize; 1523 1524 BUG_ON(sector_bad->page == NULL); 1525 BUG_ON(sector_good->page == NULL); 1526 if (force_write || sblock_bad->header_error || 1527 sblock_bad->checksum_error || sector_bad->io_error) { 1528 struct bio bio; 1529 struct bio_vec bvec; 1530 int ret; 1531 1532 if (!sector_bad->dev->bdev) { 1533 btrfs_warn_rl(fs_info, 1534 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); 1535 return -EIO; 1536 } 1537 1538 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); 1539 bio.bi_iter.bi_sector = sector_bad->physical >> 9; 1540 __bio_add_page(&bio, sector_good->page, sectorsize, 0); 1541 1542 btrfsic_check_bio(&bio); 1543 ret = submit_bio_wait(&bio); 1544 bio_uninit(&bio); 1545 1546 if (ret) { 1547 btrfs_dev_stat_inc_and_print(sector_bad->dev, 1548 BTRFS_DEV_STAT_WRITE_ERRS); 1549 atomic64_inc(&fs_info->dev_replace.num_write_errors); 1550 return -EIO; 1551 } 1552 } 1553 1554 return 0; 1555 } 1556 1557 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1558 { 1559 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; 1560 int i; 1561 1562 /* 1563 * This block is used for the check of the parity on the source device, 1564 * so the data needn't be written into the destination device. 1565 */ 1566 if (sblock->sparity) 1567 return; 1568 1569 for (i = 0; i < sblock->sector_count; i++) { 1570 int ret; 1571 1572 ret = scrub_write_sector_to_dev_replace(sblock, i); 1573 if (ret) 1574 atomic64_inc(&fs_info->dev_replace.num_write_errors); 1575 } 1576 } 1577 1578 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) 1579 { 1580 struct scrub_sector *sector = sblock->sectors[sector_num]; 1581 1582 BUG_ON(sector->page == NULL); 1583 if (sector->io_error) 1584 clear_page(page_address(sector->page)); 1585 1586 return scrub_add_sector_to_wr_bio(sblock->sctx, sector); 1587 } 1588 1589 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) 1590 { 1591 int ret = 0; 1592 u64 length; 1593 1594 if (!btrfs_is_zoned(sctx->fs_info)) 1595 return 0; 1596 1597 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) 1598 return 0; 1599 1600 if (sctx->write_pointer < physical) { 1601 length = physical - sctx->write_pointer; 1602 1603 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, 1604 sctx->write_pointer, length); 1605 if (!ret) 1606 sctx->write_pointer = physical; 1607 } 1608 return ret; 1609 } 1610 1611 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, 1612 struct scrub_sector *sector) 1613 { 1614 struct scrub_bio *sbio; 1615 int ret; 1616 const u32 sectorsize = sctx->fs_info->sectorsize; 1617 1618 mutex_lock(&sctx->wr_lock); 1619 again: 1620 if (!sctx->wr_curr_bio) { 1621 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), 1622 GFP_KERNEL); 1623 if (!sctx->wr_curr_bio) { 1624 mutex_unlock(&sctx->wr_lock); 1625 return -ENOMEM; 1626 } 1627 sctx->wr_curr_bio->sctx = sctx; 1628 sctx->wr_curr_bio->sector_count = 0; 1629 } 1630 sbio = sctx->wr_curr_bio; 1631 if (sbio->sector_count == 0) { 1632 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); 1633 if (ret) { 1634 mutex_unlock(&sctx->wr_lock); 1635 return ret; 1636 } 1637 1638 sbio->physical = sector->physical_for_dev_replace; 1639 sbio->logical = sector->logical; 1640 sbio->dev = sctx->wr_tgtdev; 1641 if (!sbio->bio) { 1642 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, 1643 REQ_OP_WRITE, GFP_NOFS); 1644 } 1645 sbio->bio->bi_private = sbio; 1646 sbio->bio->bi_end_io = scrub_wr_bio_end_io; 1647 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; 1648 sbio->status = 0; 1649 } else if (sbio->physical + sbio->sector_count * sectorsize != 1650 sector->physical_for_dev_replace || 1651 sbio->logical + sbio->sector_count * sectorsize != 1652 sector->logical) { 1653 scrub_wr_submit(sctx); 1654 goto again; 1655 } 1656 1657 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); 1658 if (ret != sectorsize) { 1659 if (sbio->sector_count < 1) { 1660 bio_put(sbio->bio); 1661 sbio->bio = NULL; 1662 mutex_unlock(&sctx->wr_lock); 1663 return -EIO; 1664 } 1665 scrub_wr_submit(sctx); 1666 goto again; 1667 } 1668 1669 sbio->sectors[sbio->sector_count] = sector; 1670 scrub_sector_get(sector); 1671 sbio->sector_count++; 1672 if (sbio->sector_count == sctx->sectors_per_bio) 1673 scrub_wr_submit(sctx); 1674 mutex_unlock(&sctx->wr_lock); 1675 1676 return 0; 1677 } 1678 1679 static void scrub_wr_submit(struct scrub_ctx *sctx) 1680 { 1681 struct scrub_bio *sbio; 1682 1683 if (!sctx->wr_curr_bio) 1684 return; 1685 1686 sbio = sctx->wr_curr_bio; 1687 sctx->wr_curr_bio = NULL; 1688 scrub_pending_bio_inc(sctx); 1689 /* process all writes in a single worker thread. Then the block layer 1690 * orders the requests before sending them to the driver which 1691 * doubled the write performance on spinning disks when measured 1692 * with Linux 3.5 */ 1693 btrfsic_check_bio(sbio->bio); 1694 submit_bio(sbio->bio); 1695 1696 if (btrfs_is_zoned(sctx->fs_info)) 1697 sctx->write_pointer = sbio->physical + sbio->sector_count * 1698 sctx->fs_info->sectorsize; 1699 } 1700 1701 static void scrub_wr_bio_end_io(struct bio *bio) 1702 { 1703 struct scrub_bio *sbio = bio->bi_private; 1704 struct btrfs_fs_info *fs_info = sbio->dev->fs_info; 1705 1706 sbio->status = bio->bi_status; 1707 sbio->bio = bio; 1708 1709 INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); 1710 queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1711 } 1712 1713 static void scrub_wr_bio_end_io_worker(struct work_struct *work) 1714 { 1715 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1716 struct scrub_ctx *sctx = sbio->sctx; 1717 int i; 1718 1719 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); 1720 if (sbio->status) { 1721 struct btrfs_dev_replace *dev_replace = 1722 &sbio->sctx->fs_info->dev_replace; 1723 1724 for (i = 0; i < sbio->sector_count; i++) { 1725 struct scrub_sector *sector = sbio->sectors[i]; 1726 1727 sector->io_error = 1; 1728 atomic64_inc(&dev_replace->num_write_errors); 1729 } 1730 } 1731 1732 for (i = 0; i < sbio->sector_count; i++) 1733 scrub_sector_put(sbio->sectors[i]); 1734 1735 bio_put(sbio->bio); 1736 kfree(sbio); 1737 scrub_pending_bio_dec(sctx); 1738 } 1739 1740 static int scrub_checksum(struct scrub_block *sblock) 1741 { 1742 u64 flags; 1743 int ret; 1744 1745 /* 1746 * No need to initialize these stats currently, 1747 * because this function only use return value 1748 * instead of these stats value. 1749 * 1750 * Todo: 1751 * always use stats 1752 */ 1753 sblock->header_error = 0; 1754 sblock->generation_error = 0; 1755 sblock->checksum_error = 0; 1756 1757 WARN_ON(sblock->sector_count < 1); 1758 flags = sblock->sectors[0]->flags; 1759 ret = 0; 1760 if (flags & BTRFS_EXTENT_FLAG_DATA) 1761 ret = scrub_checksum_data(sblock); 1762 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1763 ret = scrub_checksum_tree_block(sblock); 1764 else if (flags & BTRFS_EXTENT_FLAG_SUPER) 1765 (void)scrub_checksum_super(sblock); 1766 else 1767 WARN_ON(1); 1768 if (ret) 1769 scrub_handle_errored_block(sblock); 1770 1771 return ret; 1772 } 1773 1774 static int scrub_checksum_data(struct scrub_block *sblock) 1775 { 1776 struct scrub_ctx *sctx = sblock->sctx; 1777 struct btrfs_fs_info *fs_info = sctx->fs_info; 1778 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1779 u8 csum[BTRFS_CSUM_SIZE]; 1780 struct scrub_sector *sector; 1781 char *kaddr; 1782 1783 BUG_ON(sblock->sector_count < 1); 1784 sector = sblock->sectors[0]; 1785 if (!sector->have_csum) 1786 return 0; 1787 1788 kaddr = page_address(sector->page); 1789 1790 shash->tfm = fs_info->csum_shash; 1791 crypto_shash_init(shash); 1792 1793 /* 1794 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector 1795 * only contains one sector of data. 1796 */ 1797 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); 1798 1799 if (memcmp(csum, sector->csum, fs_info->csum_size)) 1800 sblock->checksum_error = 1; 1801 return sblock->checksum_error; 1802 } 1803 1804 static int scrub_checksum_tree_block(struct scrub_block *sblock) 1805 { 1806 struct scrub_ctx *sctx = sblock->sctx; 1807 struct btrfs_header *h; 1808 struct btrfs_fs_info *fs_info = sctx->fs_info; 1809 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1810 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1811 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1812 /* 1813 * This is done in sectorsize steps even for metadata as there's a 1814 * constraint for nodesize to be aligned to sectorsize. This will need 1815 * to change so we don't misuse data and metadata units like that. 1816 */ 1817 const u32 sectorsize = sctx->fs_info->sectorsize; 1818 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; 1819 int i; 1820 struct scrub_sector *sector; 1821 char *kaddr; 1822 1823 BUG_ON(sblock->sector_count < 1); 1824 1825 /* Each member in sectors is just one sector */ 1826 ASSERT(sblock->sector_count == num_sectors); 1827 1828 sector = sblock->sectors[0]; 1829 kaddr = page_address(sector->page); 1830 h = (struct btrfs_header *)kaddr; 1831 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); 1832 1833 /* 1834 * we don't use the getter functions here, as we 1835 * a) don't have an extent buffer and 1836 * b) the page is already kmapped 1837 */ 1838 if (sector->logical != btrfs_stack_header_bytenr(h)) 1839 sblock->header_error = 1; 1840 1841 if (sector->generation != btrfs_stack_header_generation(h)) { 1842 sblock->header_error = 1; 1843 sblock->generation_error = 1; 1844 } 1845 1846 if (!scrub_check_fsid(h->fsid, sector)) 1847 sblock->header_error = 1; 1848 1849 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1850 BTRFS_UUID_SIZE)) 1851 sblock->header_error = 1; 1852 1853 shash->tfm = fs_info->csum_shash; 1854 crypto_shash_init(shash); 1855 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, 1856 sectorsize - BTRFS_CSUM_SIZE); 1857 1858 for (i = 1; i < num_sectors; i++) { 1859 kaddr = page_address(sblock->sectors[i]->page); 1860 crypto_shash_update(shash, kaddr, sectorsize); 1861 } 1862 1863 crypto_shash_final(shash, calculated_csum); 1864 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) 1865 sblock->checksum_error = 1; 1866 1867 return sblock->header_error || sblock->checksum_error; 1868 } 1869 1870 static int scrub_checksum_super(struct scrub_block *sblock) 1871 { 1872 struct btrfs_super_block *s; 1873 struct scrub_ctx *sctx = sblock->sctx; 1874 struct btrfs_fs_info *fs_info = sctx->fs_info; 1875 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 1876 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1877 struct scrub_sector *sector; 1878 char *kaddr; 1879 int fail_gen = 0; 1880 int fail_cor = 0; 1881 1882 BUG_ON(sblock->sector_count < 1); 1883 sector = sblock->sectors[0]; 1884 kaddr = page_address(sector->page); 1885 s = (struct btrfs_super_block *)kaddr; 1886 1887 if (sector->logical != btrfs_super_bytenr(s)) 1888 ++fail_cor; 1889 1890 if (sector->generation != btrfs_super_generation(s)) 1891 ++fail_gen; 1892 1893 if (!scrub_check_fsid(s->fsid, sector)) 1894 ++fail_cor; 1895 1896 shash->tfm = fs_info->csum_shash; 1897 crypto_shash_init(shash); 1898 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, 1899 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); 1900 1901 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) 1902 ++fail_cor; 1903 1904 if (fail_cor + fail_gen) { 1905 /* 1906 * if we find an error in a super block, we just report it. 1907 * They will get written with the next transaction commit 1908 * anyway 1909 */ 1910 spin_lock(&sctx->stat_lock); 1911 ++sctx->stat.super_errors; 1912 spin_unlock(&sctx->stat_lock); 1913 if (fail_cor) 1914 btrfs_dev_stat_inc_and_print(sector->dev, 1915 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1916 else 1917 btrfs_dev_stat_inc_and_print(sector->dev, 1918 BTRFS_DEV_STAT_GENERATION_ERRS); 1919 } 1920 1921 return fail_cor + fail_gen; 1922 } 1923 1924 static void scrub_block_get(struct scrub_block *sblock) 1925 { 1926 refcount_inc(&sblock->refs); 1927 } 1928 1929 static void scrub_block_put(struct scrub_block *sblock) 1930 { 1931 if (refcount_dec_and_test(&sblock->refs)) { 1932 int i; 1933 1934 if (sblock->sparity) 1935 scrub_parity_put(sblock->sparity); 1936 1937 for (i = 0; i < sblock->sector_count; i++) 1938 scrub_sector_put(sblock->sectors[i]); 1939 kfree(sblock); 1940 } 1941 } 1942 1943 static void scrub_sector_get(struct scrub_sector *sector) 1944 { 1945 atomic_inc(§or->refs); 1946 } 1947 1948 static void scrub_sector_put(struct scrub_sector *sector) 1949 { 1950 if (atomic_dec_and_test(§or->refs)) { 1951 if (sector->page) 1952 __free_page(sector->page); 1953 kfree(sector); 1954 } 1955 } 1956 1957 /* 1958 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 1959 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. 1960 */ 1961 static void scrub_throttle(struct scrub_ctx *sctx) 1962 { 1963 const int time_slice = 1000; 1964 struct scrub_bio *sbio; 1965 struct btrfs_device *device; 1966 s64 delta; 1967 ktime_t now; 1968 u32 div; 1969 u64 bwlimit; 1970 1971 sbio = sctx->bios[sctx->curr]; 1972 device = sbio->dev; 1973 bwlimit = READ_ONCE(device->scrub_speed_max); 1974 if (bwlimit == 0) 1975 return; 1976 1977 /* 1978 * Slice is divided into intervals when the IO is submitted, adjust by 1979 * bwlimit and maximum of 64 intervals. 1980 */ 1981 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); 1982 div = min_t(u32, 64, div); 1983 1984 /* Start new epoch, set deadline */ 1985 now = ktime_get(); 1986 if (sctx->throttle_deadline == 0) { 1987 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); 1988 sctx->throttle_sent = 0; 1989 } 1990 1991 /* Still in the time to send? */ 1992 if (ktime_before(now, sctx->throttle_deadline)) { 1993 /* If current bio is within the limit, send it */ 1994 sctx->throttle_sent += sbio->bio->bi_iter.bi_size; 1995 if (sctx->throttle_sent <= div_u64(bwlimit, div)) 1996 return; 1997 1998 /* We're over the limit, sleep until the rest of the slice */ 1999 delta = ktime_ms_delta(sctx->throttle_deadline, now); 2000 } else { 2001 /* New request after deadline, start new epoch */ 2002 delta = 0; 2003 } 2004 2005 if (delta) { 2006 long timeout; 2007 2008 timeout = div_u64(delta * HZ, 1000); 2009 schedule_timeout_interruptible(timeout); 2010 } 2011 2012 /* Next call will start the deadline period */ 2013 sctx->throttle_deadline = 0; 2014 } 2015 2016 static void scrub_submit(struct scrub_ctx *sctx) 2017 { 2018 struct scrub_bio *sbio; 2019 2020 if (sctx->curr == -1) 2021 return; 2022 2023 scrub_throttle(sctx); 2024 2025 sbio = sctx->bios[sctx->curr]; 2026 sctx->curr = -1; 2027 scrub_pending_bio_inc(sctx); 2028 btrfsic_check_bio(sbio->bio); 2029 submit_bio(sbio->bio); 2030 } 2031 2032 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, 2033 struct scrub_sector *sector) 2034 { 2035 struct scrub_block *sblock = sector->sblock; 2036 struct scrub_bio *sbio; 2037 const u32 sectorsize = sctx->fs_info->sectorsize; 2038 int ret; 2039 2040 again: 2041 /* 2042 * grab a fresh bio or wait for one to become available 2043 */ 2044 while (sctx->curr == -1) { 2045 spin_lock(&sctx->list_lock); 2046 sctx->curr = sctx->first_free; 2047 if (sctx->curr != -1) { 2048 sctx->first_free = sctx->bios[sctx->curr]->next_free; 2049 sctx->bios[sctx->curr]->next_free = -1; 2050 sctx->bios[sctx->curr]->sector_count = 0; 2051 spin_unlock(&sctx->list_lock); 2052 } else { 2053 spin_unlock(&sctx->list_lock); 2054 wait_event(sctx->list_wait, sctx->first_free != -1); 2055 } 2056 } 2057 sbio = sctx->bios[sctx->curr]; 2058 if (sbio->sector_count == 0) { 2059 sbio->physical = sector->physical; 2060 sbio->logical = sector->logical; 2061 sbio->dev = sector->dev; 2062 if (!sbio->bio) { 2063 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, 2064 REQ_OP_READ, GFP_NOFS); 2065 } 2066 sbio->bio->bi_private = sbio; 2067 sbio->bio->bi_end_io = scrub_bio_end_io; 2068 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; 2069 sbio->status = 0; 2070 } else if (sbio->physical + sbio->sector_count * sectorsize != 2071 sector->physical || 2072 sbio->logical + sbio->sector_count * sectorsize != 2073 sector->logical || 2074 sbio->dev != sector->dev) { 2075 scrub_submit(sctx); 2076 goto again; 2077 } 2078 2079 sbio->sectors[sbio->sector_count] = sector; 2080 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); 2081 if (ret != sectorsize) { 2082 if (sbio->sector_count < 1) { 2083 bio_put(sbio->bio); 2084 sbio->bio = NULL; 2085 return -EIO; 2086 } 2087 scrub_submit(sctx); 2088 goto again; 2089 } 2090 2091 scrub_block_get(sblock); /* one for the page added to the bio */ 2092 atomic_inc(&sblock->outstanding_sectors); 2093 sbio->sector_count++; 2094 if (sbio->sector_count == sctx->sectors_per_bio) 2095 scrub_submit(sctx); 2096 2097 return 0; 2098 } 2099 2100 static void scrub_missing_raid56_end_io(struct bio *bio) 2101 { 2102 struct scrub_block *sblock = bio->bi_private; 2103 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; 2104 2105 if (bio->bi_status) 2106 sblock->no_io_error_seen = 0; 2107 2108 bio_put(bio); 2109 2110 queue_work(fs_info->scrub_workers, &sblock->work); 2111 } 2112 2113 static void scrub_missing_raid56_worker(struct work_struct *work) 2114 { 2115 struct scrub_block *sblock = container_of(work, struct scrub_block, work); 2116 struct scrub_ctx *sctx = sblock->sctx; 2117 struct btrfs_fs_info *fs_info = sctx->fs_info; 2118 u64 logical; 2119 struct btrfs_device *dev; 2120 2121 logical = sblock->sectors[0]->logical; 2122 dev = sblock->sectors[0]->dev; 2123 2124 if (sblock->no_io_error_seen) 2125 scrub_recheck_block_checksum(sblock); 2126 2127 if (!sblock->no_io_error_seen) { 2128 spin_lock(&sctx->stat_lock); 2129 sctx->stat.read_errors++; 2130 spin_unlock(&sctx->stat_lock); 2131 btrfs_err_rl_in_rcu(fs_info, 2132 "IO error rebuilding logical %llu for dev %s", 2133 logical, rcu_str_deref(dev->name)); 2134 } else if (sblock->header_error || sblock->checksum_error) { 2135 spin_lock(&sctx->stat_lock); 2136 sctx->stat.uncorrectable_errors++; 2137 spin_unlock(&sctx->stat_lock); 2138 btrfs_err_rl_in_rcu(fs_info, 2139 "failed to rebuild valid logical %llu for dev %s", 2140 logical, rcu_str_deref(dev->name)); 2141 } else { 2142 scrub_write_block_to_dev_replace(sblock); 2143 } 2144 2145 if (sctx->is_dev_replace && sctx->flush_all_writes) { 2146 mutex_lock(&sctx->wr_lock); 2147 scrub_wr_submit(sctx); 2148 mutex_unlock(&sctx->wr_lock); 2149 } 2150 2151 scrub_block_put(sblock); 2152 scrub_pending_bio_dec(sctx); 2153 } 2154 2155 static void scrub_missing_raid56_pages(struct scrub_block *sblock) 2156 { 2157 struct scrub_ctx *sctx = sblock->sctx; 2158 struct btrfs_fs_info *fs_info = sctx->fs_info; 2159 u64 length = sblock->sector_count << fs_info->sectorsize_bits; 2160 u64 logical = sblock->sectors[0]->logical; 2161 struct btrfs_io_context *bioc = NULL; 2162 struct bio *bio; 2163 struct btrfs_raid_bio *rbio; 2164 int ret; 2165 int i; 2166 2167 btrfs_bio_counter_inc_blocked(fs_info); 2168 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 2169 &length, &bioc); 2170 if (ret || !bioc || !bioc->raid_map) 2171 goto bioc_out; 2172 2173 if (WARN_ON(!sctx->is_dev_replace || 2174 !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2175 /* 2176 * We shouldn't be scrubbing a missing device. Even for dev 2177 * replace, we should only get here for RAID 5/6. We either 2178 * managed to mount something with no mirrors remaining or 2179 * there's a bug in scrub_find_good_copy()/btrfs_map_block(). 2180 */ 2181 goto bioc_out; 2182 } 2183 2184 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); 2185 bio->bi_iter.bi_sector = logical >> 9; 2186 bio->bi_private = sblock; 2187 bio->bi_end_io = scrub_missing_raid56_end_io; 2188 2189 rbio = raid56_alloc_missing_rbio(bio, bioc); 2190 if (!rbio) 2191 goto rbio_out; 2192 2193 for (i = 0; i < sblock->sector_count; i++) { 2194 struct scrub_sector *sector = sblock->sectors[i]; 2195 2196 /* 2197 * For now, our scrub is still one page per sector, so pgoff 2198 * is always 0. 2199 */ 2200 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); 2201 } 2202 2203 INIT_WORK(&sblock->work, scrub_missing_raid56_worker); 2204 scrub_block_get(sblock); 2205 scrub_pending_bio_inc(sctx); 2206 raid56_submit_missing_rbio(rbio); 2207 return; 2208 2209 rbio_out: 2210 bio_put(bio); 2211 bioc_out: 2212 btrfs_bio_counter_dec(fs_info); 2213 btrfs_put_bioc(bioc); 2214 spin_lock(&sctx->stat_lock); 2215 sctx->stat.malloc_errors++; 2216 spin_unlock(&sctx->stat_lock); 2217 } 2218 2219 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, 2220 u64 physical, struct btrfs_device *dev, u64 flags, 2221 u64 gen, int mirror_num, u8 *csum, 2222 u64 physical_for_dev_replace) 2223 { 2224 struct scrub_block *sblock; 2225 const u32 sectorsize = sctx->fs_info->sectorsize; 2226 int index; 2227 2228 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2229 if (!sblock) { 2230 spin_lock(&sctx->stat_lock); 2231 sctx->stat.malloc_errors++; 2232 spin_unlock(&sctx->stat_lock); 2233 return -ENOMEM; 2234 } 2235 2236 /* one ref inside this function, plus one for each page added to 2237 * a bio later on */ 2238 refcount_set(&sblock->refs, 1); 2239 sblock->sctx = sctx; 2240 sblock->no_io_error_seen = 1; 2241 2242 for (index = 0; len > 0; index++) { 2243 struct scrub_sector *sector; 2244 /* 2245 * Here we will allocate one page for one sector to scrub. 2246 * This is fine if PAGE_SIZE == sectorsize, but will cost 2247 * more memory for PAGE_SIZE > sectorsize case. 2248 */ 2249 u32 l = min(sectorsize, len); 2250 2251 sector = kzalloc(sizeof(*sector), GFP_KERNEL); 2252 if (!sector) { 2253 leave_nomem: 2254 spin_lock(&sctx->stat_lock); 2255 sctx->stat.malloc_errors++; 2256 spin_unlock(&sctx->stat_lock); 2257 scrub_block_put(sblock); 2258 return -ENOMEM; 2259 } 2260 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); 2261 scrub_sector_get(sector); 2262 sblock->sectors[index] = sector; 2263 sector->sblock = sblock; 2264 sector->dev = dev; 2265 sector->flags = flags; 2266 sector->generation = gen; 2267 sector->logical = logical; 2268 sector->physical = physical; 2269 sector->physical_for_dev_replace = physical_for_dev_replace; 2270 sector->mirror_num = mirror_num; 2271 if (csum) { 2272 sector->have_csum = 1; 2273 memcpy(sector->csum, csum, sctx->fs_info->csum_size); 2274 } else { 2275 sector->have_csum = 0; 2276 } 2277 sblock->sector_count++; 2278 sector->page = alloc_page(GFP_KERNEL); 2279 if (!sector->page) 2280 goto leave_nomem; 2281 len -= l; 2282 logical += l; 2283 physical += l; 2284 physical_for_dev_replace += l; 2285 } 2286 2287 WARN_ON(sblock->sector_count == 0); 2288 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { 2289 /* 2290 * This case should only be hit for RAID 5/6 device replace. See 2291 * the comment in scrub_missing_raid56_pages() for details. 2292 */ 2293 scrub_missing_raid56_pages(sblock); 2294 } else { 2295 for (index = 0; index < sblock->sector_count; index++) { 2296 struct scrub_sector *sector = sblock->sectors[index]; 2297 int ret; 2298 2299 ret = scrub_add_sector_to_rd_bio(sctx, sector); 2300 if (ret) { 2301 scrub_block_put(sblock); 2302 return ret; 2303 } 2304 } 2305 2306 if (flags & BTRFS_EXTENT_FLAG_SUPER) 2307 scrub_submit(sctx); 2308 } 2309 2310 /* last one frees, either here or in bio completion for last page */ 2311 scrub_block_put(sblock); 2312 return 0; 2313 } 2314 2315 static void scrub_bio_end_io(struct bio *bio) 2316 { 2317 struct scrub_bio *sbio = bio->bi_private; 2318 struct btrfs_fs_info *fs_info = sbio->dev->fs_info; 2319 2320 sbio->status = bio->bi_status; 2321 sbio->bio = bio; 2322 2323 queue_work(fs_info->scrub_workers, &sbio->work); 2324 } 2325 2326 static void scrub_bio_end_io_worker(struct work_struct *work) 2327 { 2328 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2329 struct scrub_ctx *sctx = sbio->sctx; 2330 int i; 2331 2332 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); 2333 if (sbio->status) { 2334 for (i = 0; i < sbio->sector_count; i++) { 2335 struct scrub_sector *sector = sbio->sectors[i]; 2336 2337 sector->io_error = 1; 2338 sector->sblock->no_io_error_seen = 0; 2339 } 2340 } 2341 2342 /* Now complete the scrub_block items that have all pages completed */ 2343 for (i = 0; i < sbio->sector_count; i++) { 2344 struct scrub_sector *sector = sbio->sectors[i]; 2345 struct scrub_block *sblock = sector->sblock; 2346 2347 if (atomic_dec_and_test(&sblock->outstanding_sectors)) 2348 scrub_block_complete(sblock); 2349 scrub_block_put(sblock); 2350 } 2351 2352 bio_put(sbio->bio); 2353 sbio->bio = NULL; 2354 spin_lock(&sctx->list_lock); 2355 sbio->next_free = sctx->first_free; 2356 sctx->first_free = sbio->index; 2357 spin_unlock(&sctx->list_lock); 2358 2359 if (sctx->is_dev_replace && sctx->flush_all_writes) { 2360 mutex_lock(&sctx->wr_lock); 2361 scrub_wr_submit(sctx); 2362 mutex_unlock(&sctx->wr_lock); 2363 } 2364 2365 scrub_pending_bio_dec(sctx); 2366 } 2367 2368 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, 2369 unsigned long *bitmap, 2370 u64 start, u32 len) 2371 { 2372 u64 offset; 2373 u32 nsectors; 2374 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits; 2375 2376 if (len >= sparity->stripe_len) { 2377 bitmap_set(bitmap, 0, sparity->nsectors); 2378 return; 2379 } 2380 2381 start -= sparity->logic_start; 2382 start = div64_u64_rem(start, sparity->stripe_len, &offset); 2383 offset = offset >> sectorsize_bits; 2384 nsectors = len >> sectorsize_bits; 2385 2386 if (offset + nsectors <= sparity->nsectors) { 2387 bitmap_set(bitmap, offset, nsectors); 2388 return; 2389 } 2390 2391 bitmap_set(bitmap, offset, sparity->nsectors - offset); 2392 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); 2393 } 2394 2395 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, 2396 u64 start, u32 len) 2397 { 2398 __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); 2399 } 2400 2401 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, 2402 u64 start, u32 len) 2403 { 2404 __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); 2405 } 2406 2407 static void scrub_block_complete(struct scrub_block *sblock) 2408 { 2409 int corrupted = 0; 2410 2411 if (!sblock->no_io_error_seen) { 2412 corrupted = 1; 2413 scrub_handle_errored_block(sblock); 2414 } else { 2415 /* 2416 * if has checksum error, write via repair mechanism in 2417 * dev replace case, otherwise write here in dev replace 2418 * case. 2419 */ 2420 corrupted = scrub_checksum(sblock); 2421 if (!corrupted && sblock->sctx->is_dev_replace) 2422 scrub_write_block_to_dev_replace(sblock); 2423 } 2424 2425 if (sblock->sparity && corrupted && !sblock->data_corrected) { 2426 u64 start = sblock->sectors[0]->logical; 2427 u64 end = sblock->sectors[sblock->sector_count - 1]->logical + 2428 sblock->sctx->fs_info->sectorsize; 2429 2430 ASSERT(end - start <= U32_MAX); 2431 scrub_parity_mark_sectors_error(sblock->sparity, 2432 start, end - start); 2433 } 2434 } 2435 2436 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum) 2437 { 2438 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits; 2439 list_del(&sum->list); 2440 kfree(sum); 2441 } 2442 2443 /* 2444 * Find the desired csum for range [logical, logical + sectorsize), and store 2445 * the csum into @csum. 2446 * 2447 * The search source is sctx->csum_list, which is a pre-populated list 2448 * storing bytenr ordered csum ranges. We're responsible to cleanup any range 2449 * that is before @logical. 2450 * 2451 * Return 0 if there is no csum for the range. 2452 * Return 1 if there is csum for the range and copied to @csum. 2453 */ 2454 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) 2455 { 2456 bool found = false; 2457 2458 while (!list_empty(&sctx->csum_list)) { 2459 struct btrfs_ordered_sum *sum = NULL; 2460 unsigned long index; 2461 unsigned long num_sectors; 2462 2463 sum = list_first_entry(&sctx->csum_list, 2464 struct btrfs_ordered_sum, list); 2465 /* The current csum range is beyond our range, no csum found */ 2466 if (sum->bytenr > logical) 2467 break; 2468 2469 /* 2470 * The current sum is before our bytenr, since scrub is always 2471 * done in bytenr order, the csum will never be used anymore, 2472 * clean it up so that later calls won't bother with the range, 2473 * and continue search the next range. 2474 */ 2475 if (sum->bytenr + sum->len <= logical) { 2476 drop_csum_range(sctx, sum); 2477 continue; 2478 } 2479 2480 /* Now the csum range covers our bytenr, copy the csum */ 2481 found = true; 2482 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; 2483 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; 2484 2485 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, 2486 sctx->fs_info->csum_size); 2487 2488 /* Cleanup the range if we're at the end of the csum range */ 2489 if (index == num_sectors - 1) 2490 drop_csum_range(sctx, sum); 2491 break; 2492 } 2493 if (!found) 2494 return 0; 2495 return 1; 2496 } 2497 2498 /* scrub extent tries to collect up to 64 kB for each bio */ 2499 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, 2500 u64 logical, u32 len, 2501 u64 physical, struct btrfs_device *dev, u64 flags, 2502 u64 gen, int mirror_num) 2503 { 2504 struct btrfs_device *src_dev = dev; 2505 u64 src_physical = physical; 2506 int src_mirror = mirror_num; 2507 int ret; 2508 u8 csum[BTRFS_CSUM_SIZE]; 2509 u32 blocksize; 2510 2511 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2512 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 2513 blocksize = map->stripe_len; 2514 else 2515 blocksize = sctx->fs_info->sectorsize; 2516 spin_lock(&sctx->stat_lock); 2517 sctx->stat.data_extents_scrubbed++; 2518 sctx->stat.data_bytes_scrubbed += len; 2519 spin_unlock(&sctx->stat_lock); 2520 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2521 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 2522 blocksize = map->stripe_len; 2523 else 2524 blocksize = sctx->fs_info->nodesize; 2525 spin_lock(&sctx->stat_lock); 2526 sctx->stat.tree_extents_scrubbed++; 2527 sctx->stat.tree_bytes_scrubbed += len; 2528 spin_unlock(&sctx->stat_lock); 2529 } else { 2530 blocksize = sctx->fs_info->sectorsize; 2531 WARN_ON(1); 2532 } 2533 2534 /* 2535 * For dev-replace case, we can have @dev being a missing device. 2536 * Regular scrub will avoid its execution on missing device at all, 2537 * as that would trigger tons of read error. 2538 * 2539 * Reading from missing device will cause read error counts to 2540 * increase unnecessarily. 2541 * So here we change the read source to a good mirror. 2542 */ 2543 if (sctx->is_dev_replace && !dev->bdev) 2544 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, 2545 &src_dev, &src_mirror); 2546 while (len) { 2547 u32 l = min(len, blocksize); 2548 int have_csum = 0; 2549 2550 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2551 /* push csums to sbio */ 2552 have_csum = scrub_find_csum(sctx, logical, csum); 2553 if (have_csum == 0) 2554 ++sctx->stat.no_csum; 2555 } 2556 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, 2557 flags, gen, src_mirror, 2558 have_csum ? csum : NULL, physical); 2559 if (ret) 2560 return ret; 2561 len -= l; 2562 logical += l; 2563 physical += l; 2564 src_physical += l; 2565 } 2566 return 0; 2567 } 2568 2569 static int scrub_sectors_for_parity(struct scrub_parity *sparity, 2570 u64 logical, u32 len, 2571 u64 physical, struct btrfs_device *dev, 2572 u64 flags, u64 gen, int mirror_num, u8 *csum) 2573 { 2574 struct scrub_ctx *sctx = sparity->sctx; 2575 struct scrub_block *sblock; 2576 const u32 sectorsize = sctx->fs_info->sectorsize; 2577 int index; 2578 2579 ASSERT(IS_ALIGNED(len, sectorsize)); 2580 2581 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2582 if (!sblock) { 2583 spin_lock(&sctx->stat_lock); 2584 sctx->stat.malloc_errors++; 2585 spin_unlock(&sctx->stat_lock); 2586 return -ENOMEM; 2587 } 2588 2589 /* one ref inside this function, plus one for each page added to 2590 * a bio later on */ 2591 refcount_set(&sblock->refs, 1); 2592 sblock->sctx = sctx; 2593 sblock->no_io_error_seen = 1; 2594 sblock->sparity = sparity; 2595 scrub_parity_get(sparity); 2596 2597 for (index = 0; len > 0; index++) { 2598 struct scrub_sector *sector; 2599 2600 sector = kzalloc(sizeof(*sector), GFP_KERNEL); 2601 if (!sector) { 2602 leave_nomem: 2603 spin_lock(&sctx->stat_lock); 2604 sctx->stat.malloc_errors++; 2605 spin_unlock(&sctx->stat_lock); 2606 scrub_block_put(sblock); 2607 return -ENOMEM; 2608 } 2609 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); 2610 /* For scrub block */ 2611 scrub_sector_get(sector); 2612 sblock->sectors[index] = sector; 2613 /* For scrub parity */ 2614 scrub_sector_get(sector); 2615 list_add_tail(§or->list, &sparity->sectors_list); 2616 sector->sblock = sblock; 2617 sector->dev = dev; 2618 sector->flags = flags; 2619 sector->generation = gen; 2620 sector->logical = logical; 2621 sector->physical = physical; 2622 sector->mirror_num = mirror_num; 2623 if (csum) { 2624 sector->have_csum = 1; 2625 memcpy(sector->csum, csum, sctx->fs_info->csum_size); 2626 } else { 2627 sector->have_csum = 0; 2628 } 2629 sblock->sector_count++; 2630 sector->page = alloc_page(GFP_KERNEL); 2631 if (!sector->page) 2632 goto leave_nomem; 2633 2634 2635 /* Iterate over the stripe range in sectorsize steps */ 2636 len -= sectorsize; 2637 logical += sectorsize; 2638 physical += sectorsize; 2639 } 2640 2641 WARN_ON(sblock->sector_count == 0); 2642 for (index = 0; index < sblock->sector_count; index++) { 2643 struct scrub_sector *sector = sblock->sectors[index]; 2644 int ret; 2645 2646 ret = scrub_add_sector_to_rd_bio(sctx, sector); 2647 if (ret) { 2648 scrub_block_put(sblock); 2649 return ret; 2650 } 2651 } 2652 2653 /* Last one frees, either here or in bio completion for last sector */ 2654 scrub_block_put(sblock); 2655 return 0; 2656 } 2657 2658 static int scrub_extent_for_parity(struct scrub_parity *sparity, 2659 u64 logical, u32 len, 2660 u64 physical, struct btrfs_device *dev, 2661 u64 flags, u64 gen, int mirror_num) 2662 { 2663 struct scrub_ctx *sctx = sparity->sctx; 2664 int ret; 2665 u8 csum[BTRFS_CSUM_SIZE]; 2666 u32 blocksize; 2667 2668 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { 2669 scrub_parity_mark_sectors_error(sparity, logical, len); 2670 return 0; 2671 } 2672 2673 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2674 blocksize = sparity->stripe_len; 2675 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2676 blocksize = sparity->stripe_len; 2677 } else { 2678 blocksize = sctx->fs_info->sectorsize; 2679 WARN_ON(1); 2680 } 2681 2682 while (len) { 2683 u32 l = min(len, blocksize); 2684 int have_csum = 0; 2685 2686 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2687 /* push csums to sbio */ 2688 have_csum = scrub_find_csum(sctx, logical, csum); 2689 if (have_csum == 0) 2690 goto skip; 2691 } 2692 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, 2693 flags, gen, mirror_num, 2694 have_csum ? csum : NULL); 2695 if (ret) 2696 return ret; 2697 skip: 2698 len -= l; 2699 logical += l; 2700 physical += l; 2701 } 2702 return 0; 2703 } 2704 2705 /* 2706 * Given a physical address, this will calculate it's 2707 * logical offset. if this is a parity stripe, it will return 2708 * the most left data stripe's logical offset. 2709 * 2710 * return 0 if it is a data stripe, 1 means parity stripe. 2711 */ 2712 static int get_raid56_logic_offset(u64 physical, int num, 2713 struct map_lookup *map, u64 *offset, 2714 u64 *stripe_start) 2715 { 2716 int i; 2717 int j = 0; 2718 u64 stripe_nr; 2719 u64 last_offset; 2720 u32 stripe_index; 2721 u32 rot; 2722 const int data_stripes = nr_data_stripes(map); 2723 2724 last_offset = (physical - map->stripes[num].physical) * data_stripes; 2725 if (stripe_start) 2726 *stripe_start = last_offset; 2727 2728 *offset = last_offset; 2729 for (i = 0; i < data_stripes; i++) { 2730 *offset = last_offset + i * map->stripe_len; 2731 2732 stripe_nr = div64_u64(*offset, map->stripe_len); 2733 stripe_nr = div_u64(stripe_nr, data_stripes); 2734 2735 /* Work out the disk rotation on this stripe-set */ 2736 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); 2737 /* calculate which stripe this data locates */ 2738 rot += i; 2739 stripe_index = rot % map->num_stripes; 2740 if (stripe_index == num) 2741 return 0; 2742 if (stripe_index < num) 2743 j++; 2744 } 2745 *offset = last_offset + j * map->stripe_len; 2746 return 1; 2747 } 2748 2749 static void scrub_free_parity(struct scrub_parity *sparity) 2750 { 2751 struct scrub_ctx *sctx = sparity->sctx; 2752 struct scrub_sector *curr, *next; 2753 int nbits; 2754 2755 nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); 2756 if (nbits) { 2757 spin_lock(&sctx->stat_lock); 2758 sctx->stat.read_errors += nbits; 2759 sctx->stat.uncorrectable_errors += nbits; 2760 spin_unlock(&sctx->stat_lock); 2761 } 2762 2763 list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { 2764 list_del_init(&curr->list); 2765 scrub_sector_put(curr); 2766 } 2767 2768 kfree(sparity); 2769 } 2770 2771 static void scrub_parity_bio_endio_worker(struct work_struct *work) 2772 { 2773 struct scrub_parity *sparity = container_of(work, struct scrub_parity, 2774 work); 2775 struct scrub_ctx *sctx = sparity->sctx; 2776 2777 scrub_free_parity(sparity); 2778 scrub_pending_bio_dec(sctx); 2779 } 2780 2781 static void scrub_parity_bio_endio(struct bio *bio) 2782 { 2783 struct scrub_parity *sparity = bio->bi_private; 2784 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; 2785 2786 if (bio->bi_status) 2787 bitmap_or(&sparity->ebitmap, &sparity->ebitmap, 2788 &sparity->dbitmap, sparity->nsectors); 2789 2790 bio_put(bio); 2791 2792 INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); 2793 queue_work(fs_info->scrub_parity_workers, &sparity->work); 2794 } 2795 2796 static void scrub_parity_check_and_repair(struct scrub_parity *sparity) 2797 { 2798 struct scrub_ctx *sctx = sparity->sctx; 2799 struct btrfs_fs_info *fs_info = sctx->fs_info; 2800 struct bio *bio; 2801 struct btrfs_raid_bio *rbio; 2802 struct btrfs_io_context *bioc = NULL; 2803 u64 length; 2804 int ret; 2805 2806 if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, 2807 &sparity->ebitmap, sparity->nsectors)) 2808 goto out; 2809 2810 length = sparity->logic_end - sparity->logic_start; 2811 2812 btrfs_bio_counter_inc_blocked(fs_info); 2813 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, 2814 &length, &bioc); 2815 if (ret || !bioc || !bioc->raid_map) 2816 goto bioc_out; 2817 2818 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); 2819 bio->bi_iter.bi_sector = sparity->logic_start >> 9; 2820 bio->bi_private = sparity; 2821 bio->bi_end_io = scrub_parity_bio_endio; 2822 2823 rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, 2824 sparity->scrub_dev, 2825 &sparity->dbitmap, 2826 sparity->nsectors); 2827 if (!rbio) 2828 goto rbio_out; 2829 2830 scrub_pending_bio_inc(sctx); 2831 raid56_parity_submit_scrub_rbio(rbio); 2832 return; 2833 2834 rbio_out: 2835 bio_put(bio); 2836 bioc_out: 2837 btrfs_bio_counter_dec(fs_info); 2838 btrfs_put_bioc(bioc); 2839 bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, 2840 sparity->nsectors); 2841 spin_lock(&sctx->stat_lock); 2842 sctx->stat.malloc_errors++; 2843 spin_unlock(&sctx->stat_lock); 2844 out: 2845 scrub_free_parity(sparity); 2846 } 2847 2848 static void scrub_parity_get(struct scrub_parity *sparity) 2849 { 2850 refcount_inc(&sparity->refs); 2851 } 2852 2853 static void scrub_parity_put(struct scrub_parity *sparity) 2854 { 2855 if (!refcount_dec_and_test(&sparity->refs)) 2856 return; 2857 2858 scrub_parity_check_and_repair(sparity); 2859 } 2860 2861 /* 2862 * Return 0 if the extent item range covers any byte of the range. 2863 * Return <0 if the extent item is before @search_start. 2864 * Return >0 if the extent item is after @start_start + @search_len. 2865 */ 2866 static int compare_extent_item_range(struct btrfs_path *path, 2867 u64 search_start, u64 search_len) 2868 { 2869 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; 2870 u64 len; 2871 struct btrfs_key key; 2872 2873 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2874 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || 2875 key.type == BTRFS_METADATA_ITEM_KEY); 2876 if (key.type == BTRFS_METADATA_ITEM_KEY) 2877 len = fs_info->nodesize; 2878 else 2879 len = key.offset; 2880 2881 if (key.objectid + len <= search_start) 2882 return -1; 2883 if (key.objectid >= search_start + search_len) 2884 return 1; 2885 return 0; 2886 } 2887 2888 /* 2889 * Locate one extent item which covers any byte in range 2890 * [@search_start, @search_start + @search_length) 2891 * 2892 * If the path is not initialized, we will initialize the search by doing 2893 * a btrfs_search_slot(). 2894 * If the path is already initialized, we will use the path as the initial 2895 * slot, to avoid duplicated btrfs_search_slot() calls. 2896 * 2897 * NOTE: If an extent item starts before @search_start, we will still 2898 * return the extent item. This is for data extent crossing stripe boundary. 2899 * 2900 * Return 0 if we found such extent item, and @path will point to the extent item. 2901 * Return >0 if no such extent item can be found, and @path will be released. 2902 * Return <0 if hit fatal error, and @path will be released. 2903 */ 2904 static int find_first_extent_item(struct btrfs_root *extent_root, 2905 struct btrfs_path *path, 2906 u64 search_start, u64 search_len) 2907 { 2908 struct btrfs_fs_info *fs_info = extent_root->fs_info; 2909 struct btrfs_key key; 2910 int ret; 2911 2912 /* Continue using the existing path */ 2913 if (path->nodes[0]) 2914 goto search_forward; 2915 2916 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2917 key.type = BTRFS_METADATA_ITEM_KEY; 2918 else 2919 key.type = BTRFS_EXTENT_ITEM_KEY; 2920 key.objectid = search_start; 2921 key.offset = (u64)-1; 2922 2923 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 2924 if (ret < 0) 2925 return ret; 2926 2927 ASSERT(ret > 0); 2928 /* 2929 * Here we intentionally pass 0 as @min_objectid, as there could be 2930 * an extent item starting before @search_start. 2931 */ 2932 ret = btrfs_previous_extent_item(extent_root, path, 0); 2933 if (ret < 0) 2934 return ret; 2935 /* 2936 * No matter whether we have found an extent item, the next loop will 2937 * properly do every check on the key. 2938 */ 2939 search_forward: 2940 while (true) { 2941 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2942 if (key.objectid >= search_start + search_len) 2943 break; 2944 if (key.type != BTRFS_METADATA_ITEM_KEY && 2945 key.type != BTRFS_EXTENT_ITEM_KEY) 2946 goto next; 2947 2948 ret = compare_extent_item_range(path, search_start, search_len); 2949 if (ret == 0) 2950 return ret; 2951 if (ret > 0) 2952 break; 2953 next: 2954 path->slots[0]++; 2955 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 2956 ret = btrfs_next_leaf(extent_root, path); 2957 if (ret) { 2958 /* Either no more item or fatal error */ 2959 btrfs_release_path(path); 2960 return ret; 2961 } 2962 } 2963 } 2964 btrfs_release_path(path); 2965 return 1; 2966 } 2967 2968 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, 2969 u64 *size_ret, u64 *flags_ret, u64 *generation_ret) 2970 { 2971 struct btrfs_key key; 2972 struct btrfs_extent_item *ei; 2973 2974 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 2975 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || 2976 key.type == BTRFS_EXTENT_ITEM_KEY); 2977 *extent_start_ret = key.objectid; 2978 if (key.type == BTRFS_METADATA_ITEM_KEY) 2979 *size_ret = path->nodes[0]->fs_info->nodesize; 2980 else 2981 *size_ret = key.offset; 2982 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); 2983 *flags_ret = btrfs_extent_flags(path->nodes[0], ei); 2984 *generation_ret = btrfs_extent_generation(path->nodes[0], ei); 2985 } 2986 2987 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, 2988 u64 boundary_start, u64 boudary_len) 2989 { 2990 return (extent_start < boundary_start && 2991 extent_start + extent_len > boundary_start) || 2992 (extent_start < boundary_start + boudary_len && 2993 extent_start + extent_len > boundary_start + boudary_len); 2994 } 2995 2996 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, 2997 struct scrub_parity *sparity, 2998 struct map_lookup *map, 2999 struct btrfs_device *sdev, 3000 struct btrfs_path *path, 3001 u64 logical) 3002 { 3003 struct btrfs_fs_info *fs_info = sctx->fs_info; 3004 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); 3005 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); 3006 u64 cur_logical = logical; 3007 int ret; 3008 3009 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 3010 3011 /* Path must not be populated */ 3012 ASSERT(!path->nodes[0]); 3013 3014 while (cur_logical < logical + map->stripe_len) { 3015 struct btrfs_io_context *bioc = NULL; 3016 struct btrfs_device *extent_dev; 3017 u64 extent_start; 3018 u64 extent_size; 3019 u64 mapped_length; 3020 u64 extent_flags; 3021 u64 extent_gen; 3022 u64 extent_physical; 3023 u64 extent_mirror_num; 3024 3025 ret = find_first_extent_item(extent_root, path, cur_logical, 3026 logical + map->stripe_len - cur_logical); 3027 /* No more extent item in this data stripe */ 3028 if (ret > 0) { 3029 ret = 0; 3030 break; 3031 } 3032 if (ret < 0) 3033 break; 3034 get_extent_info(path, &extent_start, &extent_size, &extent_flags, 3035 &extent_gen); 3036 3037 /* Metadata should not cross stripe boundaries */ 3038 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3039 does_range_cross_boundary(extent_start, extent_size, 3040 logical, map->stripe_len)) { 3041 btrfs_err(fs_info, 3042 "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3043 extent_start, logical); 3044 spin_lock(&sctx->stat_lock); 3045 sctx->stat.uncorrectable_errors++; 3046 spin_unlock(&sctx->stat_lock); 3047 cur_logical += extent_size; 3048 continue; 3049 } 3050 3051 /* Skip hole range which doesn't have any extent */ 3052 cur_logical = max(extent_start, cur_logical); 3053 3054 /* Truncate the range inside this data stripe */ 3055 extent_size = min(extent_start + extent_size, 3056 logical + map->stripe_len) - cur_logical; 3057 extent_start = cur_logical; 3058 ASSERT(extent_size <= U32_MAX); 3059 3060 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); 3061 3062 mapped_length = extent_size; 3063 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, 3064 &mapped_length, &bioc, 0); 3065 if (!ret && (!bioc || mapped_length < extent_size)) 3066 ret = -EIO; 3067 if (ret) { 3068 btrfs_put_bioc(bioc); 3069 scrub_parity_mark_sectors_error(sparity, extent_start, 3070 extent_size); 3071 break; 3072 } 3073 extent_physical = bioc->stripes[0].physical; 3074 extent_mirror_num = bioc->mirror_num; 3075 extent_dev = bioc->stripes[0].dev; 3076 btrfs_put_bioc(bioc); 3077 3078 ret = btrfs_lookup_csums_range(csum_root, extent_start, 3079 extent_start + extent_size - 1, 3080 &sctx->csum_list, 1); 3081 if (ret) { 3082 scrub_parity_mark_sectors_error(sparity, extent_start, 3083 extent_size); 3084 break; 3085 } 3086 3087 ret = scrub_extent_for_parity(sparity, extent_start, 3088 extent_size, extent_physical, 3089 extent_dev, extent_flags, 3090 extent_gen, extent_mirror_num); 3091 scrub_free_csums(sctx); 3092 3093 if (ret) { 3094 scrub_parity_mark_sectors_error(sparity, extent_start, 3095 extent_size); 3096 break; 3097 } 3098 3099 cond_resched(); 3100 cur_logical += extent_size; 3101 } 3102 btrfs_release_path(path); 3103 return ret; 3104 } 3105 3106 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, 3107 struct map_lookup *map, 3108 struct btrfs_device *sdev, 3109 u64 logic_start, 3110 u64 logic_end) 3111 { 3112 struct btrfs_fs_info *fs_info = sctx->fs_info; 3113 struct btrfs_path *path; 3114 u64 cur_logical; 3115 int ret; 3116 struct scrub_parity *sparity; 3117 int nsectors; 3118 3119 path = btrfs_alloc_path(); 3120 if (!path) { 3121 spin_lock(&sctx->stat_lock); 3122 sctx->stat.malloc_errors++; 3123 spin_unlock(&sctx->stat_lock); 3124 return -ENOMEM; 3125 } 3126 path->search_commit_root = 1; 3127 path->skip_locking = 1; 3128 3129 ASSERT(map->stripe_len <= U32_MAX); 3130 nsectors = map->stripe_len >> fs_info->sectorsize_bits; 3131 ASSERT(nsectors <= BITS_PER_LONG); 3132 sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); 3133 if (!sparity) { 3134 spin_lock(&sctx->stat_lock); 3135 sctx->stat.malloc_errors++; 3136 spin_unlock(&sctx->stat_lock); 3137 btrfs_free_path(path); 3138 return -ENOMEM; 3139 } 3140 3141 ASSERT(map->stripe_len <= U32_MAX); 3142 sparity->stripe_len = map->stripe_len; 3143 sparity->nsectors = nsectors; 3144 sparity->sctx = sctx; 3145 sparity->scrub_dev = sdev; 3146 sparity->logic_start = logic_start; 3147 sparity->logic_end = logic_end; 3148 refcount_set(&sparity->refs, 1); 3149 INIT_LIST_HEAD(&sparity->sectors_list); 3150 3151 ret = 0; 3152 for (cur_logical = logic_start; cur_logical < logic_end; 3153 cur_logical += map->stripe_len) { 3154 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, 3155 sdev, path, cur_logical); 3156 if (ret < 0) 3157 break; 3158 } 3159 3160 scrub_parity_put(sparity); 3161 scrub_submit(sctx); 3162 mutex_lock(&sctx->wr_lock); 3163 scrub_wr_submit(sctx); 3164 mutex_unlock(&sctx->wr_lock); 3165 3166 btrfs_free_path(path); 3167 return ret < 0 ? ret : 0; 3168 } 3169 3170 static void sync_replace_for_zoned(struct scrub_ctx *sctx) 3171 { 3172 if (!btrfs_is_zoned(sctx->fs_info)) 3173 return; 3174 3175 sctx->flush_all_writes = true; 3176 scrub_submit(sctx); 3177 mutex_lock(&sctx->wr_lock); 3178 scrub_wr_submit(sctx); 3179 mutex_unlock(&sctx->wr_lock); 3180 3181 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 3182 } 3183 3184 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, 3185 u64 physical, u64 physical_end) 3186 { 3187 struct btrfs_fs_info *fs_info = sctx->fs_info; 3188 int ret = 0; 3189 3190 if (!btrfs_is_zoned(fs_info)) 3191 return 0; 3192 3193 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 3194 3195 mutex_lock(&sctx->wr_lock); 3196 if (sctx->write_pointer < physical_end) { 3197 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, 3198 physical, 3199 sctx->write_pointer); 3200 if (ret) 3201 btrfs_err(fs_info, 3202 "zoned: failed to recover write pointer"); 3203 } 3204 mutex_unlock(&sctx->wr_lock); 3205 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); 3206 3207 return ret; 3208 } 3209 3210 /* 3211 * Scrub one range which can only has simple mirror based profile. 3212 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in 3213 * RAID0/RAID10). 3214 * 3215 * Since we may need to handle a subset of block group, we need @logical_start 3216 * and @logical_length parameter. 3217 */ 3218 static int scrub_simple_mirror(struct scrub_ctx *sctx, 3219 struct btrfs_root *extent_root, 3220 struct btrfs_root *csum_root, 3221 struct btrfs_block_group *bg, 3222 struct map_lookup *map, 3223 u64 logical_start, u64 logical_length, 3224 struct btrfs_device *device, 3225 u64 physical, int mirror_num) 3226 { 3227 struct btrfs_fs_info *fs_info = sctx->fs_info; 3228 const u64 logical_end = logical_start + logical_length; 3229 /* An artificial limit, inherit from old scrub behavior */ 3230 const u32 max_length = SZ_64K; 3231 struct btrfs_path path = { 0 }; 3232 u64 cur_logical = logical_start; 3233 int ret; 3234 3235 /* The range must be inside the bg */ 3236 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); 3237 3238 path.search_commit_root = 1; 3239 path.skip_locking = 1; 3240 /* Go through each extent items inside the logical range */ 3241 while (cur_logical < logical_end) { 3242 u64 extent_start; 3243 u64 extent_len; 3244 u64 extent_flags; 3245 u64 extent_gen; 3246 u64 scrub_len; 3247 3248 /* Canceled? */ 3249 if (atomic_read(&fs_info->scrub_cancel_req) || 3250 atomic_read(&sctx->cancel_req)) { 3251 ret = -ECANCELED; 3252 break; 3253 } 3254 /* Paused? */ 3255 if (atomic_read(&fs_info->scrub_pause_req)) { 3256 /* Push queued extents */ 3257 sctx->flush_all_writes = true; 3258 scrub_submit(sctx); 3259 mutex_lock(&sctx->wr_lock); 3260 scrub_wr_submit(sctx); 3261 mutex_unlock(&sctx->wr_lock); 3262 wait_event(sctx->list_wait, 3263 atomic_read(&sctx->bios_in_flight) == 0); 3264 sctx->flush_all_writes = false; 3265 scrub_blocked_if_needed(fs_info); 3266 } 3267 /* Block group removed? */ 3268 spin_lock(&bg->lock); 3269 if (bg->removed) { 3270 spin_unlock(&bg->lock); 3271 ret = 0; 3272 break; 3273 } 3274 spin_unlock(&bg->lock); 3275 3276 ret = find_first_extent_item(extent_root, &path, cur_logical, 3277 logical_end - cur_logical); 3278 if (ret > 0) { 3279 /* No more extent, just update the accounting */ 3280 sctx->stat.last_physical = physical + logical_length; 3281 ret = 0; 3282 break; 3283 } 3284 if (ret < 0) 3285 break; 3286 get_extent_info(&path, &extent_start, &extent_len, 3287 &extent_flags, &extent_gen); 3288 /* Skip hole range which doesn't have any extent */ 3289 cur_logical = max(extent_start, cur_logical); 3290 3291 /* 3292 * Scrub len has three limits: 3293 * - Extent size limit 3294 * - Scrub range limit 3295 * This is especially imporatant for RAID0/RAID10 to reuse 3296 * this function 3297 * - Max scrub size limit 3298 */ 3299 scrub_len = min(min(extent_start + extent_len, 3300 logical_end), cur_logical + max_length) - 3301 cur_logical; 3302 3303 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { 3304 ret = btrfs_lookup_csums_range(csum_root, cur_logical, 3305 cur_logical + scrub_len - 1, 3306 &sctx->csum_list, 1); 3307 if (ret) 3308 break; 3309 } 3310 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3311 does_range_cross_boundary(extent_start, extent_len, 3312 logical_start, logical_length)) { 3313 btrfs_err(fs_info, 3314 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", 3315 extent_start, logical_start, logical_end); 3316 spin_lock(&sctx->stat_lock); 3317 sctx->stat.uncorrectable_errors++; 3318 spin_unlock(&sctx->stat_lock); 3319 cur_logical += scrub_len; 3320 continue; 3321 } 3322 ret = scrub_extent(sctx, map, cur_logical, scrub_len, 3323 cur_logical - logical_start + physical, 3324 device, extent_flags, extent_gen, 3325 mirror_num); 3326 scrub_free_csums(sctx); 3327 if (ret) 3328 break; 3329 if (sctx->is_dev_replace) 3330 sync_replace_for_zoned(sctx); 3331 cur_logical += scrub_len; 3332 /* Don't hold CPU for too long time */ 3333 cond_resched(); 3334 } 3335 btrfs_release_path(&path); 3336 return ret; 3337 } 3338 3339 /* Calculate the full stripe length for simple stripe based profiles */ 3340 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) 3341 { 3342 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3343 BTRFS_BLOCK_GROUP_RAID10)); 3344 3345 return map->num_stripes / map->sub_stripes * map->stripe_len; 3346 } 3347 3348 /* Get the logical bytenr for the stripe */ 3349 static u64 simple_stripe_get_logical(struct map_lookup *map, 3350 struct btrfs_block_group *bg, 3351 int stripe_index) 3352 { 3353 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3354 BTRFS_BLOCK_GROUP_RAID10)); 3355 ASSERT(stripe_index < map->num_stripes); 3356 3357 /* 3358 * (stripe_index / sub_stripes) gives how many data stripes we need to 3359 * skip. 3360 */ 3361 return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; 3362 } 3363 3364 /* Get the mirror number for the stripe */ 3365 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) 3366 { 3367 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3368 BTRFS_BLOCK_GROUP_RAID10)); 3369 ASSERT(stripe_index < map->num_stripes); 3370 3371 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ 3372 return stripe_index % map->sub_stripes + 1; 3373 } 3374 3375 static int scrub_simple_stripe(struct scrub_ctx *sctx, 3376 struct btrfs_root *extent_root, 3377 struct btrfs_root *csum_root, 3378 struct btrfs_block_group *bg, 3379 struct map_lookup *map, 3380 struct btrfs_device *device, 3381 int stripe_index) 3382 { 3383 const u64 logical_increment = simple_stripe_full_stripe_len(map); 3384 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 3385 const u64 orig_physical = map->stripes[stripe_index].physical; 3386 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 3387 u64 cur_logical = orig_logical; 3388 u64 cur_physical = orig_physical; 3389 int ret = 0; 3390 3391 while (cur_logical < bg->start + bg->length) { 3392 /* 3393 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 3394 * just RAID1, so we can reuse scrub_simple_mirror() to scrub 3395 * this stripe. 3396 */ 3397 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, 3398 cur_logical, map->stripe_len, device, 3399 cur_physical, mirror_num); 3400 if (ret) 3401 return ret; 3402 /* Skip to next stripe which belongs to the target device */ 3403 cur_logical += logical_increment; 3404 /* For physical offset, we just go to next stripe */ 3405 cur_physical += map->stripe_len; 3406 } 3407 return ret; 3408 } 3409 3410 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 3411 struct btrfs_block_group *bg, 3412 struct extent_map *em, 3413 struct btrfs_device *scrub_dev, 3414 int stripe_index) 3415 { 3416 struct btrfs_path *path; 3417 struct btrfs_fs_info *fs_info = sctx->fs_info; 3418 struct btrfs_root *root; 3419 struct btrfs_root *csum_root; 3420 struct blk_plug plug; 3421 struct map_lookup *map = em->map_lookup; 3422 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 3423 const u64 chunk_logical = bg->start; 3424 int ret; 3425 u64 physical = map->stripes[stripe_index].physical; 3426 const u64 dev_stripe_len = btrfs_calc_stripe_length(em); 3427 const u64 physical_end = physical + dev_stripe_len; 3428 u64 logical; 3429 u64 logic_end; 3430 /* The logical increment after finishing one stripe */ 3431 u64 increment; 3432 /* Offset inside the chunk */ 3433 u64 offset; 3434 u64 stripe_logical; 3435 u64 stripe_end; 3436 int stop_loop = 0; 3437 3438 path = btrfs_alloc_path(); 3439 if (!path) 3440 return -ENOMEM; 3441 3442 /* 3443 * work on commit root. The related disk blocks are static as 3444 * long as COW is applied. This means, it is save to rewrite 3445 * them to repair disk errors without any race conditions 3446 */ 3447 path->search_commit_root = 1; 3448 path->skip_locking = 1; 3449 path->reada = READA_FORWARD; 3450 3451 wait_event(sctx->list_wait, 3452 atomic_read(&sctx->bios_in_flight) == 0); 3453 scrub_blocked_if_needed(fs_info); 3454 3455 root = btrfs_extent_root(fs_info, bg->start); 3456 csum_root = btrfs_csum_root(fs_info, bg->start); 3457 3458 /* 3459 * collect all data csums for the stripe to avoid seeking during 3460 * the scrub. This might currently (crc32) end up to be about 1MB 3461 */ 3462 blk_start_plug(&plug); 3463 3464 if (sctx->is_dev_replace && 3465 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { 3466 mutex_lock(&sctx->wr_lock); 3467 sctx->write_pointer = physical; 3468 mutex_unlock(&sctx->wr_lock); 3469 sctx->flush_all_writes = true; 3470 } 3471 3472 /* 3473 * There used to be a big double loop to handle all profiles using the 3474 * same routine, which grows larger and more gross over time. 3475 * 3476 * So here we handle each profile differently, so simpler profiles 3477 * have simpler scrubbing function. 3478 */ 3479 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | 3480 BTRFS_BLOCK_GROUP_RAID56_MASK))) { 3481 /* 3482 * Above check rules out all complex profile, the remaining 3483 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple 3484 * mirrored duplication without stripe. 3485 * 3486 * Only @physical and @mirror_num needs to calculated using 3487 * @stripe_index. 3488 */ 3489 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, 3490 bg->start, bg->length, scrub_dev, 3491 map->stripes[stripe_index].physical, 3492 stripe_index + 1); 3493 offset = 0; 3494 goto out; 3495 } 3496 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { 3497 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, 3498 scrub_dev, stripe_index); 3499 offset = map->stripe_len * (stripe_index / map->sub_stripes); 3500 goto out; 3501 } 3502 3503 /* Only RAID56 goes through the old code */ 3504 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); 3505 ret = 0; 3506 3507 /* Calculate the logical end of the stripe */ 3508 get_raid56_logic_offset(physical_end, stripe_index, 3509 map, &logic_end, NULL); 3510 logic_end += chunk_logical; 3511 3512 /* Initialize @offset in case we need to go to out: label */ 3513 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); 3514 increment = map->stripe_len * nr_data_stripes(map); 3515 3516 /* 3517 * Due to the rotation, for RAID56 it's better to iterate each stripe 3518 * using their physical offset. 3519 */ 3520 while (physical < physical_end) { 3521 ret = get_raid56_logic_offset(physical, stripe_index, map, 3522 &logical, &stripe_logical); 3523 logical += chunk_logical; 3524 if (ret) { 3525 /* it is parity strip */ 3526 stripe_logical += chunk_logical; 3527 stripe_end = stripe_logical + increment; 3528 ret = scrub_raid56_parity(sctx, map, scrub_dev, 3529 stripe_logical, 3530 stripe_end); 3531 if (ret) 3532 goto out; 3533 goto next; 3534 } 3535 3536 /* 3537 * Now we're at a data stripe, scrub each extents in the range. 3538 * 3539 * At this stage, if we ignore the repair part, inside each data 3540 * stripe it is no different than SINGLE profile. 3541 * We can reuse scrub_simple_mirror() here, as the repair part 3542 * is still based on @mirror_num. 3543 */ 3544 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, 3545 logical, map->stripe_len, 3546 scrub_dev, physical, 1); 3547 if (ret < 0) 3548 goto out; 3549 next: 3550 logical += increment; 3551 physical += map->stripe_len; 3552 spin_lock(&sctx->stat_lock); 3553 if (stop_loop) 3554 sctx->stat.last_physical = 3555 map->stripes[stripe_index].physical + dev_stripe_len; 3556 else 3557 sctx->stat.last_physical = physical; 3558 spin_unlock(&sctx->stat_lock); 3559 if (stop_loop) 3560 break; 3561 } 3562 out: 3563 /* push queued extents */ 3564 scrub_submit(sctx); 3565 mutex_lock(&sctx->wr_lock); 3566 scrub_wr_submit(sctx); 3567 mutex_unlock(&sctx->wr_lock); 3568 3569 blk_finish_plug(&plug); 3570 btrfs_free_path(path); 3571 3572 if (sctx->is_dev_replace && ret >= 0) { 3573 int ret2; 3574 3575 ret2 = sync_write_pointer_for_zoned(sctx, 3576 chunk_logical + offset, 3577 map->stripes[stripe_index].physical, 3578 physical_end); 3579 if (ret2) 3580 ret = ret2; 3581 } 3582 3583 return ret < 0 ? ret : 0; 3584 } 3585 3586 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 3587 struct btrfs_block_group *bg, 3588 struct btrfs_device *scrub_dev, 3589 u64 dev_offset, 3590 u64 dev_extent_len) 3591 { 3592 struct btrfs_fs_info *fs_info = sctx->fs_info; 3593 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 3594 struct map_lookup *map; 3595 struct extent_map *em; 3596 int i; 3597 int ret = 0; 3598 3599 read_lock(&map_tree->lock); 3600 em = lookup_extent_mapping(map_tree, bg->start, bg->length); 3601 read_unlock(&map_tree->lock); 3602 3603 if (!em) { 3604 /* 3605 * Might have been an unused block group deleted by the cleaner 3606 * kthread or relocation. 3607 */ 3608 spin_lock(&bg->lock); 3609 if (!bg->removed) 3610 ret = -EINVAL; 3611 spin_unlock(&bg->lock); 3612 3613 return ret; 3614 } 3615 if (em->start != bg->start) 3616 goto out; 3617 if (em->len < dev_extent_len) 3618 goto out; 3619 3620 map = em->map_lookup; 3621 for (i = 0; i < map->num_stripes; ++i) { 3622 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 3623 map->stripes[i].physical == dev_offset) { 3624 ret = scrub_stripe(sctx, bg, em, scrub_dev, i); 3625 if (ret) 3626 goto out; 3627 } 3628 } 3629 out: 3630 free_extent_map(em); 3631 3632 return ret; 3633 } 3634 3635 static int finish_extent_writes_for_zoned(struct btrfs_root *root, 3636 struct btrfs_block_group *cache) 3637 { 3638 struct btrfs_fs_info *fs_info = cache->fs_info; 3639 struct btrfs_trans_handle *trans; 3640 3641 if (!btrfs_is_zoned(fs_info)) 3642 return 0; 3643 3644 btrfs_wait_block_group_reservations(cache); 3645 btrfs_wait_nocow_writers(cache); 3646 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length); 3647 3648 trans = btrfs_join_transaction(root); 3649 if (IS_ERR(trans)) 3650 return PTR_ERR(trans); 3651 return btrfs_commit_transaction(trans); 3652 } 3653 3654 static noinline_for_stack 3655 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 3656 struct btrfs_device *scrub_dev, u64 start, u64 end) 3657 { 3658 struct btrfs_dev_extent *dev_extent = NULL; 3659 struct btrfs_path *path; 3660 struct btrfs_fs_info *fs_info = sctx->fs_info; 3661 struct btrfs_root *root = fs_info->dev_root; 3662 u64 chunk_offset; 3663 int ret = 0; 3664 int ro_set; 3665 int slot; 3666 struct extent_buffer *l; 3667 struct btrfs_key key; 3668 struct btrfs_key found_key; 3669 struct btrfs_block_group *cache; 3670 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 3671 3672 path = btrfs_alloc_path(); 3673 if (!path) 3674 return -ENOMEM; 3675 3676 path->reada = READA_FORWARD; 3677 path->search_commit_root = 1; 3678 path->skip_locking = 1; 3679 3680 key.objectid = scrub_dev->devid; 3681 key.offset = 0ull; 3682 key.type = BTRFS_DEV_EXTENT_KEY; 3683 3684 while (1) { 3685 u64 dev_extent_len; 3686 3687 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3688 if (ret < 0) 3689 break; 3690 if (ret > 0) { 3691 if (path->slots[0] >= 3692 btrfs_header_nritems(path->nodes[0])) { 3693 ret = btrfs_next_leaf(root, path); 3694 if (ret < 0) 3695 break; 3696 if (ret > 0) { 3697 ret = 0; 3698 break; 3699 } 3700 } else { 3701 ret = 0; 3702 } 3703 } 3704 3705 l = path->nodes[0]; 3706 slot = path->slots[0]; 3707 3708 btrfs_item_key_to_cpu(l, &found_key, slot); 3709 3710 if (found_key.objectid != scrub_dev->devid) 3711 break; 3712 3713 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 3714 break; 3715 3716 if (found_key.offset >= end) 3717 break; 3718 3719 if (found_key.offset < key.offset) 3720 break; 3721 3722 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3723 dev_extent_len = btrfs_dev_extent_length(l, dev_extent); 3724 3725 if (found_key.offset + dev_extent_len <= start) 3726 goto skip; 3727 3728 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3729 3730 /* 3731 * get a reference on the corresponding block group to prevent 3732 * the chunk from going away while we scrub it 3733 */ 3734 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3735 3736 /* some chunks are removed but not committed to disk yet, 3737 * continue scrubbing */ 3738 if (!cache) 3739 goto skip; 3740 3741 ASSERT(cache->start <= chunk_offset); 3742 /* 3743 * We are using the commit root to search for device extents, so 3744 * that means we could have found a device extent item from a 3745 * block group that was deleted in the current transaction. The 3746 * logical start offset of the deleted block group, stored at 3747 * @chunk_offset, might be part of the logical address range of 3748 * a new block group (which uses different physical extents). 3749 * In this case btrfs_lookup_block_group() has returned the new 3750 * block group, and its start address is less than @chunk_offset. 3751 * 3752 * We skip such new block groups, because it's pointless to 3753 * process them, as we won't find their extents because we search 3754 * for them using the commit root of the extent tree. For a device 3755 * replace it's also fine to skip it, we won't miss copying them 3756 * to the target device because we have the write duplication 3757 * setup through the regular write path (by btrfs_map_block()), 3758 * and we have committed a transaction when we started the device 3759 * replace, right after setting up the device replace state. 3760 */ 3761 if (cache->start < chunk_offset) { 3762 btrfs_put_block_group(cache); 3763 goto skip; 3764 } 3765 3766 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) { 3767 spin_lock(&cache->lock); 3768 if (!cache->to_copy) { 3769 spin_unlock(&cache->lock); 3770 btrfs_put_block_group(cache); 3771 goto skip; 3772 } 3773 spin_unlock(&cache->lock); 3774 } 3775 3776 /* 3777 * Make sure that while we are scrubbing the corresponding block 3778 * group doesn't get its logical address and its device extents 3779 * reused for another block group, which can possibly be of a 3780 * different type and different profile. We do this to prevent 3781 * false error detections and crashes due to bogus attempts to 3782 * repair extents. 3783 */ 3784 spin_lock(&cache->lock); 3785 if (cache->removed) { 3786 spin_unlock(&cache->lock); 3787 btrfs_put_block_group(cache); 3788 goto skip; 3789 } 3790 btrfs_freeze_block_group(cache); 3791 spin_unlock(&cache->lock); 3792 3793 /* 3794 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 3795 * to avoid deadlock caused by: 3796 * btrfs_inc_block_group_ro() 3797 * -> btrfs_wait_for_commit() 3798 * -> btrfs_commit_transaction() 3799 * -> btrfs_scrub_pause() 3800 */ 3801 scrub_pause_on(fs_info); 3802 3803 /* 3804 * Don't do chunk preallocation for scrub. 3805 * 3806 * This is especially important for SYSTEM bgs, or we can hit 3807 * -EFBIG from btrfs_finish_chunk_alloc() like: 3808 * 1. The only SYSTEM bg is marked RO. 3809 * Since SYSTEM bg is small, that's pretty common. 3810 * 2. New SYSTEM bg will be allocated 3811 * Due to regular version will allocate new chunk. 3812 * 3. New SYSTEM bg is empty and will get cleaned up 3813 * Before cleanup really happens, it's marked RO again. 3814 * 4. Empty SYSTEM bg get scrubbed 3815 * We go back to 2. 3816 * 3817 * This can easily boost the amount of SYSTEM chunks if cleaner 3818 * thread can't be triggered fast enough, and use up all space 3819 * of btrfs_super_block::sys_chunk_array 3820 * 3821 * While for dev replace, we need to try our best to mark block 3822 * group RO, to prevent race between: 3823 * - Write duplication 3824 * Contains latest data 3825 * - Scrub copy 3826 * Contains data from commit tree 3827 * 3828 * If target block group is not marked RO, nocow writes can 3829 * be overwritten by scrub copy, causing data corruption. 3830 * So for dev-replace, it's not allowed to continue if a block 3831 * group is not RO. 3832 */ 3833 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); 3834 if (!ret && sctx->is_dev_replace) { 3835 ret = finish_extent_writes_for_zoned(root, cache); 3836 if (ret) { 3837 btrfs_dec_block_group_ro(cache); 3838 scrub_pause_off(fs_info); 3839 btrfs_put_block_group(cache); 3840 break; 3841 } 3842 } 3843 3844 if (ret == 0) { 3845 ro_set = 1; 3846 } else if (ret == -ENOSPC && !sctx->is_dev_replace) { 3847 /* 3848 * btrfs_inc_block_group_ro return -ENOSPC when it 3849 * failed in creating new chunk for metadata. 3850 * It is not a problem for scrub, because 3851 * metadata are always cowed, and our scrub paused 3852 * commit_transactions. 3853 */ 3854 ro_set = 0; 3855 } else if (ret == -ETXTBSY) { 3856 btrfs_warn(fs_info, 3857 "skipping scrub of block group %llu due to active swapfile", 3858 cache->start); 3859 scrub_pause_off(fs_info); 3860 ret = 0; 3861 goto skip_unfreeze; 3862 } else { 3863 btrfs_warn(fs_info, 3864 "failed setting block group ro: %d", ret); 3865 btrfs_unfreeze_block_group(cache); 3866 btrfs_put_block_group(cache); 3867 scrub_pause_off(fs_info); 3868 break; 3869 } 3870 3871 /* 3872 * Now the target block is marked RO, wait for nocow writes to 3873 * finish before dev-replace. 3874 * COW is fine, as COW never overwrites extents in commit tree. 3875 */ 3876 if (sctx->is_dev_replace) { 3877 btrfs_wait_nocow_writers(cache); 3878 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, 3879 cache->length); 3880 } 3881 3882 scrub_pause_off(fs_info); 3883 down_write(&dev_replace->rwsem); 3884 dev_replace->cursor_right = found_key.offset + dev_extent_len; 3885 dev_replace->cursor_left = found_key.offset; 3886 dev_replace->item_needs_writeback = 1; 3887 up_write(&dev_replace->rwsem); 3888 3889 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, 3890 dev_extent_len); 3891 3892 /* 3893 * flush, submit all pending read and write bios, afterwards 3894 * wait for them. 3895 * Note that in the dev replace case, a read request causes 3896 * write requests that are submitted in the read completion 3897 * worker. Therefore in the current situation, it is required 3898 * that all write requests are flushed, so that all read and 3899 * write requests are really completed when bios_in_flight 3900 * changes to 0. 3901 */ 3902 sctx->flush_all_writes = true; 3903 scrub_submit(sctx); 3904 mutex_lock(&sctx->wr_lock); 3905 scrub_wr_submit(sctx); 3906 mutex_unlock(&sctx->wr_lock); 3907 3908 wait_event(sctx->list_wait, 3909 atomic_read(&sctx->bios_in_flight) == 0); 3910 3911 scrub_pause_on(fs_info); 3912 3913 /* 3914 * must be called before we decrease @scrub_paused. 3915 * make sure we don't block transaction commit while 3916 * we are waiting pending workers finished. 3917 */ 3918 wait_event(sctx->list_wait, 3919 atomic_read(&sctx->workers_pending) == 0); 3920 sctx->flush_all_writes = false; 3921 3922 scrub_pause_off(fs_info); 3923 3924 if (sctx->is_dev_replace && 3925 !btrfs_finish_block_group_to_copy(dev_replace->srcdev, 3926 cache, found_key.offset)) 3927 ro_set = 0; 3928 3929 down_write(&dev_replace->rwsem); 3930 dev_replace->cursor_left = dev_replace->cursor_right; 3931 dev_replace->item_needs_writeback = 1; 3932 up_write(&dev_replace->rwsem); 3933 3934 if (ro_set) 3935 btrfs_dec_block_group_ro(cache); 3936 3937 /* 3938 * We might have prevented the cleaner kthread from deleting 3939 * this block group if it was already unused because we raced 3940 * and set it to RO mode first. So add it back to the unused 3941 * list, otherwise it might not ever be deleted unless a manual 3942 * balance is triggered or it becomes used and unused again. 3943 */ 3944 spin_lock(&cache->lock); 3945 if (!cache->removed && !cache->ro && cache->reserved == 0 && 3946 cache->used == 0) { 3947 spin_unlock(&cache->lock); 3948 if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) 3949 btrfs_discard_queue_work(&fs_info->discard_ctl, 3950 cache); 3951 else 3952 btrfs_mark_bg_unused(cache); 3953 } else { 3954 spin_unlock(&cache->lock); 3955 } 3956 skip_unfreeze: 3957 btrfs_unfreeze_block_group(cache); 3958 btrfs_put_block_group(cache); 3959 if (ret) 3960 break; 3961 if (sctx->is_dev_replace && 3962 atomic64_read(&dev_replace->num_write_errors) > 0) { 3963 ret = -EIO; 3964 break; 3965 } 3966 if (sctx->stat.malloc_errors > 0) { 3967 ret = -ENOMEM; 3968 break; 3969 } 3970 skip: 3971 key.offset = found_key.offset + dev_extent_len; 3972 btrfs_release_path(path); 3973 } 3974 3975 btrfs_free_path(path); 3976 3977 return ret; 3978 } 3979 3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 3981 struct btrfs_device *scrub_dev) 3982 { 3983 int i; 3984 u64 bytenr; 3985 u64 gen; 3986 int ret; 3987 struct btrfs_fs_info *fs_info = sctx->fs_info; 3988 3989 if (BTRFS_FS_ERROR(fs_info)) 3990 return -EROFS; 3991 3992 /* Seed devices of a new filesystem has their own generation. */ 3993 if (scrub_dev->fs_devices != fs_info->fs_devices) 3994 gen = scrub_dev->generation; 3995 else 3996 gen = fs_info->last_trans_committed; 3997 3998 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 3999 bytenr = btrfs_sb_offset(i); 4000 if (bytenr + BTRFS_SUPER_INFO_SIZE > 4001 scrub_dev->commit_total_bytes) 4002 break; 4003 if (!btrfs_check_super_location(scrub_dev, bytenr)) 4004 continue; 4005 4006 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 4007 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 4008 NULL, bytenr); 4009 if (ret) 4010 return ret; 4011 } 4012 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 4013 4014 return 0; 4015 } 4016 4017 static void scrub_workers_put(struct btrfs_fs_info *fs_info) 4018 { 4019 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, 4020 &fs_info->scrub_lock)) { 4021 struct workqueue_struct *scrub_workers = fs_info->scrub_workers; 4022 struct workqueue_struct *scrub_wr_comp = 4023 fs_info->scrub_wr_completion_workers; 4024 struct workqueue_struct *scrub_parity = 4025 fs_info->scrub_parity_workers; 4026 4027 fs_info->scrub_workers = NULL; 4028 fs_info->scrub_wr_completion_workers = NULL; 4029 fs_info->scrub_parity_workers = NULL; 4030 mutex_unlock(&fs_info->scrub_lock); 4031 4032 if (scrub_workers) 4033 destroy_workqueue(scrub_workers); 4034 if (scrub_wr_comp) 4035 destroy_workqueue(scrub_wr_comp); 4036 if (scrub_parity) 4037 destroy_workqueue(scrub_parity); 4038 } 4039 } 4040 4041 /* 4042 * get a reference count on fs_info->scrub_workers. start worker if necessary 4043 */ 4044 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 4045 int is_dev_replace) 4046 { 4047 struct workqueue_struct *scrub_workers = NULL; 4048 struct workqueue_struct *scrub_wr_comp = NULL; 4049 struct workqueue_struct *scrub_parity = NULL; 4050 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 4051 int max_active = fs_info->thread_pool_size; 4052 int ret = -ENOMEM; 4053 4054 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) 4055 return 0; 4056 4057 scrub_workers = alloc_workqueue("btrfs-scrub", flags, 4058 is_dev_replace ? 1 : max_active); 4059 if (!scrub_workers) 4060 goto fail_scrub_workers; 4061 4062 scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); 4063 if (!scrub_wr_comp) 4064 goto fail_scrub_wr_completion_workers; 4065 4066 scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); 4067 if (!scrub_parity) 4068 goto fail_scrub_parity_workers; 4069 4070 mutex_lock(&fs_info->scrub_lock); 4071 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { 4072 ASSERT(fs_info->scrub_workers == NULL && 4073 fs_info->scrub_wr_completion_workers == NULL && 4074 fs_info->scrub_parity_workers == NULL); 4075 fs_info->scrub_workers = scrub_workers; 4076 fs_info->scrub_wr_completion_workers = scrub_wr_comp; 4077 fs_info->scrub_parity_workers = scrub_parity; 4078 refcount_set(&fs_info->scrub_workers_refcnt, 1); 4079 mutex_unlock(&fs_info->scrub_lock); 4080 return 0; 4081 } 4082 /* Other thread raced in and created the workers for us */ 4083 refcount_inc(&fs_info->scrub_workers_refcnt); 4084 mutex_unlock(&fs_info->scrub_lock); 4085 4086 ret = 0; 4087 destroy_workqueue(scrub_parity); 4088 fail_scrub_parity_workers: 4089 destroy_workqueue(scrub_wr_comp); 4090 fail_scrub_wr_completion_workers: 4091 destroy_workqueue(scrub_workers); 4092 fail_scrub_workers: 4093 return ret; 4094 } 4095 4096 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 4097 u64 end, struct btrfs_scrub_progress *progress, 4098 int readonly, int is_dev_replace) 4099 { 4100 struct btrfs_dev_lookup_args args = { .devid = devid }; 4101 struct scrub_ctx *sctx; 4102 int ret; 4103 struct btrfs_device *dev; 4104 unsigned int nofs_flag; 4105 4106 if (btrfs_fs_closing(fs_info)) 4107 return -EAGAIN; 4108 4109 if (fs_info->nodesize > BTRFS_STRIPE_LEN) { 4110 /* 4111 * in this case scrub is unable to calculate the checksum 4112 * the way scrub is implemented. Do not handle this 4113 * situation at all because it won't ever happen. 4114 */ 4115 btrfs_err(fs_info, 4116 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", 4117 fs_info->nodesize, 4118 BTRFS_STRIPE_LEN); 4119 return -EINVAL; 4120 } 4121 4122 if (fs_info->nodesize > 4123 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || 4124 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { 4125 /* 4126 * Would exhaust the array bounds of sectorv member in 4127 * struct scrub_block 4128 */ 4129 btrfs_err(fs_info, 4130 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", 4131 fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, 4132 fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); 4133 return -EINVAL; 4134 } 4135 4136 /* Allocate outside of device_list_mutex */ 4137 sctx = scrub_setup_ctx(fs_info, is_dev_replace); 4138 if (IS_ERR(sctx)) 4139 return PTR_ERR(sctx); 4140 4141 ret = scrub_workers_get(fs_info, is_dev_replace); 4142 if (ret) 4143 goto out_free_ctx; 4144 4145 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4146 dev = btrfs_find_device(fs_info->fs_devices, &args); 4147 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 4148 !is_dev_replace)) { 4149 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4150 ret = -ENODEV; 4151 goto out; 4152 } 4153 4154 if (!is_dev_replace && !readonly && 4155 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 4156 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4157 btrfs_err_in_rcu(fs_info, 4158 "scrub on devid %llu: filesystem on %s is not writable", 4159 devid, rcu_str_deref(dev->name)); 4160 ret = -EROFS; 4161 goto out; 4162 } 4163 4164 mutex_lock(&fs_info->scrub_lock); 4165 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4166 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { 4167 mutex_unlock(&fs_info->scrub_lock); 4168 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4169 ret = -EIO; 4170 goto out; 4171 } 4172 4173 down_read(&fs_info->dev_replace.rwsem); 4174 if (dev->scrub_ctx || 4175 (!is_dev_replace && 4176 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 4177 up_read(&fs_info->dev_replace.rwsem); 4178 mutex_unlock(&fs_info->scrub_lock); 4179 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4180 ret = -EINPROGRESS; 4181 goto out; 4182 } 4183 up_read(&fs_info->dev_replace.rwsem); 4184 4185 sctx->readonly = readonly; 4186 dev->scrub_ctx = sctx; 4187 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4188 4189 /* 4190 * checking @scrub_pause_req here, we can avoid 4191 * race between committing transaction and scrubbing. 4192 */ 4193 __scrub_blocked_if_needed(fs_info); 4194 atomic_inc(&fs_info->scrubs_running); 4195 mutex_unlock(&fs_info->scrub_lock); 4196 4197 /* 4198 * In order to avoid deadlock with reclaim when there is a transaction 4199 * trying to pause scrub, make sure we use GFP_NOFS for all the 4200 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() 4201 * invoked by our callees. The pausing request is done when the 4202 * transaction commit starts, and it blocks the transaction until scrub 4203 * is paused (done at specific points at scrub_stripe() or right above 4204 * before incrementing fs_info->scrubs_running). 4205 */ 4206 nofs_flag = memalloc_nofs_save(); 4207 if (!is_dev_replace) { 4208 btrfs_info(fs_info, "scrub: started on devid %llu", devid); 4209 /* 4210 * by holding device list mutex, we can 4211 * kick off writing super in log tree sync. 4212 */ 4213 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4214 ret = scrub_supers(sctx, dev); 4215 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4216 } 4217 4218 if (!ret) 4219 ret = scrub_enumerate_chunks(sctx, dev, start, end); 4220 memalloc_nofs_restore(nofs_flag); 4221 4222 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 4223 atomic_dec(&fs_info->scrubs_running); 4224 wake_up(&fs_info->scrub_pause_wait); 4225 4226 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); 4227 4228 if (progress) 4229 memcpy(progress, &sctx->stat, sizeof(*progress)); 4230 4231 if (!is_dev_replace) 4232 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d", 4233 ret ? "not finished" : "finished", devid, ret); 4234 4235 mutex_lock(&fs_info->scrub_lock); 4236 dev->scrub_ctx = NULL; 4237 mutex_unlock(&fs_info->scrub_lock); 4238 4239 scrub_workers_put(fs_info); 4240 scrub_put_ctx(sctx); 4241 4242 return ret; 4243 out: 4244 scrub_workers_put(fs_info); 4245 out_free_ctx: 4246 scrub_free_ctx(sctx); 4247 4248 return ret; 4249 } 4250 4251 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 4252 { 4253 mutex_lock(&fs_info->scrub_lock); 4254 atomic_inc(&fs_info->scrub_pause_req); 4255 while (atomic_read(&fs_info->scrubs_paused) != 4256 atomic_read(&fs_info->scrubs_running)) { 4257 mutex_unlock(&fs_info->scrub_lock); 4258 wait_event(fs_info->scrub_pause_wait, 4259 atomic_read(&fs_info->scrubs_paused) == 4260 atomic_read(&fs_info->scrubs_running)); 4261 mutex_lock(&fs_info->scrub_lock); 4262 } 4263 mutex_unlock(&fs_info->scrub_lock); 4264 } 4265 4266 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 4267 { 4268 atomic_dec(&fs_info->scrub_pause_req); 4269 wake_up(&fs_info->scrub_pause_wait); 4270 } 4271 4272 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 4273 { 4274 mutex_lock(&fs_info->scrub_lock); 4275 if (!atomic_read(&fs_info->scrubs_running)) { 4276 mutex_unlock(&fs_info->scrub_lock); 4277 return -ENOTCONN; 4278 } 4279 4280 atomic_inc(&fs_info->scrub_cancel_req); 4281 while (atomic_read(&fs_info->scrubs_running)) { 4282 mutex_unlock(&fs_info->scrub_lock); 4283 wait_event(fs_info->scrub_pause_wait, 4284 atomic_read(&fs_info->scrubs_running) == 0); 4285 mutex_lock(&fs_info->scrub_lock); 4286 } 4287 atomic_dec(&fs_info->scrub_cancel_req); 4288 mutex_unlock(&fs_info->scrub_lock); 4289 4290 return 0; 4291 } 4292 4293 int btrfs_scrub_cancel_dev(struct btrfs_device *dev) 4294 { 4295 struct btrfs_fs_info *fs_info = dev->fs_info; 4296 struct scrub_ctx *sctx; 4297 4298 mutex_lock(&fs_info->scrub_lock); 4299 sctx = dev->scrub_ctx; 4300 if (!sctx) { 4301 mutex_unlock(&fs_info->scrub_lock); 4302 return -ENOTCONN; 4303 } 4304 atomic_inc(&sctx->cancel_req); 4305 while (dev->scrub_ctx) { 4306 mutex_unlock(&fs_info->scrub_lock); 4307 wait_event(fs_info->scrub_pause_wait, 4308 dev->scrub_ctx == NULL); 4309 mutex_lock(&fs_info->scrub_lock); 4310 } 4311 mutex_unlock(&fs_info->scrub_lock); 4312 4313 return 0; 4314 } 4315 4316 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 4317 struct btrfs_scrub_progress *progress) 4318 { 4319 struct btrfs_dev_lookup_args args = { .devid = devid }; 4320 struct btrfs_device *dev; 4321 struct scrub_ctx *sctx = NULL; 4322 4323 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4324 dev = btrfs_find_device(fs_info->fs_devices, &args); 4325 if (dev) 4326 sctx = dev->scrub_ctx; 4327 if (sctx) 4328 memcpy(progress, &sctx->stat, sizeof(*progress)); 4329 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4330 4331 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 4332 } 4333 4334 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, 4335 u64 extent_logical, u32 extent_len, 4336 u64 *extent_physical, 4337 struct btrfs_device **extent_dev, 4338 int *extent_mirror_num) 4339 { 4340 u64 mapped_length; 4341 struct btrfs_io_context *bioc = NULL; 4342 int ret; 4343 4344 mapped_length = extent_len; 4345 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, 4346 &mapped_length, &bioc, 0); 4347 if (ret || !bioc || mapped_length < extent_len || 4348 !bioc->stripes[0].dev->bdev) { 4349 btrfs_put_bioc(bioc); 4350 return; 4351 } 4352 4353 *extent_physical = bioc->stripes[0].physical; 4354 *extent_mirror_num = bioc->mirror_num; 4355 *extent_dev = bioc->stripes[0].dev; 4356 btrfs_put_bioc(bioc); 4357 } 4358