1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 4 */ 5 6 #include <linux/blkdev.h> 7 #include <linux/ratelimit.h> 8 #include <linux/sched/mm.h> 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "disk-io.h" 12 #include "ordered-data.h" 13 #include "transaction.h" 14 #include "backref.h" 15 #include "extent_io.h" 16 #include "dev-replace.h" 17 #include "check-integrity.h" 18 #include "rcu-string.h" 19 #include "raid56.h" 20 21 /* 22 * This is only the first step towards a full-features scrub. It reads all 23 * extent and super block and verifies the checksums. In case a bad checksum 24 * is found or the extent cannot be read, good data will be written back if 25 * any can be found. 26 * 27 * Future enhancements: 28 * - In case an unrepairable extent is encountered, track which files are 29 * affected and report them 30 * - track and record media errors, throw out bad devices 31 * - add a mode to also read unallocated space 32 */ 33 34 struct scrub_block; 35 struct scrub_ctx; 36 37 /* 38 * the following three values only influence the performance. 39 * The last one configures the number of parallel and outstanding I/O 40 * operations. The first two values configure an upper limit for the number 41 * of (dynamically allocated) pages that are added to a bio. 42 */ 43 #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ 44 #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ 45 #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ 46 47 /* 48 * the following value times PAGE_SIZE needs to be large enough to match the 49 * largest node/leaf/sector size that shall be supported. 50 * Values larger than BTRFS_STRIPE_LEN are not supported. 51 */ 52 #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 53 54 struct scrub_recover { 55 refcount_t refs; 56 struct btrfs_bio *bbio; 57 u64 map_length; 58 }; 59 60 struct scrub_page { 61 struct scrub_block *sblock; 62 struct page *page; 63 struct btrfs_device *dev; 64 struct list_head list; 65 u64 flags; /* extent flags */ 66 u64 generation; 67 u64 logical; 68 u64 physical; 69 u64 physical_for_dev_replace; 70 atomic_t refs; 71 struct { 72 unsigned int mirror_num:8; 73 unsigned int have_csum:1; 74 unsigned int io_error:1; 75 }; 76 u8 csum[BTRFS_CSUM_SIZE]; 77 78 struct scrub_recover *recover; 79 }; 80 81 struct scrub_bio { 82 int index; 83 struct scrub_ctx *sctx; 84 struct btrfs_device *dev; 85 struct bio *bio; 86 blk_status_t status; 87 u64 logical; 88 u64 physical; 89 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO 90 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; 91 #else 92 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; 93 #endif 94 int page_count; 95 int next_free; 96 struct btrfs_work work; 97 }; 98 99 struct scrub_block { 100 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 101 int page_count; 102 atomic_t outstanding_pages; 103 refcount_t refs; /* free mem on transition to zero */ 104 struct scrub_ctx *sctx; 105 struct scrub_parity *sparity; 106 struct { 107 unsigned int header_error:1; 108 unsigned int checksum_error:1; 109 unsigned int no_io_error_seen:1; 110 unsigned int generation_error:1; /* also sets header_error */ 111 112 /* The following is for the data used to check parity */ 113 /* It is for the data with checksum */ 114 unsigned int data_corrected:1; 115 }; 116 struct btrfs_work work; 117 }; 118 119 /* Used for the chunks with parity stripe such RAID5/6 */ 120 struct scrub_parity { 121 struct scrub_ctx *sctx; 122 123 struct btrfs_device *scrub_dev; 124 125 u64 logic_start; 126 127 u64 logic_end; 128 129 int nsectors; 130 131 u64 stripe_len; 132 133 refcount_t refs; 134 135 struct list_head spages; 136 137 /* Work of parity check and repair */ 138 struct btrfs_work work; 139 140 /* Mark the parity blocks which have data */ 141 unsigned long *dbitmap; 142 143 /* 144 * Mark the parity blocks which have data, but errors happen when 145 * read data or check data 146 */ 147 unsigned long *ebitmap; 148 149 unsigned long bitmap[0]; 150 }; 151 152 struct scrub_ctx { 153 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 154 struct btrfs_fs_info *fs_info; 155 int first_free; 156 int curr; 157 atomic_t bios_in_flight; 158 atomic_t workers_pending; 159 spinlock_t list_lock; 160 wait_queue_head_t list_wait; 161 u16 csum_size; 162 struct list_head csum_list; 163 atomic_t cancel_req; 164 int readonly; 165 int pages_per_rd_bio; 166 167 int is_dev_replace; 168 169 struct scrub_bio *wr_curr_bio; 170 struct mutex wr_lock; 171 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ 172 struct btrfs_device *wr_tgtdev; 173 bool flush_all_writes; 174 175 /* 176 * statistics 177 */ 178 struct btrfs_scrub_progress stat; 179 spinlock_t stat_lock; 180 181 /* 182 * Use a ref counter to avoid use-after-free issues. Scrub workers 183 * decrement bios_in_flight and workers_pending and then do a wakeup 184 * on the list_wait wait queue. We must ensure the main scrub task 185 * doesn't free the scrub context before or while the workers are 186 * doing the wakeup() call. 187 */ 188 refcount_t refs; 189 }; 190 191 struct scrub_fixup_nodatasum { 192 struct scrub_ctx *sctx; 193 struct btrfs_device *dev; 194 u64 logical; 195 struct btrfs_root *root; 196 struct btrfs_work work; 197 int mirror_num; 198 }; 199 200 struct scrub_nocow_inode { 201 u64 inum; 202 u64 offset; 203 u64 root; 204 struct list_head list; 205 }; 206 207 struct scrub_copy_nocow_ctx { 208 struct scrub_ctx *sctx; 209 u64 logical; 210 u64 len; 211 int mirror_num; 212 u64 physical_for_dev_replace; 213 struct list_head inodes; 214 struct btrfs_work work; 215 }; 216 217 struct scrub_warning { 218 struct btrfs_path *path; 219 u64 extent_item_size; 220 const char *errstr; 221 u64 physical; 222 u64 logical; 223 struct btrfs_device *dev; 224 }; 225 226 struct full_stripe_lock { 227 struct rb_node node; 228 u64 logical; 229 u64 refs; 230 struct mutex mutex; 231 }; 232 233 static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 234 static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 235 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 236 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); 237 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 238 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 239 struct scrub_block *sblocks_for_recheck); 240 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 241 struct scrub_block *sblock, 242 int retry_failed_mirror); 243 static void scrub_recheck_block_checksum(struct scrub_block *sblock); 244 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 245 struct scrub_block *sblock_good); 246 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 247 struct scrub_block *sblock_good, 248 int page_num, int force_write); 249 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 250 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 251 int page_num); 252 static int scrub_checksum_data(struct scrub_block *sblock); 253 static int scrub_checksum_tree_block(struct scrub_block *sblock); 254 static int scrub_checksum_super(struct scrub_block *sblock); 255 static void scrub_block_get(struct scrub_block *sblock); 256 static void scrub_block_put(struct scrub_block *sblock); 257 static void scrub_page_get(struct scrub_page *spage); 258 static void scrub_page_put(struct scrub_page *spage); 259 static void scrub_parity_get(struct scrub_parity *sparity); 260 static void scrub_parity_put(struct scrub_parity *sparity); 261 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 262 struct scrub_page *spage); 263 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 264 u64 physical, struct btrfs_device *dev, u64 flags, 265 u64 gen, int mirror_num, u8 *csum, int force, 266 u64 physical_for_dev_replace); 267 static void scrub_bio_end_io(struct bio *bio); 268 static void scrub_bio_end_io_worker(struct btrfs_work *work); 269 static void scrub_block_complete(struct scrub_block *sblock); 270 static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 271 u64 extent_logical, u64 extent_len, 272 u64 *extent_physical, 273 struct btrfs_device **extent_dev, 274 int *extent_mirror_num); 275 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 276 struct scrub_page *spage); 277 static void scrub_wr_submit(struct scrub_ctx *sctx); 278 static void scrub_wr_bio_end_io(struct bio *bio); 279 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); 280 static int write_page_nocow(struct scrub_ctx *sctx, 281 u64 physical_for_dev_replace, struct page *page); 282 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 283 struct scrub_copy_nocow_ctx *ctx); 284 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 285 int mirror_num, u64 physical_for_dev_replace); 286 static void copy_nocow_pages_worker(struct btrfs_work *work); 287 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 288 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 289 static void scrub_put_ctx(struct scrub_ctx *sctx); 290 291 static inline int scrub_is_page_on_raid56(struct scrub_page *page) 292 { 293 return page->recover && 294 (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); 295 } 296 297 static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 298 { 299 refcount_inc(&sctx->refs); 300 atomic_inc(&sctx->bios_in_flight); 301 } 302 303 static void scrub_pending_bio_dec(struct scrub_ctx *sctx) 304 { 305 atomic_dec(&sctx->bios_in_flight); 306 wake_up(&sctx->list_wait); 307 scrub_put_ctx(sctx); 308 } 309 310 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 311 { 312 while (atomic_read(&fs_info->scrub_pause_req)) { 313 mutex_unlock(&fs_info->scrub_lock); 314 wait_event(fs_info->scrub_pause_wait, 315 atomic_read(&fs_info->scrub_pause_req) == 0); 316 mutex_lock(&fs_info->scrub_lock); 317 } 318 } 319 320 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 321 { 322 atomic_inc(&fs_info->scrubs_paused); 323 wake_up(&fs_info->scrub_pause_wait); 324 } 325 326 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 327 { 328 mutex_lock(&fs_info->scrub_lock); 329 __scrub_blocked_if_needed(fs_info); 330 atomic_dec(&fs_info->scrubs_paused); 331 mutex_unlock(&fs_info->scrub_lock); 332 333 wake_up(&fs_info->scrub_pause_wait); 334 } 335 336 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 337 { 338 scrub_pause_on(fs_info); 339 scrub_pause_off(fs_info); 340 } 341 342 /* 343 * Insert new full stripe lock into full stripe locks tree 344 * 345 * Return pointer to existing or newly inserted full_stripe_lock structure if 346 * everything works well. 347 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory 348 * 349 * NOTE: caller must hold full_stripe_locks_root->lock before calling this 350 * function 351 */ 352 static struct full_stripe_lock *insert_full_stripe_lock( 353 struct btrfs_full_stripe_locks_tree *locks_root, 354 u64 fstripe_logical) 355 { 356 struct rb_node **p; 357 struct rb_node *parent = NULL; 358 struct full_stripe_lock *entry; 359 struct full_stripe_lock *ret; 360 361 lockdep_assert_held(&locks_root->lock); 362 363 p = &locks_root->root.rb_node; 364 while (*p) { 365 parent = *p; 366 entry = rb_entry(parent, struct full_stripe_lock, node); 367 if (fstripe_logical < entry->logical) { 368 p = &(*p)->rb_left; 369 } else if (fstripe_logical > entry->logical) { 370 p = &(*p)->rb_right; 371 } else { 372 entry->refs++; 373 return entry; 374 } 375 } 376 377 /* Insert new lock */ 378 ret = kmalloc(sizeof(*ret), GFP_KERNEL); 379 if (!ret) 380 return ERR_PTR(-ENOMEM); 381 ret->logical = fstripe_logical; 382 ret->refs = 1; 383 mutex_init(&ret->mutex); 384 385 rb_link_node(&ret->node, parent, p); 386 rb_insert_color(&ret->node, &locks_root->root); 387 return ret; 388 } 389 390 /* 391 * Search for a full stripe lock of a block group 392 * 393 * Return pointer to existing full stripe lock if found 394 * Return NULL if not found 395 */ 396 static struct full_stripe_lock *search_full_stripe_lock( 397 struct btrfs_full_stripe_locks_tree *locks_root, 398 u64 fstripe_logical) 399 { 400 struct rb_node *node; 401 struct full_stripe_lock *entry; 402 403 lockdep_assert_held(&locks_root->lock); 404 405 node = locks_root->root.rb_node; 406 while (node) { 407 entry = rb_entry(node, struct full_stripe_lock, node); 408 if (fstripe_logical < entry->logical) 409 node = node->rb_left; 410 else if (fstripe_logical > entry->logical) 411 node = node->rb_right; 412 else 413 return entry; 414 } 415 return NULL; 416 } 417 418 /* 419 * Helper to get full stripe logical from a normal bytenr. 420 * 421 * Caller must ensure @cache is a RAID56 block group. 422 */ 423 static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, 424 u64 bytenr) 425 { 426 u64 ret; 427 428 /* 429 * Due to chunk item size limit, full stripe length should not be 430 * larger than U32_MAX. Just a sanity check here. 431 */ 432 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); 433 434 /* 435 * round_down() can only handle power of 2, while RAID56 full 436 * stripe length can be 64KiB * n, so we need to manually round down. 437 */ 438 ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * 439 cache->full_stripe_len + cache->key.objectid; 440 return ret; 441 } 442 443 /* 444 * Lock a full stripe to avoid concurrency of recovery and read 445 * 446 * It's only used for profiles with parities (RAID5/6), for other profiles it 447 * does nothing. 448 * 449 * Return 0 if we locked full stripe covering @bytenr, with a mutex held. 450 * So caller must call unlock_full_stripe() at the same context. 451 * 452 * Return <0 if encounters error. 453 */ 454 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, 455 bool *locked_ret) 456 { 457 struct btrfs_block_group_cache *bg_cache; 458 struct btrfs_full_stripe_locks_tree *locks_root; 459 struct full_stripe_lock *existing; 460 u64 fstripe_start; 461 int ret = 0; 462 463 *locked_ret = false; 464 bg_cache = btrfs_lookup_block_group(fs_info, bytenr); 465 if (!bg_cache) { 466 ASSERT(0); 467 return -ENOENT; 468 } 469 470 /* Profiles not based on parity don't need full stripe lock */ 471 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) 472 goto out; 473 locks_root = &bg_cache->full_stripe_locks_root; 474 475 fstripe_start = get_full_stripe_logical(bg_cache, bytenr); 476 477 /* Now insert the full stripe lock */ 478 mutex_lock(&locks_root->lock); 479 existing = insert_full_stripe_lock(locks_root, fstripe_start); 480 mutex_unlock(&locks_root->lock); 481 if (IS_ERR(existing)) { 482 ret = PTR_ERR(existing); 483 goto out; 484 } 485 mutex_lock(&existing->mutex); 486 *locked_ret = true; 487 out: 488 btrfs_put_block_group(bg_cache); 489 return ret; 490 } 491 492 /* 493 * Unlock a full stripe. 494 * 495 * NOTE: Caller must ensure it's the same context calling corresponding 496 * lock_full_stripe(). 497 * 498 * Return 0 if we unlock full stripe without problem. 499 * Return <0 for error 500 */ 501 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, 502 bool locked) 503 { 504 struct btrfs_block_group_cache *bg_cache; 505 struct btrfs_full_stripe_locks_tree *locks_root; 506 struct full_stripe_lock *fstripe_lock; 507 u64 fstripe_start; 508 bool freeit = false; 509 int ret = 0; 510 511 /* If we didn't acquire full stripe lock, no need to continue */ 512 if (!locked) 513 return 0; 514 515 bg_cache = btrfs_lookup_block_group(fs_info, bytenr); 516 if (!bg_cache) { 517 ASSERT(0); 518 return -ENOENT; 519 } 520 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) 521 goto out; 522 523 locks_root = &bg_cache->full_stripe_locks_root; 524 fstripe_start = get_full_stripe_logical(bg_cache, bytenr); 525 526 mutex_lock(&locks_root->lock); 527 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); 528 /* Unpaired unlock_full_stripe() detected */ 529 if (!fstripe_lock) { 530 WARN_ON(1); 531 ret = -ENOENT; 532 mutex_unlock(&locks_root->lock); 533 goto out; 534 } 535 536 if (fstripe_lock->refs == 0) { 537 WARN_ON(1); 538 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", 539 fstripe_lock->logical); 540 } else { 541 fstripe_lock->refs--; 542 } 543 544 if (fstripe_lock->refs == 0) { 545 rb_erase(&fstripe_lock->node, &locks_root->root); 546 freeit = true; 547 } 548 mutex_unlock(&locks_root->lock); 549 550 mutex_unlock(&fstripe_lock->mutex); 551 if (freeit) 552 kfree(fstripe_lock); 553 out: 554 btrfs_put_block_group(bg_cache); 555 return ret; 556 } 557 558 /* 559 * used for workers that require transaction commits (i.e., for the 560 * NOCOW case) 561 */ 562 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) 563 { 564 struct btrfs_fs_info *fs_info = sctx->fs_info; 565 566 refcount_inc(&sctx->refs); 567 /* 568 * increment scrubs_running to prevent cancel requests from 569 * completing as long as a worker is running. we must also 570 * increment scrubs_paused to prevent deadlocking on pause 571 * requests used for transactions commits (as the worker uses a 572 * transaction context). it is safe to regard the worker 573 * as paused for all matters practical. effectively, we only 574 * avoid cancellation requests from completing. 575 */ 576 mutex_lock(&fs_info->scrub_lock); 577 atomic_inc(&fs_info->scrubs_running); 578 atomic_inc(&fs_info->scrubs_paused); 579 mutex_unlock(&fs_info->scrub_lock); 580 581 /* 582 * check if @scrubs_running=@scrubs_paused condition 583 * inside wait_event() is not an atomic operation. 584 * which means we may inc/dec @scrub_running/paused 585 * at any time. Let's wake up @scrub_pause_wait as 586 * much as we can to let commit transaction blocked less. 587 */ 588 wake_up(&fs_info->scrub_pause_wait); 589 590 atomic_inc(&sctx->workers_pending); 591 } 592 593 /* used for workers that require transaction commits */ 594 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) 595 { 596 struct btrfs_fs_info *fs_info = sctx->fs_info; 597 598 /* 599 * see scrub_pending_trans_workers_inc() why we're pretending 600 * to be paused in the scrub counters 601 */ 602 mutex_lock(&fs_info->scrub_lock); 603 atomic_dec(&fs_info->scrubs_running); 604 atomic_dec(&fs_info->scrubs_paused); 605 mutex_unlock(&fs_info->scrub_lock); 606 atomic_dec(&sctx->workers_pending); 607 wake_up(&fs_info->scrub_pause_wait); 608 wake_up(&sctx->list_wait); 609 scrub_put_ctx(sctx); 610 } 611 612 static void scrub_free_csums(struct scrub_ctx *sctx) 613 { 614 while (!list_empty(&sctx->csum_list)) { 615 struct btrfs_ordered_sum *sum; 616 sum = list_first_entry(&sctx->csum_list, 617 struct btrfs_ordered_sum, list); 618 list_del(&sum->list); 619 kfree(sum); 620 } 621 } 622 623 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 624 { 625 int i; 626 627 if (!sctx) 628 return; 629 630 /* this can happen when scrub is cancelled */ 631 if (sctx->curr != -1) { 632 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 633 634 for (i = 0; i < sbio->page_count; i++) { 635 WARN_ON(!sbio->pagev[i]->page); 636 scrub_block_put(sbio->pagev[i]->sblock); 637 } 638 bio_put(sbio->bio); 639 } 640 641 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 642 struct scrub_bio *sbio = sctx->bios[i]; 643 644 if (!sbio) 645 break; 646 kfree(sbio); 647 } 648 649 kfree(sctx->wr_curr_bio); 650 scrub_free_csums(sctx); 651 kfree(sctx); 652 } 653 654 static void scrub_put_ctx(struct scrub_ctx *sctx) 655 { 656 if (refcount_dec_and_test(&sctx->refs)) 657 scrub_free_ctx(sctx); 658 } 659 660 static noinline_for_stack 661 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) 662 { 663 struct scrub_ctx *sctx; 664 int i; 665 struct btrfs_fs_info *fs_info = dev->fs_info; 666 667 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); 668 if (!sctx) 669 goto nomem; 670 refcount_set(&sctx->refs, 1); 671 sctx->is_dev_replace = is_dev_replace; 672 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; 673 sctx->curr = -1; 674 sctx->fs_info = dev->fs_info; 675 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 676 struct scrub_bio *sbio; 677 678 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); 679 if (!sbio) 680 goto nomem; 681 sctx->bios[i] = sbio; 682 683 sbio->index = i; 684 sbio->sctx = sctx; 685 sbio->page_count = 0; 686 btrfs_init_work(&sbio->work, btrfs_scrub_helper, 687 scrub_bio_end_io_worker, NULL, NULL); 688 689 if (i != SCRUB_BIOS_PER_SCTX - 1) 690 sctx->bios[i]->next_free = i + 1; 691 else 692 sctx->bios[i]->next_free = -1; 693 } 694 sctx->first_free = 0; 695 atomic_set(&sctx->bios_in_flight, 0); 696 atomic_set(&sctx->workers_pending, 0); 697 atomic_set(&sctx->cancel_req, 0); 698 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 699 INIT_LIST_HEAD(&sctx->csum_list); 700 701 spin_lock_init(&sctx->list_lock); 702 spin_lock_init(&sctx->stat_lock); 703 init_waitqueue_head(&sctx->list_wait); 704 705 WARN_ON(sctx->wr_curr_bio != NULL); 706 mutex_init(&sctx->wr_lock); 707 sctx->wr_curr_bio = NULL; 708 if (is_dev_replace) { 709 WARN_ON(!fs_info->dev_replace.tgtdev); 710 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; 711 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 712 sctx->flush_all_writes = false; 713 } 714 715 return sctx; 716 717 nomem: 718 scrub_free_ctx(sctx); 719 return ERR_PTR(-ENOMEM); 720 } 721 722 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 723 void *warn_ctx) 724 { 725 u64 isize; 726 u32 nlink; 727 int ret; 728 int i; 729 unsigned nofs_flag; 730 struct extent_buffer *eb; 731 struct btrfs_inode_item *inode_item; 732 struct scrub_warning *swarn = warn_ctx; 733 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 734 struct inode_fs_paths *ipath = NULL; 735 struct btrfs_root *local_root; 736 struct btrfs_key root_key; 737 struct btrfs_key key; 738 739 root_key.objectid = root; 740 root_key.type = BTRFS_ROOT_ITEM_KEY; 741 root_key.offset = (u64)-1; 742 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); 743 if (IS_ERR(local_root)) { 744 ret = PTR_ERR(local_root); 745 goto err; 746 } 747 748 /* 749 * this makes the path point to (inum INODE_ITEM ioff) 750 */ 751 key.objectid = inum; 752 key.type = BTRFS_INODE_ITEM_KEY; 753 key.offset = 0; 754 755 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 756 if (ret) { 757 btrfs_release_path(swarn->path); 758 goto err; 759 } 760 761 eb = swarn->path->nodes[0]; 762 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 763 struct btrfs_inode_item); 764 isize = btrfs_inode_size(eb, inode_item); 765 nlink = btrfs_inode_nlink(eb, inode_item); 766 btrfs_release_path(swarn->path); 767 768 /* 769 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 770 * uses GFP_NOFS in this context, so we keep it consistent but it does 771 * not seem to be strictly necessary. 772 */ 773 nofs_flag = memalloc_nofs_save(); 774 ipath = init_ipath(4096, local_root, swarn->path); 775 memalloc_nofs_restore(nofs_flag); 776 if (IS_ERR(ipath)) { 777 ret = PTR_ERR(ipath); 778 ipath = NULL; 779 goto err; 780 } 781 ret = paths_from_inode(inum, ipath); 782 783 if (ret < 0) 784 goto err; 785 786 /* 787 * we deliberately ignore the bit ipath might have been too small to 788 * hold all of the paths here 789 */ 790 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 791 btrfs_warn_in_rcu(fs_info, 792 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", 793 swarn->errstr, swarn->logical, 794 rcu_str_deref(swarn->dev->name), 795 swarn->physical, 796 root, inum, offset, 797 min(isize - offset, (u64)PAGE_SIZE), nlink, 798 (char *)(unsigned long)ipath->fspath->val[i]); 799 800 free_ipath(ipath); 801 return 0; 802 803 err: 804 btrfs_warn_in_rcu(fs_info, 805 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", 806 swarn->errstr, swarn->logical, 807 rcu_str_deref(swarn->dev->name), 808 swarn->physical, 809 root, inum, offset, ret); 810 811 free_ipath(ipath); 812 return 0; 813 } 814 815 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 816 { 817 struct btrfs_device *dev; 818 struct btrfs_fs_info *fs_info; 819 struct btrfs_path *path; 820 struct btrfs_key found_key; 821 struct extent_buffer *eb; 822 struct btrfs_extent_item *ei; 823 struct scrub_warning swarn; 824 unsigned long ptr = 0; 825 u64 extent_item_pos; 826 u64 flags = 0; 827 u64 ref_root; 828 u32 item_size; 829 u8 ref_level = 0; 830 int ret; 831 832 WARN_ON(sblock->page_count < 1); 833 dev = sblock->pagev[0]->dev; 834 fs_info = sblock->sctx->fs_info; 835 836 path = btrfs_alloc_path(); 837 if (!path) 838 return; 839 840 swarn.physical = sblock->pagev[0]->physical; 841 swarn.logical = sblock->pagev[0]->logical; 842 swarn.errstr = errstr; 843 swarn.dev = NULL; 844 845 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 846 &flags); 847 if (ret < 0) 848 goto out; 849 850 extent_item_pos = swarn.logical - found_key.objectid; 851 swarn.extent_item_size = found_key.offset; 852 853 eb = path->nodes[0]; 854 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 855 item_size = btrfs_item_size_nr(eb, path->slots[0]); 856 857 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 858 do { 859 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 860 item_size, &ref_root, 861 &ref_level); 862 btrfs_warn_in_rcu(fs_info, 863 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 864 errstr, swarn.logical, 865 rcu_str_deref(dev->name), 866 swarn.physical, 867 ref_level ? "node" : "leaf", 868 ret < 0 ? -1 : ref_level, 869 ret < 0 ? -1 : ref_root); 870 } while (ret != 1); 871 btrfs_release_path(path); 872 } else { 873 btrfs_release_path(path); 874 swarn.path = path; 875 swarn.dev = dev; 876 iterate_extent_inodes(fs_info, found_key.objectid, 877 extent_item_pos, 1, 878 scrub_print_warning_inode, &swarn, false); 879 } 880 881 out: 882 btrfs_free_path(path); 883 } 884 885 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 886 { 887 struct page *page = NULL; 888 unsigned long index; 889 struct scrub_fixup_nodatasum *fixup = fixup_ctx; 890 int ret; 891 int corrected = 0; 892 struct btrfs_key key; 893 struct inode *inode = NULL; 894 struct btrfs_fs_info *fs_info; 895 u64 end = offset + PAGE_SIZE - 1; 896 struct btrfs_root *local_root; 897 int srcu_index; 898 899 key.objectid = root; 900 key.type = BTRFS_ROOT_ITEM_KEY; 901 key.offset = (u64)-1; 902 903 fs_info = fixup->root->fs_info; 904 srcu_index = srcu_read_lock(&fs_info->subvol_srcu); 905 906 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 907 if (IS_ERR(local_root)) { 908 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 909 return PTR_ERR(local_root); 910 } 911 912 key.type = BTRFS_INODE_ITEM_KEY; 913 key.objectid = inum; 914 key.offset = 0; 915 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 916 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 917 if (IS_ERR(inode)) 918 return PTR_ERR(inode); 919 920 index = offset >> PAGE_SHIFT; 921 922 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 923 if (!page) { 924 ret = -ENOMEM; 925 goto out; 926 } 927 928 if (PageUptodate(page)) { 929 if (PageDirty(page)) { 930 /* 931 * we need to write the data to the defect sector. the 932 * data that was in that sector is not in memory, 933 * because the page was modified. we must not write the 934 * modified page to that sector. 935 * 936 * TODO: what could be done here: wait for the delalloc 937 * runner to write out that page (might involve 938 * COW) and see whether the sector is still 939 * referenced afterwards. 940 * 941 * For the meantime, we'll treat this error 942 * incorrectable, although there is a chance that a 943 * later scrub will find the bad sector again and that 944 * there's no dirty page in memory, then. 945 */ 946 ret = -EIO; 947 goto out; 948 } 949 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE, 950 fixup->logical, page, 951 offset - page_offset(page), 952 fixup->mirror_num); 953 unlock_page(page); 954 corrected = !ret; 955 } else { 956 /* 957 * we need to get good data first. the general readpage path 958 * will call repair_io_failure for us, we just have to make 959 * sure we read the bad mirror. 960 */ 961 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 962 EXTENT_DAMAGED); 963 if (ret) { 964 /* set_extent_bits should give proper error */ 965 WARN_ON(ret > 0); 966 if (ret > 0) 967 ret = -EFAULT; 968 goto out; 969 } 970 971 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, 972 btrfs_get_extent, 973 fixup->mirror_num); 974 wait_on_page_locked(page); 975 976 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, 977 end, EXTENT_DAMAGED, 0, NULL); 978 if (!corrected) 979 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 980 EXTENT_DAMAGED); 981 } 982 983 out: 984 if (page) 985 put_page(page); 986 987 iput(inode); 988 989 if (ret < 0) 990 return ret; 991 992 if (ret == 0 && corrected) { 993 /* 994 * we only need to call readpage for one of the inodes belonging 995 * to this extent. so make iterate_extent_inodes stop 996 */ 997 return 1; 998 } 999 1000 return -EIO; 1001 } 1002 1003 static void scrub_fixup_nodatasum(struct btrfs_work *work) 1004 { 1005 struct btrfs_fs_info *fs_info; 1006 int ret; 1007 struct scrub_fixup_nodatasum *fixup; 1008 struct scrub_ctx *sctx; 1009 struct btrfs_trans_handle *trans = NULL; 1010 struct btrfs_path *path; 1011 int uncorrectable = 0; 1012 1013 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 1014 sctx = fixup->sctx; 1015 fs_info = fixup->root->fs_info; 1016 1017 path = btrfs_alloc_path(); 1018 if (!path) { 1019 spin_lock(&sctx->stat_lock); 1020 ++sctx->stat.malloc_errors; 1021 spin_unlock(&sctx->stat_lock); 1022 uncorrectable = 1; 1023 goto out; 1024 } 1025 1026 trans = btrfs_join_transaction(fixup->root); 1027 if (IS_ERR(trans)) { 1028 uncorrectable = 1; 1029 goto out; 1030 } 1031 1032 /* 1033 * the idea is to trigger a regular read through the standard path. we 1034 * read a page from the (failed) logical address by specifying the 1035 * corresponding copynum of the failed sector. thus, that readpage is 1036 * expected to fail. 1037 * that is the point where on-the-fly error correction will kick in 1038 * (once it's finished) and rewrite the failed sector if a good copy 1039 * can be found. 1040 */ 1041 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, 1042 scrub_fixup_readpage, fixup, false); 1043 if (ret < 0) { 1044 uncorrectable = 1; 1045 goto out; 1046 } 1047 WARN_ON(ret != 1); 1048 1049 spin_lock(&sctx->stat_lock); 1050 ++sctx->stat.corrected_errors; 1051 spin_unlock(&sctx->stat_lock); 1052 1053 out: 1054 if (trans && !IS_ERR(trans)) 1055 btrfs_end_transaction(trans); 1056 if (uncorrectable) { 1057 spin_lock(&sctx->stat_lock); 1058 ++sctx->stat.uncorrectable_errors; 1059 spin_unlock(&sctx->stat_lock); 1060 btrfs_dev_replace_stats_inc( 1061 &fs_info->dev_replace.num_uncorrectable_read_errors); 1062 btrfs_err_rl_in_rcu(fs_info, 1063 "unable to fixup (nodatasum) error at logical %llu on dev %s", 1064 fixup->logical, rcu_str_deref(fixup->dev->name)); 1065 } 1066 1067 btrfs_free_path(path); 1068 kfree(fixup); 1069 1070 scrub_pending_trans_workers_dec(sctx); 1071 } 1072 1073 static inline void scrub_get_recover(struct scrub_recover *recover) 1074 { 1075 refcount_inc(&recover->refs); 1076 } 1077 1078 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, 1079 struct scrub_recover *recover) 1080 { 1081 if (refcount_dec_and_test(&recover->refs)) { 1082 btrfs_bio_counter_dec(fs_info); 1083 btrfs_put_bbio(recover->bbio); 1084 kfree(recover); 1085 } 1086 } 1087 1088 /* 1089 * scrub_handle_errored_block gets called when either verification of the 1090 * pages failed or the bio failed to read, e.g. with EIO. In the latter 1091 * case, this function handles all pages in the bio, even though only one 1092 * may be bad. 1093 * The goal of this function is to repair the errored block by using the 1094 * contents of one of the mirrors. 1095 */ 1096 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 1097 { 1098 struct scrub_ctx *sctx = sblock_to_check->sctx; 1099 struct btrfs_device *dev; 1100 struct btrfs_fs_info *fs_info; 1101 u64 logical; 1102 unsigned int failed_mirror_index; 1103 unsigned int is_metadata; 1104 unsigned int have_csum; 1105 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ 1106 struct scrub_block *sblock_bad; 1107 int ret; 1108 int mirror_index; 1109 int page_num; 1110 int success; 1111 bool full_stripe_locked; 1112 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 1113 DEFAULT_RATELIMIT_BURST); 1114 1115 BUG_ON(sblock_to_check->page_count < 1); 1116 fs_info = sctx->fs_info; 1117 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 1118 /* 1119 * if we find an error in a super block, we just report it. 1120 * They will get written with the next transaction commit 1121 * anyway 1122 */ 1123 spin_lock(&sctx->stat_lock); 1124 ++sctx->stat.super_errors; 1125 spin_unlock(&sctx->stat_lock); 1126 return 0; 1127 } 1128 logical = sblock_to_check->pagev[0]->logical; 1129 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); 1130 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; 1131 is_metadata = !(sblock_to_check->pagev[0]->flags & 1132 BTRFS_EXTENT_FLAG_DATA); 1133 have_csum = sblock_to_check->pagev[0]->have_csum; 1134 dev = sblock_to_check->pagev[0]->dev; 1135 1136 /* 1137 * For RAID5/6, race can happen for a different device scrub thread. 1138 * For data corruption, Parity and Data threads will both try 1139 * to recovery the data. 1140 * Race can lead to doubly added csum error, or even unrecoverable 1141 * error. 1142 */ 1143 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); 1144 if (ret < 0) { 1145 spin_lock(&sctx->stat_lock); 1146 if (ret == -ENOMEM) 1147 sctx->stat.malloc_errors++; 1148 sctx->stat.read_errors++; 1149 sctx->stat.uncorrectable_errors++; 1150 spin_unlock(&sctx->stat_lock); 1151 return ret; 1152 } 1153 1154 if (sctx->is_dev_replace && !is_metadata && !have_csum) { 1155 sblocks_for_recheck = NULL; 1156 goto nodatasum_case; 1157 } 1158 1159 /* 1160 * read all mirrors one after the other. This includes to 1161 * re-read the extent or metadata block that failed (that was 1162 * the cause that this fixup code is called) another time, 1163 * page by page this time in order to know which pages 1164 * caused I/O errors and which ones are good (for all mirrors). 1165 * It is the goal to handle the situation when more than one 1166 * mirror contains I/O errors, but the errors do not 1167 * overlap, i.e. the data can be repaired by selecting the 1168 * pages from those mirrors without I/O error on the 1169 * particular pages. One example (with blocks >= 2 * PAGE_SIZE) 1170 * would be that mirror #1 has an I/O error on the first page, 1171 * the second page is good, and mirror #2 has an I/O error on 1172 * the second page, but the first page is good. 1173 * Then the first page of the first mirror can be repaired by 1174 * taking the first page of the second mirror, and the 1175 * second page of the second mirror can be repaired by 1176 * copying the contents of the 2nd page of the 1st mirror. 1177 * One more note: if the pages of one mirror contain I/O 1178 * errors, the checksum cannot be verified. In order to get 1179 * the best data for repairing, the first attempt is to find 1180 * a mirror without I/O errors and with a validated checksum. 1181 * Only if this is not possible, the pages are picked from 1182 * mirrors with I/O errors without considering the checksum. 1183 * If the latter is the case, at the end, the checksum of the 1184 * repaired area is verified in order to correctly maintain 1185 * the statistics. 1186 */ 1187 1188 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, 1189 sizeof(*sblocks_for_recheck), GFP_NOFS); 1190 if (!sblocks_for_recheck) { 1191 spin_lock(&sctx->stat_lock); 1192 sctx->stat.malloc_errors++; 1193 sctx->stat.read_errors++; 1194 sctx->stat.uncorrectable_errors++; 1195 spin_unlock(&sctx->stat_lock); 1196 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 1197 goto out; 1198 } 1199 1200 /* setup the context, map the logical blocks and alloc the pages */ 1201 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); 1202 if (ret) { 1203 spin_lock(&sctx->stat_lock); 1204 sctx->stat.read_errors++; 1205 sctx->stat.uncorrectable_errors++; 1206 spin_unlock(&sctx->stat_lock); 1207 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 1208 goto out; 1209 } 1210 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 1211 sblock_bad = sblocks_for_recheck + failed_mirror_index; 1212 1213 /* build and submit the bios for the failed mirror, check checksums */ 1214 scrub_recheck_block(fs_info, sblock_bad, 1); 1215 1216 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 1217 sblock_bad->no_io_error_seen) { 1218 /* 1219 * the error disappeared after reading page by page, or 1220 * the area was part of a huge bio and other parts of the 1221 * bio caused I/O errors, or the block layer merged several 1222 * read requests into one and the error is caused by a 1223 * different bio (usually one of the two latter cases is 1224 * the cause) 1225 */ 1226 spin_lock(&sctx->stat_lock); 1227 sctx->stat.unverified_errors++; 1228 sblock_to_check->data_corrected = 1; 1229 spin_unlock(&sctx->stat_lock); 1230 1231 if (sctx->is_dev_replace) 1232 scrub_write_block_to_dev_replace(sblock_bad); 1233 goto out; 1234 } 1235 1236 if (!sblock_bad->no_io_error_seen) { 1237 spin_lock(&sctx->stat_lock); 1238 sctx->stat.read_errors++; 1239 spin_unlock(&sctx->stat_lock); 1240 if (__ratelimit(&_rs)) 1241 scrub_print_warning("i/o error", sblock_to_check); 1242 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 1243 } else if (sblock_bad->checksum_error) { 1244 spin_lock(&sctx->stat_lock); 1245 sctx->stat.csum_errors++; 1246 spin_unlock(&sctx->stat_lock); 1247 if (__ratelimit(&_rs)) 1248 scrub_print_warning("checksum error", sblock_to_check); 1249 btrfs_dev_stat_inc_and_print(dev, 1250 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1251 } else if (sblock_bad->header_error) { 1252 spin_lock(&sctx->stat_lock); 1253 sctx->stat.verify_errors++; 1254 spin_unlock(&sctx->stat_lock); 1255 if (__ratelimit(&_rs)) 1256 scrub_print_warning("checksum/header error", 1257 sblock_to_check); 1258 if (sblock_bad->generation_error) 1259 btrfs_dev_stat_inc_and_print(dev, 1260 BTRFS_DEV_STAT_GENERATION_ERRS); 1261 else 1262 btrfs_dev_stat_inc_and_print(dev, 1263 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1264 } 1265 1266 if (sctx->readonly) { 1267 ASSERT(!sctx->is_dev_replace); 1268 goto out; 1269 } 1270 1271 if (!is_metadata && !have_csum) { 1272 struct scrub_fixup_nodatasum *fixup_nodatasum; 1273 1274 WARN_ON(sctx->is_dev_replace); 1275 1276 nodatasum_case: 1277 1278 /* 1279 * !is_metadata and !have_csum, this means that the data 1280 * might not be COWed, that it might be modified 1281 * concurrently. The general strategy to work on the 1282 * commit root does not help in the case when COW is not 1283 * used. 1284 */ 1285 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 1286 if (!fixup_nodatasum) 1287 goto did_not_correct_error; 1288 fixup_nodatasum->sctx = sctx; 1289 fixup_nodatasum->dev = dev; 1290 fixup_nodatasum->logical = logical; 1291 fixup_nodatasum->root = fs_info->extent_root; 1292 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 1293 scrub_pending_trans_workers_inc(sctx); 1294 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, 1295 scrub_fixup_nodatasum, NULL, NULL); 1296 btrfs_queue_work(fs_info->scrub_workers, 1297 &fixup_nodatasum->work); 1298 goto out; 1299 } 1300 1301 /* 1302 * now build and submit the bios for the other mirrors, check 1303 * checksums. 1304 * First try to pick the mirror which is completely without I/O 1305 * errors and also does not have a checksum error. 1306 * If one is found, and if a checksum is present, the full block 1307 * that is known to contain an error is rewritten. Afterwards 1308 * the block is known to be corrected. 1309 * If a mirror is found which is completely correct, and no 1310 * checksum is present, only those pages are rewritten that had 1311 * an I/O error in the block to be repaired, since it cannot be 1312 * determined, which copy of the other pages is better (and it 1313 * could happen otherwise that a correct page would be 1314 * overwritten by a bad one). 1315 */ 1316 for (mirror_index = 0; ;mirror_index++) { 1317 struct scrub_block *sblock_other; 1318 1319 if (mirror_index == failed_mirror_index) 1320 continue; 1321 1322 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ 1323 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { 1324 if (mirror_index >= BTRFS_MAX_MIRRORS) 1325 break; 1326 if (!sblocks_for_recheck[mirror_index].page_count) 1327 break; 1328 1329 sblock_other = sblocks_for_recheck + mirror_index; 1330 } else { 1331 struct scrub_recover *r = sblock_bad->pagev[0]->recover; 1332 int max_allowed = r->bbio->num_stripes - 1333 r->bbio->num_tgtdevs; 1334 1335 if (mirror_index >= max_allowed) 1336 break; 1337 if (!sblocks_for_recheck[1].page_count) 1338 break; 1339 1340 ASSERT(failed_mirror_index == 0); 1341 sblock_other = sblocks_for_recheck + 1; 1342 sblock_other->pagev[0]->mirror_num = 1 + mirror_index; 1343 } 1344 1345 /* build and submit the bios, check checksums */ 1346 scrub_recheck_block(fs_info, sblock_other, 0); 1347 1348 if (!sblock_other->header_error && 1349 !sblock_other->checksum_error && 1350 sblock_other->no_io_error_seen) { 1351 if (sctx->is_dev_replace) { 1352 scrub_write_block_to_dev_replace(sblock_other); 1353 goto corrected_error; 1354 } else { 1355 ret = scrub_repair_block_from_good_copy( 1356 sblock_bad, sblock_other); 1357 if (!ret) 1358 goto corrected_error; 1359 } 1360 } 1361 } 1362 1363 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) 1364 goto did_not_correct_error; 1365 1366 /* 1367 * In case of I/O errors in the area that is supposed to be 1368 * repaired, continue by picking good copies of those pages. 1369 * Select the good pages from mirrors to rewrite bad pages from 1370 * the area to fix. Afterwards verify the checksum of the block 1371 * that is supposed to be repaired. This verification step is 1372 * only done for the purpose of statistic counting and for the 1373 * final scrub report, whether errors remain. 1374 * A perfect algorithm could make use of the checksum and try 1375 * all possible combinations of pages from the different mirrors 1376 * until the checksum verification succeeds. For example, when 1377 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page 1378 * of mirror #2 is readable but the final checksum test fails, 1379 * then the 2nd page of mirror #3 could be tried, whether now 1380 * the final checksum succeeds. But this would be a rare 1381 * exception and is therefore not implemented. At least it is 1382 * avoided that the good copy is overwritten. 1383 * A more useful improvement would be to pick the sectors 1384 * without I/O error based on sector sizes (512 bytes on legacy 1385 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one 1386 * mirror could be repaired by taking 512 byte of a different 1387 * mirror, even if other 512 byte sectors in the same PAGE_SIZE 1388 * area are unreadable. 1389 */ 1390 success = 1; 1391 for (page_num = 0; page_num < sblock_bad->page_count; 1392 page_num++) { 1393 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1394 struct scrub_block *sblock_other = NULL; 1395 1396 /* skip no-io-error page in scrub */ 1397 if (!page_bad->io_error && !sctx->is_dev_replace) 1398 continue; 1399 1400 if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { 1401 /* 1402 * In case of dev replace, if raid56 rebuild process 1403 * didn't work out correct data, then copy the content 1404 * in sblock_bad to make sure target device is identical 1405 * to source device, instead of writing garbage data in 1406 * sblock_for_recheck array to target device. 1407 */ 1408 sblock_other = NULL; 1409 } else if (page_bad->io_error) { 1410 /* try to find no-io-error page in mirrors */ 1411 for (mirror_index = 0; 1412 mirror_index < BTRFS_MAX_MIRRORS && 1413 sblocks_for_recheck[mirror_index].page_count > 0; 1414 mirror_index++) { 1415 if (!sblocks_for_recheck[mirror_index]. 1416 pagev[page_num]->io_error) { 1417 sblock_other = sblocks_for_recheck + 1418 mirror_index; 1419 break; 1420 } 1421 } 1422 if (!sblock_other) 1423 success = 0; 1424 } 1425 1426 if (sctx->is_dev_replace) { 1427 /* 1428 * did not find a mirror to fetch the page 1429 * from. scrub_write_page_to_dev_replace() 1430 * handles this case (page->io_error), by 1431 * filling the block with zeros before 1432 * submitting the write request 1433 */ 1434 if (!sblock_other) 1435 sblock_other = sblock_bad; 1436 1437 if (scrub_write_page_to_dev_replace(sblock_other, 1438 page_num) != 0) { 1439 btrfs_dev_replace_stats_inc( 1440 &fs_info->dev_replace.num_write_errors); 1441 success = 0; 1442 } 1443 } else if (sblock_other) { 1444 ret = scrub_repair_page_from_good_copy(sblock_bad, 1445 sblock_other, 1446 page_num, 0); 1447 if (0 == ret) 1448 page_bad->io_error = 0; 1449 else 1450 success = 0; 1451 } 1452 } 1453 1454 if (success && !sctx->is_dev_replace) { 1455 if (is_metadata || have_csum) { 1456 /* 1457 * need to verify the checksum now that all 1458 * sectors on disk are repaired (the write 1459 * request for data to be repaired is on its way). 1460 * Just be lazy and use scrub_recheck_block() 1461 * which re-reads the data before the checksum 1462 * is verified, but most likely the data comes out 1463 * of the page cache. 1464 */ 1465 scrub_recheck_block(fs_info, sblock_bad, 1); 1466 if (!sblock_bad->header_error && 1467 !sblock_bad->checksum_error && 1468 sblock_bad->no_io_error_seen) 1469 goto corrected_error; 1470 else 1471 goto did_not_correct_error; 1472 } else { 1473 corrected_error: 1474 spin_lock(&sctx->stat_lock); 1475 sctx->stat.corrected_errors++; 1476 sblock_to_check->data_corrected = 1; 1477 spin_unlock(&sctx->stat_lock); 1478 btrfs_err_rl_in_rcu(fs_info, 1479 "fixed up error at logical %llu on dev %s", 1480 logical, rcu_str_deref(dev->name)); 1481 } 1482 } else { 1483 did_not_correct_error: 1484 spin_lock(&sctx->stat_lock); 1485 sctx->stat.uncorrectable_errors++; 1486 spin_unlock(&sctx->stat_lock); 1487 btrfs_err_rl_in_rcu(fs_info, 1488 "unable to fixup (regular) error at logical %llu on dev %s", 1489 logical, rcu_str_deref(dev->name)); 1490 } 1491 1492 out: 1493 if (sblocks_for_recheck) { 1494 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; 1495 mirror_index++) { 1496 struct scrub_block *sblock = sblocks_for_recheck + 1497 mirror_index; 1498 struct scrub_recover *recover; 1499 int page_index; 1500 1501 for (page_index = 0; page_index < sblock->page_count; 1502 page_index++) { 1503 sblock->pagev[page_index]->sblock = NULL; 1504 recover = sblock->pagev[page_index]->recover; 1505 if (recover) { 1506 scrub_put_recover(fs_info, recover); 1507 sblock->pagev[page_index]->recover = 1508 NULL; 1509 } 1510 scrub_page_put(sblock->pagev[page_index]); 1511 } 1512 } 1513 kfree(sblocks_for_recheck); 1514 } 1515 1516 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); 1517 if (ret < 0) 1518 return ret; 1519 return 0; 1520 } 1521 1522 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) 1523 { 1524 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1525 return 2; 1526 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1527 return 3; 1528 else 1529 return (int)bbio->num_stripes; 1530 } 1531 1532 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, 1533 u64 *raid_map, 1534 u64 mapped_length, 1535 int nstripes, int mirror, 1536 int *stripe_index, 1537 u64 *stripe_offset) 1538 { 1539 int i; 1540 1541 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 1542 /* RAID5/6 */ 1543 for (i = 0; i < nstripes; i++) { 1544 if (raid_map[i] == RAID6_Q_STRIPE || 1545 raid_map[i] == RAID5_P_STRIPE) 1546 continue; 1547 1548 if (logical >= raid_map[i] && 1549 logical < raid_map[i] + mapped_length) 1550 break; 1551 } 1552 1553 *stripe_index = i; 1554 *stripe_offset = logical - raid_map[i]; 1555 } else { 1556 /* The other RAID type */ 1557 *stripe_index = mirror; 1558 *stripe_offset = 0; 1559 } 1560 } 1561 1562 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 1563 struct scrub_block *sblocks_for_recheck) 1564 { 1565 struct scrub_ctx *sctx = original_sblock->sctx; 1566 struct btrfs_fs_info *fs_info = sctx->fs_info; 1567 u64 length = original_sblock->page_count * PAGE_SIZE; 1568 u64 logical = original_sblock->pagev[0]->logical; 1569 u64 generation = original_sblock->pagev[0]->generation; 1570 u64 flags = original_sblock->pagev[0]->flags; 1571 u64 have_csum = original_sblock->pagev[0]->have_csum; 1572 struct scrub_recover *recover; 1573 struct btrfs_bio *bbio; 1574 u64 sublen; 1575 u64 mapped_length; 1576 u64 stripe_offset; 1577 int stripe_index; 1578 int page_index = 0; 1579 int mirror_index; 1580 int nmirrors; 1581 int ret; 1582 1583 /* 1584 * note: the two members refs and outstanding_pages 1585 * are not used (and not set) in the blocks that are used for 1586 * the recheck procedure 1587 */ 1588 1589 while (length > 0) { 1590 sublen = min_t(u64, length, PAGE_SIZE); 1591 mapped_length = sublen; 1592 bbio = NULL; 1593 1594 /* 1595 * with a length of PAGE_SIZE, each returned stripe 1596 * represents one mirror 1597 */ 1598 btrfs_bio_counter_inc_blocked(fs_info); 1599 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1600 logical, &mapped_length, &bbio); 1601 if (ret || !bbio || mapped_length < sublen) { 1602 btrfs_put_bbio(bbio); 1603 btrfs_bio_counter_dec(fs_info); 1604 return -EIO; 1605 } 1606 1607 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); 1608 if (!recover) { 1609 btrfs_put_bbio(bbio); 1610 btrfs_bio_counter_dec(fs_info); 1611 return -ENOMEM; 1612 } 1613 1614 refcount_set(&recover->refs, 1); 1615 recover->bbio = bbio; 1616 recover->map_length = mapped_length; 1617 1618 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); 1619 1620 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); 1621 1622 for (mirror_index = 0; mirror_index < nmirrors; 1623 mirror_index++) { 1624 struct scrub_block *sblock; 1625 struct scrub_page *page; 1626 1627 sblock = sblocks_for_recheck + mirror_index; 1628 sblock->sctx = sctx; 1629 1630 page = kzalloc(sizeof(*page), GFP_NOFS); 1631 if (!page) { 1632 leave_nomem: 1633 spin_lock(&sctx->stat_lock); 1634 sctx->stat.malloc_errors++; 1635 spin_unlock(&sctx->stat_lock); 1636 scrub_put_recover(fs_info, recover); 1637 return -ENOMEM; 1638 } 1639 scrub_page_get(page); 1640 sblock->pagev[page_index] = page; 1641 page->sblock = sblock; 1642 page->flags = flags; 1643 page->generation = generation; 1644 page->logical = logical; 1645 page->have_csum = have_csum; 1646 if (have_csum) 1647 memcpy(page->csum, 1648 original_sblock->pagev[0]->csum, 1649 sctx->csum_size); 1650 1651 scrub_stripe_index_and_offset(logical, 1652 bbio->map_type, 1653 bbio->raid_map, 1654 mapped_length, 1655 bbio->num_stripes - 1656 bbio->num_tgtdevs, 1657 mirror_index, 1658 &stripe_index, 1659 &stripe_offset); 1660 page->physical = bbio->stripes[stripe_index].physical + 1661 stripe_offset; 1662 page->dev = bbio->stripes[stripe_index].dev; 1663 1664 BUG_ON(page_index >= original_sblock->page_count); 1665 page->physical_for_dev_replace = 1666 original_sblock->pagev[page_index]-> 1667 physical_for_dev_replace; 1668 /* for missing devices, dev->bdev is NULL */ 1669 page->mirror_num = mirror_index + 1; 1670 sblock->page_count++; 1671 page->page = alloc_page(GFP_NOFS); 1672 if (!page->page) 1673 goto leave_nomem; 1674 1675 scrub_get_recover(recover); 1676 page->recover = recover; 1677 } 1678 scrub_put_recover(fs_info, recover); 1679 length -= sublen; 1680 logical += sublen; 1681 page_index++; 1682 } 1683 1684 return 0; 1685 } 1686 1687 static void scrub_bio_wait_endio(struct bio *bio) 1688 { 1689 complete(bio->bi_private); 1690 } 1691 1692 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, 1693 struct bio *bio, 1694 struct scrub_page *page) 1695 { 1696 DECLARE_COMPLETION_ONSTACK(done); 1697 int ret; 1698 int mirror_num; 1699 1700 bio->bi_iter.bi_sector = page->logical >> 9; 1701 bio->bi_private = &done; 1702 bio->bi_end_io = scrub_bio_wait_endio; 1703 1704 mirror_num = page->sblock->pagev[0]->mirror_num; 1705 ret = raid56_parity_recover(fs_info, bio, page->recover->bbio, 1706 page->recover->map_length, 1707 mirror_num, 0); 1708 if (ret) 1709 return ret; 1710 1711 wait_for_completion_io(&done); 1712 return blk_status_to_errno(bio->bi_status); 1713 } 1714 1715 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, 1716 struct scrub_block *sblock) 1717 { 1718 struct scrub_page *first_page = sblock->pagev[0]; 1719 struct bio *bio; 1720 int page_num; 1721 1722 /* All pages in sblock belong to the same stripe on the same device. */ 1723 ASSERT(first_page->dev); 1724 if (!first_page->dev->bdev) 1725 goto out; 1726 1727 bio = btrfs_io_bio_alloc(BIO_MAX_PAGES); 1728 bio_set_dev(bio, first_page->dev->bdev); 1729 1730 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1731 struct scrub_page *page = sblock->pagev[page_num]; 1732 1733 WARN_ON(!page->page); 1734 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1735 } 1736 1737 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { 1738 bio_put(bio); 1739 goto out; 1740 } 1741 1742 bio_put(bio); 1743 1744 scrub_recheck_block_checksum(sblock); 1745 1746 return; 1747 out: 1748 for (page_num = 0; page_num < sblock->page_count; page_num++) 1749 sblock->pagev[page_num]->io_error = 1; 1750 1751 sblock->no_io_error_seen = 0; 1752 } 1753 1754 /* 1755 * this function will check the on disk data for checksum errors, header 1756 * errors and read I/O errors. If any I/O errors happen, the exact pages 1757 * which are errored are marked as being bad. The goal is to enable scrub 1758 * to take those pages that are not errored from all the mirrors so that 1759 * the pages that are errored in the just handled mirror can be repaired. 1760 */ 1761 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1762 struct scrub_block *sblock, 1763 int retry_failed_mirror) 1764 { 1765 int page_num; 1766 1767 sblock->no_io_error_seen = 1; 1768 1769 /* short cut for raid56 */ 1770 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0])) 1771 return scrub_recheck_block_on_raid56(fs_info, sblock); 1772 1773 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1774 struct bio *bio; 1775 struct scrub_page *page = sblock->pagev[page_num]; 1776 1777 if (page->dev->bdev == NULL) { 1778 page->io_error = 1; 1779 sblock->no_io_error_seen = 0; 1780 continue; 1781 } 1782 1783 WARN_ON(!page->page); 1784 bio = btrfs_io_bio_alloc(1); 1785 bio_set_dev(bio, page->dev->bdev); 1786 1787 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1788 bio->bi_iter.bi_sector = page->physical >> 9; 1789 bio->bi_opf = REQ_OP_READ; 1790 1791 if (btrfsic_submit_bio_wait(bio)) { 1792 page->io_error = 1; 1793 sblock->no_io_error_seen = 0; 1794 } 1795 1796 bio_put(bio); 1797 } 1798 1799 if (sblock->no_io_error_seen) 1800 scrub_recheck_block_checksum(sblock); 1801 } 1802 1803 static inline int scrub_check_fsid(u8 fsid[], 1804 struct scrub_page *spage) 1805 { 1806 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; 1807 int ret; 1808 1809 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1810 return !ret; 1811 } 1812 1813 static void scrub_recheck_block_checksum(struct scrub_block *sblock) 1814 { 1815 sblock->header_error = 0; 1816 sblock->checksum_error = 0; 1817 sblock->generation_error = 0; 1818 1819 if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) 1820 scrub_checksum_data(sblock); 1821 else 1822 scrub_checksum_tree_block(sblock); 1823 } 1824 1825 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1826 struct scrub_block *sblock_good) 1827 { 1828 int page_num; 1829 int ret = 0; 1830 1831 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1832 int ret_sub; 1833 1834 ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1835 sblock_good, 1836 page_num, 1); 1837 if (ret_sub) 1838 ret = ret_sub; 1839 } 1840 1841 return ret; 1842 } 1843 1844 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 1845 struct scrub_block *sblock_good, 1846 int page_num, int force_write) 1847 { 1848 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1849 struct scrub_page *page_good = sblock_good->pagev[page_num]; 1850 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; 1851 1852 BUG_ON(page_bad->page == NULL); 1853 BUG_ON(page_good->page == NULL); 1854 if (force_write || sblock_bad->header_error || 1855 sblock_bad->checksum_error || page_bad->io_error) { 1856 struct bio *bio; 1857 int ret; 1858 1859 if (!page_bad->dev->bdev) { 1860 btrfs_warn_rl(fs_info, 1861 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); 1862 return -EIO; 1863 } 1864 1865 bio = btrfs_io_bio_alloc(1); 1866 bio_set_dev(bio, page_bad->dev->bdev); 1867 bio->bi_iter.bi_sector = page_bad->physical >> 9; 1868 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 1869 1870 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1871 if (PAGE_SIZE != ret) { 1872 bio_put(bio); 1873 return -EIO; 1874 } 1875 1876 if (btrfsic_submit_bio_wait(bio)) { 1877 btrfs_dev_stat_inc_and_print(page_bad->dev, 1878 BTRFS_DEV_STAT_WRITE_ERRS); 1879 btrfs_dev_replace_stats_inc( 1880 &fs_info->dev_replace.num_write_errors); 1881 bio_put(bio); 1882 return -EIO; 1883 } 1884 bio_put(bio); 1885 } 1886 1887 return 0; 1888 } 1889 1890 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1891 { 1892 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; 1893 int page_num; 1894 1895 /* 1896 * This block is used for the check of the parity on the source device, 1897 * so the data needn't be written into the destination device. 1898 */ 1899 if (sblock->sparity) 1900 return; 1901 1902 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1903 int ret; 1904 1905 ret = scrub_write_page_to_dev_replace(sblock, page_num); 1906 if (ret) 1907 btrfs_dev_replace_stats_inc( 1908 &fs_info->dev_replace.num_write_errors); 1909 } 1910 } 1911 1912 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 1913 int page_num) 1914 { 1915 struct scrub_page *spage = sblock->pagev[page_num]; 1916 1917 BUG_ON(spage->page == NULL); 1918 if (spage->io_error) { 1919 void *mapped_buffer = kmap_atomic(spage->page); 1920 1921 clear_page(mapped_buffer); 1922 flush_dcache_page(spage->page); 1923 kunmap_atomic(mapped_buffer); 1924 } 1925 return scrub_add_page_to_wr_bio(sblock->sctx, spage); 1926 } 1927 1928 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1929 struct scrub_page *spage) 1930 { 1931 struct scrub_bio *sbio; 1932 int ret; 1933 1934 mutex_lock(&sctx->wr_lock); 1935 again: 1936 if (!sctx->wr_curr_bio) { 1937 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), 1938 GFP_KERNEL); 1939 if (!sctx->wr_curr_bio) { 1940 mutex_unlock(&sctx->wr_lock); 1941 return -ENOMEM; 1942 } 1943 sctx->wr_curr_bio->sctx = sctx; 1944 sctx->wr_curr_bio->page_count = 0; 1945 } 1946 sbio = sctx->wr_curr_bio; 1947 if (sbio->page_count == 0) { 1948 struct bio *bio; 1949 1950 sbio->physical = spage->physical_for_dev_replace; 1951 sbio->logical = spage->logical; 1952 sbio->dev = sctx->wr_tgtdev; 1953 bio = sbio->bio; 1954 if (!bio) { 1955 bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); 1956 sbio->bio = bio; 1957 } 1958 1959 bio->bi_private = sbio; 1960 bio->bi_end_io = scrub_wr_bio_end_io; 1961 bio_set_dev(bio, sbio->dev->bdev); 1962 bio->bi_iter.bi_sector = sbio->physical >> 9; 1963 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 1964 sbio->status = 0; 1965 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1966 spage->physical_for_dev_replace || 1967 sbio->logical + sbio->page_count * PAGE_SIZE != 1968 spage->logical) { 1969 scrub_wr_submit(sctx); 1970 goto again; 1971 } 1972 1973 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1974 if (ret != PAGE_SIZE) { 1975 if (sbio->page_count < 1) { 1976 bio_put(sbio->bio); 1977 sbio->bio = NULL; 1978 mutex_unlock(&sctx->wr_lock); 1979 return -EIO; 1980 } 1981 scrub_wr_submit(sctx); 1982 goto again; 1983 } 1984 1985 sbio->pagev[sbio->page_count] = spage; 1986 scrub_page_get(spage); 1987 sbio->page_count++; 1988 if (sbio->page_count == sctx->pages_per_wr_bio) 1989 scrub_wr_submit(sctx); 1990 mutex_unlock(&sctx->wr_lock); 1991 1992 return 0; 1993 } 1994 1995 static void scrub_wr_submit(struct scrub_ctx *sctx) 1996 { 1997 struct scrub_bio *sbio; 1998 1999 if (!sctx->wr_curr_bio) 2000 return; 2001 2002 sbio = sctx->wr_curr_bio; 2003 sctx->wr_curr_bio = NULL; 2004 WARN_ON(!sbio->bio->bi_disk); 2005 scrub_pending_bio_inc(sctx); 2006 /* process all writes in a single worker thread. Then the block layer 2007 * orders the requests before sending them to the driver which 2008 * doubled the write performance on spinning disks when measured 2009 * with Linux 3.5 */ 2010 btrfsic_submit_bio(sbio->bio); 2011 } 2012 2013 static void scrub_wr_bio_end_io(struct bio *bio) 2014 { 2015 struct scrub_bio *sbio = bio->bi_private; 2016 struct btrfs_fs_info *fs_info = sbio->dev->fs_info; 2017 2018 sbio->status = bio->bi_status; 2019 sbio->bio = bio; 2020 2021 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, 2022 scrub_wr_bio_end_io_worker, NULL, NULL); 2023 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 2024 } 2025 2026 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 2027 { 2028 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2029 struct scrub_ctx *sctx = sbio->sctx; 2030 int i; 2031 2032 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); 2033 if (sbio->status) { 2034 struct btrfs_dev_replace *dev_replace = 2035 &sbio->sctx->fs_info->dev_replace; 2036 2037 for (i = 0; i < sbio->page_count; i++) { 2038 struct scrub_page *spage = sbio->pagev[i]; 2039 2040 spage->io_error = 1; 2041 btrfs_dev_replace_stats_inc(&dev_replace-> 2042 num_write_errors); 2043 } 2044 } 2045 2046 for (i = 0; i < sbio->page_count; i++) 2047 scrub_page_put(sbio->pagev[i]); 2048 2049 bio_put(sbio->bio); 2050 kfree(sbio); 2051 scrub_pending_bio_dec(sctx); 2052 } 2053 2054 static int scrub_checksum(struct scrub_block *sblock) 2055 { 2056 u64 flags; 2057 int ret; 2058 2059 /* 2060 * No need to initialize these stats currently, 2061 * because this function only use return value 2062 * instead of these stats value. 2063 * 2064 * Todo: 2065 * always use stats 2066 */ 2067 sblock->header_error = 0; 2068 sblock->generation_error = 0; 2069 sblock->checksum_error = 0; 2070 2071 WARN_ON(sblock->page_count < 1); 2072 flags = sblock->pagev[0]->flags; 2073 ret = 0; 2074 if (flags & BTRFS_EXTENT_FLAG_DATA) 2075 ret = scrub_checksum_data(sblock); 2076 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 2077 ret = scrub_checksum_tree_block(sblock); 2078 else if (flags & BTRFS_EXTENT_FLAG_SUPER) 2079 (void)scrub_checksum_super(sblock); 2080 else 2081 WARN_ON(1); 2082 if (ret) 2083 scrub_handle_errored_block(sblock); 2084 2085 return ret; 2086 } 2087 2088 static int scrub_checksum_data(struct scrub_block *sblock) 2089 { 2090 struct scrub_ctx *sctx = sblock->sctx; 2091 u8 csum[BTRFS_CSUM_SIZE]; 2092 u8 *on_disk_csum; 2093 struct page *page; 2094 void *buffer; 2095 u32 crc = ~(u32)0; 2096 u64 len; 2097 int index; 2098 2099 BUG_ON(sblock->page_count < 1); 2100 if (!sblock->pagev[0]->have_csum) 2101 return 0; 2102 2103 on_disk_csum = sblock->pagev[0]->csum; 2104 page = sblock->pagev[0]->page; 2105 buffer = kmap_atomic(page); 2106 2107 len = sctx->fs_info->sectorsize; 2108 index = 0; 2109 for (;;) { 2110 u64 l = min_t(u64, len, PAGE_SIZE); 2111 2112 crc = btrfs_csum_data(buffer, crc, l); 2113 kunmap_atomic(buffer); 2114 len -= l; 2115 if (len == 0) 2116 break; 2117 index++; 2118 BUG_ON(index >= sblock->page_count); 2119 BUG_ON(!sblock->pagev[index]->page); 2120 page = sblock->pagev[index]->page; 2121 buffer = kmap_atomic(page); 2122 } 2123 2124 btrfs_csum_final(crc, csum); 2125 if (memcmp(csum, on_disk_csum, sctx->csum_size)) 2126 sblock->checksum_error = 1; 2127 2128 return sblock->checksum_error; 2129 } 2130 2131 static int scrub_checksum_tree_block(struct scrub_block *sblock) 2132 { 2133 struct scrub_ctx *sctx = sblock->sctx; 2134 struct btrfs_header *h; 2135 struct btrfs_fs_info *fs_info = sctx->fs_info; 2136 u8 calculated_csum[BTRFS_CSUM_SIZE]; 2137 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 2138 struct page *page; 2139 void *mapped_buffer; 2140 u64 mapped_size; 2141 void *p; 2142 u32 crc = ~(u32)0; 2143 u64 len; 2144 int index; 2145 2146 BUG_ON(sblock->page_count < 1); 2147 page = sblock->pagev[0]->page; 2148 mapped_buffer = kmap_atomic(page); 2149 h = (struct btrfs_header *)mapped_buffer; 2150 memcpy(on_disk_csum, h->csum, sctx->csum_size); 2151 2152 /* 2153 * we don't use the getter functions here, as we 2154 * a) don't have an extent buffer and 2155 * b) the page is already kmapped 2156 */ 2157 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) 2158 sblock->header_error = 1; 2159 2160 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) { 2161 sblock->header_error = 1; 2162 sblock->generation_error = 1; 2163 } 2164 2165 if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) 2166 sblock->header_error = 1; 2167 2168 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 2169 BTRFS_UUID_SIZE)) 2170 sblock->header_error = 1; 2171 2172 len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; 2173 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 2174 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 2175 index = 0; 2176 for (;;) { 2177 u64 l = min_t(u64, len, mapped_size); 2178 2179 crc = btrfs_csum_data(p, crc, l); 2180 kunmap_atomic(mapped_buffer); 2181 len -= l; 2182 if (len == 0) 2183 break; 2184 index++; 2185 BUG_ON(index >= sblock->page_count); 2186 BUG_ON(!sblock->pagev[index]->page); 2187 page = sblock->pagev[index]->page; 2188 mapped_buffer = kmap_atomic(page); 2189 mapped_size = PAGE_SIZE; 2190 p = mapped_buffer; 2191 } 2192 2193 btrfs_csum_final(crc, calculated_csum); 2194 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 2195 sblock->checksum_error = 1; 2196 2197 return sblock->header_error || sblock->checksum_error; 2198 } 2199 2200 static int scrub_checksum_super(struct scrub_block *sblock) 2201 { 2202 struct btrfs_super_block *s; 2203 struct scrub_ctx *sctx = sblock->sctx; 2204 u8 calculated_csum[BTRFS_CSUM_SIZE]; 2205 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 2206 struct page *page; 2207 void *mapped_buffer; 2208 u64 mapped_size; 2209 void *p; 2210 u32 crc = ~(u32)0; 2211 int fail_gen = 0; 2212 int fail_cor = 0; 2213 u64 len; 2214 int index; 2215 2216 BUG_ON(sblock->page_count < 1); 2217 page = sblock->pagev[0]->page; 2218 mapped_buffer = kmap_atomic(page); 2219 s = (struct btrfs_super_block *)mapped_buffer; 2220 memcpy(on_disk_csum, s->csum, sctx->csum_size); 2221 2222 if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) 2223 ++fail_cor; 2224 2225 if (sblock->pagev[0]->generation != btrfs_super_generation(s)) 2226 ++fail_gen; 2227 2228 if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) 2229 ++fail_cor; 2230 2231 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 2232 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 2233 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 2234 index = 0; 2235 for (;;) { 2236 u64 l = min_t(u64, len, mapped_size); 2237 2238 crc = btrfs_csum_data(p, crc, l); 2239 kunmap_atomic(mapped_buffer); 2240 len -= l; 2241 if (len == 0) 2242 break; 2243 index++; 2244 BUG_ON(index >= sblock->page_count); 2245 BUG_ON(!sblock->pagev[index]->page); 2246 page = sblock->pagev[index]->page; 2247 mapped_buffer = kmap_atomic(page); 2248 mapped_size = PAGE_SIZE; 2249 p = mapped_buffer; 2250 } 2251 2252 btrfs_csum_final(crc, calculated_csum); 2253 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 2254 ++fail_cor; 2255 2256 if (fail_cor + fail_gen) { 2257 /* 2258 * if we find an error in a super block, we just report it. 2259 * They will get written with the next transaction commit 2260 * anyway 2261 */ 2262 spin_lock(&sctx->stat_lock); 2263 ++sctx->stat.super_errors; 2264 spin_unlock(&sctx->stat_lock); 2265 if (fail_cor) 2266 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 2267 BTRFS_DEV_STAT_CORRUPTION_ERRS); 2268 else 2269 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 2270 BTRFS_DEV_STAT_GENERATION_ERRS); 2271 } 2272 2273 return fail_cor + fail_gen; 2274 } 2275 2276 static void scrub_block_get(struct scrub_block *sblock) 2277 { 2278 refcount_inc(&sblock->refs); 2279 } 2280 2281 static void scrub_block_put(struct scrub_block *sblock) 2282 { 2283 if (refcount_dec_and_test(&sblock->refs)) { 2284 int i; 2285 2286 if (sblock->sparity) 2287 scrub_parity_put(sblock->sparity); 2288 2289 for (i = 0; i < sblock->page_count; i++) 2290 scrub_page_put(sblock->pagev[i]); 2291 kfree(sblock); 2292 } 2293 } 2294 2295 static void scrub_page_get(struct scrub_page *spage) 2296 { 2297 atomic_inc(&spage->refs); 2298 } 2299 2300 static void scrub_page_put(struct scrub_page *spage) 2301 { 2302 if (atomic_dec_and_test(&spage->refs)) { 2303 if (spage->page) 2304 __free_page(spage->page); 2305 kfree(spage); 2306 } 2307 } 2308 2309 static void scrub_submit(struct scrub_ctx *sctx) 2310 { 2311 struct scrub_bio *sbio; 2312 2313 if (sctx->curr == -1) 2314 return; 2315 2316 sbio = sctx->bios[sctx->curr]; 2317 sctx->curr = -1; 2318 scrub_pending_bio_inc(sctx); 2319 btrfsic_submit_bio(sbio->bio); 2320 } 2321 2322 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 2323 struct scrub_page *spage) 2324 { 2325 struct scrub_block *sblock = spage->sblock; 2326 struct scrub_bio *sbio; 2327 int ret; 2328 2329 again: 2330 /* 2331 * grab a fresh bio or wait for one to become available 2332 */ 2333 while (sctx->curr == -1) { 2334 spin_lock(&sctx->list_lock); 2335 sctx->curr = sctx->first_free; 2336 if (sctx->curr != -1) { 2337 sctx->first_free = sctx->bios[sctx->curr]->next_free; 2338 sctx->bios[sctx->curr]->next_free = -1; 2339 sctx->bios[sctx->curr]->page_count = 0; 2340 spin_unlock(&sctx->list_lock); 2341 } else { 2342 spin_unlock(&sctx->list_lock); 2343 wait_event(sctx->list_wait, sctx->first_free != -1); 2344 } 2345 } 2346 sbio = sctx->bios[sctx->curr]; 2347 if (sbio->page_count == 0) { 2348 struct bio *bio; 2349 2350 sbio->physical = spage->physical; 2351 sbio->logical = spage->logical; 2352 sbio->dev = spage->dev; 2353 bio = sbio->bio; 2354 if (!bio) { 2355 bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); 2356 sbio->bio = bio; 2357 } 2358 2359 bio->bi_private = sbio; 2360 bio->bi_end_io = scrub_bio_end_io; 2361 bio_set_dev(bio, sbio->dev->bdev); 2362 bio->bi_iter.bi_sector = sbio->physical >> 9; 2363 bio_set_op_attrs(bio, REQ_OP_READ, 0); 2364 sbio->status = 0; 2365 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 2366 spage->physical || 2367 sbio->logical + sbio->page_count * PAGE_SIZE != 2368 spage->logical || 2369 sbio->dev != spage->dev) { 2370 scrub_submit(sctx); 2371 goto again; 2372 } 2373 2374 sbio->pagev[sbio->page_count] = spage; 2375 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 2376 if (ret != PAGE_SIZE) { 2377 if (sbio->page_count < 1) { 2378 bio_put(sbio->bio); 2379 sbio->bio = NULL; 2380 return -EIO; 2381 } 2382 scrub_submit(sctx); 2383 goto again; 2384 } 2385 2386 scrub_block_get(sblock); /* one for the page added to the bio */ 2387 atomic_inc(&sblock->outstanding_pages); 2388 sbio->page_count++; 2389 if (sbio->page_count == sctx->pages_per_rd_bio) 2390 scrub_submit(sctx); 2391 2392 return 0; 2393 } 2394 2395 static void scrub_missing_raid56_end_io(struct bio *bio) 2396 { 2397 struct scrub_block *sblock = bio->bi_private; 2398 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; 2399 2400 if (bio->bi_status) 2401 sblock->no_io_error_seen = 0; 2402 2403 bio_put(bio); 2404 2405 btrfs_queue_work(fs_info->scrub_workers, &sblock->work); 2406 } 2407 2408 static void scrub_missing_raid56_worker(struct btrfs_work *work) 2409 { 2410 struct scrub_block *sblock = container_of(work, struct scrub_block, work); 2411 struct scrub_ctx *sctx = sblock->sctx; 2412 struct btrfs_fs_info *fs_info = sctx->fs_info; 2413 u64 logical; 2414 struct btrfs_device *dev; 2415 2416 logical = sblock->pagev[0]->logical; 2417 dev = sblock->pagev[0]->dev; 2418 2419 if (sblock->no_io_error_seen) 2420 scrub_recheck_block_checksum(sblock); 2421 2422 if (!sblock->no_io_error_seen) { 2423 spin_lock(&sctx->stat_lock); 2424 sctx->stat.read_errors++; 2425 spin_unlock(&sctx->stat_lock); 2426 btrfs_err_rl_in_rcu(fs_info, 2427 "IO error rebuilding logical %llu for dev %s", 2428 logical, rcu_str_deref(dev->name)); 2429 } else if (sblock->header_error || sblock->checksum_error) { 2430 spin_lock(&sctx->stat_lock); 2431 sctx->stat.uncorrectable_errors++; 2432 spin_unlock(&sctx->stat_lock); 2433 btrfs_err_rl_in_rcu(fs_info, 2434 "failed to rebuild valid logical %llu for dev %s", 2435 logical, rcu_str_deref(dev->name)); 2436 } else { 2437 scrub_write_block_to_dev_replace(sblock); 2438 } 2439 2440 scrub_block_put(sblock); 2441 2442 if (sctx->is_dev_replace && sctx->flush_all_writes) { 2443 mutex_lock(&sctx->wr_lock); 2444 scrub_wr_submit(sctx); 2445 mutex_unlock(&sctx->wr_lock); 2446 } 2447 2448 scrub_pending_bio_dec(sctx); 2449 } 2450 2451 static void scrub_missing_raid56_pages(struct scrub_block *sblock) 2452 { 2453 struct scrub_ctx *sctx = sblock->sctx; 2454 struct btrfs_fs_info *fs_info = sctx->fs_info; 2455 u64 length = sblock->page_count * PAGE_SIZE; 2456 u64 logical = sblock->pagev[0]->logical; 2457 struct btrfs_bio *bbio = NULL; 2458 struct bio *bio; 2459 struct btrfs_raid_bio *rbio; 2460 int ret; 2461 int i; 2462 2463 btrfs_bio_counter_inc_blocked(fs_info); 2464 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 2465 &length, &bbio); 2466 if (ret || !bbio || !bbio->raid_map) 2467 goto bbio_out; 2468 2469 if (WARN_ON(!sctx->is_dev_replace || 2470 !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2471 /* 2472 * We shouldn't be scrubbing a missing device. Even for dev 2473 * replace, we should only get here for RAID 5/6. We either 2474 * managed to mount something with no mirrors remaining or 2475 * there's a bug in scrub_remap_extent()/btrfs_map_block(). 2476 */ 2477 goto bbio_out; 2478 } 2479 2480 bio = btrfs_io_bio_alloc(0); 2481 bio->bi_iter.bi_sector = logical >> 9; 2482 bio->bi_private = sblock; 2483 bio->bi_end_io = scrub_missing_raid56_end_io; 2484 2485 rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length); 2486 if (!rbio) 2487 goto rbio_out; 2488 2489 for (i = 0; i < sblock->page_count; i++) { 2490 struct scrub_page *spage = sblock->pagev[i]; 2491 2492 raid56_add_scrub_pages(rbio, spage->page, spage->logical); 2493 } 2494 2495 btrfs_init_work(&sblock->work, btrfs_scrub_helper, 2496 scrub_missing_raid56_worker, NULL, NULL); 2497 scrub_block_get(sblock); 2498 scrub_pending_bio_inc(sctx); 2499 raid56_submit_missing_rbio(rbio); 2500 return; 2501 2502 rbio_out: 2503 bio_put(bio); 2504 bbio_out: 2505 btrfs_bio_counter_dec(fs_info); 2506 btrfs_put_bbio(bbio); 2507 spin_lock(&sctx->stat_lock); 2508 sctx->stat.malloc_errors++; 2509 spin_unlock(&sctx->stat_lock); 2510 } 2511 2512 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 2513 u64 physical, struct btrfs_device *dev, u64 flags, 2514 u64 gen, int mirror_num, u8 *csum, int force, 2515 u64 physical_for_dev_replace) 2516 { 2517 struct scrub_block *sblock; 2518 int index; 2519 2520 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2521 if (!sblock) { 2522 spin_lock(&sctx->stat_lock); 2523 sctx->stat.malloc_errors++; 2524 spin_unlock(&sctx->stat_lock); 2525 return -ENOMEM; 2526 } 2527 2528 /* one ref inside this function, plus one for each page added to 2529 * a bio later on */ 2530 refcount_set(&sblock->refs, 1); 2531 sblock->sctx = sctx; 2532 sblock->no_io_error_seen = 1; 2533 2534 for (index = 0; len > 0; index++) { 2535 struct scrub_page *spage; 2536 u64 l = min_t(u64, len, PAGE_SIZE); 2537 2538 spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2539 if (!spage) { 2540 leave_nomem: 2541 spin_lock(&sctx->stat_lock); 2542 sctx->stat.malloc_errors++; 2543 spin_unlock(&sctx->stat_lock); 2544 scrub_block_put(sblock); 2545 return -ENOMEM; 2546 } 2547 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 2548 scrub_page_get(spage); 2549 sblock->pagev[index] = spage; 2550 spage->sblock = sblock; 2551 spage->dev = dev; 2552 spage->flags = flags; 2553 spage->generation = gen; 2554 spage->logical = logical; 2555 spage->physical = physical; 2556 spage->physical_for_dev_replace = physical_for_dev_replace; 2557 spage->mirror_num = mirror_num; 2558 if (csum) { 2559 spage->have_csum = 1; 2560 memcpy(spage->csum, csum, sctx->csum_size); 2561 } else { 2562 spage->have_csum = 0; 2563 } 2564 sblock->page_count++; 2565 spage->page = alloc_page(GFP_KERNEL); 2566 if (!spage->page) 2567 goto leave_nomem; 2568 len -= l; 2569 logical += l; 2570 physical += l; 2571 physical_for_dev_replace += l; 2572 } 2573 2574 WARN_ON(sblock->page_count == 0); 2575 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { 2576 /* 2577 * This case should only be hit for RAID 5/6 device replace. See 2578 * the comment in scrub_missing_raid56_pages() for details. 2579 */ 2580 scrub_missing_raid56_pages(sblock); 2581 } else { 2582 for (index = 0; index < sblock->page_count; index++) { 2583 struct scrub_page *spage = sblock->pagev[index]; 2584 int ret; 2585 2586 ret = scrub_add_page_to_rd_bio(sctx, spage); 2587 if (ret) { 2588 scrub_block_put(sblock); 2589 return ret; 2590 } 2591 } 2592 2593 if (force) 2594 scrub_submit(sctx); 2595 } 2596 2597 /* last one frees, either here or in bio completion for last page */ 2598 scrub_block_put(sblock); 2599 return 0; 2600 } 2601 2602 static void scrub_bio_end_io(struct bio *bio) 2603 { 2604 struct scrub_bio *sbio = bio->bi_private; 2605 struct btrfs_fs_info *fs_info = sbio->dev->fs_info; 2606 2607 sbio->status = bio->bi_status; 2608 sbio->bio = bio; 2609 2610 btrfs_queue_work(fs_info->scrub_workers, &sbio->work); 2611 } 2612 2613 static void scrub_bio_end_io_worker(struct btrfs_work *work) 2614 { 2615 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2616 struct scrub_ctx *sctx = sbio->sctx; 2617 int i; 2618 2619 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); 2620 if (sbio->status) { 2621 for (i = 0; i < sbio->page_count; i++) { 2622 struct scrub_page *spage = sbio->pagev[i]; 2623 2624 spage->io_error = 1; 2625 spage->sblock->no_io_error_seen = 0; 2626 } 2627 } 2628 2629 /* now complete the scrub_block items that have all pages completed */ 2630 for (i = 0; i < sbio->page_count; i++) { 2631 struct scrub_page *spage = sbio->pagev[i]; 2632 struct scrub_block *sblock = spage->sblock; 2633 2634 if (atomic_dec_and_test(&sblock->outstanding_pages)) 2635 scrub_block_complete(sblock); 2636 scrub_block_put(sblock); 2637 } 2638 2639 bio_put(sbio->bio); 2640 sbio->bio = NULL; 2641 spin_lock(&sctx->list_lock); 2642 sbio->next_free = sctx->first_free; 2643 sctx->first_free = sbio->index; 2644 spin_unlock(&sctx->list_lock); 2645 2646 if (sctx->is_dev_replace && sctx->flush_all_writes) { 2647 mutex_lock(&sctx->wr_lock); 2648 scrub_wr_submit(sctx); 2649 mutex_unlock(&sctx->wr_lock); 2650 } 2651 2652 scrub_pending_bio_dec(sctx); 2653 } 2654 2655 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, 2656 unsigned long *bitmap, 2657 u64 start, u64 len) 2658 { 2659 u64 offset; 2660 u64 nsectors64; 2661 u32 nsectors; 2662 int sectorsize = sparity->sctx->fs_info->sectorsize; 2663 2664 if (len >= sparity->stripe_len) { 2665 bitmap_set(bitmap, 0, sparity->nsectors); 2666 return; 2667 } 2668 2669 start -= sparity->logic_start; 2670 start = div64_u64_rem(start, sparity->stripe_len, &offset); 2671 offset = div_u64(offset, sectorsize); 2672 nsectors64 = div_u64(len, sectorsize); 2673 2674 ASSERT(nsectors64 < UINT_MAX); 2675 nsectors = (u32)nsectors64; 2676 2677 if (offset + nsectors <= sparity->nsectors) { 2678 bitmap_set(bitmap, offset, nsectors); 2679 return; 2680 } 2681 2682 bitmap_set(bitmap, offset, sparity->nsectors - offset); 2683 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); 2684 } 2685 2686 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, 2687 u64 start, u64 len) 2688 { 2689 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); 2690 } 2691 2692 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, 2693 u64 start, u64 len) 2694 { 2695 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); 2696 } 2697 2698 static void scrub_block_complete(struct scrub_block *sblock) 2699 { 2700 int corrupted = 0; 2701 2702 if (!sblock->no_io_error_seen) { 2703 corrupted = 1; 2704 scrub_handle_errored_block(sblock); 2705 } else { 2706 /* 2707 * if has checksum error, write via repair mechanism in 2708 * dev replace case, otherwise write here in dev replace 2709 * case. 2710 */ 2711 corrupted = scrub_checksum(sblock); 2712 if (!corrupted && sblock->sctx->is_dev_replace) 2713 scrub_write_block_to_dev_replace(sblock); 2714 } 2715 2716 if (sblock->sparity && corrupted && !sblock->data_corrected) { 2717 u64 start = sblock->pagev[0]->logical; 2718 u64 end = sblock->pagev[sblock->page_count - 1]->logical + 2719 PAGE_SIZE; 2720 2721 scrub_parity_mark_sectors_error(sblock->sparity, 2722 start, end - start); 2723 } 2724 } 2725 2726 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) 2727 { 2728 struct btrfs_ordered_sum *sum = NULL; 2729 unsigned long index; 2730 unsigned long num_sectors; 2731 2732 while (!list_empty(&sctx->csum_list)) { 2733 sum = list_first_entry(&sctx->csum_list, 2734 struct btrfs_ordered_sum, list); 2735 if (sum->bytenr > logical) 2736 return 0; 2737 if (sum->bytenr + sum->len > logical) 2738 break; 2739 2740 ++sctx->stat.csum_discards; 2741 list_del(&sum->list); 2742 kfree(sum); 2743 sum = NULL; 2744 } 2745 if (!sum) 2746 return 0; 2747 2748 index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize); 2749 ASSERT(index < UINT_MAX); 2750 2751 num_sectors = sum->len / sctx->fs_info->sectorsize; 2752 memcpy(csum, sum->sums + index, sctx->csum_size); 2753 if (index == num_sectors - 1) { 2754 list_del(&sum->list); 2755 kfree(sum); 2756 } 2757 return 1; 2758 } 2759 2760 /* scrub extent tries to collect up to 64 kB for each bio */ 2761 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, 2762 u64 logical, u64 len, 2763 u64 physical, struct btrfs_device *dev, u64 flags, 2764 u64 gen, int mirror_num, u64 physical_for_dev_replace) 2765 { 2766 int ret; 2767 u8 csum[BTRFS_CSUM_SIZE]; 2768 u32 blocksize; 2769 2770 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2771 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 2772 blocksize = map->stripe_len; 2773 else 2774 blocksize = sctx->fs_info->sectorsize; 2775 spin_lock(&sctx->stat_lock); 2776 sctx->stat.data_extents_scrubbed++; 2777 sctx->stat.data_bytes_scrubbed += len; 2778 spin_unlock(&sctx->stat_lock); 2779 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2780 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 2781 blocksize = map->stripe_len; 2782 else 2783 blocksize = sctx->fs_info->nodesize; 2784 spin_lock(&sctx->stat_lock); 2785 sctx->stat.tree_extents_scrubbed++; 2786 sctx->stat.tree_bytes_scrubbed += len; 2787 spin_unlock(&sctx->stat_lock); 2788 } else { 2789 blocksize = sctx->fs_info->sectorsize; 2790 WARN_ON(1); 2791 } 2792 2793 while (len) { 2794 u64 l = min_t(u64, len, blocksize); 2795 int have_csum = 0; 2796 2797 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2798 /* push csums to sbio */ 2799 have_csum = scrub_find_csum(sctx, logical, csum); 2800 if (have_csum == 0) 2801 ++sctx->stat.no_csum; 2802 if (0 && sctx->is_dev_replace && !have_csum) { 2803 ret = copy_nocow_pages(sctx, logical, l, 2804 mirror_num, 2805 physical_for_dev_replace); 2806 goto behind_scrub_pages; 2807 } 2808 } 2809 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 2810 mirror_num, have_csum ? csum : NULL, 0, 2811 physical_for_dev_replace); 2812 behind_scrub_pages: 2813 if (ret) 2814 return ret; 2815 len -= l; 2816 logical += l; 2817 physical += l; 2818 physical_for_dev_replace += l; 2819 } 2820 return 0; 2821 } 2822 2823 static int scrub_pages_for_parity(struct scrub_parity *sparity, 2824 u64 logical, u64 len, 2825 u64 physical, struct btrfs_device *dev, 2826 u64 flags, u64 gen, int mirror_num, u8 *csum) 2827 { 2828 struct scrub_ctx *sctx = sparity->sctx; 2829 struct scrub_block *sblock; 2830 int index; 2831 2832 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2833 if (!sblock) { 2834 spin_lock(&sctx->stat_lock); 2835 sctx->stat.malloc_errors++; 2836 spin_unlock(&sctx->stat_lock); 2837 return -ENOMEM; 2838 } 2839 2840 /* one ref inside this function, plus one for each page added to 2841 * a bio later on */ 2842 refcount_set(&sblock->refs, 1); 2843 sblock->sctx = sctx; 2844 sblock->no_io_error_seen = 1; 2845 sblock->sparity = sparity; 2846 scrub_parity_get(sparity); 2847 2848 for (index = 0; len > 0; index++) { 2849 struct scrub_page *spage; 2850 u64 l = min_t(u64, len, PAGE_SIZE); 2851 2852 spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2853 if (!spage) { 2854 leave_nomem: 2855 spin_lock(&sctx->stat_lock); 2856 sctx->stat.malloc_errors++; 2857 spin_unlock(&sctx->stat_lock); 2858 scrub_block_put(sblock); 2859 return -ENOMEM; 2860 } 2861 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 2862 /* For scrub block */ 2863 scrub_page_get(spage); 2864 sblock->pagev[index] = spage; 2865 /* For scrub parity */ 2866 scrub_page_get(spage); 2867 list_add_tail(&spage->list, &sparity->spages); 2868 spage->sblock = sblock; 2869 spage->dev = dev; 2870 spage->flags = flags; 2871 spage->generation = gen; 2872 spage->logical = logical; 2873 spage->physical = physical; 2874 spage->mirror_num = mirror_num; 2875 if (csum) { 2876 spage->have_csum = 1; 2877 memcpy(spage->csum, csum, sctx->csum_size); 2878 } else { 2879 spage->have_csum = 0; 2880 } 2881 sblock->page_count++; 2882 spage->page = alloc_page(GFP_KERNEL); 2883 if (!spage->page) 2884 goto leave_nomem; 2885 len -= l; 2886 logical += l; 2887 physical += l; 2888 } 2889 2890 WARN_ON(sblock->page_count == 0); 2891 for (index = 0; index < sblock->page_count; index++) { 2892 struct scrub_page *spage = sblock->pagev[index]; 2893 int ret; 2894 2895 ret = scrub_add_page_to_rd_bio(sctx, spage); 2896 if (ret) { 2897 scrub_block_put(sblock); 2898 return ret; 2899 } 2900 } 2901 2902 /* last one frees, either here or in bio completion for last page */ 2903 scrub_block_put(sblock); 2904 return 0; 2905 } 2906 2907 static int scrub_extent_for_parity(struct scrub_parity *sparity, 2908 u64 logical, u64 len, 2909 u64 physical, struct btrfs_device *dev, 2910 u64 flags, u64 gen, int mirror_num) 2911 { 2912 struct scrub_ctx *sctx = sparity->sctx; 2913 int ret; 2914 u8 csum[BTRFS_CSUM_SIZE]; 2915 u32 blocksize; 2916 2917 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { 2918 scrub_parity_mark_sectors_error(sparity, logical, len); 2919 return 0; 2920 } 2921 2922 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2923 blocksize = sparity->stripe_len; 2924 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2925 blocksize = sparity->stripe_len; 2926 } else { 2927 blocksize = sctx->fs_info->sectorsize; 2928 WARN_ON(1); 2929 } 2930 2931 while (len) { 2932 u64 l = min_t(u64, len, blocksize); 2933 int have_csum = 0; 2934 2935 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2936 /* push csums to sbio */ 2937 have_csum = scrub_find_csum(sctx, logical, csum); 2938 if (have_csum == 0) 2939 goto skip; 2940 } 2941 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, 2942 flags, gen, mirror_num, 2943 have_csum ? csum : NULL); 2944 if (ret) 2945 return ret; 2946 skip: 2947 len -= l; 2948 logical += l; 2949 physical += l; 2950 } 2951 return 0; 2952 } 2953 2954 /* 2955 * Given a physical address, this will calculate it's 2956 * logical offset. if this is a parity stripe, it will return 2957 * the most left data stripe's logical offset. 2958 * 2959 * return 0 if it is a data stripe, 1 means parity stripe. 2960 */ 2961 static int get_raid56_logic_offset(u64 physical, int num, 2962 struct map_lookup *map, u64 *offset, 2963 u64 *stripe_start) 2964 { 2965 int i; 2966 int j = 0; 2967 u64 stripe_nr; 2968 u64 last_offset; 2969 u32 stripe_index; 2970 u32 rot; 2971 2972 last_offset = (physical - map->stripes[num].physical) * 2973 nr_data_stripes(map); 2974 if (stripe_start) 2975 *stripe_start = last_offset; 2976 2977 *offset = last_offset; 2978 for (i = 0; i < nr_data_stripes(map); i++) { 2979 *offset = last_offset + i * map->stripe_len; 2980 2981 stripe_nr = div64_u64(*offset, map->stripe_len); 2982 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); 2983 2984 /* Work out the disk rotation on this stripe-set */ 2985 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); 2986 /* calculate which stripe this data locates */ 2987 rot += i; 2988 stripe_index = rot % map->num_stripes; 2989 if (stripe_index == num) 2990 return 0; 2991 if (stripe_index < num) 2992 j++; 2993 } 2994 *offset = last_offset + j * map->stripe_len; 2995 return 1; 2996 } 2997 2998 static void scrub_free_parity(struct scrub_parity *sparity) 2999 { 3000 struct scrub_ctx *sctx = sparity->sctx; 3001 struct scrub_page *curr, *next; 3002 int nbits; 3003 3004 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); 3005 if (nbits) { 3006 spin_lock(&sctx->stat_lock); 3007 sctx->stat.read_errors += nbits; 3008 sctx->stat.uncorrectable_errors += nbits; 3009 spin_unlock(&sctx->stat_lock); 3010 } 3011 3012 list_for_each_entry_safe(curr, next, &sparity->spages, list) { 3013 list_del_init(&curr->list); 3014 scrub_page_put(curr); 3015 } 3016 3017 kfree(sparity); 3018 } 3019 3020 static void scrub_parity_bio_endio_worker(struct btrfs_work *work) 3021 { 3022 struct scrub_parity *sparity = container_of(work, struct scrub_parity, 3023 work); 3024 struct scrub_ctx *sctx = sparity->sctx; 3025 3026 scrub_free_parity(sparity); 3027 scrub_pending_bio_dec(sctx); 3028 } 3029 3030 static void scrub_parity_bio_endio(struct bio *bio) 3031 { 3032 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; 3033 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; 3034 3035 if (bio->bi_status) 3036 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 3037 sparity->nsectors); 3038 3039 bio_put(bio); 3040 3041 btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, 3042 scrub_parity_bio_endio_worker, NULL, NULL); 3043 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); 3044 } 3045 3046 static void scrub_parity_check_and_repair(struct scrub_parity *sparity) 3047 { 3048 struct scrub_ctx *sctx = sparity->sctx; 3049 struct btrfs_fs_info *fs_info = sctx->fs_info; 3050 struct bio *bio; 3051 struct btrfs_raid_bio *rbio; 3052 struct btrfs_bio *bbio = NULL; 3053 u64 length; 3054 int ret; 3055 3056 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, 3057 sparity->nsectors)) 3058 goto out; 3059 3060 length = sparity->logic_end - sparity->logic_start; 3061 3062 btrfs_bio_counter_inc_blocked(fs_info); 3063 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, 3064 &length, &bbio); 3065 if (ret || !bbio || !bbio->raid_map) 3066 goto bbio_out; 3067 3068 bio = btrfs_io_bio_alloc(0); 3069 bio->bi_iter.bi_sector = sparity->logic_start >> 9; 3070 bio->bi_private = sparity; 3071 bio->bi_end_io = scrub_parity_bio_endio; 3072 3073 rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio, 3074 length, sparity->scrub_dev, 3075 sparity->dbitmap, 3076 sparity->nsectors); 3077 if (!rbio) 3078 goto rbio_out; 3079 3080 scrub_pending_bio_inc(sctx); 3081 raid56_parity_submit_scrub_rbio(rbio); 3082 return; 3083 3084 rbio_out: 3085 bio_put(bio); 3086 bbio_out: 3087 btrfs_bio_counter_dec(fs_info); 3088 btrfs_put_bbio(bbio); 3089 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 3090 sparity->nsectors); 3091 spin_lock(&sctx->stat_lock); 3092 sctx->stat.malloc_errors++; 3093 spin_unlock(&sctx->stat_lock); 3094 out: 3095 scrub_free_parity(sparity); 3096 } 3097 3098 static inline int scrub_calc_parity_bitmap_len(int nsectors) 3099 { 3100 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); 3101 } 3102 3103 static void scrub_parity_get(struct scrub_parity *sparity) 3104 { 3105 refcount_inc(&sparity->refs); 3106 } 3107 3108 static void scrub_parity_put(struct scrub_parity *sparity) 3109 { 3110 if (!refcount_dec_and_test(&sparity->refs)) 3111 return; 3112 3113 scrub_parity_check_and_repair(sparity); 3114 } 3115 3116 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, 3117 struct map_lookup *map, 3118 struct btrfs_device *sdev, 3119 struct btrfs_path *path, 3120 u64 logic_start, 3121 u64 logic_end) 3122 { 3123 struct btrfs_fs_info *fs_info = sctx->fs_info; 3124 struct btrfs_root *root = fs_info->extent_root; 3125 struct btrfs_root *csum_root = fs_info->csum_root; 3126 struct btrfs_extent_item *extent; 3127 struct btrfs_bio *bbio = NULL; 3128 u64 flags; 3129 int ret; 3130 int slot; 3131 struct extent_buffer *l; 3132 struct btrfs_key key; 3133 u64 generation; 3134 u64 extent_logical; 3135 u64 extent_physical; 3136 u64 extent_len; 3137 u64 mapped_length; 3138 struct btrfs_device *extent_dev; 3139 struct scrub_parity *sparity; 3140 int nsectors; 3141 int bitmap_len; 3142 int extent_mirror_num; 3143 int stop_loop = 0; 3144 3145 nsectors = div_u64(map->stripe_len, fs_info->sectorsize); 3146 bitmap_len = scrub_calc_parity_bitmap_len(nsectors); 3147 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, 3148 GFP_NOFS); 3149 if (!sparity) { 3150 spin_lock(&sctx->stat_lock); 3151 sctx->stat.malloc_errors++; 3152 spin_unlock(&sctx->stat_lock); 3153 return -ENOMEM; 3154 } 3155 3156 sparity->stripe_len = map->stripe_len; 3157 sparity->nsectors = nsectors; 3158 sparity->sctx = sctx; 3159 sparity->scrub_dev = sdev; 3160 sparity->logic_start = logic_start; 3161 sparity->logic_end = logic_end; 3162 refcount_set(&sparity->refs, 1); 3163 INIT_LIST_HEAD(&sparity->spages); 3164 sparity->dbitmap = sparity->bitmap; 3165 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; 3166 3167 ret = 0; 3168 while (logic_start < logic_end) { 3169 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 3170 key.type = BTRFS_METADATA_ITEM_KEY; 3171 else 3172 key.type = BTRFS_EXTENT_ITEM_KEY; 3173 key.objectid = logic_start; 3174 key.offset = (u64)-1; 3175 3176 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3177 if (ret < 0) 3178 goto out; 3179 3180 if (ret > 0) { 3181 ret = btrfs_previous_extent_item(root, path, 0); 3182 if (ret < 0) 3183 goto out; 3184 if (ret > 0) { 3185 btrfs_release_path(path); 3186 ret = btrfs_search_slot(NULL, root, &key, 3187 path, 0, 0); 3188 if (ret < 0) 3189 goto out; 3190 } 3191 } 3192 3193 stop_loop = 0; 3194 while (1) { 3195 u64 bytes; 3196 3197 l = path->nodes[0]; 3198 slot = path->slots[0]; 3199 if (slot >= btrfs_header_nritems(l)) { 3200 ret = btrfs_next_leaf(root, path); 3201 if (ret == 0) 3202 continue; 3203 if (ret < 0) 3204 goto out; 3205 3206 stop_loop = 1; 3207 break; 3208 } 3209 btrfs_item_key_to_cpu(l, &key, slot); 3210 3211 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3212 key.type != BTRFS_METADATA_ITEM_KEY) 3213 goto next; 3214 3215 if (key.type == BTRFS_METADATA_ITEM_KEY) 3216 bytes = fs_info->nodesize; 3217 else 3218 bytes = key.offset; 3219 3220 if (key.objectid + bytes <= logic_start) 3221 goto next; 3222 3223 if (key.objectid >= logic_end) { 3224 stop_loop = 1; 3225 break; 3226 } 3227 3228 while (key.objectid >= logic_start + map->stripe_len) 3229 logic_start += map->stripe_len; 3230 3231 extent = btrfs_item_ptr(l, slot, 3232 struct btrfs_extent_item); 3233 flags = btrfs_extent_flags(l, extent); 3234 generation = btrfs_extent_generation(l, extent); 3235 3236 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3237 (key.objectid < logic_start || 3238 key.objectid + bytes > 3239 logic_start + map->stripe_len)) { 3240 btrfs_err(fs_info, 3241 "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3242 key.objectid, logic_start); 3243 spin_lock(&sctx->stat_lock); 3244 sctx->stat.uncorrectable_errors++; 3245 spin_unlock(&sctx->stat_lock); 3246 goto next; 3247 } 3248 again: 3249 extent_logical = key.objectid; 3250 extent_len = bytes; 3251 3252 if (extent_logical < logic_start) { 3253 extent_len -= logic_start - extent_logical; 3254 extent_logical = logic_start; 3255 } 3256 3257 if (extent_logical + extent_len > 3258 logic_start + map->stripe_len) 3259 extent_len = logic_start + map->stripe_len - 3260 extent_logical; 3261 3262 scrub_parity_mark_sectors_data(sparity, extent_logical, 3263 extent_len); 3264 3265 mapped_length = extent_len; 3266 bbio = NULL; 3267 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, 3268 extent_logical, &mapped_length, &bbio, 3269 0); 3270 if (!ret) { 3271 if (!bbio || mapped_length < extent_len) 3272 ret = -EIO; 3273 } 3274 if (ret) { 3275 btrfs_put_bbio(bbio); 3276 goto out; 3277 } 3278 extent_physical = bbio->stripes[0].physical; 3279 extent_mirror_num = bbio->mirror_num; 3280 extent_dev = bbio->stripes[0].dev; 3281 btrfs_put_bbio(bbio); 3282 3283 ret = btrfs_lookup_csums_range(csum_root, 3284 extent_logical, 3285 extent_logical + extent_len - 1, 3286 &sctx->csum_list, 1); 3287 if (ret) 3288 goto out; 3289 3290 ret = scrub_extent_for_parity(sparity, extent_logical, 3291 extent_len, 3292 extent_physical, 3293 extent_dev, flags, 3294 generation, 3295 extent_mirror_num); 3296 3297 scrub_free_csums(sctx); 3298 3299 if (ret) 3300 goto out; 3301 3302 if (extent_logical + extent_len < 3303 key.objectid + bytes) { 3304 logic_start += map->stripe_len; 3305 3306 if (logic_start >= logic_end) { 3307 stop_loop = 1; 3308 break; 3309 } 3310 3311 if (logic_start < key.objectid + bytes) { 3312 cond_resched(); 3313 goto again; 3314 } 3315 } 3316 next: 3317 path->slots[0]++; 3318 } 3319 3320 btrfs_release_path(path); 3321 3322 if (stop_loop) 3323 break; 3324 3325 logic_start += map->stripe_len; 3326 } 3327 out: 3328 if (ret < 0) 3329 scrub_parity_mark_sectors_error(sparity, logic_start, 3330 logic_end - logic_start); 3331 scrub_parity_put(sparity); 3332 scrub_submit(sctx); 3333 mutex_lock(&sctx->wr_lock); 3334 scrub_wr_submit(sctx); 3335 mutex_unlock(&sctx->wr_lock); 3336 3337 btrfs_release_path(path); 3338 return ret < 0 ? ret : 0; 3339 } 3340 3341 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 3342 struct map_lookup *map, 3343 struct btrfs_device *scrub_dev, 3344 int num, u64 base, u64 length, 3345 int is_dev_replace) 3346 { 3347 struct btrfs_path *path, *ppath; 3348 struct btrfs_fs_info *fs_info = sctx->fs_info; 3349 struct btrfs_root *root = fs_info->extent_root; 3350 struct btrfs_root *csum_root = fs_info->csum_root; 3351 struct btrfs_extent_item *extent; 3352 struct blk_plug plug; 3353 u64 flags; 3354 int ret; 3355 int slot; 3356 u64 nstripes; 3357 struct extent_buffer *l; 3358 u64 physical; 3359 u64 logical; 3360 u64 logic_end; 3361 u64 physical_end; 3362 u64 generation; 3363 int mirror_num; 3364 struct reada_control *reada1; 3365 struct reada_control *reada2; 3366 struct btrfs_key key; 3367 struct btrfs_key key_end; 3368 u64 increment = map->stripe_len; 3369 u64 offset; 3370 u64 extent_logical; 3371 u64 extent_physical; 3372 u64 extent_len; 3373 u64 stripe_logical; 3374 u64 stripe_end; 3375 struct btrfs_device *extent_dev; 3376 int extent_mirror_num; 3377 int stop_loop = 0; 3378 3379 physical = map->stripes[num].physical; 3380 offset = 0; 3381 nstripes = div64_u64(length, map->stripe_len); 3382 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3383 offset = map->stripe_len * num; 3384 increment = map->stripe_len * map->num_stripes; 3385 mirror_num = 1; 3386 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3387 int factor = map->num_stripes / map->sub_stripes; 3388 offset = map->stripe_len * (num / map->sub_stripes); 3389 increment = map->stripe_len * factor; 3390 mirror_num = num % map->sub_stripes + 1; 3391 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3392 increment = map->stripe_len; 3393 mirror_num = num % map->num_stripes + 1; 3394 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3395 increment = map->stripe_len; 3396 mirror_num = num % map->num_stripes + 1; 3397 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3398 get_raid56_logic_offset(physical, num, map, &offset, NULL); 3399 increment = map->stripe_len * nr_data_stripes(map); 3400 mirror_num = 1; 3401 } else { 3402 increment = map->stripe_len; 3403 mirror_num = 1; 3404 } 3405 3406 path = btrfs_alloc_path(); 3407 if (!path) 3408 return -ENOMEM; 3409 3410 ppath = btrfs_alloc_path(); 3411 if (!ppath) { 3412 btrfs_free_path(path); 3413 return -ENOMEM; 3414 } 3415 3416 /* 3417 * work on commit root. The related disk blocks are static as 3418 * long as COW is applied. This means, it is save to rewrite 3419 * them to repair disk errors without any race conditions 3420 */ 3421 path->search_commit_root = 1; 3422 path->skip_locking = 1; 3423 3424 ppath->search_commit_root = 1; 3425 ppath->skip_locking = 1; 3426 /* 3427 * trigger the readahead for extent tree csum tree and wait for 3428 * completion. During readahead, the scrub is officially paused 3429 * to not hold off transaction commits 3430 */ 3431 logical = base + offset; 3432 physical_end = physical + nstripes * map->stripe_len; 3433 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3434 get_raid56_logic_offset(physical_end, num, 3435 map, &logic_end, NULL); 3436 logic_end += base; 3437 } else { 3438 logic_end = logical + increment * nstripes; 3439 } 3440 wait_event(sctx->list_wait, 3441 atomic_read(&sctx->bios_in_flight) == 0); 3442 scrub_blocked_if_needed(fs_info); 3443 3444 /* FIXME it might be better to start readahead at commit root */ 3445 key.objectid = logical; 3446 key.type = BTRFS_EXTENT_ITEM_KEY; 3447 key.offset = (u64)0; 3448 key_end.objectid = logic_end; 3449 key_end.type = BTRFS_METADATA_ITEM_KEY; 3450 key_end.offset = (u64)-1; 3451 reada1 = btrfs_reada_add(root, &key, &key_end); 3452 3453 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3454 key.type = BTRFS_EXTENT_CSUM_KEY; 3455 key.offset = logical; 3456 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3457 key_end.type = BTRFS_EXTENT_CSUM_KEY; 3458 key_end.offset = logic_end; 3459 reada2 = btrfs_reada_add(csum_root, &key, &key_end); 3460 3461 if (!IS_ERR(reada1)) 3462 btrfs_reada_wait(reada1); 3463 if (!IS_ERR(reada2)) 3464 btrfs_reada_wait(reada2); 3465 3466 3467 /* 3468 * collect all data csums for the stripe to avoid seeking during 3469 * the scrub. This might currently (crc32) end up to be about 1MB 3470 */ 3471 blk_start_plug(&plug); 3472 3473 /* 3474 * now find all extents for each stripe and scrub them 3475 */ 3476 ret = 0; 3477 while (physical < physical_end) { 3478 /* 3479 * canceled? 3480 */ 3481 if (atomic_read(&fs_info->scrub_cancel_req) || 3482 atomic_read(&sctx->cancel_req)) { 3483 ret = -ECANCELED; 3484 goto out; 3485 } 3486 /* 3487 * check to see if we have to pause 3488 */ 3489 if (atomic_read(&fs_info->scrub_pause_req)) { 3490 /* push queued extents */ 3491 sctx->flush_all_writes = true; 3492 scrub_submit(sctx); 3493 mutex_lock(&sctx->wr_lock); 3494 scrub_wr_submit(sctx); 3495 mutex_unlock(&sctx->wr_lock); 3496 wait_event(sctx->list_wait, 3497 atomic_read(&sctx->bios_in_flight) == 0); 3498 sctx->flush_all_writes = false; 3499 scrub_blocked_if_needed(fs_info); 3500 } 3501 3502 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3503 ret = get_raid56_logic_offset(physical, num, map, 3504 &logical, 3505 &stripe_logical); 3506 logical += base; 3507 if (ret) { 3508 /* it is parity strip */ 3509 stripe_logical += base; 3510 stripe_end = stripe_logical + increment; 3511 ret = scrub_raid56_parity(sctx, map, scrub_dev, 3512 ppath, stripe_logical, 3513 stripe_end); 3514 if (ret) 3515 goto out; 3516 goto skip; 3517 } 3518 } 3519 3520 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 3521 key.type = BTRFS_METADATA_ITEM_KEY; 3522 else 3523 key.type = BTRFS_EXTENT_ITEM_KEY; 3524 key.objectid = logical; 3525 key.offset = (u64)-1; 3526 3527 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3528 if (ret < 0) 3529 goto out; 3530 3531 if (ret > 0) { 3532 ret = btrfs_previous_extent_item(root, path, 0); 3533 if (ret < 0) 3534 goto out; 3535 if (ret > 0) { 3536 /* there's no smaller item, so stick with the 3537 * larger one */ 3538 btrfs_release_path(path); 3539 ret = btrfs_search_slot(NULL, root, &key, 3540 path, 0, 0); 3541 if (ret < 0) 3542 goto out; 3543 } 3544 } 3545 3546 stop_loop = 0; 3547 while (1) { 3548 u64 bytes; 3549 3550 l = path->nodes[0]; 3551 slot = path->slots[0]; 3552 if (slot >= btrfs_header_nritems(l)) { 3553 ret = btrfs_next_leaf(root, path); 3554 if (ret == 0) 3555 continue; 3556 if (ret < 0) 3557 goto out; 3558 3559 stop_loop = 1; 3560 break; 3561 } 3562 btrfs_item_key_to_cpu(l, &key, slot); 3563 3564 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3565 key.type != BTRFS_METADATA_ITEM_KEY) 3566 goto next; 3567 3568 if (key.type == BTRFS_METADATA_ITEM_KEY) 3569 bytes = fs_info->nodesize; 3570 else 3571 bytes = key.offset; 3572 3573 if (key.objectid + bytes <= logical) 3574 goto next; 3575 3576 if (key.objectid >= logical + map->stripe_len) { 3577 /* out of this device extent */ 3578 if (key.objectid >= logic_end) 3579 stop_loop = 1; 3580 break; 3581 } 3582 3583 extent = btrfs_item_ptr(l, slot, 3584 struct btrfs_extent_item); 3585 flags = btrfs_extent_flags(l, extent); 3586 generation = btrfs_extent_generation(l, extent); 3587 3588 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3589 (key.objectid < logical || 3590 key.objectid + bytes > 3591 logical + map->stripe_len)) { 3592 btrfs_err(fs_info, 3593 "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3594 key.objectid, logical); 3595 spin_lock(&sctx->stat_lock); 3596 sctx->stat.uncorrectable_errors++; 3597 spin_unlock(&sctx->stat_lock); 3598 goto next; 3599 } 3600 3601 again: 3602 extent_logical = key.objectid; 3603 extent_len = bytes; 3604 3605 /* 3606 * trim extent to this stripe 3607 */ 3608 if (extent_logical < logical) { 3609 extent_len -= logical - extent_logical; 3610 extent_logical = logical; 3611 } 3612 if (extent_logical + extent_len > 3613 logical + map->stripe_len) { 3614 extent_len = logical + map->stripe_len - 3615 extent_logical; 3616 } 3617 3618 extent_physical = extent_logical - logical + physical; 3619 extent_dev = scrub_dev; 3620 extent_mirror_num = mirror_num; 3621 if (is_dev_replace) 3622 scrub_remap_extent(fs_info, extent_logical, 3623 extent_len, &extent_physical, 3624 &extent_dev, 3625 &extent_mirror_num); 3626 3627 ret = btrfs_lookup_csums_range(csum_root, 3628 extent_logical, 3629 extent_logical + 3630 extent_len - 1, 3631 &sctx->csum_list, 1); 3632 if (ret) 3633 goto out; 3634 3635 ret = scrub_extent(sctx, map, extent_logical, extent_len, 3636 extent_physical, extent_dev, flags, 3637 generation, extent_mirror_num, 3638 extent_logical - logical + physical); 3639 3640 scrub_free_csums(sctx); 3641 3642 if (ret) 3643 goto out; 3644 3645 if (extent_logical + extent_len < 3646 key.objectid + bytes) { 3647 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3648 /* 3649 * loop until we find next data stripe 3650 * or we have finished all stripes. 3651 */ 3652 loop: 3653 physical += map->stripe_len; 3654 ret = get_raid56_logic_offset(physical, 3655 num, map, &logical, 3656 &stripe_logical); 3657 logical += base; 3658 3659 if (ret && physical < physical_end) { 3660 stripe_logical += base; 3661 stripe_end = stripe_logical + 3662 increment; 3663 ret = scrub_raid56_parity(sctx, 3664 map, scrub_dev, ppath, 3665 stripe_logical, 3666 stripe_end); 3667 if (ret) 3668 goto out; 3669 goto loop; 3670 } 3671 } else { 3672 physical += map->stripe_len; 3673 logical += increment; 3674 } 3675 if (logical < key.objectid + bytes) { 3676 cond_resched(); 3677 goto again; 3678 } 3679 3680 if (physical >= physical_end) { 3681 stop_loop = 1; 3682 break; 3683 } 3684 } 3685 next: 3686 path->slots[0]++; 3687 } 3688 btrfs_release_path(path); 3689 skip: 3690 logical += increment; 3691 physical += map->stripe_len; 3692 spin_lock(&sctx->stat_lock); 3693 if (stop_loop) 3694 sctx->stat.last_physical = map->stripes[num].physical + 3695 length; 3696 else 3697 sctx->stat.last_physical = physical; 3698 spin_unlock(&sctx->stat_lock); 3699 if (stop_loop) 3700 break; 3701 } 3702 out: 3703 /* push queued extents */ 3704 scrub_submit(sctx); 3705 mutex_lock(&sctx->wr_lock); 3706 scrub_wr_submit(sctx); 3707 mutex_unlock(&sctx->wr_lock); 3708 3709 blk_finish_plug(&plug); 3710 btrfs_free_path(path); 3711 btrfs_free_path(ppath); 3712 return ret < 0 ? ret : 0; 3713 } 3714 3715 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 3716 struct btrfs_device *scrub_dev, 3717 u64 chunk_offset, u64 length, 3718 u64 dev_offset, 3719 struct btrfs_block_group_cache *cache, 3720 int is_dev_replace) 3721 { 3722 struct btrfs_fs_info *fs_info = sctx->fs_info; 3723 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 3724 struct map_lookup *map; 3725 struct extent_map *em; 3726 int i; 3727 int ret = 0; 3728 3729 read_lock(&map_tree->map_tree.lock); 3730 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3731 read_unlock(&map_tree->map_tree.lock); 3732 3733 if (!em) { 3734 /* 3735 * Might have been an unused block group deleted by the cleaner 3736 * kthread or relocation. 3737 */ 3738 spin_lock(&cache->lock); 3739 if (!cache->removed) 3740 ret = -EINVAL; 3741 spin_unlock(&cache->lock); 3742 3743 return ret; 3744 } 3745 3746 map = em->map_lookup; 3747 if (em->start != chunk_offset) 3748 goto out; 3749 3750 if (em->len < length) 3751 goto out; 3752 3753 for (i = 0; i < map->num_stripes; ++i) { 3754 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 3755 map->stripes[i].physical == dev_offset) { 3756 ret = scrub_stripe(sctx, map, scrub_dev, i, 3757 chunk_offset, length, 3758 is_dev_replace); 3759 if (ret) 3760 goto out; 3761 } 3762 } 3763 out: 3764 free_extent_map(em); 3765 3766 return ret; 3767 } 3768 3769 static noinline_for_stack 3770 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 3771 struct btrfs_device *scrub_dev, u64 start, u64 end, 3772 int is_dev_replace) 3773 { 3774 struct btrfs_dev_extent *dev_extent = NULL; 3775 struct btrfs_path *path; 3776 struct btrfs_fs_info *fs_info = sctx->fs_info; 3777 struct btrfs_root *root = fs_info->dev_root; 3778 u64 length; 3779 u64 chunk_offset; 3780 int ret = 0; 3781 int ro_set; 3782 int slot; 3783 struct extent_buffer *l; 3784 struct btrfs_key key; 3785 struct btrfs_key found_key; 3786 struct btrfs_block_group_cache *cache; 3787 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 3788 3789 path = btrfs_alloc_path(); 3790 if (!path) 3791 return -ENOMEM; 3792 3793 path->reada = READA_FORWARD; 3794 path->search_commit_root = 1; 3795 path->skip_locking = 1; 3796 3797 key.objectid = scrub_dev->devid; 3798 key.offset = 0ull; 3799 key.type = BTRFS_DEV_EXTENT_KEY; 3800 3801 while (1) { 3802 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3803 if (ret < 0) 3804 break; 3805 if (ret > 0) { 3806 if (path->slots[0] >= 3807 btrfs_header_nritems(path->nodes[0])) { 3808 ret = btrfs_next_leaf(root, path); 3809 if (ret < 0) 3810 break; 3811 if (ret > 0) { 3812 ret = 0; 3813 break; 3814 } 3815 } else { 3816 ret = 0; 3817 } 3818 } 3819 3820 l = path->nodes[0]; 3821 slot = path->slots[0]; 3822 3823 btrfs_item_key_to_cpu(l, &found_key, slot); 3824 3825 if (found_key.objectid != scrub_dev->devid) 3826 break; 3827 3828 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 3829 break; 3830 3831 if (found_key.offset >= end) 3832 break; 3833 3834 if (found_key.offset < key.offset) 3835 break; 3836 3837 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3838 length = btrfs_dev_extent_length(l, dev_extent); 3839 3840 if (found_key.offset + length <= start) 3841 goto skip; 3842 3843 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3844 3845 /* 3846 * get a reference on the corresponding block group to prevent 3847 * the chunk from going away while we scrub it 3848 */ 3849 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3850 3851 /* some chunks are removed but not committed to disk yet, 3852 * continue scrubbing */ 3853 if (!cache) 3854 goto skip; 3855 3856 /* 3857 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 3858 * to avoid deadlock caused by: 3859 * btrfs_inc_block_group_ro() 3860 * -> btrfs_wait_for_commit() 3861 * -> btrfs_commit_transaction() 3862 * -> btrfs_scrub_pause() 3863 */ 3864 scrub_pause_on(fs_info); 3865 ret = btrfs_inc_block_group_ro(fs_info, cache); 3866 if (!ret && is_dev_replace) { 3867 /* 3868 * If we are doing a device replace wait for any tasks 3869 * that started dellaloc right before we set the block 3870 * group to RO mode, as they might have just allocated 3871 * an extent from it or decided they could do a nocow 3872 * write. And if any such tasks did that, wait for their 3873 * ordered extents to complete and then commit the 3874 * current transaction, so that we can later see the new 3875 * extent items in the extent tree - the ordered extents 3876 * create delayed data references (for cow writes) when 3877 * they complete, which will be run and insert the 3878 * corresponding extent items into the extent tree when 3879 * we commit the transaction they used when running 3880 * inode.c:btrfs_finish_ordered_io(). We later use 3881 * the commit root of the extent tree to find extents 3882 * to copy from the srcdev into the tgtdev, and we don't 3883 * want to miss any new extents. 3884 */ 3885 btrfs_wait_block_group_reservations(cache); 3886 btrfs_wait_nocow_writers(cache); 3887 ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, 3888 cache->key.objectid, 3889 cache->key.offset); 3890 if (ret > 0) { 3891 struct btrfs_trans_handle *trans; 3892 3893 trans = btrfs_join_transaction(root); 3894 if (IS_ERR(trans)) 3895 ret = PTR_ERR(trans); 3896 else 3897 ret = btrfs_commit_transaction(trans); 3898 if (ret) { 3899 scrub_pause_off(fs_info); 3900 btrfs_put_block_group(cache); 3901 break; 3902 } 3903 } 3904 } 3905 scrub_pause_off(fs_info); 3906 3907 if (ret == 0) { 3908 ro_set = 1; 3909 } else if (ret == -ENOSPC) { 3910 /* 3911 * btrfs_inc_block_group_ro return -ENOSPC when it 3912 * failed in creating new chunk for metadata. 3913 * It is not a problem for scrub/replace, because 3914 * metadata are always cowed, and our scrub paused 3915 * commit_transactions. 3916 */ 3917 ro_set = 0; 3918 } else { 3919 btrfs_warn(fs_info, 3920 "failed setting block group ro: %d", ret); 3921 btrfs_put_block_group(cache); 3922 break; 3923 } 3924 3925 btrfs_dev_replace_write_lock(&fs_info->dev_replace); 3926 dev_replace->cursor_right = found_key.offset + length; 3927 dev_replace->cursor_left = found_key.offset; 3928 dev_replace->item_needs_writeback = 1; 3929 btrfs_dev_replace_write_unlock(&fs_info->dev_replace); 3930 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, 3931 found_key.offset, cache, is_dev_replace); 3932 3933 /* 3934 * flush, submit all pending read and write bios, afterwards 3935 * wait for them. 3936 * Note that in the dev replace case, a read request causes 3937 * write requests that are submitted in the read completion 3938 * worker. Therefore in the current situation, it is required 3939 * that all write requests are flushed, so that all read and 3940 * write requests are really completed when bios_in_flight 3941 * changes to 0. 3942 */ 3943 sctx->flush_all_writes = true; 3944 scrub_submit(sctx); 3945 mutex_lock(&sctx->wr_lock); 3946 scrub_wr_submit(sctx); 3947 mutex_unlock(&sctx->wr_lock); 3948 3949 wait_event(sctx->list_wait, 3950 atomic_read(&sctx->bios_in_flight) == 0); 3951 3952 scrub_pause_on(fs_info); 3953 3954 /* 3955 * must be called before we decrease @scrub_paused. 3956 * make sure we don't block transaction commit while 3957 * we are waiting pending workers finished. 3958 */ 3959 wait_event(sctx->list_wait, 3960 atomic_read(&sctx->workers_pending) == 0); 3961 sctx->flush_all_writes = false; 3962 3963 scrub_pause_off(fs_info); 3964 3965 btrfs_dev_replace_write_lock(&fs_info->dev_replace); 3966 dev_replace->cursor_left = dev_replace->cursor_right; 3967 dev_replace->item_needs_writeback = 1; 3968 btrfs_dev_replace_write_unlock(&fs_info->dev_replace); 3969 3970 if (ro_set) 3971 btrfs_dec_block_group_ro(cache); 3972 3973 /* 3974 * We might have prevented the cleaner kthread from deleting 3975 * this block group if it was already unused because we raced 3976 * and set it to RO mode first. So add it back to the unused 3977 * list, otherwise it might not ever be deleted unless a manual 3978 * balance is triggered or it becomes used and unused again. 3979 */ 3980 spin_lock(&cache->lock); 3981 if (!cache->removed && !cache->ro && cache->reserved == 0 && 3982 btrfs_block_group_used(&cache->item) == 0) { 3983 spin_unlock(&cache->lock); 3984 spin_lock(&fs_info->unused_bgs_lock); 3985 if (list_empty(&cache->bg_list)) { 3986 btrfs_get_block_group(cache); 3987 trace_btrfs_add_unused_block_group(cache); 3988 list_add_tail(&cache->bg_list, 3989 &fs_info->unused_bgs); 3990 } 3991 spin_unlock(&fs_info->unused_bgs_lock); 3992 } else { 3993 spin_unlock(&cache->lock); 3994 } 3995 3996 btrfs_put_block_group(cache); 3997 if (ret) 3998 break; 3999 if (is_dev_replace && 4000 atomic64_read(&dev_replace->num_write_errors) > 0) { 4001 ret = -EIO; 4002 break; 4003 } 4004 if (sctx->stat.malloc_errors > 0) { 4005 ret = -ENOMEM; 4006 break; 4007 } 4008 skip: 4009 key.offset = found_key.offset + length; 4010 btrfs_release_path(path); 4011 } 4012 4013 btrfs_free_path(path); 4014 4015 return ret; 4016 } 4017 4018 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 4019 struct btrfs_device *scrub_dev) 4020 { 4021 int i; 4022 u64 bytenr; 4023 u64 gen; 4024 int ret; 4025 struct btrfs_fs_info *fs_info = sctx->fs_info; 4026 4027 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 4028 return -EIO; 4029 4030 /* Seed devices of a new filesystem has their own generation. */ 4031 if (scrub_dev->fs_devices != fs_info->fs_devices) 4032 gen = scrub_dev->generation; 4033 else 4034 gen = fs_info->last_trans_committed; 4035 4036 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 4037 bytenr = btrfs_sb_offset(i); 4038 if (bytenr + BTRFS_SUPER_INFO_SIZE > 4039 scrub_dev->commit_total_bytes) 4040 break; 4041 4042 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 4043 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 4044 NULL, 1, bytenr); 4045 if (ret) 4046 return ret; 4047 } 4048 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 4049 4050 return 0; 4051 } 4052 4053 /* 4054 * get a reference count on fs_info->scrub_workers. start worker if necessary 4055 */ 4056 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 4057 int is_dev_replace) 4058 { 4059 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 4060 int max_active = fs_info->thread_pool_size; 4061 4062 if (fs_info->scrub_workers_refcnt == 0) { 4063 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", 4064 flags, is_dev_replace ? 1 : max_active, 4); 4065 if (!fs_info->scrub_workers) 4066 goto fail_scrub_workers; 4067 4068 fs_info->scrub_wr_completion_workers = 4069 btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, 4070 max_active, 2); 4071 if (!fs_info->scrub_wr_completion_workers) 4072 goto fail_scrub_wr_completion_workers; 4073 4074 fs_info->scrub_nocow_workers = 4075 btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); 4076 if (!fs_info->scrub_nocow_workers) 4077 goto fail_scrub_nocow_workers; 4078 fs_info->scrub_parity_workers = 4079 btrfs_alloc_workqueue(fs_info, "scrubparity", flags, 4080 max_active, 2); 4081 if (!fs_info->scrub_parity_workers) 4082 goto fail_scrub_parity_workers; 4083 } 4084 ++fs_info->scrub_workers_refcnt; 4085 return 0; 4086 4087 fail_scrub_parity_workers: 4088 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); 4089 fail_scrub_nocow_workers: 4090 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); 4091 fail_scrub_wr_completion_workers: 4092 btrfs_destroy_workqueue(fs_info->scrub_workers); 4093 fail_scrub_workers: 4094 return -ENOMEM; 4095 } 4096 4097 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 4098 { 4099 if (--fs_info->scrub_workers_refcnt == 0) { 4100 btrfs_destroy_workqueue(fs_info->scrub_workers); 4101 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); 4102 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); 4103 btrfs_destroy_workqueue(fs_info->scrub_parity_workers); 4104 } 4105 WARN_ON(fs_info->scrub_workers_refcnt < 0); 4106 } 4107 4108 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 4109 u64 end, struct btrfs_scrub_progress *progress, 4110 int readonly, int is_dev_replace) 4111 { 4112 struct scrub_ctx *sctx; 4113 int ret; 4114 struct btrfs_device *dev; 4115 struct rcu_string *name; 4116 4117 if (btrfs_fs_closing(fs_info)) 4118 return -EINVAL; 4119 4120 if (fs_info->nodesize > BTRFS_STRIPE_LEN) { 4121 /* 4122 * in this case scrub is unable to calculate the checksum 4123 * the way scrub is implemented. Do not handle this 4124 * situation at all because it won't ever happen. 4125 */ 4126 btrfs_err(fs_info, 4127 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", 4128 fs_info->nodesize, 4129 BTRFS_STRIPE_LEN); 4130 return -EINVAL; 4131 } 4132 4133 if (fs_info->sectorsize != PAGE_SIZE) { 4134 /* not supported for data w/o checksums */ 4135 btrfs_err_rl(fs_info, 4136 "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails", 4137 fs_info->sectorsize, PAGE_SIZE); 4138 return -EINVAL; 4139 } 4140 4141 if (fs_info->nodesize > 4142 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || 4143 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { 4144 /* 4145 * would exhaust the array bounds of pagev member in 4146 * struct scrub_block 4147 */ 4148 btrfs_err(fs_info, 4149 "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", 4150 fs_info->nodesize, 4151 SCRUB_MAX_PAGES_PER_BLOCK, 4152 fs_info->sectorsize, 4153 SCRUB_MAX_PAGES_PER_BLOCK); 4154 return -EINVAL; 4155 } 4156 4157 4158 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4159 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 4160 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 4161 !is_dev_replace)) { 4162 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4163 return -ENODEV; 4164 } 4165 4166 if (!is_dev_replace && !readonly && 4167 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 4168 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4169 rcu_read_lock(); 4170 name = rcu_dereference(dev->name); 4171 btrfs_err(fs_info, "scrub: device %s is not writable", 4172 name->str); 4173 rcu_read_unlock(); 4174 return -EROFS; 4175 } 4176 4177 mutex_lock(&fs_info->scrub_lock); 4178 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4179 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { 4180 mutex_unlock(&fs_info->scrub_lock); 4181 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4182 return -EIO; 4183 } 4184 4185 btrfs_dev_replace_read_lock(&fs_info->dev_replace); 4186 if (dev->scrub_ctx || 4187 (!is_dev_replace && 4188 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 4189 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 4190 mutex_unlock(&fs_info->scrub_lock); 4191 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4192 return -EINPROGRESS; 4193 } 4194 btrfs_dev_replace_read_unlock(&fs_info->dev_replace); 4195 4196 ret = scrub_workers_get(fs_info, is_dev_replace); 4197 if (ret) { 4198 mutex_unlock(&fs_info->scrub_lock); 4199 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4200 return ret; 4201 } 4202 4203 sctx = scrub_setup_ctx(dev, is_dev_replace); 4204 if (IS_ERR(sctx)) { 4205 mutex_unlock(&fs_info->scrub_lock); 4206 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4207 scrub_workers_put(fs_info); 4208 return PTR_ERR(sctx); 4209 } 4210 sctx->readonly = readonly; 4211 dev->scrub_ctx = sctx; 4212 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4213 4214 /* 4215 * checking @scrub_pause_req here, we can avoid 4216 * race between committing transaction and scrubbing. 4217 */ 4218 __scrub_blocked_if_needed(fs_info); 4219 atomic_inc(&fs_info->scrubs_running); 4220 mutex_unlock(&fs_info->scrub_lock); 4221 4222 if (!is_dev_replace) { 4223 /* 4224 * by holding device list mutex, we can 4225 * kick off writing super in log tree sync. 4226 */ 4227 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4228 ret = scrub_supers(sctx, dev); 4229 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4230 } 4231 4232 if (!ret) 4233 ret = scrub_enumerate_chunks(sctx, dev, start, end, 4234 is_dev_replace); 4235 4236 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 4237 atomic_dec(&fs_info->scrubs_running); 4238 wake_up(&fs_info->scrub_pause_wait); 4239 4240 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); 4241 4242 if (progress) 4243 memcpy(progress, &sctx->stat, sizeof(*progress)); 4244 4245 mutex_lock(&fs_info->scrub_lock); 4246 dev->scrub_ctx = NULL; 4247 scrub_workers_put(fs_info); 4248 mutex_unlock(&fs_info->scrub_lock); 4249 4250 scrub_put_ctx(sctx); 4251 4252 return ret; 4253 } 4254 4255 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 4256 { 4257 mutex_lock(&fs_info->scrub_lock); 4258 atomic_inc(&fs_info->scrub_pause_req); 4259 while (atomic_read(&fs_info->scrubs_paused) != 4260 atomic_read(&fs_info->scrubs_running)) { 4261 mutex_unlock(&fs_info->scrub_lock); 4262 wait_event(fs_info->scrub_pause_wait, 4263 atomic_read(&fs_info->scrubs_paused) == 4264 atomic_read(&fs_info->scrubs_running)); 4265 mutex_lock(&fs_info->scrub_lock); 4266 } 4267 mutex_unlock(&fs_info->scrub_lock); 4268 } 4269 4270 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 4271 { 4272 atomic_dec(&fs_info->scrub_pause_req); 4273 wake_up(&fs_info->scrub_pause_wait); 4274 } 4275 4276 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 4277 { 4278 mutex_lock(&fs_info->scrub_lock); 4279 if (!atomic_read(&fs_info->scrubs_running)) { 4280 mutex_unlock(&fs_info->scrub_lock); 4281 return -ENOTCONN; 4282 } 4283 4284 atomic_inc(&fs_info->scrub_cancel_req); 4285 while (atomic_read(&fs_info->scrubs_running)) { 4286 mutex_unlock(&fs_info->scrub_lock); 4287 wait_event(fs_info->scrub_pause_wait, 4288 atomic_read(&fs_info->scrubs_running) == 0); 4289 mutex_lock(&fs_info->scrub_lock); 4290 } 4291 atomic_dec(&fs_info->scrub_cancel_req); 4292 mutex_unlock(&fs_info->scrub_lock); 4293 4294 return 0; 4295 } 4296 4297 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, 4298 struct btrfs_device *dev) 4299 { 4300 struct scrub_ctx *sctx; 4301 4302 mutex_lock(&fs_info->scrub_lock); 4303 sctx = dev->scrub_ctx; 4304 if (!sctx) { 4305 mutex_unlock(&fs_info->scrub_lock); 4306 return -ENOTCONN; 4307 } 4308 atomic_inc(&sctx->cancel_req); 4309 while (dev->scrub_ctx) { 4310 mutex_unlock(&fs_info->scrub_lock); 4311 wait_event(fs_info->scrub_pause_wait, 4312 dev->scrub_ctx == NULL); 4313 mutex_lock(&fs_info->scrub_lock); 4314 } 4315 mutex_unlock(&fs_info->scrub_lock); 4316 4317 return 0; 4318 } 4319 4320 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 4321 struct btrfs_scrub_progress *progress) 4322 { 4323 struct btrfs_device *dev; 4324 struct scrub_ctx *sctx = NULL; 4325 4326 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4327 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 4328 if (dev) 4329 sctx = dev->scrub_ctx; 4330 if (sctx) 4331 memcpy(progress, &sctx->stat, sizeof(*progress)); 4332 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4333 4334 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 4335 } 4336 4337 static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 4338 u64 extent_logical, u64 extent_len, 4339 u64 *extent_physical, 4340 struct btrfs_device **extent_dev, 4341 int *extent_mirror_num) 4342 { 4343 u64 mapped_length; 4344 struct btrfs_bio *bbio = NULL; 4345 int ret; 4346 4347 mapped_length = extent_len; 4348 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, 4349 &mapped_length, &bbio, 0); 4350 if (ret || !bbio || mapped_length < extent_len || 4351 !bbio->stripes[0].dev->bdev) { 4352 btrfs_put_bbio(bbio); 4353 return; 4354 } 4355 4356 *extent_physical = bbio->stripes[0].physical; 4357 *extent_mirror_num = bbio->mirror_num; 4358 *extent_dev = bbio->stripes[0].dev; 4359 btrfs_put_bbio(bbio); 4360 } 4361 4362 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 4363 int mirror_num, u64 physical_for_dev_replace) 4364 { 4365 struct scrub_copy_nocow_ctx *nocow_ctx; 4366 struct btrfs_fs_info *fs_info = sctx->fs_info; 4367 4368 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); 4369 if (!nocow_ctx) { 4370 spin_lock(&sctx->stat_lock); 4371 sctx->stat.malloc_errors++; 4372 spin_unlock(&sctx->stat_lock); 4373 return -ENOMEM; 4374 } 4375 4376 scrub_pending_trans_workers_inc(sctx); 4377 4378 nocow_ctx->sctx = sctx; 4379 nocow_ctx->logical = logical; 4380 nocow_ctx->len = len; 4381 nocow_ctx->mirror_num = mirror_num; 4382 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 4383 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, 4384 copy_nocow_pages_worker, NULL, NULL); 4385 INIT_LIST_HEAD(&nocow_ctx->inodes); 4386 btrfs_queue_work(fs_info->scrub_nocow_workers, 4387 &nocow_ctx->work); 4388 4389 return 0; 4390 } 4391 4392 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) 4393 { 4394 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 4395 struct scrub_nocow_inode *nocow_inode; 4396 4397 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); 4398 if (!nocow_inode) 4399 return -ENOMEM; 4400 nocow_inode->inum = inum; 4401 nocow_inode->offset = offset; 4402 nocow_inode->root = root; 4403 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); 4404 return 0; 4405 } 4406 4407 #define COPY_COMPLETE 1 4408 4409 static void copy_nocow_pages_worker(struct btrfs_work *work) 4410 { 4411 struct scrub_copy_nocow_ctx *nocow_ctx = 4412 container_of(work, struct scrub_copy_nocow_ctx, work); 4413 struct scrub_ctx *sctx = nocow_ctx->sctx; 4414 struct btrfs_fs_info *fs_info = sctx->fs_info; 4415 struct btrfs_root *root = fs_info->extent_root; 4416 u64 logical = nocow_ctx->logical; 4417 u64 len = nocow_ctx->len; 4418 int mirror_num = nocow_ctx->mirror_num; 4419 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4420 int ret; 4421 struct btrfs_trans_handle *trans = NULL; 4422 struct btrfs_path *path; 4423 int not_written = 0; 4424 4425 path = btrfs_alloc_path(); 4426 if (!path) { 4427 spin_lock(&sctx->stat_lock); 4428 sctx->stat.malloc_errors++; 4429 spin_unlock(&sctx->stat_lock); 4430 not_written = 1; 4431 goto out; 4432 } 4433 4434 trans = btrfs_join_transaction(root); 4435 if (IS_ERR(trans)) { 4436 not_written = 1; 4437 goto out; 4438 } 4439 4440 ret = iterate_inodes_from_logical(logical, fs_info, path, 4441 record_inode_for_nocow, nocow_ctx, false); 4442 if (ret != 0 && ret != -ENOENT) { 4443 btrfs_warn(fs_info, 4444 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d", 4445 logical, physical_for_dev_replace, len, mirror_num, 4446 ret); 4447 not_written = 1; 4448 goto out; 4449 } 4450 4451 btrfs_end_transaction(trans); 4452 trans = NULL; 4453 while (!list_empty(&nocow_ctx->inodes)) { 4454 struct scrub_nocow_inode *entry; 4455 entry = list_first_entry(&nocow_ctx->inodes, 4456 struct scrub_nocow_inode, 4457 list); 4458 list_del_init(&entry->list); 4459 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, 4460 entry->root, nocow_ctx); 4461 kfree(entry); 4462 if (ret == COPY_COMPLETE) { 4463 ret = 0; 4464 break; 4465 } else if (ret) { 4466 break; 4467 } 4468 } 4469 out: 4470 while (!list_empty(&nocow_ctx->inodes)) { 4471 struct scrub_nocow_inode *entry; 4472 entry = list_first_entry(&nocow_ctx->inodes, 4473 struct scrub_nocow_inode, 4474 list); 4475 list_del_init(&entry->list); 4476 kfree(entry); 4477 } 4478 if (trans && !IS_ERR(trans)) 4479 btrfs_end_transaction(trans); 4480 if (not_written) 4481 btrfs_dev_replace_stats_inc(&fs_info->dev_replace. 4482 num_uncorrectable_read_errors); 4483 4484 btrfs_free_path(path); 4485 kfree(nocow_ctx); 4486 4487 scrub_pending_trans_workers_dec(sctx); 4488 } 4489 4490 static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, 4491 u64 logical) 4492 { 4493 struct extent_state *cached_state = NULL; 4494 struct btrfs_ordered_extent *ordered; 4495 struct extent_io_tree *io_tree; 4496 struct extent_map *em; 4497 u64 lockstart = start, lockend = start + len - 1; 4498 int ret = 0; 4499 4500 io_tree = &inode->io_tree; 4501 4502 lock_extent_bits(io_tree, lockstart, lockend, &cached_state); 4503 ordered = btrfs_lookup_ordered_range(inode, lockstart, len); 4504 if (ordered) { 4505 btrfs_put_ordered_extent(ordered); 4506 ret = 1; 4507 goto out_unlock; 4508 } 4509 4510 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 4511 if (IS_ERR(em)) { 4512 ret = PTR_ERR(em); 4513 goto out_unlock; 4514 } 4515 4516 /* 4517 * This extent does not actually cover the logical extent anymore, 4518 * move on to the next inode. 4519 */ 4520 if (em->block_start > logical || 4521 em->block_start + em->block_len < logical + len || 4522 test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 4523 free_extent_map(em); 4524 ret = 1; 4525 goto out_unlock; 4526 } 4527 free_extent_map(em); 4528 4529 out_unlock: 4530 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state); 4531 return ret; 4532 } 4533 4534 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 4535 struct scrub_copy_nocow_ctx *nocow_ctx) 4536 { 4537 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info; 4538 struct btrfs_key key; 4539 struct inode *inode; 4540 struct page *page; 4541 struct btrfs_root *local_root; 4542 struct extent_io_tree *io_tree; 4543 u64 physical_for_dev_replace; 4544 u64 nocow_ctx_logical; 4545 u64 len = nocow_ctx->len; 4546 unsigned long index; 4547 int srcu_index; 4548 int ret = 0; 4549 int err = 0; 4550 4551 key.objectid = root; 4552 key.type = BTRFS_ROOT_ITEM_KEY; 4553 key.offset = (u64)-1; 4554 4555 srcu_index = srcu_read_lock(&fs_info->subvol_srcu); 4556 4557 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 4558 if (IS_ERR(local_root)) { 4559 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 4560 return PTR_ERR(local_root); 4561 } 4562 4563 key.type = BTRFS_INODE_ITEM_KEY; 4564 key.objectid = inum; 4565 key.offset = 0; 4566 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 4567 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 4568 if (IS_ERR(inode)) 4569 return PTR_ERR(inode); 4570 4571 /* Avoid truncate/dio/punch hole.. */ 4572 inode_lock(inode); 4573 inode_dio_wait(inode); 4574 4575 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4576 io_tree = &BTRFS_I(inode)->io_tree; 4577 nocow_ctx_logical = nocow_ctx->logical; 4578 4579 ret = check_extent_to_block(BTRFS_I(inode), offset, len, 4580 nocow_ctx_logical); 4581 if (ret) { 4582 ret = ret > 0 ? 0 : ret; 4583 goto out; 4584 } 4585 4586 while (len >= PAGE_SIZE) { 4587 index = offset >> PAGE_SHIFT; 4588 again: 4589 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 4590 if (!page) { 4591 btrfs_err(fs_info, "find_or_create_page() failed"); 4592 ret = -ENOMEM; 4593 goto out; 4594 } 4595 4596 if (PageUptodate(page)) { 4597 if (PageDirty(page)) 4598 goto next_page; 4599 } else { 4600 ClearPageError(page); 4601 err = extent_read_full_page(io_tree, page, 4602 btrfs_get_extent, 4603 nocow_ctx->mirror_num); 4604 if (err) { 4605 ret = err; 4606 goto next_page; 4607 } 4608 4609 lock_page(page); 4610 /* 4611 * If the page has been remove from the page cache, 4612 * the data on it is meaningless, because it may be 4613 * old one, the new data may be written into the new 4614 * page in the page cache. 4615 */ 4616 if (page->mapping != inode->i_mapping) { 4617 unlock_page(page); 4618 put_page(page); 4619 goto again; 4620 } 4621 if (!PageUptodate(page)) { 4622 ret = -EIO; 4623 goto next_page; 4624 } 4625 } 4626 4627 ret = check_extent_to_block(BTRFS_I(inode), offset, len, 4628 nocow_ctx_logical); 4629 if (ret) { 4630 ret = ret > 0 ? 0 : ret; 4631 goto next_page; 4632 } 4633 4634 err = write_page_nocow(nocow_ctx->sctx, 4635 physical_for_dev_replace, page); 4636 if (err) 4637 ret = err; 4638 next_page: 4639 unlock_page(page); 4640 put_page(page); 4641 4642 if (ret) 4643 break; 4644 4645 offset += PAGE_SIZE; 4646 physical_for_dev_replace += PAGE_SIZE; 4647 nocow_ctx_logical += PAGE_SIZE; 4648 len -= PAGE_SIZE; 4649 } 4650 ret = COPY_COMPLETE; 4651 out: 4652 inode_unlock(inode); 4653 iput(inode); 4654 return ret; 4655 } 4656 4657 static int write_page_nocow(struct scrub_ctx *sctx, 4658 u64 physical_for_dev_replace, struct page *page) 4659 { 4660 struct bio *bio; 4661 struct btrfs_device *dev; 4662 4663 dev = sctx->wr_tgtdev; 4664 if (!dev) 4665 return -EIO; 4666 if (!dev->bdev) { 4667 btrfs_warn_rl(dev->fs_info, 4668 "scrub write_page_nocow(bdev == NULL) is unexpected"); 4669 return -EIO; 4670 } 4671 bio = btrfs_io_bio_alloc(1); 4672 bio->bi_iter.bi_size = 0; 4673 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; 4674 bio_set_dev(bio, dev->bdev); 4675 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 4676 /* bio_add_page won't fail on a freshly allocated bio */ 4677 bio_add_page(bio, page, PAGE_SIZE, 0); 4678 4679 if (btrfsic_submit_bio_wait(bio)) { 4680 bio_put(bio); 4681 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 4682 return -EIO; 4683 } 4684 4685 bio_put(bio); 4686 return 0; 4687 } 4688