1 /* 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/blkdev.h> 20 #include <linux/ratelimit.h> 21 #include <linux/sched/mm.h> 22 #include "ctree.h" 23 #include "volumes.h" 24 #include "disk-io.h" 25 #include "ordered-data.h" 26 #include "transaction.h" 27 #include "backref.h" 28 #include "extent_io.h" 29 #include "dev-replace.h" 30 #include "check-integrity.h" 31 #include "rcu-string.h" 32 #include "raid56.h" 33 34 /* 35 * This is only the first step towards a full-features scrub. It reads all 36 * extent and super block and verifies the checksums. In case a bad checksum 37 * is found or the extent cannot be read, good data will be written back if 38 * any can be found. 39 * 40 * Future enhancements: 41 * - In case an unrepairable extent is encountered, track which files are 42 * affected and report them 43 * - track and record media errors, throw out bad devices 44 * - add a mode to also read unallocated space 45 */ 46 47 struct scrub_block; 48 struct scrub_ctx; 49 50 /* 51 * the following three values only influence the performance. 52 * The last one configures the number of parallel and outstanding I/O 53 * operations. The first two values configure an upper limit for the number 54 * of (dynamically allocated) pages that are added to a bio. 55 */ 56 #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ 57 #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ 58 #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ 59 60 /* 61 * the following value times PAGE_SIZE needs to be large enough to match the 62 * largest node/leaf/sector size that shall be supported. 63 * Values larger than BTRFS_STRIPE_LEN are not supported. 64 */ 65 #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 66 67 struct scrub_recover { 68 refcount_t refs; 69 struct btrfs_bio *bbio; 70 u64 map_length; 71 }; 72 73 struct scrub_page { 74 struct scrub_block *sblock; 75 struct page *page; 76 struct btrfs_device *dev; 77 struct list_head list; 78 u64 flags; /* extent flags */ 79 u64 generation; 80 u64 logical; 81 u64 physical; 82 u64 physical_for_dev_replace; 83 atomic_t refs; 84 struct { 85 unsigned int mirror_num:8; 86 unsigned int have_csum:1; 87 unsigned int io_error:1; 88 }; 89 u8 csum[BTRFS_CSUM_SIZE]; 90 91 struct scrub_recover *recover; 92 }; 93 94 struct scrub_bio { 95 int index; 96 struct scrub_ctx *sctx; 97 struct btrfs_device *dev; 98 struct bio *bio; 99 blk_status_t status; 100 u64 logical; 101 u64 physical; 102 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO 103 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; 104 #else 105 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; 106 #endif 107 int page_count; 108 int next_free; 109 struct btrfs_work work; 110 }; 111 112 struct scrub_block { 113 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 114 int page_count; 115 atomic_t outstanding_pages; 116 refcount_t refs; /* free mem on transition to zero */ 117 struct scrub_ctx *sctx; 118 struct scrub_parity *sparity; 119 struct { 120 unsigned int header_error:1; 121 unsigned int checksum_error:1; 122 unsigned int no_io_error_seen:1; 123 unsigned int generation_error:1; /* also sets header_error */ 124 125 /* The following is for the data used to check parity */ 126 /* It is for the data with checksum */ 127 unsigned int data_corrected:1; 128 }; 129 struct btrfs_work work; 130 }; 131 132 /* Used for the chunks with parity stripe such RAID5/6 */ 133 struct scrub_parity { 134 struct scrub_ctx *sctx; 135 136 struct btrfs_device *scrub_dev; 137 138 u64 logic_start; 139 140 u64 logic_end; 141 142 int nsectors; 143 144 u64 stripe_len; 145 146 refcount_t refs; 147 148 struct list_head spages; 149 150 /* Work of parity check and repair */ 151 struct btrfs_work work; 152 153 /* Mark the parity blocks which have data */ 154 unsigned long *dbitmap; 155 156 /* 157 * Mark the parity blocks which have data, but errors happen when 158 * read data or check data 159 */ 160 unsigned long *ebitmap; 161 162 unsigned long bitmap[0]; 163 }; 164 165 struct scrub_ctx { 166 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 167 struct btrfs_fs_info *fs_info; 168 int first_free; 169 int curr; 170 atomic_t bios_in_flight; 171 atomic_t workers_pending; 172 spinlock_t list_lock; 173 wait_queue_head_t list_wait; 174 u16 csum_size; 175 struct list_head csum_list; 176 atomic_t cancel_req; 177 int readonly; 178 int pages_per_rd_bio; 179 180 int is_dev_replace; 181 182 struct scrub_bio *wr_curr_bio; 183 struct mutex wr_lock; 184 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ 185 struct btrfs_device *wr_tgtdev; 186 bool flush_all_writes; 187 188 /* 189 * statistics 190 */ 191 struct btrfs_scrub_progress stat; 192 spinlock_t stat_lock; 193 194 /* 195 * Use a ref counter to avoid use-after-free issues. Scrub workers 196 * decrement bios_in_flight and workers_pending and then do a wakeup 197 * on the list_wait wait queue. We must ensure the main scrub task 198 * doesn't free the scrub context before or while the workers are 199 * doing the wakeup() call. 200 */ 201 refcount_t refs; 202 }; 203 204 struct scrub_fixup_nodatasum { 205 struct scrub_ctx *sctx; 206 struct btrfs_device *dev; 207 u64 logical; 208 struct btrfs_root *root; 209 struct btrfs_work work; 210 int mirror_num; 211 }; 212 213 struct scrub_nocow_inode { 214 u64 inum; 215 u64 offset; 216 u64 root; 217 struct list_head list; 218 }; 219 220 struct scrub_copy_nocow_ctx { 221 struct scrub_ctx *sctx; 222 u64 logical; 223 u64 len; 224 int mirror_num; 225 u64 physical_for_dev_replace; 226 struct list_head inodes; 227 struct btrfs_work work; 228 }; 229 230 struct scrub_warning { 231 struct btrfs_path *path; 232 u64 extent_item_size; 233 const char *errstr; 234 u64 physical; 235 u64 logical; 236 struct btrfs_device *dev; 237 }; 238 239 struct full_stripe_lock { 240 struct rb_node node; 241 u64 logical; 242 u64 refs; 243 struct mutex mutex; 244 }; 245 246 static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 247 static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 248 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 249 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); 250 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 251 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 252 struct scrub_block *sblocks_for_recheck); 253 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 254 struct scrub_block *sblock, 255 int retry_failed_mirror); 256 static void scrub_recheck_block_checksum(struct scrub_block *sblock); 257 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 258 struct scrub_block *sblock_good); 259 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 260 struct scrub_block *sblock_good, 261 int page_num, int force_write); 262 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 263 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 264 int page_num); 265 static int scrub_checksum_data(struct scrub_block *sblock); 266 static int scrub_checksum_tree_block(struct scrub_block *sblock); 267 static int scrub_checksum_super(struct scrub_block *sblock); 268 static void scrub_block_get(struct scrub_block *sblock); 269 static void scrub_block_put(struct scrub_block *sblock); 270 static void scrub_page_get(struct scrub_page *spage); 271 static void scrub_page_put(struct scrub_page *spage); 272 static void scrub_parity_get(struct scrub_parity *sparity); 273 static void scrub_parity_put(struct scrub_parity *sparity); 274 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 275 struct scrub_page *spage); 276 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 277 u64 physical, struct btrfs_device *dev, u64 flags, 278 u64 gen, int mirror_num, u8 *csum, int force, 279 u64 physical_for_dev_replace); 280 static void scrub_bio_end_io(struct bio *bio); 281 static void scrub_bio_end_io_worker(struct btrfs_work *work); 282 static void scrub_block_complete(struct scrub_block *sblock); 283 static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 284 u64 extent_logical, u64 extent_len, 285 u64 *extent_physical, 286 struct btrfs_device **extent_dev, 287 int *extent_mirror_num); 288 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 289 struct scrub_page *spage); 290 static void scrub_wr_submit(struct scrub_ctx *sctx); 291 static void scrub_wr_bio_end_io(struct bio *bio); 292 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); 293 static int write_page_nocow(struct scrub_ctx *sctx, 294 u64 physical_for_dev_replace, struct page *page); 295 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 296 struct scrub_copy_nocow_ctx *ctx); 297 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 298 int mirror_num, u64 physical_for_dev_replace); 299 static void copy_nocow_pages_worker(struct btrfs_work *work); 300 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 301 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 302 static void scrub_put_ctx(struct scrub_ctx *sctx); 303 304 static inline int scrub_is_page_on_raid56(struct scrub_page *page) 305 { 306 return page->recover && 307 (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); 308 } 309 310 static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 311 { 312 refcount_inc(&sctx->refs); 313 atomic_inc(&sctx->bios_in_flight); 314 } 315 316 static void scrub_pending_bio_dec(struct scrub_ctx *sctx) 317 { 318 atomic_dec(&sctx->bios_in_flight); 319 wake_up(&sctx->list_wait); 320 scrub_put_ctx(sctx); 321 } 322 323 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 324 { 325 while (atomic_read(&fs_info->scrub_pause_req)) { 326 mutex_unlock(&fs_info->scrub_lock); 327 wait_event(fs_info->scrub_pause_wait, 328 atomic_read(&fs_info->scrub_pause_req) == 0); 329 mutex_lock(&fs_info->scrub_lock); 330 } 331 } 332 333 static void scrub_pause_on(struct btrfs_fs_info *fs_info) 334 { 335 atomic_inc(&fs_info->scrubs_paused); 336 wake_up(&fs_info->scrub_pause_wait); 337 } 338 339 static void scrub_pause_off(struct btrfs_fs_info *fs_info) 340 { 341 mutex_lock(&fs_info->scrub_lock); 342 __scrub_blocked_if_needed(fs_info); 343 atomic_dec(&fs_info->scrubs_paused); 344 mutex_unlock(&fs_info->scrub_lock); 345 346 wake_up(&fs_info->scrub_pause_wait); 347 } 348 349 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 350 { 351 scrub_pause_on(fs_info); 352 scrub_pause_off(fs_info); 353 } 354 355 /* 356 * Insert new full stripe lock into full stripe locks tree 357 * 358 * Return pointer to existing or newly inserted full_stripe_lock structure if 359 * everything works well. 360 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory 361 * 362 * NOTE: caller must hold full_stripe_locks_root->lock before calling this 363 * function 364 */ 365 static struct full_stripe_lock *insert_full_stripe_lock( 366 struct btrfs_full_stripe_locks_tree *locks_root, 367 u64 fstripe_logical) 368 { 369 struct rb_node **p; 370 struct rb_node *parent = NULL; 371 struct full_stripe_lock *entry; 372 struct full_stripe_lock *ret; 373 374 WARN_ON(!mutex_is_locked(&locks_root->lock)); 375 376 p = &locks_root->root.rb_node; 377 while (*p) { 378 parent = *p; 379 entry = rb_entry(parent, struct full_stripe_lock, node); 380 if (fstripe_logical < entry->logical) { 381 p = &(*p)->rb_left; 382 } else if (fstripe_logical > entry->logical) { 383 p = &(*p)->rb_right; 384 } else { 385 entry->refs++; 386 return entry; 387 } 388 } 389 390 /* Insert new lock */ 391 ret = kmalloc(sizeof(*ret), GFP_KERNEL); 392 if (!ret) 393 return ERR_PTR(-ENOMEM); 394 ret->logical = fstripe_logical; 395 ret->refs = 1; 396 mutex_init(&ret->mutex); 397 398 rb_link_node(&ret->node, parent, p); 399 rb_insert_color(&ret->node, &locks_root->root); 400 return ret; 401 } 402 403 /* 404 * Search for a full stripe lock of a block group 405 * 406 * Return pointer to existing full stripe lock if found 407 * Return NULL if not found 408 */ 409 static struct full_stripe_lock *search_full_stripe_lock( 410 struct btrfs_full_stripe_locks_tree *locks_root, 411 u64 fstripe_logical) 412 { 413 struct rb_node *node; 414 struct full_stripe_lock *entry; 415 416 WARN_ON(!mutex_is_locked(&locks_root->lock)); 417 418 node = locks_root->root.rb_node; 419 while (node) { 420 entry = rb_entry(node, struct full_stripe_lock, node); 421 if (fstripe_logical < entry->logical) 422 node = node->rb_left; 423 else if (fstripe_logical > entry->logical) 424 node = node->rb_right; 425 else 426 return entry; 427 } 428 return NULL; 429 } 430 431 /* 432 * Helper to get full stripe logical from a normal bytenr. 433 * 434 * Caller must ensure @cache is a RAID56 block group. 435 */ 436 static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, 437 u64 bytenr) 438 { 439 u64 ret; 440 441 /* 442 * Due to chunk item size limit, full stripe length should not be 443 * larger than U32_MAX. Just a sanity check here. 444 */ 445 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); 446 447 /* 448 * round_down() can only handle power of 2, while RAID56 full 449 * stripe length can be 64KiB * n, so we need to manually round down. 450 */ 451 ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * 452 cache->full_stripe_len + cache->key.objectid; 453 return ret; 454 } 455 456 /* 457 * Lock a full stripe to avoid concurrency of recovery and read 458 * 459 * It's only used for profiles with parities (RAID5/6), for other profiles it 460 * does nothing. 461 * 462 * Return 0 if we locked full stripe covering @bytenr, with a mutex held. 463 * So caller must call unlock_full_stripe() at the same context. 464 * 465 * Return <0 if encounters error. 466 */ 467 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, 468 bool *locked_ret) 469 { 470 struct btrfs_block_group_cache *bg_cache; 471 struct btrfs_full_stripe_locks_tree *locks_root; 472 struct full_stripe_lock *existing; 473 u64 fstripe_start; 474 int ret = 0; 475 476 *locked_ret = false; 477 bg_cache = btrfs_lookup_block_group(fs_info, bytenr); 478 if (!bg_cache) { 479 ASSERT(0); 480 return -ENOENT; 481 } 482 483 /* Profiles not based on parity don't need full stripe lock */ 484 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) 485 goto out; 486 locks_root = &bg_cache->full_stripe_locks_root; 487 488 fstripe_start = get_full_stripe_logical(bg_cache, bytenr); 489 490 /* Now insert the full stripe lock */ 491 mutex_lock(&locks_root->lock); 492 existing = insert_full_stripe_lock(locks_root, fstripe_start); 493 mutex_unlock(&locks_root->lock); 494 if (IS_ERR(existing)) { 495 ret = PTR_ERR(existing); 496 goto out; 497 } 498 mutex_lock(&existing->mutex); 499 *locked_ret = true; 500 out: 501 btrfs_put_block_group(bg_cache); 502 return ret; 503 } 504 505 /* 506 * Unlock a full stripe. 507 * 508 * NOTE: Caller must ensure it's the same context calling corresponding 509 * lock_full_stripe(). 510 * 511 * Return 0 if we unlock full stripe without problem. 512 * Return <0 for error 513 */ 514 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, 515 bool locked) 516 { 517 struct btrfs_block_group_cache *bg_cache; 518 struct btrfs_full_stripe_locks_tree *locks_root; 519 struct full_stripe_lock *fstripe_lock; 520 u64 fstripe_start; 521 bool freeit = false; 522 int ret = 0; 523 524 /* If we didn't acquire full stripe lock, no need to continue */ 525 if (!locked) 526 return 0; 527 528 bg_cache = btrfs_lookup_block_group(fs_info, bytenr); 529 if (!bg_cache) { 530 ASSERT(0); 531 return -ENOENT; 532 } 533 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) 534 goto out; 535 536 locks_root = &bg_cache->full_stripe_locks_root; 537 fstripe_start = get_full_stripe_logical(bg_cache, bytenr); 538 539 mutex_lock(&locks_root->lock); 540 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); 541 /* Unpaired unlock_full_stripe() detected */ 542 if (!fstripe_lock) { 543 WARN_ON(1); 544 ret = -ENOENT; 545 mutex_unlock(&locks_root->lock); 546 goto out; 547 } 548 549 if (fstripe_lock->refs == 0) { 550 WARN_ON(1); 551 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", 552 fstripe_lock->logical); 553 } else { 554 fstripe_lock->refs--; 555 } 556 557 if (fstripe_lock->refs == 0) { 558 rb_erase(&fstripe_lock->node, &locks_root->root); 559 freeit = true; 560 } 561 mutex_unlock(&locks_root->lock); 562 563 mutex_unlock(&fstripe_lock->mutex); 564 if (freeit) 565 kfree(fstripe_lock); 566 out: 567 btrfs_put_block_group(bg_cache); 568 return ret; 569 } 570 571 /* 572 * used for workers that require transaction commits (i.e., for the 573 * NOCOW case) 574 */ 575 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) 576 { 577 struct btrfs_fs_info *fs_info = sctx->fs_info; 578 579 refcount_inc(&sctx->refs); 580 /* 581 * increment scrubs_running to prevent cancel requests from 582 * completing as long as a worker is running. we must also 583 * increment scrubs_paused to prevent deadlocking on pause 584 * requests used for transactions commits (as the worker uses a 585 * transaction context). it is safe to regard the worker 586 * as paused for all matters practical. effectively, we only 587 * avoid cancellation requests from completing. 588 */ 589 mutex_lock(&fs_info->scrub_lock); 590 atomic_inc(&fs_info->scrubs_running); 591 atomic_inc(&fs_info->scrubs_paused); 592 mutex_unlock(&fs_info->scrub_lock); 593 594 /* 595 * check if @scrubs_running=@scrubs_paused condition 596 * inside wait_event() is not an atomic operation. 597 * which means we may inc/dec @scrub_running/paused 598 * at any time. Let's wake up @scrub_pause_wait as 599 * much as we can to let commit transaction blocked less. 600 */ 601 wake_up(&fs_info->scrub_pause_wait); 602 603 atomic_inc(&sctx->workers_pending); 604 } 605 606 /* used for workers that require transaction commits */ 607 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) 608 { 609 struct btrfs_fs_info *fs_info = sctx->fs_info; 610 611 /* 612 * see scrub_pending_trans_workers_inc() why we're pretending 613 * to be paused in the scrub counters 614 */ 615 mutex_lock(&fs_info->scrub_lock); 616 atomic_dec(&fs_info->scrubs_running); 617 atomic_dec(&fs_info->scrubs_paused); 618 mutex_unlock(&fs_info->scrub_lock); 619 atomic_dec(&sctx->workers_pending); 620 wake_up(&fs_info->scrub_pause_wait); 621 wake_up(&sctx->list_wait); 622 scrub_put_ctx(sctx); 623 } 624 625 static void scrub_free_csums(struct scrub_ctx *sctx) 626 { 627 while (!list_empty(&sctx->csum_list)) { 628 struct btrfs_ordered_sum *sum; 629 sum = list_first_entry(&sctx->csum_list, 630 struct btrfs_ordered_sum, list); 631 list_del(&sum->list); 632 kfree(sum); 633 } 634 } 635 636 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 637 { 638 int i; 639 640 if (!sctx) 641 return; 642 643 /* this can happen when scrub is cancelled */ 644 if (sctx->curr != -1) { 645 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 646 647 for (i = 0; i < sbio->page_count; i++) { 648 WARN_ON(!sbio->pagev[i]->page); 649 scrub_block_put(sbio->pagev[i]->sblock); 650 } 651 bio_put(sbio->bio); 652 } 653 654 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 655 struct scrub_bio *sbio = sctx->bios[i]; 656 657 if (!sbio) 658 break; 659 kfree(sbio); 660 } 661 662 kfree(sctx->wr_curr_bio); 663 scrub_free_csums(sctx); 664 kfree(sctx); 665 } 666 667 static void scrub_put_ctx(struct scrub_ctx *sctx) 668 { 669 if (refcount_dec_and_test(&sctx->refs)) 670 scrub_free_ctx(sctx); 671 } 672 673 static noinline_for_stack 674 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) 675 { 676 struct scrub_ctx *sctx; 677 int i; 678 struct btrfs_fs_info *fs_info = dev->fs_info; 679 680 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); 681 if (!sctx) 682 goto nomem; 683 refcount_set(&sctx->refs, 1); 684 sctx->is_dev_replace = is_dev_replace; 685 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; 686 sctx->curr = -1; 687 sctx->fs_info = dev->fs_info; 688 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 689 struct scrub_bio *sbio; 690 691 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); 692 if (!sbio) 693 goto nomem; 694 sctx->bios[i] = sbio; 695 696 sbio->index = i; 697 sbio->sctx = sctx; 698 sbio->page_count = 0; 699 btrfs_init_work(&sbio->work, btrfs_scrub_helper, 700 scrub_bio_end_io_worker, NULL, NULL); 701 702 if (i != SCRUB_BIOS_PER_SCTX - 1) 703 sctx->bios[i]->next_free = i + 1; 704 else 705 sctx->bios[i]->next_free = -1; 706 } 707 sctx->first_free = 0; 708 atomic_set(&sctx->bios_in_flight, 0); 709 atomic_set(&sctx->workers_pending, 0); 710 atomic_set(&sctx->cancel_req, 0); 711 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 712 INIT_LIST_HEAD(&sctx->csum_list); 713 714 spin_lock_init(&sctx->list_lock); 715 spin_lock_init(&sctx->stat_lock); 716 init_waitqueue_head(&sctx->list_wait); 717 718 WARN_ON(sctx->wr_curr_bio != NULL); 719 mutex_init(&sctx->wr_lock); 720 sctx->wr_curr_bio = NULL; 721 if (is_dev_replace) { 722 WARN_ON(!fs_info->dev_replace.tgtdev); 723 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; 724 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; 725 sctx->flush_all_writes = false; 726 } 727 728 return sctx; 729 730 nomem: 731 scrub_free_ctx(sctx); 732 return ERR_PTR(-ENOMEM); 733 } 734 735 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 736 void *warn_ctx) 737 { 738 u64 isize; 739 u32 nlink; 740 int ret; 741 int i; 742 unsigned nofs_flag; 743 struct extent_buffer *eb; 744 struct btrfs_inode_item *inode_item; 745 struct scrub_warning *swarn = warn_ctx; 746 struct btrfs_fs_info *fs_info = swarn->dev->fs_info; 747 struct inode_fs_paths *ipath = NULL; 748 struct btrfs_root *local_root; 749 struct btrfs_key root_key; 750 struct btrfs_key key; 751 752 root_key.objectid = root; 753 root_key.type = BTRFS_ROOT_ITEM_KEY; 754 root_key.offset = (u64)-1; 755 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); 756 if (IS_ERR(local_root)) { 757 ret = PTR_ERR(local_root); 758 goto err; 759 } 760 761 /* 762 * this makes the path point to (inum INODE_ITEM ioff) 763 */ 764 key.objectid = inum; 765 key.type = BTRFS_INODE_ITEM_KEY; 766 key.offset = 0; 767 768 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 769 if (ret) { 770 btrfs_release_path(swarn->path); 771 goto err; 772 } 773 774 eb = swarn->path->nodes[0]; 775 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 776 struct btrfs_inode_item); 777 isize = btrfs_inode_size(eb, inode_item); 778 nlink = btrfs_inode_nlink(eb, inode_item); 779 btrfs_release_path(swarn->path); 780 781 /* 782 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub 783 * uses GFP_NOFS in this context, so we keep it consistent but it does 784 * not seem to be strictly necessary. 785 */ 786 nofs_flag = memalloc_nofs_save(); 787 ipath = init_ipath(4096, local_root, swarn->path); 788 memalloc_nofs_restore(nofs_flag); 789 if (IS_ERR(ipath)) { 790 ret = PTR_ERR(ipath); 791 ipath = NULL; 792 goto err; 793 } 794 ret = paths_from_inode(inum, ipath); 795 796 if (ret < 0) 797 goto err; 798 799 /* 800 * we deliberately ignore the bit ipath might have been too small to 801 * hold all of the paths here 802 */ 803 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 804 btrfs_warn_in_rcu(fs_info, 805 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", 806 swarn->errstr, swarn->logical, 807 rcu_str_deref(swarn->dev->name), 808 swarn->physical, 809 root, inum, offset, 810 min(isize - offset, (u64)PAGE_SIZE), nlink, 811 (char *)(unsigned long)ipath->fspath->val[i]); 812 813 free_ipath(ipath); 814 return 0; 815 816 err: 817 btrfs_warn_in_rcu(fs_info, 818 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", 819 swarn->errstr, swarn->logical, 820 rcu_str_deref(swarn->dev->name), 821 swarn->physical, 822 root, inum, offset, ret); 823 824 free_ipath(ipath); 825 return 0; 826 } 827 828 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 829 { 830 struct btrfs_device *dev; 831 struct btrfs_fs_info *fs_info; 832 struct btrfs_path *path; 833 struct btrfs_key found_key; 834 struct extent_buffer *eb; 835 struct btrfs_extent_item *ei; 836 struct scrub_warning swarn; 837 unsigned long ptr = 0; 838 u64 extent_item_pos; 839 u64 flags = 0; 840 u64 ref_root; 841 u32 item_size; 842 u8 ref_level = 0; 843 int ret; 844 845 WARN_ON(sblock->page_count < 1); 846 dev = sblock->pagev[0]->dev; 847 fs_info = sblock->sctx->fs_info; 848 849 path = btrfs_alloc_path(); 850 if (!path) 851 return; 852 853 swarn.physical = sblock->pagev[0]->physical; 854 swarn.logical = sblock->pagev[0]->logical; 855 swarn.errstr = errstr; 856 swarn.dev = NULL; 857 858 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 859 &flags); 860 if (ret < 0) 861 goto out; 862 863 extent_item_pos = swarn.logical - found_key.objectid; 864 swarn.extent_item_size = found_key.offset; 865 866 eb = path->nodes[0]; 867 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 868 item_size = btrfs_item_size_nr(eb, path->slots[0]); 869 870 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 871 do { 872 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 873 item_size, &ref_root, 874 &ref_level); 875 btrfs_warn_in_rcu(fs_info, 876 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", 877 errstr, swarn.logical, 878 rcu_str_deref(dev->name), 879 swarn.physical, 880 ref_level ? "node" : "leaf", 881 ret < 0 ? -1 : ref_level, 882 ret < 0 ? -1 : ref_root); 883 } while (ret != 1); 884 btrfs_release_path(path); 885 } else { 886 btrfs_release_path(path); 887 swarn.path = path; 888 swarn.dev = dev; 889 iterate_extent_inodes(fs_info, found_key.objectid, 890 extent_item_pos, 1, 891 scrub_print_warning_inode, &swarn, false); 892 } 893 894 out: 895 btrfs_free_path(path); 896 } 897 898 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 899 { 900 struct page *page = NULL; 901 unsigned long index; 902 struct scrub_fixup_nodatasum *fixup = fixup_ctx; 903 int ret; 904 int corrected = 0; 905 struct btrfs_key key; 906 struct inode *inode = NULL; 907 struct btrfs_fs_info *fs_info; 908 u64 end = offset + PAGE_SIZE - 1; 909 struct btrfs_root *local_root; 910 int srcu_index; 911 912 key.objectid = root; 913 key.type = BTRFS_ROOT_ITEM_KEY; 914 key.offset = (u64)-1; 915 916 fs_info = fixup->root->fs_info; 917 srcu_index = srcu_read_lock(&fs_info->subvol_srcu); 918 919 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 920 if (IS_ERR(local_root)) { 921 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 922 return PTR_ERR(local_root); 923 } 924 925 key.type = BTRFS_INODE_ITEM_KEY; 926 key.objectid = inum; 927 key.offset = 0; 928 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 929 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 930 if (IS_ERR(inode)) 931 return PTR_ERR(inode); 932 933 index = offset >> PAGE_SHIFT; 934 935 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 936 if (!page) { 937 ret = -ENOMEM; 938 goto out; 939 } 940 941 if (PageUptodate(page)) { 942 if (PageDirty(page)) { 943 /* 944 * we need to write the data to the defect sector. the 945 * data that was in that sector is not in memory, 946 * because the page was modified. we must not write the 947 * modified page to that sector. 948 * 949 * TODO: what could be done here: wait for the delalloc 950 * runner to write out that page (might involve 951 * COW) and see whether the sector is still 952 * referenced afterwards. 953 * 954 * For the meantime, we'll treat this error 955 * incorrectable, although there is a chance that a 956 * later scrub will find the bad sector again and that 957 * there's no dirty page in memory, then. 958 */ 959 ret = -EIO; 960 goto out; 961 } 962 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE, 963 fixup->logical, page, 964 offset - page_offset(page), 965 fixup->mirror_num); 966 unlock_page(page); 967 corrected = !ret; 968 } else { 969 /* 970 * we need to get good data first. the general readpage path 971 * will call repair_io_failure for us, we just have to make 972 * sure we read the bad mirror. 973 */ 974 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 975 EXTENT_DAMAGED); 976 if (ret) { 977 /* set_extent_bits should give proper error */ 978 WARN_ON(ret > 0); 979 if (ret > 0) 980 ret = -EFAULT; 981 goto out; 982 } 983 984 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, 985 btrfs_get_extent, 986 fixup->mirror_num); 987 wait_on_page_locked(page); 988 989 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, 990 end, EXTENT_DAMAGED, 0, NULL); 991 if (!corrected) 992 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 993 EXTENT_DAMAGED); 994 } 995 996 out: 997 if (page) 998 put_page(page); 999 1000 iput(inode); 1001 1002 if (ret < 0) 1003 return ret; 1004 1005 if (ret == 0 && corrected) { 1006 /* 1007 * we only need to call readpage for one of the inodes belonging 1008 * to this extent. so make iterate_extent_inodes stop 1009 */ 1010 return 1; 1011 } 1012 1013 return -EIO; 1014 } 1015 1016 static void scrub_fixup_nodatasum(struct btrfs_work *work) 1017 { 1018 struct btrfs_fs_info *fs_info; 1019 int ret; 1020 struct scrub_fixup_nodatasum *fixup; 1021 struct scrub_ctx *sctx; 1022 struct btrfs_trans_handle *trans = NULL; 1023 struct btrfs_path *path; 1024 int uncorrectable = 0; 1025 1026 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 1027 sctx = fixup->sctx; 1028 fs_info = fixup->root->fs_info; 1029 1030 path = btrfs_alloc_path(); 1031 if (!path) { 1032 spin_lock(&sctx->stat_lock); 1033 ++sctx->stat.malloc_errors; 1034 spin_unlock(&sctx->stat_lock); 1035 uncorrectable = 1; 1036 goto out; 1037 } 1038 1039 trans = btrfs_join_transaction(fixup->root); 1040 if (IS_ERR(trans)) { 1041 uncorrectable = 1; 1042 goto out; 1043 } 1044 1045 /* 1046 * the idea is to trigger a regular read through the standard path. we 1047 * read a page from the (failed) logical address by specifying the 1048 * corresponding copynum of the failed sector. thus, that readpage is 1049 * expected to fail. 1050 * that is the point where on-the-fly error correction will kick in 1051 * (once it's finished) and rewrite the failed sector if a good copy 1052 * can be found. 1053 */ 1054 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, 1055 scrub_fixup_readpage, fixup, false); 1056 if (ret < 0) { 1057 uncorrectable = 1; 1058 goto out; 1059 } 1060 WARN_ON(ret != 1); 1061 1062 spin_lock(&sctx->stat_lock); 1063 ++sctx->stat.corrected_errors; 1064 spin_unlock(&sctx->stat_lock); 1065 1066 out: 1067 if (trans && !IS_ERR(trans)) 1068 btrfs_end_transaction(trans); 1069 if (uncorrectable) { 1070 spin_lock(&sctx->stat_lock); 1071 ++sctx->stat.uncorrectable_errors; 1072 spin_unlock(&sctx->stat_lock); 1073 btrfs_dev_replace_stats_inc( 1074 &fs_info->dev_replace.num_uncorrectable_read_errors); 1075 btrfs_err_rl_in_rcu(fs_info, 1076 "unable to fixup (nodatasum) error at logical %llu on dev %s", 1077 fixup->logical, rcu_str_deref(fixup->dev->name)); 1078 } 1079 1080 btrfs_free_path(path); 1081 kfree(fixup); 1082 1083 scrub_pending_trans_workers_dec(sctx); 1084 } 1085 1086 static inline void scrub_get_recover(struct scrub_recover *recover) 1087 { 1088 refcount_inc(&recover->refs); 1089 } 1090 1091 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, 1092 struct scrub_recover *recover) 1093 { 1094 if (refcount_dec_and_test(&recover->refs)) { 1095 btrfs_bio_counter_dec(fs_info); 1096 btrfs_put_bbio(recover->bbio); 1097 kfree(recover); 1098 } 1099 } 1100 1101 /* 1102 * scrub_handle_errored_block gets called when either verification of the 1103 * pages failed or the bio failed to read, e.g. with EIO. In the latter 1104 * case, this function handles all pages in the bio, even though only one 1105 * may be bad. 1106 * The goal of this function is to repair the errored block by using the 1107 * contents of one of the mirrors. 1108 */ 1109 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 1110 { 1111 struct scrub_ctx *sctx = sblock_to_check->sctx; 1112 struct btrfs_device *dev; 1113 struct btrfs_fs_info *fs_info; 1114 u64 length; 1115 u64 logical; 1116 unsigned int failed_mirror_index; 1117 unsigned int is_metadata; 1118 unsigned int have_csum; 1119 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ 1120 struct scrub_block *sblock_bad; 1121 int ret; 1122 int mirror_index; 1123 int page_num; 1124 int success; 1125 bool full_stripe_locked; 1126 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 1127 DEFAULT_RATELIMIT_BURST); 1128 1129 BUG_ON(sblock_to_check->page_count < 1); 1130 fs_info = sctx->fs_info; 1131 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 1132 /* 1133 * if we find an error in a super block, we just report it. 1134 * They will get written with the next transaction commit 1135 * anyway 1136 */ 1137 spin_lock(&sctx->stat_lock); 1138 ++sctx->stat.super_errors; 1139 spin_unlock(&sctx->stat_lock); 1140 return 0; 1141 } 1142 length = sblock_to_check->page_count * PAGE_SIZE; 1143 logical = sblock_to_check->pagev[0]->logical; 1144 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); 1145 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; 1146 is_metadata = !(sblock_to_check->pagev[0]->flags & 1147 BTRFS_EXTENT_FLAG_DATA); 1148 have_csum = sblock_to_check->pagev[0]->have_csum; 1149 dev = sblock_to_check->pagev[0]->dev; 1150 1151 /* 1152 * For RAID5/6, race can happen for a different device scrub thread. 1153 * For data corruption, Parity and Data threads will both try 1154 * to recovery the data. 1155 * Race can lead to doubly added csum error, or even unrecoverable 1156 * error. 1157 */ 1158 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); 1159 if (ret < 0) { 1160 spin_lock(&sctx->stat_lock); 1161 if (ret == -ENOMEM) 1162 sctx->stat.malloc_errors++; 1163 sctx->stat.read_errors++; 1164 sctx->stat.uncorrectable_errors++; 1165 spin_unlock(&sctx->stat_lock); 1166 return ret; 1167 } 1168 1169 if (sctx->is_dev_replace && !is_metadata && !have_csum) { 1170 sblocks_for_recheck = NULL; 1171 goto nodatasum_case; 1172 } 1173 1174 /* 1175 * read all mirrors one after the other. This includes to 1176 * re-read the extent or metadata block that failed (that was 1177 * the cause that this fixup code is called) another time, 1178 * page by page this time in order to know which pages 1179 * caused I/O errors and which ones are good (for all mirrors). 1180 * It is the goal to handle the situation when more than one 1181 * mirror contains I/O errors, but the errors do not 1182 * overlap, i.e. the data can be repaired by selecting the 1183 * pages from those mirrors without I/O error on the 1184 * particular pages. One example (with blocks >= 2 * PAGE_SIZE) 1185 * would be that mirror #1 has an I/O error on the first page, 1186 * the second page is good, and mirror #2 has an I/O error on 1187 * the second page, but the first page is good. 1188 * Then the first page of the first mirror can be repaired by 1189 * taking the first page of the second mirror, and the 1190 * second page of the second mirror can be repaired by 1191 * copying the contents of the 2nd page of the 1st mirror. 1192 * One more note: if the pages of one mirror contain I/O 1193 * errors, the checksum cannot be verified. In order to get 1194 * the best data for repairing, the first attempt is to find 1195 * a mirror without I/O errors and with a validated checksum. 1196 * Only if this is not possible, the pages are picked from 1197 * mirrors with I/O errors without considering the checksum. 1198 * If the latter is the case, at the end, the checksum of the 1199 * repaired area is verified in order to correctly maintain 1200 * the statistics. 1201 */ 1202 1203 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, 1204 sizeof(*sblocks_for_recheck), GFP_NOFS); 1205 if (!sblocks_for_recheck) { 1206 spin_lock(&sctx->stat_lock); 1207 sctx->stat.malloc_errors++; 1208 sctx->stat.read_errors++; 1209 sctx->stat.uncorrectable_errors++; 1210 spin_unlock(&sctx->stat_lock); 1211 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 1212 goto out; 1213 } 1214 1215 /* setup the context, map the logical blocks and alloc the pages */ 1216 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); 1217 if (ret) { 1218 spin_lock(&sctx->stat_lock); 1219 sctx->stat.read_errors++; 1220 sctx->stat.uncorrectable_errors++; 1221 spin_unlock(&sctx->stat_lock); 1222 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 1223 goto out; 1224 } 1225 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 1226 sblock_bad = sblocks_for_recheck + failed_mirror_index; 1227 1228 /* build and submit the bios for the failed mirror, check checksums */ 1229 scrub_recheck_block(fs_info, sblock_bad, 1); 1230 1231 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 1232 sblock_bad->no_io_error_seen) { 1233 /* 1234 * the error disappeared after reading page by page, or 1235 * the area was part of a huge bio and other parts of the 1236 * bio caused I/O errors, or the block layer merged several 1237 * read requests into one and the error is caused by a 1238 * different bio (usually one of the two latter cases is 1239 * the cause) 1240 */ 1241 spin_lock(&sctx->stat_lock); 1242 sctx->stat.unverified_errors++; 1243 sblock_to_check->data_corrected = 1; 1244 spin_unlock(&sctx->stat_lock); 1245 1246 if (sctx->is_dev_replace) 1247 scrub_write_block_to_dev_replace(sblock_bad); 1248 goto out; 1249 } 1250 1251 if (!sblock_bad->no_io_error_seen) { 1252 spin_lock(&sctx->stat_lock); 1253 sctx->stat.read_errors++; 1254 spin_unlock(&sctx->stat_lock); 1255 if (__ratelimit(&_rs)) 1256 scrub_print_warning("i/o error", sblock_to_check); 1257 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 1258 } else if (sblock_bad->checksum_error) { 1259 spin_lock(&sctx->stat_lock); 1260 sctx->stat.csum_errors++; 1261 spin_unlock(&sctx->stat_lock); 1262 if (__ratelimit(&_rs)) 1263 scrub_print_warning("checksum error", sblock_to_check); 1264 btrfs_dev_stat_inc_and_print(dev, 1265 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1266 } else if (sblock_bad->header_error) { 1267 spin_lock(&sctx->stat_lock); 1268 sctx->stat.verify_errors++; 1269 spin_unlock(&sctx->stat_lock); 1270 if (__ratelimit(&_rs)) 1271 scrub_print_warning("checksum/header error", 1272 sblock_to_check); 1273 if (sblock_bad->generation_error) 1274 btrfs_dev_stat_inc_and_print(dev, 1275 BTRFS_DEV_STAT_GENERATION_ERRS); 1276 else 1277 btrfs_dev_stat_inc_and_print(dev, 1278 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1279 } 1280 1281 if (sctx->readonly) { 1282 ASSERT(!sctx->is_dev_replace); 1283 goto out; 1284 } 1285 1286 if (!is_metadata && !have_csum) { 1287 struct scrub_fixup_nodatasum *fixup_nodatasum; 1288 1289 WARN_ON(sctx->is_dev_replace); 1290 1291 nodatasum_case: 1292 1293 /* 1294 * !is_metadata and !have_csum, this means that the data 1295 * might not be COWed, that it might be modified 1296 * concurrently. The general strategy to work on the 1297 * commit root does not help in the case when COW is not 1298 * used. 1299 */ 1300 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 1301 if (!fixup_nodatasum) 1302 goto did_not_correct_error; 1303 fixup_nodatasum->sctx = sctx; 1304 fixup_nodatasum->dev = dev; 1305 fixup_nodatasum->logical = logical; 1306 fixup_nodatasum->root = fs_info->extent_root; 1307 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 1308 scrub_pending_trans_workers_inc(sctx); 1309 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, 1310 scrub_fixup_nodatasum, NULL, NULL); 1311 btrfs_queue_work(fs_info->scrub_workers, 1312 &fixup_nodatasum->work); 1313 goto out; 1314 } 1315 1316 /* 1317 * now build and submit the bios for the other mirrors, check 1318 * checksums. 1319 * First try to pick the mirror which is completely without I/O 1320 * errors and also does not have a checksum error. 1321 * If one is found, and if a checksum is present, the full block 1322 * that is known to contain an error is rewritten. Afterwards 1323 * the block is known to be corrected. 1324 * If a mirror is found which is completely correct, and no 1325 * checksum is present, only those pages are rewritten that had 1326 * an I/O error in the block to be repaired, since it cannot be 1327 * determined, which copy of the other pages is better (and it 1328 * could happen otherwise that a correct page would be 1329 * overwritten by a bad one). 1330 */ 1331 for (mirror_index = 0; ;mirror_index++) { 1332 struct scrub_block *sblock_other; 1333 1334 if (mirror_index == failed_mirror_index) 1335 continue; 1336 1337 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ 1338 if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { 1339 if (mirror_index >= BTRFS_MAX_MIRRORS) 1340 break; 1341 if (!sblocks_for_recheck[mirror_index].page_count) 1342 break; 1343 1344 sblock_other = sblocks_for_recheck + mirror_index; 1345 } else { 1346 struct scrub_recover *r = sblock_bad->pagev[0]->recover; 1347 int max_allowed = r->bbio->num_stripes - 1348 r->bbio->num_tgtdevs; 1349 1350 if (mirror_index >= max_allowed) 1351 break; 1352 if (!sblocks_for_recheck[1].page_count) 1353 break; 1354 1355 ASSERT(failed_mirror_index == 0); 1356 sblock_other = sblocks_for_recheck + 1; 1357 sblock_other->pagev[0]->mirror_num = 1 + mirror_index; 1358 } 1359 1360 /* build and submit the bios, check checksums */ 1361 scrub_recheck_block(fs_info, sblock_other, 0); 1362 1363 if (!sblock_other->header_error && 1364 !sblock_other->checksum_error && 1365 sblock_other->no_io_error_seen) { 1366 if (sctx->is_dev_replace) { 1367 scrub_write_block_to_dev_replace(sblock_other); 1368 goto corrected_error; 1369 } else { 1370 ret = scrub_repair_block_from_good_copy( 1371 sblock_bad, sblock_other); 1372 if (!ret) 1373 goto corrected_error; 1374 } 1375 } 1376 } 1377 1378 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) 1379 goto did_not_correct_error; 1380 1381 /* 1382 * In case of I/O errors in the area that is supposed to be 1383 * repaired, continue by picking good copies of those pages. 1384 * Select the good pages from mirrors to rewrite bad pages from 1385 * the area to fix. Afterwards verify the checksum of the block 1386 * that is supposed to be repaired. This verification step is 1387 * only done for the purpose of statistic counting and for the 1388 * final scrub report, whether errors remain. 1389 * A perfect algorithm could make use of the checksum and try 1390 * all possible combinations of pages from the different mirrors 1391 * until the checksum verification succeeds. For example, when 1392 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page 1393 * of mirror #2 is readable but the final checksum test fails, 1394 * then the 2nd page of mirror #3 could be tried, whether now 1395 * the final checksum succeeds. But this would be a rare 1396 * exception and is therefore not implemented. At least it is 1397 * avoided that the good copy is overwritten. 1398 * A more useful improvement would be to pick the sectors 1399 * without I/O error based on sector sizes (512 bytes on legacy 1400 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one 1401 * mirror could be repaired by taking 512 byte of a different 1402 * mirror, even if other 512 byte sectors in the same PAGE_SIZE 1403 * area are unreadable. 1404 */ 1405 success = 1; 1406 for (page_num = 0; page_num < sblock_bad->page_count; 1407 page_num++) { 1408 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1409 struct scrub_block *sblock_other = NULL; 1410 1411 /* skip no-io-error page in scrub */ 1412 if (!page_bad->io_error && !sctx->is_dev_replace) 1413 continue; 1414 1415 /* try to find no-io-error page in mirrors */ 1416 if (page_bad->io_error) { 1417 for (mirror_index = 0; 1418 mirror_index < BTRFS_MAX_MIRRORS && 1419 sblocks_for_recheck[mirror_index].page_count > 0; 1420 mirror_index++) { 1421 if (!sblocks_for_recheck[mirror_index]. 1422 pagev[page_num]->io_error) { 1423 sblock_other = sblocks_for_recheck + 1424 mirror_index; 1425 break; 1426 } 1427 } 1428 if (!sblock_other) 1429 success = 0; 1430 } 1431 1432 if (sctx->is_dev_replace) { 1433 /* 1434 * did not find a mirror to fetch the page 1435 * from. scrub_write_page_to_dev_replace() 1436 * handles this case (page->io_error), by 1437 * filling the block with zeros before 1438 * submitting the write request 1439 */ 1440 if (!sblock_other) 1441 sblock_other = sblock_bad; 1442 1443 if (scrub_write_page_to_dev_replace(sblock_other, 1444 page_num) != 0) { 1445 btrfs_dev_replace_stats_inc( 1446 &fs_info->dev_replace.num_write_errors); 1447 success = 0; 1448 } 1449 } else if (sblock_other) { 1450 ret = scrub_repair_page_from_good_copy(sblock_bad, 1451 sblock_other, 1452 page_num, 0); 1453 if (0 == ret) 1454 page_bad->io_error = 0; 1455 else 1456 success = 0; 1457 } 1458 } 1459 1460 if (success && !sctx->is_dev_replace) { 1461 if (is_metadata || have_csum) { 1462 /* 1463 * need to verify the checksum now that all 1464 * sectors on disk are repaired (the write 1465 * request for data to be repaired is on its way). 1466 * Just be lazy and use scrub_recheck_block() 1467 * which re-reads the data before the checksum 1468 * is verified, but most likely the data comes out 1469 * of the page cache. 1470 */ 1471 scrub_recheck_block(fs_info, sblock_bad, 1); 1472 if (!sblock_bad->header_error && 1473 !sblock_bad->checksum_error && 1474 sblock_bad->no_io_error_seen) 1475 goto corrected_error; 1476 else 1477 goto did_not_correct_error; 1478 } else { 1479 corrected_error: 1480 spin_lock(&sctx->stat_lock); 1481 sctx->stat.corrected_errors++; 1482 sblock_to_check->data_corrected = 1; 1483 spin_unlock(&sctx->stat_lock); 1484 btrfs_err_rl_in_rcu(fs_info, 1485 "fixed up error at logical %llu on dev %s", 1486 logical, rcu_str_deref(dev->name)); 1487 } 1488 } else { 1489 did_not_correct_error: 1490 spin_lock(&sctx->stat_lock); 1491 sctx->stat.uncorrectable_errors++; 1492 spin_unlock(&sctx->stat_lock); 1493 btrfs_err_rl_in_rcu(fs_info, 1494 "unable to fixup (regular) error at logical %llu on dev %s", 1495 logical, rcu_str_deref(dev->name)); 1496 } 1497 1498 out: 1499 if (sblocks_for_recheck) { 1500 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; 1501 mirror_index++) { 1502 struct scrub_block *sblock = sblocks_for_recheck + 1503 mirror_index; 1504 struct scrub_recover *recover; 1505 int page_index; 1506 1507 for (page_index = 0; page_index < sblock->page_count; 1508 page_index++) { 1509 sblock->pagev[page_index]->sblock = NULL; 1510 recover = sblock->pagev[page_index]->recover; 1511 if (recover) { 1512 scrub_put_recover(fs_info, recover); 1513 sblock->pagev[page_index]->recover = 1514 NULL; 1515 } 1516 scrub_page_put(sblock->pagev[page_index]); 1517 } 1518 } 1519 kfree(sblocks_for_recheck); 1520 } 1521 1522 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); 1523 if (ret < 0) 1524 return ret; 1525 return 0; 1526 } 1527 1528 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) 1529 { 1530 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1531 return 2; 1532 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1533 return 3; 1534 else 1535 return (int)bbio->num_stripes; 1536 } 1537 1538 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, 1539 u64 *raid_map, 1540 u64 mapped_length, 1541 int nstripes, int mirror, 1542 int *stripe_index, 1543 u64 *stripe_offset) 1544 { 1545 int i; 1546 1547 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 1548 /* RAID5/6 */ 1549 for (i = 0; i < nstripes; i++) { 1550 if (raid_map[i] == RAID6_Q_STRIPE || 1551 raid_map[i] == RAID5_P_STRIPE) 1552 continue; 1553 1554 if (logical >= raid_map[i] && 1555 logical < raid_map[i] + mapped_length) 1556 break; 1557 } 1558 1559 *stripe_index = i; 1560 *stripe_offset = logical - raid_map[i]; 1561 } else { 1562 /* The other RAID type */ 1563 *stripe_index = mirror; 1564 *stripe_offset = 0; 1565 } 1566 } 1567 1568 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 1569 struct scrub_block *sblocks_for_recheck) 1570 { 1571 struct scrub_ctx *sctx = original_sblock->sctx; 1572 struct btrfs_fs_info *fs_info = sctx->fs_info; 1573 u64 length = original_sblock->page_count * PAGE_SIZE; 1574 u64 logical = original_sblock->pagev[0]->logical; 1575 u64 generation = original_sblock->pagev[0]->generation; 1576 u64 flags = original_sblock->pagev[0]->flags; 1577 u64 have_csum = original_sblock->pagev[0]->have_csum; 1578 struct scrub_recover *recover; 1579 struct btrfs_bio *bbio; 1580 u64 sublen; 1581 u64 mapped_length; 1582 u64 stripe_offset; 1583 int stripe_index; 1584 int page_index = 0; 1585 int mirror_index; 1586 int nmirrors; 1587 int ret; 1588 1589 /* 1590 * note: the two members refs and outstanding_pages 1591 * are not used (and not set) in the blocks that are used for 1592 * the recheck procedure 1593 */ 1594 1595 while (length > 0) { 1596 sublen = min_t(u64, length, PAGE_SIZE); 1597 mapped_length = sublen; 1598 bbio = NULL; 1599 1600 /* 1601 * with a length of PAGE_SIZE, each returned stripe 1602 * represents one mirror 1603 */ 1604 btrfs_bio_counter_inc_blocked(fs_info); 1605 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 1606 logical, &mapped_length, &bbio); 1607 if (ret || !bbio || mapped_length < sublen) { 1608 btrfs_put_bbio(bbio); 1609 btrfs_bio_counter_dec(fs_info); 1610 return -EIO; 1611 } 1612 1613 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); 1614 if (!recover) { 1615 btrfs_put_bbio(bbio); 1616 btrfs_bio_counter_dec(fs_info); 1617 return -ENOMEM; 1618 } 1619 1620 refcount_set(&recover->refs, 1); 1621 recover->bbio = bbio; 1622 recover->map_length = mapped_length; 1623 1624 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); 1625 1626 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); 1627 1628 for (mirror_index = 0; mirror_index < nmirrors; 1629 mirror_index++) { 1630 struct scrub_block *sblock; 1631 struct scrub_page *page; 1632 1633 sblock = sblocks_for_recheck + mirror_index; 1634 sblock->sctx = sctx; 1635 1636 page = kzalloc(sizeof(*page), GFP_NOFS); 1637 if (!page) { 1638 leave_nomem: 1639 spin_lock(&sctx->stat_lock); 1640 sctx->stat.malloc_errors++; 1641 spin_unlock(&sctx->stat_lock); 1642 scrub_put_recover(fs_info, recover); 1643 return -ENOMEM; 1644 } 1645 scrub_page_get(page); 1646 sblock->pagev[page_index] = page; 1647 page->sblock = sblock; 1648 page->flags = flags; 1649 page->generation = generation; 1650 page->logical = logical; 1651 page->have_csum = have_csum; 1652 if (have_csum) 1653 memcpy(page->csum, 1654 original_sblock->pagev[0]->csum, 1655 sctx->csum_size); 1656 1657 scrub_stripe_index_and_offset(logical, 1658 bbio->map_type, 1659 bbio->raid_map, 1660 mapped_length, 1661 bbio->num_stripes - 1662 bbio->num_tgtdevs, 1663 mirror_index, 1664 &stripe_index, 1665 &stripe_offset); 1666 page->physical = bbio->stripes[stripe_index].physical + 1667 stripe_offset; 1668 page->dev = bbio->stripes[stripe_index].dev; 1669 1670 BUG_ON(page_index >= original_sblock->page_count); 1671 page->physical_for_dev_replace = 1672 original_sblock->pagev[page_index]-> 1673 physical_for_dev_replace; 1674 /* for missing devices, dev->bdev is NULL */ 1675 page->mirror_num = mirror_index + 1; 1676 sblock->page_count++; 1677 page->page = alloc_page(GFP_NOFS); 1678 if (!page->page) 1679 goto leave_nomem; 1680 1681 scrub_get_recover(recover); 1682 page->recover = recover; 1683 } 1684 scrub_put_recover(fs_info, recover); 1685 length -= sublen; 1686 logical += sublen; 1687 page_index++; 1688 } 1689 1690 return 0; 1691 } 1692 1693 static void scrub_bio_wait_endio(struct bio *bio) 1694 { 1695 complete(bio->bi_private); 1696 } 1697 1698 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, 1699 struct bio *bio, 1700 struct scrub_page *page) 1701 { 1702 DECLARE_COMPLETION_ONSTACK(done); 1703 int ret; 1704 int mirror_num; 1705 1706 bio->bi_iter.bi_sector = page->logical >> 9; 1707 bio->bi_private = &done; 1708 bio->bi_end_io = scrub_bio_wait_endio; 1709 1710 mirror_num = page->sblock->pagev[0]->mirror_num; 1711 ret = raid56_parity_recover(fs_info, bio, page->recover->bbio, 1712 page->recover->map_length, 1713 mirror_num, 0); 1714 if (ret) 1715 return ret; 1716 1717 wait_for_completion_io(&done); 1718 return blk_status_to_errno(bio->bi_status); 1719 } 1720 1721 /* 1722 * this function will check the on disk data for checksum errors, header 1723 * errors and read I/O errors. If any I/O errors happen, the exact pages 1724 * which are errored are marked as being bad. The goal is to enable scrub 1725 * to take those pages that are not errored from all the mirrors so that 1726 * the pages that are errored in the just handled mirror can be repaired. 1727 */ 1728 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1729 struct scrub_block *sblock, 1730 int retry_failed_mirror) 1731 { 1732 int page_num; 1733 1734 sblock->no_io_error_seen = 1; 1735 1736 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1737 struct bio *bio; 1738 struct scrub_page *page = sblock->pagev[page_num]; 1739 1740 if (page->dev->bdev == NULL) { 1741 page->io_error = 1; 1742 sblock->no_io_error_seen = 0; 1743 continue; 1744 } 1745 1746 WARN_ON(!page->page); 1747 bio = btrfs_io_bio_alloc(1); 1748 bio_set_dev(bio, page->dev->bdev); 1749 1750 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1751 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { 1752 if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { 1753 page->io_error = 1; 1754 sblock->no_io_error_seen = 0; 1755 } 1756 } else { 1757 bio->bi_iter.bi_sector = page->physical >> 9; 1758 bio_set_op_attrs(bio, REQ_OP_READ, 0); 1759 1760 if (btrfsic_submit_bio_wait(bio)) { 1761 page->io_error = 1; 1762 sblock->no_io_error_seen = 0; 1763 } 1764 } 1765 1766 bio_put(bio); 1767 } 1768 1769 if (sblock->no_io_error_seen) 1770 scrub_recheck_block_checksum(sblock); 1771 } 1772 1773 static inline int scrub_check_fsid(u8 fsid[], 1774 struct scrub_page *spage) 1775 { 1776 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; 1777 int ret; 1778 1779 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 1780 return !ret; 1781 } 1782 1783 static void scrub_recheck_block_checksum(struct scrub_block *sblock) 1784 { 1785 sblock->header_error = 0; 1786 sblock->checksum_error = 0; 1787 sblock->generation_error = 0; 1788 1789 if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) 1790 scrub_checksum_data(sblock); 1791 else 1792 scrub_checksum_tree_block(sblock); 1793 } 1794 1795 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1796 struct scrub_block *sblock_good) 1797 { 1798 int page_num; 1799 int ret = 0; 1800 1801 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1802 int ret_sub; 1803 1804 ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1805 sblock_good, 1806 page_num, 1); 1807 if (ret_sub) 1808 ret = ret_sub; 1809 } 1810 1811 return ret; 1812 } 1813 1814 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 1815 struct scrub_block *sblock_good, 1816 int page_num, int force_write) 1817 { 1818 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1819 struct scrub_page *page_good = sblock_good->pagev[page_num]; 1820 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; 1821 1822 BUG_ON(page_bad->page == NULL); 1823 BUG_ON(page_good->page == NULL); 1824 if (force_write || sblock_bad->header_error || 1825 sblock_bad->checksum_error || page_bad->io_error) { 1826 struct bio *bio; 1827 int ret; 1828 1829 if (!page_bad->dev->bdev) { 1830 btrfs_warn_rl(fs_info, 1831 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); 1832 return -EIO; 1833 } 1834 1835 bio = btrfs_io_bio_alloc(1); 1836 bio_set_dev(bio, page_bad->dev->bdev); 1837 bio->bi_iter.bi_sector = page_bad->physical >> 9; 1838 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 1839 1840 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1841 if (PAGE_SIZE != ret) { 1842 bio_put(bio); 1843 return -EIO; 1844 } 1845 1846 if (btrfsic_submit_bio_wait(bio)) { 1847 btrfs_dev_stat_inc_and_print(page_bad->dev, 1848 BTRFS_DEV_STAT_WRITE_ERRS); 1849 btrfs_dev_replace_stats_inc( 1850 &fs_info->dev_replace.num_write_errors); 1851 bio_put(bio); 1852 return -EIO; 1853 } 1854 bio_put(bio); 1855 } 1856 1857 return 0; 1858 } 1859 1860 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1861 { 1862 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; 1863 int page_num; 1864 1865 /* 1866 * This block is used for the check of the parity on the source device, 1867 * so the data needn't be written into the destination device. 1868 */ 1869 if (sblock->sparity) 1870 return; 1871 1872 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1873 int ret; 1874 1875 ret = scrub_write_page_to_dev_replace(sblock, page_num); 1876 if (ret) 1877 btrfs_dev_replace_stats_inc( 1878 &fs_info->dev_replace.num_write_errors); 1879 } 1880 } 1881 1882 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 1883 int page_num) 1884 { 1885 struct scrub_page *spage = sblock->pagev[page_num]; 1886 1887 BUG_ON(spage->page == NULL); 1888 if (spage->io_error) { 1889 void *mapped_buffer = kmap_atomic(spage->page); 1890 1891 clear_page(mapped_buffer); 1892 flush_dcache_page(spage->page); 1893 kunmap_atomic(mapped_buffer); 1894 } 1895 return scrub_add_page_to_wr_bio(sblock->sctx, spage); 1896 } 1897 1898 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1899 struct scrub_page *spage) 1900 { 1901 struct scrub_bio *sbio; 1902 int ret; 1903 1904 mutex_lock(&sctx->wr_lock); 1905 again: 1906 if (!sctx->wr_curr_bio) { 1907 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), 1908 GFP_KERNEL); 1909 if (!sctx->wr_curr_bio) { 1910 mutex_unlock(&sctx->wr_lock); 1911 return -ENOMEM; 1912 } 1913 sctx->wr_curr_bio->sctx = sctx; 1914 sctx->wr_curr_bio->page_count = 0; 1915 } 1916 sbio = sctx->wr_curr_bio; 1917 if (sbio->page_count == 0) { 1918 struct bio *bio; 1919 1920 sbio->physical = spage->physical_for_dev_replace; 1921 sbio->logical = spage->logical; 1922 sbio->dev = sctx->wr_tgtdev; 1923 bio = sbio->bio; 1924 if (!bio) { 1925 bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); 1926 sbio->bio = bio; 1927 } 1928 1929 bio->bi_private = sbio; 1930 bio->bi_end_io = scrub_wr_bio_end_io; 1931 bio_set_dev(bio, sbio->dev->bdev); 1932 bio->bi_iter.bi_sector = sbio->physical >> 9; 1933 bio_set_op_attrs(bio, REQ_OP_WRITE, 0); 1934 sbio->status = 0; 1935 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1936 spage->physical_for_dev_replace || 1937 sbio->logical + sbio->page_count * PAGE_SIZE != 1938 spage->logical) { 1939 scrub_wr_submit(sctx); 1940 goto again; 1941 } 1942 1943 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1944 if (ret != PAGE_SIZE) { 1945 if (sbio->page_count < 1) { 1946 bio_put(sbio->bio); 1947 sbio->bio = NULL; 1948 mutex_unlock(&sctx->wr_lock); 1949 return -EIO; 1950 } 1951 scrub_wr_submit(sctx); 1952 goto again; 1953 } 1954 1955 sbio->pagev[sbio->page_count] = spage; 1956 scrub_page_get(spage); 1957 sbio->page_count++; 1958 if (sbio->page_count == sctx->pages_per_wr_bio) 1959 scrub_wr_submit(sctx); 1960 mutex_unlock(&sctx->wr_lock); 1961 1962 return 0; 1963 } 1964 1965 static void scrub_wr_submit(struct scrub_ctx *sctx) 1966 { 1967 struct scrub_bio *sbio; 1968 1969 if (!sctx->wr_curr_bio) 1970 return; 1971 1972 sbio = sctx->wr_curr_bio; 1973 sctx->wr_curr_bio = NULL; 1974 WARN_ON(!sbio->bio->bi_disk); 1975 scrub_pending_bio_inc(sctx); 1976 /* process all writes in a single worker thread. Then the block layer 1977 * orders the requests before sending them to the driver which 1978 * doubled the write performance on spinning disks when measured 1979 * with Linux 3.5 */ 1980 btrfsic_submit_bio(sbio->bio); 1981 } 1982 1983 static void scrub_wr_bio_end_io(struct bio *bio) 1984 { 1985 struct scrub_bio *sbio = bio->bi_private; 1986 struct btrfs_fs_info *fs_info = sbio->dev->fs_info; 1987 1988 sbio->status = bio->bi_status; 1989 sbio->bio = bio; 1990 1991 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, 1992 scrub_wr_bio_end_io_worker, NULL, NULL); 1993 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1994 } 1995 1996 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1997 { 1998 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1999 struct scrub_ctx *sctx = sbio->sctx; 2000 int i; 2001 2002 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); 2003 if (sbio->status) { 2004 struct btrfs_dev_replace *dev_replace = 2005 &sbio->sctx->fs_info->dev_replace; 2006 2007 for (i = 0; i < sbio->page_count; i++) { 2008 struct scrub_page *spage = sbio->pagev[i]; 2009 2010 spage->io_error = 1; 2011 btrfs_dev_replace_stats_inc(&dev_replace-> 2012 num_write_errors); 2013 } 2014 } 2015 2016 for (i = 0; i < sbio->page_count; i++) 2017 scrub_page_put(sbio->pagev[i]); 2018 2019 bio_put(sbio->bio); 2020 kfree(sbio); 2021 scrub_pending_bio_dec(sctx); 2022 } 2023 2024 static int scrub_checksum(struct scrub_block *sblock) 2025 { 2026 u64 flags; 2027 int ret; 2028 2029 /* 2030 * No need to initialize these stats currently, 2031 * because this function only use return value 2032 * instead of these stats value. 2033 * 2034 * Todo: 2035 * always use stats 2036 */ 2037 sblock->header_error = 0; 2038 sblock->generation_error = 0; 2039 sblock->checksum_error = 0; 2040 2041 WARN_ON(sblock->page_count < 1); 2042 flags = sblock->pagev[0]->flags; 2043 ret = 0; 2044 if (flags & BTRFS_EXTENT_FLAG_DATA) 2045 ret = scrub_checksum_data(sblock); 2046 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 2047 ret = scrub_checksum_tree_block(sblock); 2048 else if (flags & BTRFS_EXTENT_FLAG_SUPER) 2049 (void)scrub_checksum_super(sblock); 2050 else 2051 WARN_ON(1); 2052 if (ret) 2053 scrub_handle_errored_block(sblock); 2054 2055 return ret; 2056 } 2057 2058 static int scrub_checksum_data(struct scrub_block *sblock) 2059 { 2060 struct scrub_ctx *sctx = sblock->sctx; 2061 u8 csum[BTRFS_CSUM_SIZE]; 2062 u8 *on_disk_csum; 2063 struct page *page; 2064 void *buffer; 2065 u32 crc = ~(u32)0; 2066 u64 len; 2067 int index; 2068 2069 BUG_ON(sblock->page_count < 1); 2070 if (!sblock->pagev[0]->have_csum) 2071 return 0; 2072 2073 on_disk_csum = sblock->pagev[0]->csum; 2074 page = sblock->pagev[0]->page; 2075 buffer = kmap_atomic(page); 2076 2077 len = sctx->fs_info->sectorsize; 2078 index = 0; 2079 for (;;) { 2080 u64 l = min_t(u64, len, PAGE_SIZE); 2081 2082 crc = btrfs_csum_data(buffer, crc, l); 2083 kunmap_atomic(buffer); 2084 len -= l; 2085 if (len == 0) 2086 break; 2087 index++; 2088 BUG_ON(index >= sblock->page_count); 2089 BUG_ON(!sblock->pagev[index]->page); 2090 page = sblock->pagev[index]->page; 2091 buffer = kmap_atomic(page); 2092 } 2093 2094 btrfs_csum_final(crc, csum); 2095 if (memcmp(csum, on_disk_csum, sctx->csum_size)) 2096 sblock->checksum_error = 1; 2097 2098 return sblock->checksum_error; 2099 } 2100 2101 static int scrub_checksum_tree_block(struct scrub_block *sblock) 2102 { 2103 struct scrub_ctx *sctx = sblock->sctx; 2104 struct btrfs_header *h; 2105 struct btrfs_fs_info *fs_info = sctx->fs_info; 2106 u8 calculated_csum[BTRFS_CSUM_SIZE]; 2107 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 2108 struct page *page; 2109 void *mapped_buffer; 2110 u64 mapped_size; 2111 void *p; 2112 u32 crc = ~(u32)0; 2113 u64 len; 2114 int index; 2115 2116 BUG_ON(sblock->page_count < 1); 2117 page = sblock->pagev[0]->page; 2118 mapped_buffer = kmap_atomic(page); 2119 h = (struct btrfs_header *)mapped_buffer; 2120 memcpy(on_disk_csum, h->csum, sctx->csum_size); 2121 2122 /* 2123 * we don't use the getter functions here, as we 2124 * a) don't have an extent buffer and 2125 * b) the page is already kmapped 2126 */ 2127 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) 2128 sblock->header_error = 1; 2129 2130 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) { 2131 sblock->header_error = 1; 2132 sblock->generation_error = 1; 2133 } 2134 2135 if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) 2136 sblock->header_error = 1; 2137 2138 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 2139 BTRFS_UUID_SIZE)) 2140 sblock->header_error = 1; 2141 2142 len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE; 2143 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 2144 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 2145 index = 0; 2146 for (;;) { 2147 u64 l = min_t(u64, len, mapped_size); 2148 2149 crc = btrfs_csum_data(p, crc, l); 2150 kunmap_atomic(mapped_buffer); 2151 len -= l; 2152 if (len == 0) 2153 break; 2154 index++; 2155 BUG_ON(index >= sblock->page_count); 2156 BUG_ON(!sblock->pagev[index]->page); 2157 page = sblock->pagev[index]->page; 2158 mapped_buffer = kmap_atomic(page); 2159 mapped_size = PAGE_SIZE; 2160 p = mapped_buffer; 2161 } 2162 2163 btrfs_csum_final(crc, calculated_csum); 2164 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 2165 sblock->checksum_error = 1; 2166 2167 return sblock->header_error || sblock->checksum_error; 2168 } 2169 2170 static int scrub_checksum_super(struct scrub_block *sblock) 2171 { 2172 struct btrfs_super_block *s; 2173 struct scrub_ctx *sctx = sblock->sctx; 2174 u8 calculated_csum[BTRFS_CSUM_SIZE]; 2175 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 2176 struct page *page; 2177 void *mapped_buffer; 2178 u64 mapped_size; 2179 void *p; 2180 u32 crc = ~(u32)0; 2181 int fail_gen = 0; 2182 int fail_cor = 0; 2183 u64 len; 2184 int index; 2185 2186 BUG_ON(sblock->page_count < 1); 2187 page = sblock->pagev[0]->page; 2188 mapped_buffer = kmap_atomic(page); 2189 s = (struct btrfs_super_block *)mapped_buffer; 2190 memcpy(on_disk_csum, s->csum, sctx->csum_size); 2191 2192 if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) 2193 ++fail_cor; 2194 2195 if (sblock->pagev[0]->generation != btrfs_super_generation(s)) 2196 ++fail_gen; 2197 2198 if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) 2199 ++fail_cor; 2200 2201 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 2202 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 2203 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 2204 index = 0; 2205 for (;;) { 2206 u64 l = min_t(u64, len, mapped_size); 2207 2208 crc = btrfs_csum_data(p, crc, l); 2209 kunmap_atomic(mapped_buffer); 2210 len -= l; 2211 if (len == 0) 2212 break; 2213 index++; 2214 BUG_ON(index >= sblock->page_count); 2215 BUG_ON(!sblock->pagev[index]->page); 2216 page = sblock->pagev[index]->page; 2217 mapped_buffer = kmap_atomic(page); 2218 mapped_size = PAGE_SIZE; 2219 p = mapped_buffer; 2220 } 2221 2222 btrfs_csum_final(crc, calculated_csum); 2223 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 2224 ++fail_cor; 2225 2226 if (fail_cor + fail_gen) { 2227 /* 2228 * if we find an error in a super block, we just report it. 2229 * They will get written with the next transaction commit 2230 * anyway 2231 */ 2232 spin_lock(&sctx->stat_lock); 2233 ++sctx->stat.super_errors; 2234 spin_unlock(&sctx->stat_lock); 2235 if (fail_cor) 2236 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 2237 BTRFS_DEV_STAT_CORRUPTION_ERRS); 2238 else 2239 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 2240 BTRFS_DEV_STAT_GENERATION_ERRS); 2241 } 2242 2243 return fail_cor + fail_gen; 2244 } 2245 2246 static void scrub_block_get(struct scrub_block *sblock) 2247 { 2248 refcount_inc(&sblock->refs); 2249 } 2250 2251 static void scrub_block_put(struct scrub_block *sblock) 2252 { 2253 if (refcount_dec_and_test(&sblock->refs)) { 2254 int i; 2255 2256 if (sblock->sparity) 2257 scrub_parity_put(sblock->sparity); 2258 2259 for (i = 0; i < sblock->page_count; i++) 2260 scrub_page_put(sblock->pagev[i]); 2261 kfree(sblock); 2262 } 2263 } 2264 2265 static void scrub_page_get(struct scrub_page *spage) 2266 { 2267 atomic_inc(&spage->refs); 2268 } 2269 2270 static void scrub_page_put(struct scrub_page *spage) 2271 { 2272 if (atomic_dec_and_test(&spage->refs)) { 2273 if (spage->page) 2274 __free_page(spage->page); 2275 kfree(spage); 2276 } 2277 } 2278 2279 static void scrub_submit(struct scrub_ctx *sctx) 2280 { 2281 struct scrub_bio *sbio; 2282 2283 if (sctx->curr == -1) 2284 return; 2285 2286 sbio = sctx->bios[sctx->curr]; 2287 sctx->curr = -1; 2288 scrub_pending_bio_inc(sctx); 2289 btrfsic_submit_bio(sbio->bio); 2290 } 2291 2292 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 2293 struct scrub_page *spage) 2294 { 2295 struct scrub_block *sblock = spage->sblock; 2296 struct scrub_bio *sbio; 2297 int ret; 2298 2299 again: 2300 /* 2301 * grab a fresh bio or wait for one to become available 2302 */ 2303 while (sctx->curr == -1) { 2304 spin_lock(&sctx->list_lock); 2305 sctx->curr = sctx->first_free; 2306 if (sctx->curr != -1) { 2307 sctx->first_free = sctx->bios[sctx->curr]->next_free; 2308 sctx->bios[sctx->curr]->next_free = -1; 2309 sctx->bios[sctx->curr]->page_count = 0; 2310 spin_unlock(&sctx->list_lock); 2311 } else { 2312 spin_unlock(&sctx->list_lock); 2313 wait_event(sctx->list_wait, sctx->first_free != -1); 2314 } 2315 } 2316 sbio = sctx->bios[sctx->curr]; 2317 if (sbio->page_count == 0) { 2318 struct bio *bio; 2319 2320 sbio->physical = spage->physical; 2321 sbio->logical = spage->logical; 2322 sbio->dev = spage->dev; 2323 bio = sbio->bio; 2324 if (!bio) { 2325 bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); 2326 sbio->bio = bio; 2327 } 2328 2329 bio->bi_private = sbio; 2330 bio->bi_end_io = scrub_bio_end_io; 2331 bio_set_dev(bio, sbio->dev->bdev); 2332 bio->bi_iter.bi_sector = sbio->physical >> 9; 2333 bio_set_op_attrs(bio, REQ_OP_READ, 0); 2334 sbio->status = 0; 2335 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 2336 spage->physical || 2337 sbio->logical + sbio->page_count * PAGE_SIZE != 2338 spage->logical || 2339 sbio->dev != spage->dev) { 2340 scrub_submit(sctx); 2341 goto again; 2342 } 2343 2344 sbio->pagev[sbio->page_count] = spage; 2345 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 2346 if (ret != PAGE_SIZE) { 2347 if (sbio->page_count < 1) { 2348 bio_put(sbio->bio); 2349 sbio->bio = NULL; 2350 return -EIO; 2351 } 2352 scrub_submit(sctx); 2353 goto again; 2354 } 2355 2356 scrub_block_get(sblock); /* one for the page added to the bio */ 2357 atomic_inc(&sblock->outstanding_pages); 2358 sbio->page_count++; 2359 if (sbio->page_count == sctx->pages_per_rd_bio) 2360 scrub_submit(sctx); 2361 2362 return 0; 2363 } 2364 2365 static void scrub_missing_raid56_end_io(struct bio *bio) 2366 { 2367 struct scrub_block *sblock = bio->bi_private; 2368 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; 2369 2370 if (bio->bi_status) 2371 sblock->no_io_error_seen = 0; 2372 2373 bio_put(bio); 2374 2375 btrfs_queue_work(fs_info->scrub_workers, &sblock->work); 2376 } 2377 2378 static void scrub_missing_raid56_worker(struct btrfs_work *work) 2379 { 2380 struct scrub_block *sblock = container_of(work, struct scrub_block, work); 2381 struct scrub_ctx *sctx = sblock->sctx; 2382 struct btrfs_fs_info *fs_info = sctx->fs_info; 2383 u64 logical; 2384 struct btrfs_device *dev; 2385 2386 logical = sblock->pagev[0]->logical; 2387 dev = sblock->pagev[0]->dev; 2388 2389 if (sblock->no_io_error_seen) 2390 scrub_recheck_block_checksum(sblock); 2391 2392 if (!sblock->no_io_error_seen) { 2393 spin_lock(&sctx->stat_lock); 2394 sctx->stat.read_errors++; 2395 spin_unlock(&sctx->stat_lock); 2396 btrfs_err_rl_in_rcu(fs_info, 2397 "IO error rebuilding logical %llu for dev %s", 2398 logical, rcu_str_deref(dev->name)); 2399 } else if (sblock->header_error || sblock->checksum_error) { 2400 spin_lock(&sctx->stat_lock); 2401 sctx->stat.uncorrectable_errors++; 2402 spin_unlock(&sctx->stat_lock); 2403 btrfs_err_rl_in_rcu(fs_info, 2404 "failed to rebuild valid logical %llu for dev %s", 2405 logical, rcu_str_deref(dev->name)); 2406 } else { 2407 scrub_write_block_to_dev_replace(sblock); 2408 } 2409 2410 scrub_block_put(sblock); 2411 2412 if (sctx->is_dev_replace && sctx->flush_all_writes) { 2413 mutex_lock(&sctx->wr_lock); 2414 scrub_wr_submit(sctx); 2415 mutex_unlock(&sctx->wr_lock); 2416 } 2417 2418 scrub_pending_bio_dec(sctx); 2419 } 2420 2421 static void scrub_missing_raid56_pages(struct scrub_block *sblock) 2422 { 2423 struct scrub_ctx *sctx = sblock->sctx; 2424 struct btrfs_fs_info *fs_info = sctx->fs_info; 2425 u64 length = sblock->page_count * PAGE_SIZE; 2426 u64 logical = sblock->pagev[0]->logical; 2427 struct btrfs_bio *bbio = NULL; 2428 struct bio *bio; 2429 struct btrfs_raid_bio *rbio; 2430 int ret; 2431 int i; 2432 2433 btrfs_bio_counter_inc_blocked(fs_info); 2434 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 2435 &length, &bbio); 2436 if (ret || !bbio || !bbio->raid_map) 2437 goto bbio_out; 2438 2439 if (WARN_ON(!sctx->is_dev_replace || 2440 !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { 2441 /* 2442 * We shouldn't be scrubbing a missing device. Even for dev 2443 * replace, we should only get here for RAID 5/6. We either 2444 * managed to mount something with no mirrors remaining or 2445 * there's a bug in scrub_remap_extent()/btrfs_map_block(). 2446 */ 2447 goto bbio_out; 2448 } 2449 2450 bio = btrfs_io_bio_alloc(0); 2451 bio->bi_iter.bi_sector = logical >> 9; 2452 bio->bi_private = sblock; 2453 bio->bi_end_io = scrub_missing_raid56_end_io; 2454 2455 rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length); 2456 if (!rbio) 2457 goto rbio_out; 2458 2459 for (i = 0; i < sblock->page_count; i++) { 2460 struct scrub_page *spage = sblock->pagev[i]; 2461 2462 raid56_add_scrub_pages(rbio, spage->page, spage->logical); 2463 } 2464 2465 btrfs_init_work(&sblock->work, btrfs_scrub_helper, 2466 scrub_missing_raid56_worker, NULL, NULL); 2467 scrub_block_get(sblock); 2468 scrub_pending_bio_inc(sctx); 2469 raid56_submit_missing_rbio(rbio); 2470 return; 2471 2472 rbio_out: 2473 bio_put(bio); 2474 bbio_out: 2475 btrfs_bio_counter_dec(fs_info); 2476 btrfs_put_bbio(bbio); 2477 spin_lock(&sctx->stat_lock); 2478 sctx->stat.malloc_errors++; 2479 spin_unlock(&sctx->stat_lock); 2480 } 2481 2482 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 2483 u64 physical, struct btrfs_device *dev, u64 flags, 2484 u64 gen, int mirror_num, u8 *csum, int force, 2485 u64 physical_for_dev_replace) 2486 { 2487 struct scrub_block *sblock; 2488 int index; 2489 2490 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2491 if (!sblock) { 2492 spin_lock(&sctx->stat_lock); 2493 sctx->stat.malloc_errors++; 2494 spin_unlock(&sctx->stat_lock); 2495 return -ENOMEM; 2496 } 2497 2498 /* one ref inside this function, plus one for each page added to 2499 * a bio later on */ 2500 refcount_set(&sblock->refs, 1); 2501 sblock->sctx = sctx; 2502 sblock->no_io_error_seen = 1; 2503 2504 for (index = 0; len > 0; index++) { 2505 struct scrub_page *spage; 2506 u64 l = min_t(u64, len, PAGE_SIZE); 2507 2508 spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2509 if (!spage) { 2510 leave_nomem: 2511 spin_lock(&sctx->stat_lock); 2512 sctx->stat.malloc_errors++; 2513 spin_unlock(&sctx->stat_lock); 2514 scrub_block_put(sblock); 2515 return -ENOMEM; 2516 } 2517 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 2518 scrub_page_get(spage); 2519 sblock->pagev[index] = spage; 2520 spage->sblock = sblock; 2521 spage->dev = dev; 2522 spage->flags = flags; 2523 spage->generation = gen; 2524 spage->logical = logical; 2525 spage->physical = physical; 2526 spage->physical_for_dev_replace = physical_for_dev_replace; 2527 spage->mirror_num = mirror_num; 2528 if (csum) { 2529 spage->have_csum = 1; 2530 memcpy(spage->csum, csum, sctx->csum_size); 2531 } else { 2532 spage->have_csum = 0; 2533 } 2534 sblock->page_count++; 2535 spage->page = alloc_page(GFP_KERNEL); 2536 if (!spage->page) 2537 goto leave_nomem; 2538 len -= l; 2539 logical += l; 2540 physical += l; 2541 physical_for_dev_replace += l; 2542 } 2543 2544 WARN_ON(sblock->page_count == 0); 2545 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { 2546 /* 2547 * This case should only be hit for RAID 5/6 device replace. See 2548 * the comment in scrub_missing_raid56_pages() for details. 2549 */ 2550 scrub_missing_raid56_pages(sblock); 2551 } else { 2552 for (index = 0; index < sblock->page_count; index++) { 2553 struct scrub_page *spage = sblock->pagev[index]; 2554 int ret; 2555 2556 ret = scrub_add_page_to_rd_bio(sctx, spage); 2557 if (ret) { 2558 scrub_block_put(sblock); 2559 return ret; 2560 } 2561 } 2562 2563 if (force) 2564 scrub_submit(sctx); 2565 } 2566 2567 /* last one frees, either here or in bio completion for last page */ 2568 scrub_block_put(sblock); 2569 return 0; 2570 } 2571 2572 static void scrub_bio_end_io(struct bio *bio) 2573 { 2574 struct scrub_bio *sbio = bio->bi_private; 2575 struct btrfs_fs_info *fs_info = sbio->dev->fs_info; 2576 2577 sbio->status = bio->bi_status; 2578 sbio->bio = bio; 2579 2580 btrfs_queue_work(fs_info->scrub_workers, &sbio->work); 2581 } 2582 2583 static void scrub_bio_end_io_worker(struct btrfs_work *work) 2584 { 2585 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2586 struct scrub_ctx *sctx = sbio->sctx; 2587 int i; 2588 2589 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); 2590 if (sbio->status) { 2591 for (i = 0; i < sbio->page_count; i++) { 2592 struct scrub_page *spage = sbio->pagev[i]; 2593 2594 spage->io_error = 1; 2595 spage->sblock->no_io_error_seen = 0; 2596 } 2597 } 2598 2599 /* now complete the scrub_block items that have all pages completed */ 2600 for (i = 0; i < sbio->page_count; i++) { 2601 struct scrub_page *spage = sbio->pagev[i]; 2602 struct scrub_block *sblock = spage->sblock; 2603 2604 if (atomic_dec_and_test(&sblock->outstanding_pages)) 2605 scrub_block_complete(sblock); 2606 scrub_block_put(sblock); 2607 } 2608 2609 bio_put(sbio->bio); 2610 sbio->bio = NULL; 2611 spin_lock(&sctx->list_lock); 2612 sbio->next_free = sctx->first_free; 2613 sctx->first_free = sbio->index; 2614 spin_unlock(&sctx->list_lock); 2615 2616 if (sctx->is_dev_replace && sctx->flush_all_writes) { 2617 mutex_lock(&sctx->wr_lock); 2618 scrub_wr_submit(sctx); 2619 mutex_unlock(&sctx->wr_lock); 2620 } 2621 2622 scrub_pending_bio_dec(sctx); 2623 } 2624 2625 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, 2626 unsigned long *bitmap, 2627 u64 start, u64 len) 2628 { 2629 u64 offset; 2630 u64 nsectors64; 2631 u32 nsectors; 2632 int sectorsize = sparity->sctx->fs_info->sectorsize; 2633 2634 if (len >= sparity->stripe_len) { 2635 bitmap_set(bitmap, 0, sparity->nsectors); 2636 return; 2637 } 2638 2639 start -= sparity->logic_start; 2640 start = div64_u64_rem(start, sparity->stripe_len, &offset); 2641 offset = div_u64(offset, sectorsize); 2642 nsectors64 = div_u64(len, sectorsize); 2643 2644 ASSERT(nsectors64 < UINT_MAX); 2645 nsectors = (u32)nsectors64; 2646 2647 if (offset + nsectors <= sparity->nsectors) { 2648 bitmap_set(bitmap, offset, nsectors); 2649 return; 2650 } 2651 2652 bitmap_set(bitmap, offset, sparity->nsectors - offset); 2653 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); 2654 } 2655 2656 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, 2657 u64 start, u64 len) 2658 { 2659 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); 2660 } 2661 2662 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, 2663 u64 start, u64 len) 2664 { 2665 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); 2666 } 2667 2668 static void scrub_block_complete(struct scrub_block *sblock) 2669 { 2670 int corrupted = 0; 2671 2672 if (!sblock->no_io_error_seen) { 2673 corrupted = 1; 2674 scrub_handle_errored_block(sblock); 2675 } else { 2676 /* 2677 * if has checksum error, write via repair mechanism in 2678 * dev replace case, otherwise write here in dev replace 2679 * case. 2680 */ 2681 corrupted = scrub_checksum(sblock); 2682 if (!corrupted && sblock->sctx->is_dev_replace) 2683 scrub_write_block_to_dev_replace(sblock); 2684 } 2685 2686 if (sblock->sparity && corrupted && !sblock->data_corrected) { 2687 u64 start = sblock->pagev[0]->logical; 2688 u64 end = sblock->pagev[sblock->page_count - 1]->logical + 2689 PAGE_SIZE; 2690 2691 scrub_parity_mark_sectors_error(sblock->sparity, 2692 start, end - start); 2693 } 2694 } 2695 2696 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) 2697 { 2698 struct btrfs_ordered_sum *sum = NULL; 2699 unsigned long index; 2700 unsigned long num_sectors; 2701 2702 while (!list_empty(&sctx->csum_list)) { 2703 sum = list_first_entry(&sctx->csum_list, 2704 struct btrfs_ordered_sum, list); 2705 if (sum->bytenr > logical) 2706 return 0; 2707 if (sum->bytenr + sum->len > logical) 2708 break; 2709 2710 ++sctx->stat.csum_discards; 2711 list_del(&sum->list); 2712 kfree(sum); 2713 sum = NULL; 2714 } 2715 if (!sum) 2716 return 0; 2717 2718 index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize); 2719 ASSERT(index < UINT_MAX); 2720 2721 num_sectors = sum->len / sctx->fs_info->sectorsize; 2722 memcpy(csum, sum->sums + index, sctx->csum_size); 2723 if (index == num_sectors - 1) { 2724 list_del(&sum->list); 2725 kfree(sum); 2726 } 2727 return 1; 2728 } 2729 2730 /* scrub extent tries to collect up to 64 kB for each bio */ 2731 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 2732 u64 physical, struct btrfs_device *dev, u64 flags, 2733 u64 gen, int mirror_num, u64 physical_for_dev_replace) 2734 { 2735 int ret; 2736 u8 csum[BTRFS_CSUM_SIZE]; 2737 u32 blocksize; 2738 2739 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2740 blocksize = sctx->fs_info->sectorsize; 2741 spin_lock(&sctx->stat_lock); 2742 sctx->stat.data_extents_scrubbed++; 2743 sctx->stat.data_bytes_scrubbed += len; 2744 spin_unlock(&sctx->stat_lock); 2745 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2746 blocksize = sctx->fs_info->nodesize; 2747 spin_lock(&sctx->stat_lock); 2748 sctx->stat.tree_extents_scrubbed++; 2749 sctx->stat.tree_bytes_scrubbed += len; 2750 spin_unlock(&sctx->stat_lock); 2751 } else { 2752 blocksize = sctx->fs_info->sectorsize; 2753 WARN_ON(1); 2754 } 2755 2756 while (len) { 2757 u64 l = min_t(u64, len, blocksize); 2758 int have_csum = 0; 2759 2760 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2761 /* push csums to sbio */ 2762 have_csum = scrub_find_csum(sctx, logical, csum); 2763 if (have_csum == 0) 2764 ++sctx->stat.no_csum; 2765 if (sctx->is_dev_replace && !have_csum) { 2766 ret = copy_nocow_pages(sctx, logical, l, 2767 mirror_num, 2768 physical_for_dev_replace); 2769 goto behind_scrub_pages; 2770 } 2771 } 2772 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 2773 mirror_num, have_csum ? csum : NULL, 0, 2774 physical_for_dev_replace); 2775 behind_scrub_pages: 2776 if (ret) 2777 return ret; 2778 len -= l; 2779 logical += l; 2780 physical += l; 2781 physical_for_dev_replace += l; 2782 } 2783 return 0; 2784 } 2785 2786 static int scrub_pages_for_parity(struct scrub_parity *sparity, 2787 u64 logical, u64 len, 2788 u64 physical, struct btrfs_device *dev, 2789 u64 flags, u64 gen, int mirror_num, u8 *csum) 2790 { 2791 struct scrub_ctx *sctx = sparity->sctx; 2792 struct scrub_block *sblock; 2793 int index; 2794 2795 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); 2796 if (!sblock) { 2797 spin_lock(&sctx->stat_lock); 2798 sctx->stat.malloc_errors++; 2799 spin_unlock(&sctx->stat_lock); 2800 return -ENOMEM; 2801 } 2802 2803 /* one ref inside this function, plus one for each page added to 2804 * a bio later on */ 2805 refcount_set(&sblock->refs, 1); 2806 sblock->sctx = sctx; 2807 sblock->no_io_error_seen = 1; 2808 sblock->sparity = sparity; 2809 scrub_parity_get(sparity); 2810 2811 for (index = 0; len > 0; index++) { 2812 struct scrub_page *spage; 2813 u64 l = min_t(u64, len, PAGE_SIZE); 2814 2815 spage = kzalloc(sizeof(*spage), GFP_KERNEL); 2816 if (!spage) { 2817 leave_nomem: 2818 spin_lock(&sctx->stat_lock); 2819 sctx->stat.malloc_errors++; 2820 spin_unlock(&sctx->stat_lock); 2821 scrub_block_put(sblock); 2822 return -ENOMEM; 2823 } 2824 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 2825 /* For scrub block */ 2826 scrub_page_get(spage); 2827 sblock->pagev[index] = spage; 2828 /* For scrub parity */ 2829 scrub_page_get(spage); 2830 list_add_tail(&spage->list, &sparity->spages); 2831 spage->sblock = sblock; 2832 spage->dev = dev; 2833 spage->flags = flags; 2834 spage->generation = gen; 2835 spage->logical = logical; 2836 spage->physical = physical; 2837 spage->mirror_num = mirror_num; 2838 if (csum) { 2839 spage->have_csum = 1; 2840 memcpy(spage->csum, csum, sctx->csum_size); 2841 } else { 2842 spage->have_csum = 0; 2843 } 2844 sblock->page_count++; 2845 spage->page = alloc_page(GFP_KERNEL); 2846 if (!spage->page) 2847 goto leave_nomem; 2848 len -= l; 2849 logical += l; 2850 physical += l; 2851 } 2852 2853 WARN_ON(sblock->page_count == 0); 2854 for (index = 0; index < sblock->page_count; index++) { 2855 struct scrub_page *spage = sblock->pagev[index]; 2856 int ret; 2857 2858 ret = scrub_add_page_to_rd_bio(sctx, spage); 2859 if (ret) { 2860 scrub_block_put(sblock); 2861 return ret; 2862 } 2863 } 2864 2865 /* last one frees, either here or in bio completion for last page */ 2866 scrub_block_put(sblock); 2867 return 0; 2868 } 2869 2870 static int scrub_extent_for_parity(struct scrub_parity *sparity, 2871 u64 logical, u64 len, 2872 u64 physical, struct btrfs_device *dev, 2873 u64 flags, u64 gen, int mirror_num) 2874 { 2875 struct scrub_ctx *sctx = sparity->sctx; 2876 int ret; 2877 u8 csum[BTRFS_CSUM_SIZE]; 2878 u32 blocksize; 2879 2880 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { 2881 scrub_parity_mark_sectors_error(sparity, logical, len); 2882 return 0; 2883 } 2884 2885 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2886 blocksize = sctx->fs_info->sectorsize; 2887 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2888 blocksize = sctx->fs_info->nodesize; 2889 } else { 2890 blocksize = sctx->fs_info->sectorsize; 2891 WARN_ON(1); 2892 } 2893 2894 while (len) { 2895 u64 l = min_t(u64, len, blocksize); 2896 int have_csum = 0; 2897 2898 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2899 /* push csums to sbio */ 2900 have_csum = scrub_find_csum(sctx, logical, csum); 2901 if (have_csum == 0) 2902 goto skip; 2903 } 2904 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, 2905 flags, gen, mirror_num, 2906 have_csum ? csum : NULL); 2907 if (ret) 2908 return ret; 2909 skip: 2910 len -= l; 2911 logical += l; 2912 physical += l; 2913 } 2914 return 0; 2915 } 2916 2917 /* 2918 * Given a physical address, this will calculate it's 2919 * logical offset. if this is a parity stripe, it will return 2920 * the most left data stripe's logical offset. 2921 * 2922 * return 0 if it is a data stripe, 1 means parity stripe. 2923 */ 2924 static int get_raid56_logic_offset(u64 physical, int num, 2925 struct map_lookup *map, u64 *offset, 2926 u64 *stripe_start) 2927 { 2928 int i; 2929 int j = 0; 2930 u64 stripe_nr; 2931 u64 last_offset; 2932 u32 stripe_index; 2933 u32 rot; 2934 2935 last_offset = (physical - map->stripes[num].physical) * 2936 nr_data_stripes(map); 2937 if (stripe_start) 2938 *stripe_start = last_offset; 2939 2940 *offset = last_offset; 2941 for (i = 0; i < nr_data_stripes(map); i++) { 2942 *offset = last_offset + i * map->stripe_len; 2943 2944 stripe_nr = div64_u64(*offset, map->stripe_len); 2945 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); 2946 2947 /* Work out the disk rotation on this stripe-set */ 2948 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); 2949 /* calculate which stripe this data locates */ 2950 rot += i; 2951 stripe_index = rot % map->num_stripes; 2952 if (stripe_index == num) 2953 return 0; 2954 if (stripe_index < num) 2955 j++; 2956 } 2957 *offset = last_offset + j * map->stripe_len; 2958 return 1; 2959 } 2960 2961 static void scrub_free_parity(struct scrub_parity *sparity) 2962 { 2963 struct scrub_ctx *sctx = sparity->sctx; 2964 struct scrub_page *curr, *next; 2965 int nbits; 2966 2967 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); 2968 if (nbits) { 2969 spin_lock(&sctx->stat_lock); 2970 sctx->stat.read_errors += nbits; 2971 sctx->stat.uncorrectable_errors += nbits; 2972 spin_unlock(&sctx->stat_lock); 2973 } 2974 2975 list_for_each_entry_safe(curr, next, &sparity->spages, list) { 2976 list_del_init(&curr->list); 2977 scrub_page_put(curr); 2978 } 2979 2980 kfree(sparity); 2981 } 2982 2983 static void scrub_parity_bio_endio_worker(struct btrfs_work *work) 2984 { 2985 struct scrub_parity *sparity = container_of(work, struct scrub_parity, 2986 work); 2987 struct scrub_ctx *sctx = sparity->sctx; 2988 2989 scrub_free_parity(sparity); 2990 scrub_pending_bio_dec(sctx); 2991 } 2992 2993 static void scrub_parity_bio_endio(struct bio *bio) 2994 { 2995 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; 2996 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; 2997 2998 if (bio->bi_status) 2999 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 3000 sparity->nsectors); 3001 3002 bio_put(bio); 3003 3004 btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, 3005 scrub_parity_bio_endio_worker, NULL, NULL); 3006 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); 3007 } 3008 3009 static void scrub_parity_check_and_repair(struct scrub_parity *sparity) 3010 { 3011 struct scrub_ctx *sctx = sparity->sctx; 3012 struct btrfs_fs_info *fs_info = sctx->fs_info; 3013 struct bio *bio; 3014 struct btrfs_raid_bio *rbio; 3015 struct btrfs_bio *bbio = NULL; 3016 u64 length; 3017 int ret; 3018 3019 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, 3020 sparity->nsectors)) 3021 goto out; 3022 3023 length = sparity->logic_end - sparity->logic_start; 3024 3025 btrfs_bio_counter_inc_blocked(fs_info); 3026 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, 3027 &length, &bbio); 3028 if (ret || !bbio || !bbio->raid_map) 3029 goto bbio_out; 3030 3031 bio = btrfs_io_bio_alloc(0); 3032 bio->bi_iter.bi_sector = sparity->logic_start >> 9; 3033 bio->bi_private = sparity; 3034 bio->bi_end_io = scrub_parity_bio_endio; 3035 3036 rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio, 3037 length, sparity->scrub_dev, 3038 sparity->dbitmap, 3039 sparity->nsectors); 3040 if (!rbio) 3041 goto rbio_out; 3042 3043 scrub_pending_bio_inc(sctx); 3044 raid56_parity_submit_scrub_rbio(rbio); 3045 return; 3046 3047 rbio_out: 3048 bio_put(bio); 3049 bbio_out: 3050 btrfs_bio_counter_dec(fs_info); 3051 btrfs_put_bbio(bbio); 3052 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 3053 sparity->nsectors); 3054 spin_lock(&sctx->stat_lock); 3055 sctx->stat.malloc_errors++; 3056 spin_unlock(&sctx->stat_lock); 3057 out: 3058 scrub_free_parity(sparity); 3059 } 3060 3061 static inline int scrub_calc_parity_bitmap_len(int nsectors) 3062 { 3063 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); 3064 } 3065 3066 static void scrub_parity_get(struct scrub_parity *sparity) 3067 { 3068 refcount_inc(&sparity->refs); 3069 } 3070 3071 static void scrub_parity_put(struct scrub_parity *sparity) 3072 { 3073 if (!refcount_dec_and_test(&sparity->refs)) 3074 return; 3075 3076 scrub_parity_check_and_repair(sparity); 3077 } 3078 3079 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, 3080 struct map_lookup *map, 3081 struct btrfs_device *sdev, 3082 struct btrfs_path *path, 3083 u64 logic_start, 3084 u64 logic_end) 3085 { 3086 struct btrfs_fs_info *fs_info = sctx->fs_info; 3087 struct btrfs_root *root = fs_info->extent_root; 3088 struct btrfs_root *csum_root = fs_info->csum_root; 3089 struct btrfs_extent_item *extent; 3090 struct btrfs_bio *bbio = NULL; 3091 u64 flags; 3092 int ret; 3093 int slot; 3094 struct extent_buffer *l; 3095 struct btrfs_key key; 3096 u64 generation; 3097 u64 extent_logical; 3098 u64 extent_physical; 3099 u64 extent_len; 3100 u64 mapped_length; 3101 struct btrfs_device *extent_dev; 3102 struct scrub_parity *sparity; 3103 int nsectors; 3104 int bitmap_len; 3105 int extent_mirror_num; 3106 int stop_loop = 0; 3107 3108 nsectors = div_u64(map->stripe_len, fs_info->sectorsize); 3109 bitmap_len = scrub_calc_parity_bitmap_len(nsectors); 3110 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, 3111 GFP_NOFS); 3112 if (!sparity) { 3113 spin_lock(&sctx->stat_lock); 3114 sctx->stat.malloc_errors++; 3115 spin_unlock(&sctx->stat_lock); 3116 return -ENOMEM; 3117 } 3118 3119 sparity->stripe_len = map->stripe_len; 3120 sparity->nsectors = nsectors; 3121 sparity->sctx = sctx; 3122 sparity->scrub_dev = sdev; 3123 sparity->logic_start = logic_start; 3124 sparity->logic_end = logic_end; 3125 refcount_set(&sparity->refs, 1); 3126 INIT_LIST_HEAD(&sparity->spages); 3127 sparity->dbitmap = sparity->bitmap; 3128 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; 3129 3130 ret = 0; 3131 while (logic_start < logic_end) { 3132 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 3133 key.type = BTRFS_METADATA_ITEM_KEY; 3134 else 3135 key.type = BTRFS_EXTENT_ITEM_KEY; 3136 key.objectid = logic_start; 3137 key.offset = (u64)-1; 3138 3139 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3140 if (ret < 0) 3141 goto out; 3142 3143 if (ret > 0) { 3144 ret = btrfs_previous_extent_item(root, path, 0); 3145 if (ret < 0) 3146 goto out; 3147 if (ret > 0) { 3148 btrfs_release_path(path); 3149 ret = btrfs_search_slot(NULL, root, &key, 3150 path, 0, 0); 3151 if (ret < 0) 3152 goto out; 3153 } 3154 } 3155 3156 stop_loop = 0; 3157 while (1) { 3158 u64 bytes; 3159 3160 l = path->nodes[0]; 3161 slot = path->slots[0]; 3162 if (slot >= btrfs_header_nritems(l)) { 3163 ret = btrfs_next_leaf(root, path); 3164 if (ret == 0) 3165 continue; 3166 if (ret < 0) 3167 goto out; 3168 3169 stop_loop = 1; 3170 break; 3171 } 3172 btrfs_item_key_to_cpu(l, &key, slot); 3173 3174 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3175 key.type != BTRFS_METADATA_ITEM_KEY) 3176 goto next; 3177 3178 if (key.type == BTRFS_METADATA_ITEM_KEY) 3179 bytes = fs_info->nodesize; 3180 else 3181 bytes = key.offset; 3182 3183 if (key.objectid + bytes <= logic_start) 3184 goto next; 3185 3186 if (key.objectid >= logic_end) { 3187 stop_loop = 1; 3188 break; 3189 } 3190 3191 while (key.objectid >= logic_start + map->stripe_len) 3192 logic_start += map->stripe_len; 3193 3194 extent = btrfs_item_ptr(l, slot, 3195 struct btrfs_extent_item); 3196 flags = btrfs_extent_flags(l, extent); 3197 generation = btrfs_extent_generation(l, extent); 3198 3199 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3200 (key.objectid < logic_start || 3201 key.objectid + bytes > 3202 logic_start + map->stripe_len)) { 3203 btrfs_err(fs_info, 3204 "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3205 key.objectid, logic_start); 3206 spin_lock(&sctx->stat_lock); 3207 sctx->stat.uncorrectable_errors++; 3208 spin_unlock(&sctx->stat_lock); 3209 goto next; 3210 } 3211 again: 3212 extent_logical = key.objectid; 3213 extent_len = bytes; 3214 3215 if (extent_logical < logic_start) { 3216 extent_len -= logic_start - extent_logical; 3217 extent_logical = logic_start; 3218 } 3219 3220 if (extent_logical + extent_len > 3221 logic_start + map->stripe_len) 3222 extent_len = logic_start + map->stripe_len - 3223 extent_logical; 3224 3225 scrub_parity_mark_sectors_data(sparity, extent_logical, 3226 extent_len); 3227 3228 mapped_length = extent_len; 3229 bbio = NULL; 3230 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, 3231 extent_logical, &mapped_length, &bbio, 3232 0); 3233 if (!ret) { 3234 if (!bbio || mapped_length < extent_len) 3235 ret = -EIO; 3236 } 3237 if (ret) { 3238 btrfs_put_bbio(bbio); 3239 goto out; 3240 } 3241 extent_physical = bbio->stripes[0].physical; 3242 extent_mirror_num = bbio->mirror_num; 3243 extent_dev = bbio->stripes[0].dev; 3244 btrfs_put_bbio(bbio); 3245 3246 ret = btrfs_lookup_csums_range(csum_root, 3247 extent_logical, 3248 extent_logical + extent_len - 1, 3249 &sctx->csum_list, 1); 3250 if (ret) 3251 goto out; 3252 3253 ret = scrub_extent_for_parity(sparity, extent_logical, 3254 extent_len, 3255 extent_physical, 3256 extent_dev, flags, 3257 generation, 3258 extent_mirror_num); 3259 3260 scrub_free_csums(sctx); 3261 3262 if (ret) 3263 goto out; 3264 3265 if (extent_logical + extent_len < 3266 key.objectid + bytes) { 3267 logic_start += map->stripe_len; 3268 3269 if (logic_start >= logic_end) { 3270 stop_loop = 1; 3271 break; 3272 } 3273 3274 if (logic_start < key.objectid + bytes) { 3275 cond_resched(); 3276 goto again; 3277 } 3278 } 3279 next: 3280 path->slots[0]++; 3281 } 3282 3283 btrfs_release_path(path); 3284 3285 if (stop_loop) 3286 break; 3287 3288 logic_start += map->stripe_len; 3289 } 3290 out: 3291 if (ret < 0) 3292 scrub_parity_mark_sectors_error(sparity, logic_start, 3293 logic_end - logic_start); 3294 scrub_parity_put(sparity); 3295 scrub_submit(sctx); 3296 mutex_lock(&sctx->wr_lock); 3297 scrub_wr_submit(sctx); 3298 mutex_unlock(&sctx->wr_lock); 3299 3300 btrfs_release_path(path); 3301 return ret < 0 ? ret : 0; 3302 } 3303 3304 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 3305 struct map_lookup *map, 3306 struct btrfs_device *scrub_dev, 3307 int num, u64 base, u64 length, 3308 int is_dev_replace) 3309 { 3310 struct btrfs_path *path, *ppath; 3311 struct btrfs_fs_info *fs_info = sctx->fs_info; 3312 struct btrfs_root *root = fs_info->extent_root; 3313 struct btrfs_root *csum_root = fs_info->csum_root; 3314 struct btrfs_extent_item *extent; 3315 struct blk_plug plug; 3316 u64 flags; 3317 int ret; 3318 int slot; 3319 u64 nstripes; 3320 struct extent_buffer *l; 3321 u64 physical; 3322 u64 logical; 3323 u64 logic_end; 3324 u64 physical_end; 3325 u64 generation; 3326 int mirror_num; 3327 struct reada_control *reada1; 3328 struct reada_control *reada2; 3329 struct btrfs_key key; 3330 struct btrfs_key key_end; 3331 u64 increment = map->stripe_len; 3332 u64 offset; 3333 u64 extent_logical; 3334 u64 extent_physical; 3335 u64 extent_len; 3336 u64 stripe_logical; 3337 u64 stripe_end; 3338 struct btrfs_device *extent_dev; 3339 int extent_mirror_num; 3340 int stop_loop = 0; 3341 3342 physical = map->stripes[num].physical; 3343 offset = 0; 3344 nstripes = div64_u64(length, map->stripe_len); 3345 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3346 offset = map->stripe_len * num; 3347 increment = map->stripe_len * map->num_stripes; 3348 mirror_num = 1; 3349 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3350 int factor = map->num_stripes / map->sub_stripes; 3351 offset = map->stripe_len * (num / map->sub_stripes); 3352 increment = map->stripe_len * factor; 3353 mirror_num = num % map->sub_stripes + 1; 3354 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3355 increment = map->stripe_len; 3356 mirror_num = num % map->num_stripes + 1; 3357 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3358 increment = map->stripe_len; 3359 mirror_num = num % map->num_stripes + 1; 3360 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3361 get_raid56_logic_offset(physical, num, map, &offset, NULL); 3362 increment = map->stripe_len * nr_data_stripes(map); 3363 mirror_num = 1; 3364 } else { 3365 increment = map->stripe_len; 3366 mirror_num = 1; 3367 } 3368 3369 path = btrfs_alloc_path(); 3370 if (!path) 3371 return -ENOMEM; 3372 3373 ppath = btrfs_alloc_path(); 3374 if (!ppath) { 3375 btrfs_free_path(path); 3376 return -ENOMEM; 3377 } 3378 3379 /* 3380 * work on commit root. The related disk blocks are static as 3381 * long as COW is applied. This means, it is save to rewrite 3382 * them to repair disk errors without any race conditions 3383 */ 3384 path->search_commit_root = 1; 3385 path->skip_locking = 1; 3386 3387 ppath->search_commit_root = 1; 3388 ppath->skip_locking = 1; 3389 /* 3390 * trigger the readahead for extent tree csum tree and wait for 3391 * completion. During readahead, the scrub is officially paused 3392 * to not hold off transaction commits 3393 */ 3394 logical = base + offset; 3395 physical_end = physical + nstripes * map->stripe_len; 3396 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3397 get_raid56_logic_offset(physical_end, num, 3398 map, &logic_end, NULL); 3399 logic_end += base; 3400 } else { 3401 logic_end = logical + increment * nstripes; 3402 } 3403 wait_event(sctx->list_wait, 3404 atomic_read(&sctx->bios_in_flight) == 0); 3405 scrub_blocked_if_needed(fs_info); 3406 3407 /* FIXME it might be better to start readahead at commit root */ 3408 key.objectid = logical; 3409 key.type = BTRFS_EXTENT_ITEM_KEY; 3410 key.offset = (u64)0; 3411 key_end.objectid = logic_end; 3412 key_end.type = BTRFS_METADATA_ITEM_KEY; 3413 key_end.offset = (u64)-1; 3414 reada1 = btrfs_reada_add(root, &key, &key_end); 3415 3416 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3417 key.type = BTRFS_EXTENT_CSUM_KEY; 3418 key.offset = logical; 3419 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3420 key_end.type = BTRFS_EXTENT_CSUM_KEY; 3421 key_end.offset = logic_end; 3422 reada2 = btrfs_reada_add(csum_root, &key, &key_end); 3423 3424 if (!IS_ERR(reada1)) 3425 btrfs_reada_wait(reada1); 3426 if (!IS_ERR(reada2)) 3427 btrfs_reada_wait(reada2); 3428 3429 3430 /* 3431 * collect all data csums for the stripe to avoid seeking during 3432 * the scrub. This might currently (crc32) end up to be about 1MB 3433 */ 3434 blk_start_plug(&plug); 3435 3436 /* 3437 * now find all extents for each stripe and scrub them 3438 */ 3439 ret = 0; 3440 while (physical < physical_end) { 3441 /* 3442 * canceled? 3443 */ 3444 if (atomic_read(&fs_info->scrub_cancel_req) || 3445 atomic_read(&sctx->cancel_req)) { 3446 ret = -ECANCELED; 3447 goto out; 3448 } 3449 /* 3450 * check to see if we have to pause 3451 */ 3452 if (atomic_read(&fs_info->scrub_pause_req)) { 3453 /* push queued extents */ 3454 sctx->flush_all_writes = true; 3455 scrub_submit(sctx); 3456 mutex_lock(&sctx->wr_lock); 3457 scrub_wr_submit(sctx); 3458 mutex_unlock(&sctx->wr_lock); 3459 wait_event(sctx->list_wait, 3460 atomic_read(&sctx->bios_in_flight) == 0); 3461 sctx->flush_all_writes = false; 3462 scrub_blocked_if_needed(fs_info); 3463 } 3464 3465 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3466 ret = get_raid56_logic_offset(physical, num, map, 3467 &logical, 3468 &stripe_logical); 3469 logical += base; 3470 if (ret) { 3471 /* it is parity strip */ 3472 stripe_logical += base; 3473 stripe_end = stripe_logical + increment; 3474 ret = scrub_raid56_parity(sctx, map, scrub_dev, 3475 ppath, stripe_logical, 3476 stripe_end); 3477 if (ret) 3478 goto out; 3479 goto skip; 3480 } 3481 } 3482 3483 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 3484 key.type = BTRFS_METADATA_ITEM_KEY; 3485 else 3486 key.type = BTRFS_EXTENT_ITEM_KEY; 3487 key.objectid = logical; 3488 key.offset = (u64)-1; 3489 3490 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3491 if (ret < 0) 3492 goto out; 3493 3494 if (ret > 0) { 3495 ret = btrfs_previous_extent_item(root, path, 0); 3496 if (ret < 0) 3497 goto out; 3498 if (ret > 0) { 3499 /* there's no smaller item, so stick with the 3500 * larger one */ 3501 btrfs_release_path(path); 3502 ret = btrfs_search_slot(NULL, root, &key, 3503 path, 0, 0); 3504 if (ret < 0) 3505 goto out; 3506 } 3507 } 3508 3509 stop_loop = 0; 3510 while (1) { 3511 u64 bytes; 3512 3513 l = path->nodes[0]; 3514 slot = path->slots[0]; 3515 if (slot >= btrfs_header_nritems(l)) { 3516 ret = btrfs_next_leaf(root, path); 3517 if (ret == 0) 3518 continue; 3519 if (ret < 0) 3520 goto out; 3521 3522 stop_loop = 1; 3523 break; 3524 } 3525 btrfs_item_key_to_cpu(l, &key, slot); 3526 3527 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3528 key.type != BTRFS_METADATA_ITEM_KEY) 3529 goto next; 3530 3531 if (key.type == BTRFS_METADATA_ITEM_KEY) 3532 bytes = fs_info->nodesize; 3533 else 3534 bytes = key.offset; 3535 3536 if (key.objectid + bytes <= logical) 3537 goto next; 3538 3539 if (key.objectid >= logical + map->stripe_len) { 3540 /* out of this device extent */ 3541 if (key.objectid >= logic_end) 3542 stop_loop = 1; 3543 break; 3544 } 3545 3546 extent = btrfs_item_ptr(l, slot, 3547 struct btrfs_extent_item); 3548 flags = btrfs_extent_flags(l, extent); 3549 generation = btrfs_extent_generation(l, extent); 3550 3551 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && 3552 (key.objectid < logical || 3553 key.objectid + bytes > 3554 logical + map->stripe_len)) { 3555 btrfs_err(fs_info, 3556 "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 3557 key.objectid, logical); 3558 spin_lock(&sctx->stat_lock); 3559 sctx->stat.uncorrectable_errors++; 3560 spin_unlock(&sctx->stat_lock); 3561 goto next; 3562 } 3563 3564 again: 3565 extent_logical = key.objectid; 3566 extent_len = bytes; 3567 3568 /* 3569 * trim extent to this stripe 3570 */ 3571 if (extent_logical < logical) { 3572 extent_len -= logical - extent_logical; 3573 extent_logical = logical; 3574 } 3575 if (extent_logical + extent_len > 3576 logical + map->stripe_len) { 3577 extent_len = logical + map->stripe_len - 3578 extent_logical; 3579 } 3580 3581 extent_physical = extent_logical - logical + physical; 3582 extent_dev = scrub_dev; 3583 extent_mirror_num = mirror_num; 3584 if (is_dev_replace) 3585 scrub_remap_extent(fs_info, extent_logical, 3586 extent_len, &extent_physical, 3587 &extent_dev, 3588 &extent_mirror_num); 3589 3590 ret = btrfs_lookup_csums_range(csum_root, 3591 extent_logical, 3592 extent_logical + 3593 extent_len - 1, 3594 &sctx->csum_list, 1); 3595 if (ret) 3596 goto out; 3597 3598 ret = scrub_extent(sctx, extent_logical, extent_len, 3599 extent_physical, extent_dev, flags, 3600 generation, extent_mirror_num, 3601 extent_logical - logical + physical); 3602 3603 scrub_free_csums(sctx); 3604 3605 if (ret) 3606 goto out; 3607 3608 if (extent_logical + extent_len < 3609 key.objectid + bytes) { 3610 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3611 /* 3612 * loop until we find next data stripe 3613 * or we have finished all stripes. 3614 */ 3615 loop: 3616 physical += map->stripe_len; 3617 ret = get_raid56_logic_offset(physical, 3618 num, map, &logical, 3619 &stripe_logical); 3620 logical += base; 3621 3622 if (ret && physical < physical_end) { 3623 stripe_logical += base; 3624 stripe_end = stripe_logical + 3625 increment; 3626 ret = scrub_raid56_parity(sctx, 3627 map, scrub_dev, ppath, 3628 stripe_logical, 3629 stripe_end); 3630 if (ret) 3631 goto out; 3632 goto loop; 3633 } 3634 } else { 3635 physical += map->stripe_len; 3636 logical += increment; 3637 } 3638 if (logical < key.objectid + bytes) { 3639 cond_resched(); 3640 goto again; 3641 } 3642 3643 if (physical >= physical_end) { 3644 stop_loop = 1; 3645 break; 3646 } 3647 } 3648 next: 3649 path->slots[0]++; 3650 } 3651 btrfs_release_path(path); 3652 skip: 3653 logical += increment; 3654 physical += map->stripe_len; 3655 spin_lock(&sctx->stat_lock); 3656 if (stop_loop) 3657 sctx->stat.last_physical = map->stripes[num].physical + 3658 length; 3659 else 3660 sctx->stat.last_physical = physical; 3661 spin_unlock(&sctx->stat_lock); 3662 if (stop_loop) 3663 break; 3664 } 3665 out: 3666 /* push queued extents */ 3667 scrub_submit(sctx); 3668 mutex_lock(&sctx->wr_lock); 3669 scrub_wr_submit(sctx); 3670 mutex_unlock(&sctx->wr_lock); 3671 3672 blk_finish_plug(&plug); 3673 btrfs_free_path(path); 3674 btrfs_free_path(ppath); 3675 return ret < 0 ? ret : 0; 3676 } 3677 3678 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 3679 struct btrfs_device *scrub_dev, 3680 u64 chunk_offset, u64 length, 3681 u64 dev_offset, 3682 struct btrfs_block_group_cache *cache, 3683 int is_dev_replace) 3684 { 3685 struct btrfs_fs_info *fs_info = sctx->fs_info; 3686 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 3687 struct map_lookup *map; 3688 struct extent_map *em; 3689 int i; 3690 int ret = 0; 3691 3692 read_lock(&map_tree->map_tree.lock); 3693 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3694 read_unlock(&map_tree->map_tree.lock); 3695 3696 if (!em) { 3697 /* 3698 * Might have been an unused block group deleted by the cleaner 3699 * kthread or relocation. 3700 */ 3701 spin_lock(&cache->lock); 3702 if (!cache->removed) 3703 ret = -EINVAL; 3704 spin_unlock(&cache->lock); 3705 3706 return ret; 3707 } 3708 3709 map = em->map_lookup; 3710 if (em->start != chunk_offset) 3711 goto out; 3712 3713 if (em->len < length) 3714 goto out; 3715 3716 for (i = 0; i < map->num_stripes; ++i) { 3717 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 3718 map->stripes[i].physical == dev_offset) { 3719 ret = scrub_stripe(sctx, map, scrub_dev, i, 3720 chunk_offset, length, 3721 is_dev_replace); 3722 if (ret) 3723 goto out; 3724 } 3725 } 3726 out: 3727 free_extent_map(em); 3728 3729 return ret; 3730 } 3731 3732 static noinline_for_stack 3733 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 3734 struct btrfs_device *scrub_dev, u64 start, u64 end, 3735 int is_dev_replace) 3736 { 3737 struct btrfs_dev_extent *dev_extent = NULL; 3738 struct btrfs_path *path; 3739 struct btrfs_fs_info *fs_info = sctx->fs_info; 3740 struct btrfs_root *root = fs_info->dev_root; 3741 u64 length; 3742 u64 chunk_offset; 3743 int ret = 0; 3744 int ro_set; 3745 int slot; 3746 struct extent_buffer *l; 3747 struct btrfs_key key; 3748 struct btrfs_key found_key; 3749 struct btrfs_block_group_cache *cache; 3750 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 3751 3752 path = btrfs_alloc_path(); 3753 if (!path) 3754 return -ENOMEM; 3755 3756 path->reada = READA_FORWARD; 3757 path->search_commit_root = 1; 3758 path->skip_locking = 1; 3759 3760 key.objectid = scrub_dev->devid; 3761 key.offset = 0ull; 3762 key.type = BTRFS_DEV_EXTENT_KEY; 3763 3764 while (1) { 3765 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3766 if (ret < 0) 3767 break; 3768 if (ret > 0) { 3769 if (path->slots[0] >= 3770 btrfs_header_nritems(path->nodes[0])) { 3771 ret = btrfs_next_leaf(root, path); 3772 if (ret < 0) 3773 break; 3774 if (ret > 0) { 3775 ret = 0; 3776 break; 3777 } 3778 } else { 3779 ret = 0; 3780 } 3781 } 3782 3783 l = path->nodes[0]; 3784 slot = path->slots[0]; 3785 3786 btrfs_item_key_to_cpu(l, &found_key, slot); 3787 3788 if (found_key.objectid != scrub_dev->devid) 3789 break; 3790 3791 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 3792 break; 3793 3794 if (found_key.offset >= end) 3795 break; 3796 3797 if (found_key.offset < key.offset) 3798 break; 3799 3800 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3801 length = btrfs_dev_extent_length(l, dev_extent); 3802 3803 if (found_key.offset + length <= start) 3804 goto skip; 3805 3806 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3807 3808 /* 3809 * get a reference on the corresponding block group to prevent 3810 * the chunk from going away while we scrub it 3811 */ 3812 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3813 3814 /* some chunks are removed but not committed to disk yet, 3815 * continue scrubbing */ 3816 if (!cache) 3817 goto skip; 3818 3819 /* 3820 * we need call btrfs_inc_block_group_ro() with scrubs_paused, 3821 * to avoid deadlock caused by: 3822 * btrfs_inc_block_group_ro() 3823 * -> btrfs_wait_for_commit() 3824 * -> btrfs_commit_transaction() 3825 * -> btrfs_scrub_pause() 3826 */ 3827 scrub_pause_on(fs_info); 3828 ret = btrfs_inc_block_group_ro(fs_info, cache); 3829 if (!ret && is_dev_replace) { 3830 /* 3831 * If we are doing a device replace wait for any tasks 3832 * that started dellaloc right before we set the block 3833 * group to RO mode, as they might have just allocated 3834 * an extent from it or decided they could do a nocow 3835 * write. And if any such tasks did that, wait for their 3836 * ordered extents to complete and then commit the 3837 * current transaction, so that we can later see the new 3838 * extent items in the extent tree - the ordered extents 3839 * create delayed data references (for cow writes) when 3840 * they complete, which will be run and insert the 3841 * corresponding extent items into the extent tree when 3842 * we commit the transaction they used when running 3843 * inode.c:btrfs_finish_ordered_io(). We later use 3844 * the commit root of the extent tree to find extents 3845 * to copy from the srcdev into the tgtdev, and we don't 3846 * want to miss any new extents. 3847 */ 3848 btrfs_wait_block_group_reservations(cache); 3849 btrfs_wait_nocow_writers(cache); 3850 ret = btrfs_wait_ordered_roots(fs_info, U64_MAX, 3851 cache->key.objectid, 3852 cache->key.offset); 3853 if (ret > 0) { 3854 struct btrfs_trans_handle *trans; 3855 3856 trans = btrfs_join_transaction(root); 3857 if (IS_ERR(trans)) 3858 ret = PTR_ERR(trans); 3859 else 3860 ret = btrfs_commit_transaction(trans); 3861 if (ret) { 3862 scrub_pause_off(fs_info); 3863 btrfs_put_block_group(cache); 3864 break; 3865 } 3866 } 3867 } 3868 scrub_pause_off(fs_info); 3869 3870 if (ret == 0) { 3871 ro_set = 1; 3872 } else if (ret == -ENOSPC) { 3873 /* 3874 * btrfs_inc_block_group_ro return -ENOSPC when it 3875 * failed in creating new chunk for metadata. 3876 * It is not a problem for scrub/replace, because 3877 * metadata are always cowed, and our scrub paused 3878 * commit_transactions. 3879 */ 3880 ro_set = 0; 3881 } else { 3882 btrfs_warn(fs_info, 3883 "failed setting block group ro: %d", ret); 3884 btrfs_put_block_group(cache); 3885 break; 3886 } 3887 3888 btrfs_dev_replace_lock(&fs_info->dev_replace, 1); 3889 dev_replace->cursor_right = found_key.offset + length; 3890 dev_replace->cursor_left = found_key.offset; 3891 dev_replace->item_needs_writeback = 1; 3892 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); 3893 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, 3894 found_key.offset, cache, is_dev_replace); 3895 3896 /* 3897 * flush, submit all pending read and write bios, afterwards 3898 * wait for them. 3899 * Note that in the dev replace case, a read request causes 3900 * write requests that are submitted in the read completion 3901 * worker. Therefore in the current situation, it is required 3902 * that all write requests are flushed, so that all read and 3903 * write requests are really completed when bios_in_flight 3904 * changes to 0. 3905 */ 3906 sctx->flush_all_writes = true; 3907 scrub_submit(sctx); 3908 mutex_lock(&sctx->wr_lock); 3909 scrub_wr_submit(sctx); 3910 mutex_unlock(&sctx->wr_lock); 3911 3912 wait_event(sctx->list_wait, 3913 atomic_read(&sctx->bios_in_flight) == 0); 3914 3915 scrub_pause_on(fs_info); 3916 3917 /* 3918 * must be called before we decrease @scrub_paused. 3919 * make sure we don't block transaction commit while 3920 * we are waiting pending workers finished. 3921 */ 3922 wait_event(sctx->list_wait, 3923 atomic_read(&sctx->workers_pending) == 0); 3924 sctx->flush_all_writes = false; 3925 3926 scrub_pause_off(fs_info); 3927 3928 btrfs_dev_replace_lock(&fs_info->dev_replace, 1); 3929 dev_replace->cursor_left = dev_replace->cursor_right; 3930 dev_replace->item_needs_writeback = 1; 3931 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); 3932 3933 if (ro_set) 3934 btrfs_dec_block_group_ro(cache); 3935 3936 /* 3937 * We might have prevented the cleaner kthread from deleting 3938 * this block group if it was already unused because we raced 3939 * and set it to RO mode first. So add it back to the unused 3940 * list, otherwise it might not ever be deleted unless a manual 3941 * balance is triggered or it becomes used and unused again. 3942 */ 3943 spin_lock(&cache->lock); 3944 if (!cache->removed && !cache->ro && cache->reserved == 0 && 3945 btrfs_block_group_used(&cache->item) == 0) { 3946 spin_unlock(&cache->lock); 3947 spin_lock(&fs_info->unused_bgs_lock); 3948 if (list_empty(&cache->bg_list)) { 3949 btrfs_get_block_group(cache); 3950 list_add_tail(&cache->bg_list, 3951 &fs_info->unused_bgs); 3952 } 3953 spin_unlock(&fs_info->unused_bgs_lock); 3954 } else { 3955 spin_unlock(&cache->lock); 3956 } 3957 3958 btrfs_put_block_group(cache); 3959 if (ret) 3960 break; 3961 if (is_dev_replace && 3962 atomic64_read(&dev_replace->num_write_errors) > 0) { 3963 ret = -EIO; 3964 break; 3965 } 3966 if (sctx->stat.malloc_errors > 0) { 3967 ret = -ENOMEM; 3968 break; 3969 } 3970 skip: 3971 key.offset = found_key.offset + length; 3972 btrfs_release_path(path); 3973 } 3974 3975 btrfs_free_path(path); 3976 3977 return ret; 3978 } 3979 3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 3981 struct btrfs_device *scrub_dev) 3982 { 3983 int i; 3984 u64 bytenr; 3985 u64 gen; 3986 int ret; 3987 struct btrfs_fs_info *fs_info = sctx->fs_info; 3988 3989 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 3990 return -EIO; 3991 3992 /* Seed devices of a new filesystem has their own generation. */ 3993 if (scrub_dev->fs_devices != fs_info->fs_devices) 3994 gen = scrub_dev->generation; 3995 else 3996 gen = fs_info->last_trans_committed; 3997 3998 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 3999 bytenr = btrfs_sb_offset(i); 4000 if (bytenr + BTRFS_SUPER_INFO_SIZE > 4001 scrub_dev->commit_total_bytes) 4002 break; 4003 4004 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 4005 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 4006 NULL, 1, bytenr); 4007 if (ret) 4008 return ret; 4009 } 4010 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 4011 4012 return 0; 4013 } 4014 4015 /* 4016 * get a reference count on fs_info->scrub_workers. start worker if necessary 4017 */ 4018 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 4019 int is_dev_replace) 4020 { 4021 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; 4022 int max_active = fs_info->thread_pool_size; 4023 4024 if (fs_info->scrub_workers_refcnt == 0) { 4025 fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", 4026 flags, is_dev_replace ? 1 : max_active, 4); 4027 if (!fs_info->scrub_workers) 4028 goto fail_scrub_workers; 4029 4030 fs_info->scrub_wr_completion_workers = 4031 btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, 4032 max_active, 2); 4033 if (!fs_info->scrub_wr_completion_workers) 4034 goto fail_scrub_wr_completion_workers; 4035 4036 fs_info->scrub_nocow_workers = 4037 btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); 4038 if (!fs_info->scrub_nocow_workers) 4039 goto fail_scrub_nocow_workers; 4040 fs_info->scrub_parity_workers = 4041 btrfs_alloc_workqueue(fs_info, "scrubparity", flags, 4042 max_active, 2); 4043 if (!fs_info->scrub_parity_workers) 4044 goto fail_scrub_parity_workers; 4045 } 4046 ++fs_info->scrub_workers_refcnt; 4047 return 0; 4048 4049 fail_scrub_parity_workers: 4050 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); 4051 fail_scrub_nocow_workers: 4052 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); 4053 fail_scrub_wr_completion_workers: 4054 btrfs_destroy_workqueue(fs_info->scrub_workers); 4055 fail_scrub_workers: 4056 return -ENOMEM; 4057 } 4058 4059 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 4060 { 4061 if (--fs_info->scrub_workers_refcnt == 0) { 4062 btrfs_destroy_workqueue(fs_info->scrub_workers); 4063 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); 4064 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); 4065 btrfs_destroy_workqueue(fs_info->scrub_parity_workers); 4066 } 4067 WARN_ON(fs_info->scrub_workers_refcnt < 0); 4068 } 4069 4070 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 4071 u64 end, struct btrfs_scrub_progress *progress, 4072 int readonly, int is_dev_replace) 4073 { 4074 struct scrub_ctx *sctx; 4075 int ret; 4076 struct btrfs_device *dev; 4077 struct rcu_string *name; 4078 4079 if (btrfs_fs_closing(fs_info)) 4080 return -EINVAL; 4081 4082 if (fs_info->nodesize > BTRFS_STRIPE_LEN) { 4083 /* 4084 * in this case scrub is unable to calculate the checksum 4085 * the way scrub is implemented. Do not handle this 4086 * situation at all because it won't ever happen. 4087 */ 4088 btrfs_err(fs_info, 4089 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", 4090 fs_info->nodesize, 4091 BTRFS_STRIPE_LEN); 4092 return -EINVAL; 4093 } 4094 4095 if (fs_info->sectorsize != PAGE_SIZE) { 4096 /* not supported for data w/o checksums */ 4097 btrfs_err_rl(fs_info, 4098 "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails", 4099 fs_info->sectorsize, PAGE_SIZE); 4100 return -EINVAL; 4101 } 4102 4103 if (fs_info->nodesize > 4104 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || 4105 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { 4106 /* 4107 * would exhaust the array bounds of pagev member in 4108 * struct scrub_block 4109 */ 4110 btrfs_err(fs_info, 4111 "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", 4112 fs_info->nodesize, 4113 SCRUB_MAX_PAGES_PER_BLOCK, 4114 fs_info->sectorsize, 4115 SCRUB_MAX_PAGES_PER_BLOCK); 4116 return -EINVAL; 4117 } 4118 4119 4120 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4121 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 4122 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && 4123 !is_dev_replace)) { 4124 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4125 return -ENODEV; 4126 } 4127 4128 if (!is_dev_replace && !readonly && 4129 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 4130 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4131 rcu_read_lock(); 4132 name = rcu_dereference(dev->name); 4133 btrfs_err(fs_info, "scrub: device %s is not writable", 4134 name->str); 4135 rcu_read_unlock(); 4136 return -EROFS; 4137 } 4138 4139 mutex_lock(&fs_info->scrub_lock); 4140 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4141 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) { 4142 mutex_unlock(&fs_info->scrub_lock); 4143 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4144 return -EIO; 4145 } 4146 4147 btrfs_dev_replace_lock(&fs_info->dev_replace, 0); 4148 if (dev->scrub_ctx || 4149 (!is_dev_replace && 4150 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 4151 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 4152 mutex_unlock(&fs_info->scrub_lock); 4153 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4154 return -EINPROGRESS; 4155 } 4156 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 4157 4158 ret = scrub_workers_get(fs_info, is_dev_replace); 4159 if (ret) { 4160 mutex_unlock(&fs_info->scrub_lock); 4161 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4162 return ret; 4163 } 4164 4165 sctx = scrub_setup_ctx(dev, is_dev_replace); 4166 if (IS_ERR(sctx)) { 4167 mutex_unlock(&fs_info->scrub_lock); 4168 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4169 scrub_workers_put(fs_info); 4170 return PTR_ERR(sctx); 4171 } 4172 sctx->readonly = readonly; 4173 dev->scrub_ctx = sctx; 4174 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4175 4176 /* 4177 * checking @scrub_pause_req here, we can avoid 4178 * race between committing transaction and scrubbing. 4179 */ 4180 __scrub_blocked_if_needed(fs_info); 4181 atomic_inc(&fs_info->scrubs_running); 4182 mutex_unlock(&fs_info->scrub_lock); 4183 4184 if (!is_dev_replace) { 4185 /* 4186 * by holding device list mutex, we can 4187 * kick off writing super in log tree sync. 4188 */ 4189 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4190 ret = scrub_supers(sctx, dev); 4191 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4192 } 4193 4194 if (!ret) 4195 ret = scrub_enumerate_chunks(sctx, dev, start, end, 4196 is_dev_replace); 4197 4198 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 4199 atomic_dec(&fs_info->scrubs_running); 4200 wake_up(&fs_info->scrub_pause_wait); 4201 4202 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); 4203 4204 if (progress) 4205 memcpy(progress, &sctx->stat, sizeof(*progress)); 4206 4207 mutex_lock(&fs_info->scrub_lock); 4208 dev->scrub_ctx = NULL; 4209 scrub_workers_put(fs_info); 4210 mutex_unlock(&fs_info->scrub_lock); 4211 4212 scrub_put_ctx(sctx); 4213 4214 return ret; 4215 } 4216 4217 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) 4218 { 4219 mutex_lock(&fs_info->scrub_lock); 4220 atomic_inc(&fs_info->scrub_pause_req); 4221 while (atomic_read(&fs_info->scrubs_paused) != 4222 atomic_read(&fs_info->scrubs_running)) { 4223 mutex_unlock(&fs_info->scrub_lock); 4224 wait_event(fs_info->scrub_pause_wait, 4225 atomic_read(&fs_info->scrubs_paused) == 4226 atomic_read(&fs_info->scrubs_running)); 4227 mutex_lock(&fs_info->scrub_lock); 4228 } 4229 mutex_unlock(&fs_info->scrub_lock); 4230 } 4231 4232 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) 4233 { 4234 atomic_dec(&fs_info->scrub_pause_req); 4235 wake_up(&fs_info->scrub_pause_wait); 4236 } 4237 4238 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 4239 { 4240 mutex_lock(&fs_info->scrub_lock); 4241 if (!atomic_read(&fs_info->scrubs_running)) { 4242 mutex_unlock(&fs_info->scrub_lock); 4243 return -ENOTCONN; 4244 } 4245 4246 atomic_inc(&fs_info->scrub_cancel_req); 4247 while (atomic_read(&fs_info->scrubs_running)) { 4248 mutex_unlock(&fs_info->scrub_lock); 4249 wait_event(fs_info->scrub_pause_wait, 4250 atomic_read(&fs_info->scrubs_running) == 0); 4251 mutex_lock(&fs_info->scrub_lock); 4252 } 4253 atomic_dec(&fs_info->scrub_cancel_req); 4254 mutex_unlock(&fs_info->scrub_lock); 4255 4256 return 0; 4257 } 4258 4259 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, 4260 struct btrfs_device *dev) 4261 { 4262 struct scrub_ctx *sctx; 4263 4264 mutex_lock(&fs_info->scrub_lock); 4265 sctx = dev->scrub_ctx; 4266 if (!sctx) { 4267 mutex_unlock(&fs_info->scrub_lock); 4268 return -ENOTCONN; 4269 } 4270 atomic_inc(&sctx->cancel_req); 4271 while (dev->scrub_ctx) { 4272 mutex_unlock(&fs_info->scrub_lock); 4273 wait_event(fs_info->scrub_pause_wait, 4274 dev->scrub_ctx == NULL); 4275 mutex_lock(&fs_info->scrub_lock); 4276 } 4277 mutex_unlock(&fs_info->scrub_lock); 4278 4279 return 0; 4280 } 4281 4282 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, 4283 struct btrfs_scrub_progress *progress) 4284 { 4285 struct btrfs_device *dev; 4286 struct scrub_ctx *sctx = NULL; 4287 4288 mutex_lock(&fs_info->fs_devices->device_list_mutex); 4289 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 4290 if (dev) 4291 sctx = dev->scrub_ctx; 4292 if (sctx) 4293 memcpy(progress, &sctx->stat, sizeof(*progress)); 4294 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4295 4296 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 4297 } 4298 4299 static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 4300 u64 extent_logical, u64 extent_len, 4301 u64 *extent_physical, 4302 struct btrfs_device **extent_dev, 4303 int *extent_mirror_num) 4304 { 4305 u64 mapped_length; 4306 struct btrfs_bio *bbio = NULL; 4307 int ret; 4308 4309 mapped_length = extent_len; 4310 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, 4311 &mapped_length, &bbio, 0); 4312 if (ret || !bbio || mapped_length < extent_len || 4313 !bbio->stripes[0].dev->bdev) { 4314 btrfs_put_bbio(bbio); 4315 return; 4316 } 4317 4318 *extent_physical = bbio->stripes[0].physical; 4319 *extent_mirror_num = bbio->mirror_num; 4320 *extent_dev = bbio->stripes[0].dev; 4321 btrfs_put_bbio(bbio); 4322 } 4323 4324 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 4325 int mirror_num, u64 physical_for_dev_replace) 4326 { 4327 struct scrub_copy_nocow_ctx *nocow_ctx; 4328 struct btrfs_fs_info *fs_info = sctx->fs_info; 4329 4330 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); 4331 if (!nocow_ctx) { 4332 spin_lock(&sctx->stat_lock); 4333 sctx->stat.malloc_errors++; 4334 spin_unlock(&sctx->stat_lock); 4335 return -ENOMEM; 4336 } 4337 4338 scrub_pending_trans_workers_inc(sctx); 4339 4340 nocow_ctx->sctx = sctx; 4341 nocow_ctx->logical = logical; 4342 nocow_ctx->len = len; 4343 nocow_ctx->mirror_num = mirror_num; 4344 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 4345 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, 4346 copy_nocow_pages_worker, NULL, NULL); 4347 INIT_LIST_HEAD(&nocow_ctx->inodes); 4348 btrfs_queue_work(fs_info->scrub_nocow_workers, 4349 &nocow_ctx->work); 4350 4351 return 0; 4352 } 4353 4354 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) 4355 { 4356 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 4357 struct scrub_nocow_inode *nocow_inode; 4358 4359 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); 4360 if (!nocow_inode) 4361 return -ENOMEM; 4362 nocow_inode->inum = inum; 4363 nocow_inode->offset = offset; 4364 nocow_inode->root = root; 4365 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); 4366 return 0; 4367 } 4368 4369 #define COPY_COMPLETE 1 4370 4371 static void copy_nocow_pages_worker(struct btrfs_work *work) 4372 { 4373 struct scrub_copy_nocow_ctx *nocow_ctx = 4374 container_of(work, struct scrub_copy_nocow_ctx, work); 4375 struct scrub_ctx *sctx = nocow_ctx->sctx; 4376 struct btrfs_fs_info *fs_info = sctx->fs_info; 4377 struct btrfs_root *root = fs_info->extent_root; 4378 u64 logical = nocow_ctx->logical; 4379 u64 len = nocow_ctx->len; 4380 int mirror_num = nocow_ctx->mirror_num; 4381 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4382 int ret; 4383 struct btrfs_trans_handle *trans = NULL; 4384 struct btrfs_path *path; 4385 int not_written = 0; 4386 4387 path = btrfs_alloc_path(); 4388 if (!path) { 4389 spin_lock(&sctx->stat_lock); 4390 sctx->stat.malloc_errors++; 4391 spin_unlock(&sctx->stat_lock); 4392 not_written = 1; 4393 goto out; 4394 } 4395 4396 trans = btrfs_join_transaction(root); 4397 if (IS_ERR(trans)) { 4398 not_written = 1; 4399 goto out; 4400 } 4401 4402 ret = iterate_inodes_from_logical(logical, fs_info, path, 4403 record_inode_for_nocow, nocow_ctx, false); 4404 if (ret != 0 && ret != -ENOENT) { 4405 btrfs_warn(fs_info, 4406 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d", 4407 logical, physical_for_dev_replace, len, mirror_num, 4408 ret); 4409 not_written = 1; 4410 goto out; 4411 } 4412 4413 btrfs_end_transaction(trans); 4414 trans = NULL; 4415 while (!list_empty(&nocow_ctx->inodes)) { 4416 struct scrub_nocow_inode *entry; 4417 entry = list_first_entry(&nocow_ctx->inodes, 4418 struct scrub_nocow_inode, 4419 list); 4420 list_del_init(&entry->list); 4421 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, 4422 entry->root, nocow_ctx); 4423 kfree(entry); 4424 if (ret == COPY_COMPLETE) { 4425 ret = 0; 4426 break; 4427 } else if (ret) { 4428 break; 4429 } 4430 } 4431 out: 4432 while (!list_empty(&nocow_ctx->inodes)) { 4433 struct scrub_nocow_inode *entry; 4434 entry = list_first_entry(&nocow_ctx->inodes, 4435 struct scrub_nocow_inode, 4436 list); 4437 list_del_init(&entry->list); 4438 kfree(entry); 4439 } 4440 if (trans && !IS_ERR(trans)) 4441 btrfs_end_transaction(trans); 4442 if (not_written) 4443 btrfs_dev_replace_stats_inc(&fs_info->dev_replace. 4444 num_uncorrectable_read_errors); 4445 4446 btrfs_free_path(path); 4447 kfree(nocow_ctx); 4448 4449 scrub_pending_trans_workers_dec(sctx); 4450 } 4451 4452 static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len, 4453 u64 logical) 4454 { 4455 struct extent_state *cached_state = NULL; 4456 struct btrfs_ordered_extent *ordered; 4457 struct extent_io_tree *io_tree; 4458 struct extent_map *em; 4459 u64 lockstart = start, lockend = start + len - 1; 4460 int ret = 0; 4461 4462 io_tree = &inode->io_tree; 4463 4464 lock_extent_bits(io_tree, lockstart, lockend, &cached_state); 4465 ordered = btrfs_lookup_ordered_range(inode, lockstart, len); 4466 if (ordered) { 4467 btrfs_put_ordered_extent(ordered); 4468 ret = 1; 4469 goto out_unlock; 4470 } 4471 4472 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 4473 if (IS_ERR(em)) { 4474 ret = PTR_ERR(em); 4475 goto out_unlock; 4476 } 4477 4478 /* 4479 * This extent does not actually cover the logical extent anymore, 4480 * move on to the next inode. 4481 */ 4482 if (em->block_start > logical || 4483 em->block_start + em->block_len < logical + len) { 4484 free_extent_map(em); 4485 ret = 1; 4486 goto out_unlock; 4487 } 4488 free_extent_map(em); 4489 4490 out_unlock: 4491 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state); 4492 return ret; 4493 } 4494 4495 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 4496 struct scrub_copy_nocow_ctx *nocow_ctx) 4497 { 4498 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info; 4499 struct btrfs_key key; 4500 struct inode *inode; 4501 struct page *page; 4502 struct btrfs_root *local_root; 4503 struct extent_io_tree *io_tree; 4504 u64 physical_for_dev_replace; 4505 u64 nocow_ctx_logical; 4506 u64 len = nocow_ctx->len; 4507 unsigned long index; 4508 int srcu_index; 4509 int ret = 0; 4510 int err = 0; 4511 4512 key.objectid = root; 4513 key.type = BTRFS_ROOT_ITEM_KEY; 4514 key.offset = (u64)-1; 4515 4516 srcu_index = srcu_read_lock(&fs_info->subvol_srcu); 4517 4518 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 4519 if (IS_ERR(local_root)) { 4520 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 4521 return PTR_ERR(local_root); 4522 } 4523 4524 key.type = BTRFS_INODE_ITEM_KEY; 4525 key.objectid = inum; 4526 key.offset = 0; 4527 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 4528 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 4529 if (IS_ERR(inode)) 4530 return PTR_ERR(inode); 4531 4532 /* Avoid truncate/dio/punch hole.. */ 4533 inode_lock(inode); 4534 inode_dio_wait(inode); 4535 4536 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4537 io_tree = &BTRFS_I(inode)->io_tree; 4538 nocow_ctx_logical = nocow_ctx->logical; 4539 4540 ret = check_extent_to_block(BTRFS_I(inode), offset, len, 4541 nocow_ctx_logical); 4542 if (ret) { 4543 ret = ret > 0 ? 0 : ret; 4544 goto out; 4545 } 4546 4547 while (len >= PAGE_SIZE) { 4548 index = offset >> PAGE_SHIFT; 4549 again: 4550 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 4551 if (!page) { 4552 btrfs_err(fs_info, "find_or_create_page() failed"); 4553 ret = -ENOMEM; 4554 goto out; 4555 } 4556 4557 if (PageUptodate(page)) { 4558 if (PageDirty(page)) 4559 goto next_page; 4560 } else { 4561 ClearPageError(page); 4562 err = extent_read_full_page(io_tree, page, 4563 btrfs_get_extent, 4564 nocow_ctx->mirror_num); 4565 if (err) { 4566 ret = err; 4567 goto next_page; 4568 } 4569 4570 lock_page(page); 4571 /* 4572 * If the page has been remove from the page cache, 4573 * the data on it is meaningless, because it may be 4574 * old one, the new data may be written into the new 4575 * page in the page cache. 4576 */ 4577 if (page->mapping != inode->i_mapping) { 4578 unlock_page(page); 4579 put_page(page); 4580 goto again; 4581 } 4582 if (!PageUptodate(page)) { 4583 ret = -EIO; 4584 goto next_page; 4585 } 4586 } 4587 4588 ret = check_extent_to_block(BTRFS_I(inode), offset, len, 4589 nocow_ctx_logical); 4590 if (ret) { 4591 ret = ret > 0 ? 0 : ret; 4592 goto next_page; 4593 } 4594 4595 err = write_page_nocow(nocow_ctx->sctx, 4596 physical_for_dev_replace, page); 4597 if (err) 4598 ret = err; 4599 next_page: 4600 unlock_page(page); 4601 put_page(page); 4602 4603 if (ret) 4604 break; 4605 4606 offset += PAGE_SIZE; 4607 physical_for_dev_replace += PAGE_SIZE; 4608 nocow_ctx_logical += PAGE_SIZE; 4609 len -= PAGE_SIZE; 4610 } 4611 ret = COPY_COMPLETE; 4612 out: 4613 inode_unlock(inode); 4614 iput(inode); 4615 return ret; 4616 } 4617 4618 static int write_page_nocow(struct scrub_ctx *sctx, 4619 u64 physical_for_dev_replace, struct page *page) 4620 { 4621 struct bio *bio; 4622 struct btrfs_device *dev; 4623 int ret; 4624 4625 dev = sctx->wr_tgtdev; 4626 if (!dev) 4627 return -EIO; 4628 if (!dev->bdev) { 4629 btrfs_warn_rl(dev->fs_info, 4630 "scrub write_page_nocow(bdev == NULL) is unexpected"); 4631 return -EIO; 4632 } 4633 bio = btrfs_io_bio_alloc(1); 4634 bio->bi_iter.bi_size = 0; 4635 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; 4636 bio_set_dev(bio, dev->bdev); 4637 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; 4638 ret = bio_add_page(bio, page, PAGE_SIZE, 0); 4639 if (ret != PAGE_SIZE) { 4640 leave_with_eio: 4641 bio_put(bio); 4642 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 4643 return -EIO; 4644 } 4645 4646 if (btrfsic_submit_bio_wait(bio)) 4647 goto leave_with_eio; 4648 4649 bio_put(bio); 4650 return 0; 4651 } 4652