1 /* 2 * Copyright (C) 2011, 2012 STRATO. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/blkdev.h> 20 #include <linux/ratelimit.h> 21 #include "ctree.h" 22 #include "volumes.h" 23 #include "disk-io.h" 24 #include "ordered-data.h" 25 #include "transaction.h" 26 #include "backref.h" 27 #include "extent_io.h" 28 #include "dev-replace.h" 29 #include "check-integrity.h" 30 #include "rcu-string.h" 31 #include "raid56.h" 32 33 /* 34 * This is only the first step towards a full-features scrub. It reads all 35 * extent and super block and verifies the checksums. In case a bad checksum 36 * is found or the extent cannot be read, good data will be written back if 37 * any can be found. 38 * 39 * Future enhancements: 40 * - In case an unrepairable extent is encountered, track which files are 41 * affected and report them 42 * - track and record media errors, throw out bad devices 43 * - add a mode to also read unallocated space 44 */ 45 46 struct scrub_block; 47 struct scrub_ctx; 48 49 /* 50 * the following three values only influence the performance. 51 * The last one configures the number of parallel and outstanding I/O 52 * operations. The first two values configure an upper limit for the number 53 * of (dynamically allocated) pages that are added to a bio. 54 */ 55 #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */ 56 #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */ 57 #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */ 58 59 /* 60 * the following value times PAGE_SIZE needs to be large enough to match the 61 * largest node/leaf/sector size that shall be supported. 62 * Values larger than BTRFS_STRIPE_LEN are not supported. 63 */ 64 #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 65 66 struct scrub_recover { 67 atomic_t refs; 68 struct btrfs_bio *bbio; 69 u64 map_length; 70 }; 71 72 struct scrub_page { 73 struct scrub_block *sblock; 74 struct page *page; 75 struct btrfs_device *dev; 76 struct list_head list; 77 u64 flags; /* extent flags */ 78 u64 generation; 79 u64 logical; 80 u64 physical; 81 u64 physical_for_dev_replace; 82 atomic_t refs; 83 struct { 84 unsigned int mirror_num:8; 85 unsigned int have_csum:1; 86 unsigned int io_error:1; 87 }; 88 u8 csum[BTRFS_CSUM_SIZE]; 89 90 struct scrub_recover *recover; 91 }; 92 93 struct scrub_bio { 94 int index; 95 struct scrub_ctx *sctx; 96 struct btrfs_device *dev; 97 struct bio *bio; 98 int err; 99 u64 logical; 100 u64 physical; 101 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO 102 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO]; 103 #else 104 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO]; 105 #endif 106 int page_count; 107 int next_free; 108 struct btrfs_work work; 109 }; 110 111 struct scrub_block { 112 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 113 int page_count; 114 atomic_t outstanding_pages; 115 atomic_t refs; /* free mem on transition to zero */ 116 struct scrub_ctx *sctx; 117 struct scrub_parity *sparity; 118 struct { 119 unsigned int header_error:1; 120 unsigned int checksum_error:1; 121 unsigned int no_io_error_seen:1; 122 unsigned int generation_error:1; /* also sets header_error */ 123 124 /* The following is for the data used to check parity */ 125 /* It is for the data with checksum */ 126 unsigned int data_corrected:1; 127 }; 128 }; 129 130 /* Used for the chunks with parity stripe such RAID5/6 */ 131 struct scrub_parity { 132 struct scrub_ctx *sctx; 133 134 struct btrfs_device *scrub_dev; 135 136 u64 logic_start; 137 138 u64 logic_end; 139 140 int nsectors; 141 142 int stripe_len; 143 144 atomic_t refs; 145 146 struct list_head spages; 147 148 /* Work of parity check and repair */ 149 struct btrfs_work work; 150 151 /* Mark the parity blocks which have data */ 152 unsigned long *dbitmap; 153 154 /* 155 * Mark the parity blocks which have data, but errors happen when 156 * read data or check data 157 */ 158 unsigned long *ebitmap; 159 160 unsigned long bitmap[0]; 161 }; 162 163 struct scrub_wr_ctx { 164 struct scrub_bio *wr_curr_bio; 165 struct btrfs_device *tgtdev; 166 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ 167 atomic_t flush_all_writes; 168 struct mutex wr_lock; 169 }; 170 171 struct scrub_ctx { 172 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; 173 struct btrfs_root *dev_root; 174 int first_free; 175 int curr; 176 atomic_t bios_in_flight; 177 atomic_t workers_pending; 178 spinlock_t list_lock; 179 wait_queue_head_t list_wait; 180 u16 csum_size; 181 struct list_head csum_list; 182 atomic_t cancel_req; 183 int readonly; 184 int pages_per_rd_bio; 185 u32 sectorsize; 186 u32 nodesize; 187 188 int is_dev_replace; 189 struct scrub_wr_ctx wr_ctx; 190 191 /* 192 * statistics 193 */ 194 struct btrfs_scrub_progress stat; 195 spinlock_t stat_lock; 196 197 /* 198 * Use a ref counter to avoid use-after-free issues. Scrub workers 199 * decrement bios_in_flight and workers_pending and then do a wakeup 200 * on the list_wait wait queue. We must ensure the main scrub task 201 * doesn't free the scrub context before or while the workers are 202 * doing the wakeup() call. 203 */ 204 atomic_t refs; 205 }; 206 207 struct scrub_fixup_nodatasum { 208 struct scrub_ctx *sctx; 209 struct btrfs_device *dev; 210 u64 logical; 211 struct btrfs_root *root; 212 struct btrfs_work work; 213 int mirror_num; 214 }; 215 216 struct scrub_nocow_inode { 217 u64 inum; 218 u64 offset; 219 u64 root; 220 struct list_head list; 221 }; 222 223 struct scrub_copy_nocow_ctx { 224 struct scrub_ctx *sctx; 225 u64 logical; 226 u64 len; 227 int mirror_num; 228 u64 physical_for_dev_replace; 229 struct list_head inodes; 230 struct btrfs_work work; 231 }; 232 233 struct scrub_warning { 234 struct btrfs_path *path; 235 u64 extent_item_size; 236 const char *errstr; 237 sector_t sector; 238 u64 logical; 239 struct btrfs_device *dev; 240 }; 241 242 static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 243 static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 244 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 245 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); 246 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 247 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 248 struct scrub_block *sblocks_for_recheck); 249 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 250 struct scrub_block *sblock, int is_metadata, 251 int have_csum, u8 *csum, u64 generation, 252 u16 csum_size, int retry_failed_mirror); 253 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 254 struct scrub_block *sblock, 255 int is_metadata, int have_csum, 256 const u8 *csum, u64 generation, 257 u16 csum_size); 258 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 259 struct scrub_block *sblock_good); 260 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 261 struct scrub_block *sblock_good, 262 int page_num, int force_write); 263 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); 264 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 265 int page_num); 266 static int scrub_checksum_data(struct scrub_block *sblock); 267 static int scrub_checksum_tree_block(struct scrub_block *sblock); 268 static int scrub_checksum_super(struct scrub_block *sblock); 269 static void scrub_block_get(struct scrub_block *sblock); 270 static void scrub_block_put(struct scrub_block *sblock); 271 static void scrub_page_get(struct scrub_page *spage); 272 static void scrub_page_put(struct scrub_page *spage); 273 static void scrub_parity_get(struct scrub_parity *sparity); 274 static void scrub_parity_put(struct scrub_parity *sparity); 275 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 276 struct scrub_page *spage); 277 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 278 u64 physical, struct btrfs_device *dev, u64 flags, 279 u64 gen, int mirror_num, u8 *csum, int force, 280 u64 physical_for_dev_replace); 281 static void scrub_bio_end_io(struct bio *bio, int err); 282 static void scrub_bio_end_io_worker(struct btrfs_work *work); 283 static void scrub_block_complete(struct scrub_block *sblock); 284 static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 285 u64 extent_logical, u64 extent_len, 286 u64 *extent_physical, 287 struct btrfs_device **extent_dev, 288 int *extent_mirror_num); 289 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 290 struct scrub_wr_ctx *wr_ctx, 291 struct btrfs_fs_info *fs_info, 292 struct btrfs_device *dev, 293 int is_dev_replace); 294 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx); 295 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 296 struct scrub_page *spage); 297 static void scrub_wr_submit(struct scrub_ctx *sctx); 298 static void scrub_wr_bio_end_io(struct bio *bio, int err); 299 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); 300 static int write_page_nocow(struct scrub_ctx *sctx, 301 u64 physical_for_dev_replace, struct page *page); 302 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 303 struct scrub_copy_nocow_ctx *ctx); 304 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 305 int mirror_num, u64 physical_for_dev_replace); 306 static void copy_nocow_pages_worker(struct btrfs_work *work); 307 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 308 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); 309 static void scrub_put_ctx(struct scrub_ctx *sctx); 310 311 312 static void scrub_pending_bio_inc(struct scrub_ctx *sctx) 313 { 314 atomic_inc(&sctx->refs); 315 atomic_inc(&sctx->bios_in_flight); 316 } 317 318 static void scrub_pending_bio_dec(struct scrub_ctx *sctx) 319 { 320 atomic_dec(&sctx->bios_in_flight); 321 wake_up(&sctx->list_wait); 322 scrub_put_ctx(sctx); 323 } 324 325 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 326 { 327 while (atomic_read(&fs_info->scrub_pause_req)) { 328 mutex_unlock(&fs_info->scrub_lock); 329 wait_event(fs_info->scrub_pause_wait, 330 atomic_read(&fs_info->scrub_pause_req) == 0); 331 mutex_lock(&fs_info->scrub_lock); 332 } 333 } 334 335 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) 336 { 337 atomic_inc(&fs_info->scrubs_paused); 338 wake_up(&fs_info->scrub_pause_wait); 339 340 mutex_lock(&fs_info->scrub_lock); 341 __scrub_blocked_if_needed(fs_info); 342 atomic_dec(&fs_info->scrubs_paused); 343 mutex_unlock(&fs_info->scrub_lock); 344 345 wake_up(&fs_info->scrub_pause_wait); 346 } 347 348 /* 349 * used for workers that require transaction commits (i.e., for the 350 * NOCOW case) 351 */ 352 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) 353 { 354 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 355 356 atomic_inc(&sctx->refs); 357 /* 358 * increment scrubs_running to prevent cancel requests from 359 * completing as long as a worker is running. we must also 360 * increment scrubs_paused to prevent deadlocking on pause 361 * requests used for transactions commits (as the worker uses a 362 * transaction context). it is safe to regard the worker 363 * as paused for all matters practical. effectively, we only 364 * avoid cancellation requests from completing. 365 */ 366 mutex_lock(&fs_info->scrub_lock); 367 atomic_inc(&fs_info->scrubs_running); 368 atomic_inc(&fs_info->scrubs_paused); 369 mutex_unlock(&fs_info->scrub_lock); 370 371 /* 372 * check if @scrubs_running=@scrubs_paused condition 373 * inside wait_event() is not an atomic operation. 374 * which means we may inc/dec @scrub_running/paused 375 * at any time. Let's wake up @scrub_pause_wait as 376 * much as we can to let commit transaction blocked less. 377 */ 378 wake_up(&fs_info->scrub_pause_wait); 379 380 atomic_inc(&sctx->workers_pending); 381 } 382 383 /* used for workers that require transaction commits */ 384 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) 385 { 386 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 387 388 /* 389 * see scrub_pending_trans_workers_inc() why we're pretending 390 * to be paused in the scrub counters 391 */ 392 mutex_lock(&fs_info->scrub_lock); 393 atomic_dec(&fs_info->scrubs_running); 394 atomic_dec(&fs_info->scrubs_paused); 395 mutex_unlock(&fs_info->scrub_lock); 396 atomic_dec(&sctx->workers_pending); 397 wake_up(&fs_info->scrub_pause_wait); 398 wake_up(&sctx->list_wait); 399 scrub_put_ctx(sctx); 400 } 401 402 static void scrub_free_csums(struct scrub_ctx *sctx) 403 { 404 while (!list_empty(&sctx->csum_list)) { 405 struct btrfs_ordered_sum *sum; 406 sum = list_first_entry(&sctx->csum_list, 407 struct btrfs_ordered_sum, list); 408 list_del(&sum->list); 409 kfree(sum); 410 } 411 } 412 413 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) 414 { 415 int i; 416 417 if (!sctx) 418 return; 419 420 scrub_free_wr_ctx(&sctx->wr_ctx); 421 422 /* this can happen when scrub is cancelled */ 423 if (sctx->curr != -1) { 424 struct scrub_bio *sbio = sctx->bios[sctx->curr]; 425 426 for (i = 0; i < sbio->page_count; i++) { 427 WARN_ON(!sbio->pagev[i]->page); 428 scrub_block_put(sbio->pagev[i]->sblock); 429 } 430 bio_put(sbio->bio); 431 } 432 433 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 434 struct scrub_bio *sbio = sctx->bios[i]; 435 436 if (!sbio) 437 break; 438 kfree(sbio); 439 } 440 441 scrub_free_csums(sctx); 442 kfree(sctx); 443 } 444 445 static void scrub_put_ctx(struct scrub_ctx *sctx) 446 { 447 if (atomic_dec_and_test(&sctx->refs)) 448 scrub_free_ctx(sctx); 449 } 450 451 static noinline_for_stack 452 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) 453 { 454 struct scrub_ctx *sctx; 455 int i; 456 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 457 int pages_per_rd_bio; 458 int ret; 459 460 /* 461 * the setting of pages_per_rd_bio is correct for scrub but might 462 * be wrong for the dev_replace code where we might read from 463 * different devices in the initial huge bios. However, that 464 * code is able to correctly handle the case when adding a page 465 * to a bio fails. 466 */ 467 if (dev->bdev) 468 pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO, 469 bio_get_nr_vecs(dev->bdev)); 470 else 471 pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; 472 sctx = kzalloc(sizeof(*sctx), GFP_NOFS); 473 if (!sctx) 474 goto nomem; 475 atomic_set(&sctx->refs, 1); 476 sctx->is_dev_replace = is_dev_replace; 477 sctx->pages_per_rd_bio = pages_per_rd_bio; 478 sctx->curr = -1; 479 sctx->dev_root = dev->dev_root; 480 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { 481 struct scrub_bio *sbio; 482 483 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 484 if (!sbio) 485 goto nomem; 486 sctx->bios[i] = sbio; 487 488 sbio->index = i; 489 sbio->sctx = sctx; 490 sbio->page_count = 0; 491 btrfs_init_work(&sbio->work, btrfs_scrub_helper, 492 scrub_bio_end_io_worker, NULL, NULL); 493 494 if (i != SCRUB_BIOS_PER_SCTX - 1) 495 sctx->bios[i]->next_free = i + 1; 496 else 497 sctx->bios[i]->next_free = -1; 498 } 499 sctx->first_free = 0; 500 sctx->nodesize = dev->dev_root->nodesize; 501 sctx->sectorsize = dev->dev_root->sectorsize; 502 atomic_set(&sctx->bios_in_flight, 0); 503 atomic_set(&sctx->workers_pending, 0); 504 atomic_set(&sctx->cancel_req, 0); 505 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy); 506 INIT_LIST_HEAD(&sctx->csum_list); 507 508 spin_lock_init(&sctx->list_lock); 509 spin_lock_init(&sctx->stat_lock); 510 init_waitqueue_head(&sctx->list_wait); 511 512 ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info, 513 fs_info->dev_replace.tgtdev, is_dev_replace); 514 if (ret) { 515 scrub_free_ctx(sctx); 516 return ERR_PTR(ret); 517 } 518 return sctx; 519 520 nomem: 521 scrub_free_ctx(sctx); 522 return ERR_PTR(-ENOMEM); 523 } 524 525 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, 526 void *warn_ctx) 527 { 528 u64 isize; 529 u32 nlink; 530 int ret; 531 int i; 532 struct extent_buffer *eb; 533 struct btrfs_inode_item *inode_item; 534 struct scrub_warning *swarn = warn_ctx; 535 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 536 struct inode_fs_paths *ipath = NULL; 537 struct btrfs_root *local_root; 538 struct btrfs_key root_key; 539 struct btrfs_key key; 540 541 root_key.objectid = root; 542 root_key.type = BTRFS_ROOT_ITEM_KEY; 543 root_key.offset = (u64)-1; 544 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); 545 if (IS_ERR(local_root)) { 546 ret = PTR_ERR(local_root); 547 goto err; 548 } 549 550 /* 551 * this makes the path point to (inum INODE_ITEM ioff) 552 */ 553 key.objectid = inum; 554 key.type = BTRFS_INODE_ITEM_KEY; 555 key.offset = 0; 556 557 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); 558 if (ret) { 559 btrfs_release_path(swarn->path); 560 goto err; 561 } 562 563 eb = swarn->path->nodes[0]; 564 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 565 struct btrfs_inode_item); 566 isize = btrfs_inode_size(eb, inode_item); 567 nlink = btrfs_inode_nlink(eb, inode_item); 568 btrfs_release_path(swarn->path); 569 570 ipath = init_ipath(4096, local_root, swarn->path); 571 if (IS_ERR(ipath)) { 572 ret = PTR_ERR(ipath); 573 ipath = NULL; 574 goto err; 575 } 576 ret = paths_from_inode(inum, ipath); 577 578 if (ret < 0) 579 goto err; 580 581 /* 582 * we deliberately ignore the bit ipath might have been too small to 583 * hold all of the paths here 584 */ 585 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 586 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " 587 "%s, sector %llu, root %llu, inode %llu, offset %llu, " 588 "length %llu, links %u (path: %s)\n", swarn->errstr, 589 swarn->logical, rcu_str_deref(swarn->dev->name), 590 (unsigned long long)swarn->sector, root, inum, offset, 591 min(isize - offset, (u64)PAGE_SIZE), nlink, 592 (char *)(unsigned long)ipath->fspath->val[i]); 593 594 free_ipath(ipath); 595 return 0; 596 597 err: 598 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " 599 "%s, sector %llu, root %llu, inode %llu, offset %llu: path " 600 "resolving failed with ret=%d\n", swarn->errstr, 601 swarn->logical, rcu_str_deref(swarn->dev->name), 602 (unsigned long long)swarn->sector, root, inum, offset, ret); 603 604 free_ipath(ipath); 605 return 0; 606 } 607 608 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 609 { 610 struct btrfs_device *dev; 611 struct btrfs_fs_info *fs_info; 612 struct btrfs_path *path; 613 struct btrfs_key found_key; 614 struct extent_buffer *eb; 615 struct btrfs_extent_item *ei; 616 struct scrub_warning swarn; 617 unsigned long ptr = 0; 618 u64 extent_item_pos; 619 u64 flags = 0; 620 u64 ref_root; 621 u32 item_size; 622 u8 ref_level; 623 int ret; 624 625 WARN_ON(sblock->page_count < 1); 626 dev = sblock->pagev[0]->dev; 627 fs_info = sblock->sctx->dev_root->fs_info; 628 629 path = btrfs_alloc_path(); 630 if (!path) 631 return; 632 633 swarn.sector = (sblock->pagev[0]->physical) >> 9; 634 swarn.logical = sblock->pagev[0]->logical; 635 swarn.errstr = errstr; 636 swarn.dev = NULL; 637 638 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 639 &flags); 640 if (ret < 0) 641 goto out; 642 643 extent_item_pos = swarn.logical - found_key.objectid; 644 swarn.extent_item_size = found_key.offset; 645 646 eb = path->nodes[0]; 647 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 648 item_size = btrfs_item_size_nr(eb, path->slots[0]); 649 650 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 651 do { 652 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 653 item_size, &ref_root, 654 &ref_level); 655 printk_in_rcu(KERN_WARNING 656 "BTRFS: %s at logical %llu on dev %s, " 657 "sector %llu: metadata %s (level %d) in tree " 658 "%llu\n", errstr, swarn.logical, 659 rcu_str_deref(dev->name), 660 (unsigned long long)swarn.sector, 661 ref_level ? "node" : "leaf", 662 ret < 0 ? -1 : ref_level, 663 ret < 0 ? -1 : ref_root); 664 } while (ret != 1); 665 btrfs_release_path(path); 666 } else { 667 btrfs_release_path(path); 668 swarn.path = path; 669 swarn.dev = dev; 670 iterate_extent_inodes(fs_info, found_key.objectid, 671 extent_item_pos, 1, 672 scrub_print_warning_inode, &swarn); 673 } 674 675 out: 676 btrfs_free_path(path); 677 } 678 679 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 680 { 681 struct page *page = NULL; 682 unsigned long index; 683 struct scrub_fixup_nodatasum *fixup = fixup_ctx; 684 int ret; 685 int corrected = 0; 686 struct btrfs_key key; 687 struct inode *inode = NULL; 688 struct btrfs_fs_info *fs_info; 689 u64 end = offset + PAGE_SIZE - 1; 690 struct btrfs_root *local_root; 691 int srcu_index; 692 693 key.objectid = root; 694 key.type = BTRFS_ROOT_ITEM_KEY; 695 key.offset = (u64)-1; 696 697 fs_info = fixup->root->fs_info; 698 srcu_index = srcu_read_lock(&fs_info->subvol_srcu); 699 700 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 701 if (IS_ERR(local_root)) { 702 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 703 return PTR_ERR(local_root); 704 } 705 706 key.type = BTRFS_INODE_ITEM_KEY; 707 key.objectid = inum; 708 key.offset = 0; 709 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 710 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 711 if (IS_ERR(inode)) 712 return PTR_ERR(inode); 713 714 index = offset >> PAGE_CACHE_SHIFT; 715 716 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 717 if (!page) { 718 ret = -ENOMEM; 719 goto out; 720 } 721 722 if (PageUptodate(page)) { 723 if (PageDirty(page)) { 724 /* 725 * we need to write the data to the defect sector. the 726 * data that was in that sector is not in memory, 727 * because the page was modified. we must not write the 728 * modified page to that sector. 729 * 730 * TODO: what could be done here: wait for the delalloc 731 * runner to write out that page (might involve 732 * COW) and see whether the sector is still 733 * referenced afterwards. 734 * 735 * For the meantime, we'll treat this error 736 * incorrectable, although there is a chance that a 737 * later scrub will find the bad sector again and that 738 * there's no dirty page in memory, then. 739 */ 740 ret = -EIO; 741 goto out; 742 } 743 ret = repair_io_failure(inode, offset, PAGE_SIZE, 744 fixup->logical, page, 745 offset - page_offset(page), 746 fixup->mirror_num); 747 unlock_page(page); 748 corrected = !ret; 749 } else { 750 /* 751 * we need to get good data first. the general readpage path 752 * will call repair_io_failure for us, we just have to make 753 * sure we read the bad mirror. 754 */ 755 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 756 EXTENT_DAMAGED, GFP_NOFS); 757 if (ret) { 758 /* set_extent_bits should give proper error */ 759 WARN_ON(ret > 0); 760 if (ret > 0) 761 ret = -EFAULT; 762 goto out; 763 } 764 765 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, 766 btrfs_get_extent, 767 fixup->mirror_num); 768 wait_on_page_locked(page); 769 770 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, 771 end, EXTENT_DAMAGED, 0, NULL); 772 if (!corrected) 773 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 774 EXTENT_DAMAGED, GFP_NOFS); 775 } 776 777 out: 778 if (page) 779 put_page(page); 780 781 iput(inode); 782 783 if (ret < 0) 784 return ret; 785 786 if (ret == 0 && corrected) { 787 /* 788 * we only need to call readpage for one of the inodes belonging 789 * to this extent. so make iterate_extent_inodes stop 790 */ 791 return 1; 792 } 793 794 return -EIO; 795 } 796 797 static void scrub_fixup_nodatasum(struct btrfs_work *work) 798 { 799 int ret; 800 struct scrub_fixup_nodatasum *fixup; 801 struct scrub_ctx *sctx; 802 struct btrfs_trans_handle *trans = NULL; 803 struct btrfs_path *path; 804 int uncorrectable = 0; 805 806 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 807 sctx = fixup->sctx; 808 809 path = btrfs_alloc_path(); 810 if (!path) { 811 spin_lock(&sctx->stat_lock); 812 ++sctx->stat.malloc_errors; 813 spin_unlock(&sctx->stat_lock); 814 uncorrectable = 1; 815 goto out; 816 } 817 818 trans = btrfs_join_transaction(fixup->root); 819 if (IS_ERR(trans)) { 820 uncorrectable = 1; 821 goto out; 822 } 823 824 /* 825 * the idea is to trigger a regular read through the standard path. we 826 * read a page from the (failed) logical address by specifying the 827 * corresponding copynum of the failed sector. thus, that readpage is 828 * expected to fail. 829 * that is the point where on-the-fly error correction will kick in 830 * (once it's finished) and rewrite the failed sector if a good copy 831 * can be found. 832 */ 833 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, 834 path, scrub_fixup_readpage, 835 fixup); 836 if (ret < 0) { 837 uncorrectable = 1; 838 goto out; 839 } 840 WARN_ON(ret != 1); 841 842 spin_lock(&sctx->stat_lock); 843 ++sctx->stat.corrected_errors; 844 spin_unlock(&sctx->stat_lock); 845 846 out: 847 if (trans && !IS_ERR(trans)) 848 btrfs_end_transaction(trans, fixup->root); 849 if (uncorrectable) { 850 spin_lock(&sctx->stat_lock); 851 ++sctx->stat.uncorrectable_errors; 852 spin_unlock(&sctx->stat_lock); 853 btrfs_dev_replace_stats_inc( 854 &sctx->dev_root->fs_info->dev_replace. 855 num_uncorrectable_read_errors); 856 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " 857 "unable to fixup (nodatasum) error at logical %llu on dev %s\n", 858 fixup->logical, rcu_str_deref(fixup->dev->name)); 859 } 860 861 btrfs_free_path(path); 862 kfree(fixup); 863 864 scrub_pending_trans_workers_dec(sctx); 865 } 866 867 static inline void scrub_get_recover(struct scrub_recover *recover) 868 { 869 atomic_inc(&recover->refs); 870 } 871 872 static inline void scrub_put_recover(struct scrub_recover *recover) 873 { 874 if (atomic_dec_and_test(&recover->refs)) { 875 btrfs_put_bbio(recover->bbio); 876 kfree(recover); 877 } 878 } 879 880 /* 881 * scrub_handle_errored_block gets called when either verification of the 882 * pages failed or the bio failed to read, e.g. with EIO. In the latter 883 * case, this function handles all pages in the bio, even though only one 884 * may be bad. 885 * The goal of this function is to repair the errored block by using the 886 * contents of one of the mirrors. 887 */ 888 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 889 { 890 struct scrub_ctx *sctx = sblock_to_check->sctx; 891 struct btrfs_device *dev; 892 struct btrfs_fs_info *fs_info; 893 u64 length; 894 u64 logical; 895 u64 generation; 896 unsigned int failed_mirror_index; 897 unsigned int is_metadata; 898 unsigned int have_csum; 899 u8 *csum; 900 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ 901 struct scrub_block *sblock_bad; 902 int ret; 903 int mirror_index; 904 int page_num; 905 int success; 906 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 907 DEFAULT_RATELIMIT_BURST); 908 909 BUG_ON(sblock_to_check->page_count < 1); 910 fs_info = sctx->dev_root->fs_info; 911 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { 912 /* 913 * if we find an error in a super block, we just report it. 914 * They will get written with the next transaction commit 915 * anyway 916 */ 917 spin_lock(&sctx->stat_lock); 918 ++sctx->stat.super_errors; 919 spin_unlock(&sctx->stat_lock); 920 return 0; 921 } 922 length = sblock_to_check->page_count * PAGE_SIZE; 923 logical = sblock_to_check->pagev[0]->logical; 924 generation = sblock_to_check->pagev[0]->generation; 925 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); 926 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; 927 is_metadata = !(sblock_to_check->pagev[0]->flags & 928 BTRFS_EXTENT_FLAG_DATA); 929 have_csum = sblock_to_check->pagev[0]->have_csum; 930 csum = sblock_to_check->pagev[0]->csum; 931 dev = sblock_to_check->pagev[0]->dev; 932 933 if (sctx->is_dev_replace && !is_metadata && !have_csum) { 934 sblocks_for_recheck = NULL; 935 goto nodatasum_case; 936 } 937 938 /* 939 * read all mirrors one after the other. This includes to 940 * re-read the extent or metadata block that failed (that was 941 * the cause that this fixup code is called) another time, 942 * page by page this time in order to know which pages 943 * caused I/O errors and which ones are good (for all mirrors). 944 * It is the goal to handle the situation when more than one 945 * mirror contains I/O errors, but the errors do not 946 * overlap, i.e. the data can be repaired by selecting the 947 * pages from those mirrors without I/O error on the 948 * particular pages. One example (with blocks >= 2 * PAGE_SIZE) 949 * would be that mirror #1 has an I/O error on the first page, 950 * the second page is good, and mirror #2 has an I/O error on 951 * the second page, but the first page is good. 952 * Then the first page of the first mirror can be repaired by 953 * taking the first page of the second mirror, and the 954 * second page of the second mirror can be repaired by 955 * copying the contents of the 2nd page of the 1st mirror. 956 * One more note: if the pages of one mirror contain I/O 957 * errors, the checksum cannot be verified. In order to get 958 * the best data for repairing, the first attempt is to find 959 * a mirror without I/O errors and with a validated checksum. 960 * Only if this is not possible, the pages are picked from 961 * mirrors with I/O errors without considering the checksum. 962 * If the latter is the case, at the end, the checksum of the 963 * repaired area is verified in order to correctly maintain 964 * the statistics. 965 */ 966 967 sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * 968 sizeof(*sblocks_for_recheck), 969 GFP_NOFS); 970 if (!sblocks_for_recheck) { 971 spin_lock(&sctx->stat_lock); 972 sctx->stat.malloc_errors++; 973 sctx->stat.read_errors++; 974 sctx->stat.uncorrectable_errors++; 975 spin_unlock(&sctx->stat_lock); 976 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 977 goto out; 978 } 979 980 /* setup the context, map the logical blocks and alloc the pages */ 981 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); 982 if (ret) { 983 spin_lock(&sctx->stat_lock); 984 sctx->stat.read_errors++; 985 sctx->stat.uncorrectable_errors++; 986 spin_unlock(&sctx->stat_lock); 987 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 988 goto out; 989 } 990 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 991 sblock_bad = sblocks_for_recheck + failed_mirror_index; 992 993 /* build and submit the bios for the failed mirror, check checksums */ 994 scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 995 csum, generation, sctx->csum_size, 1); 996 997 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 998 sblock_bad->no_io_error_seen) { 999 /* 1000 * the error disappeared after reading page by page, or 1001 * the area was part of a huge bio and other parts of the 1002 * bio caused I/O errors, or the block layer merged several 1003 * read requests into one and the error is caused by a 1004 * different bio (usually one of the two latter cases is 1005 * the cause) 1006 */ 1007 spin_lock(&sctx->stat_lock); 1008 sctx->stat.unverified_errors++; 1009 sblock_to_check->data_corrected = 1; 1010 spin_unlock(&sctx->stat_lock); 1011 1012 if (sctx->is_dev_replace) 1013 scrub_write_block_to_dev_replace(sblock_bad); 1014 goto out; 1015 } 1016 1017 if (!sblock_bad->no_io_error_seen) { 1018 spin_lock(&sctx->stat_lock); 1019 sctx->stat.read_errors++; 1020 spin_unlock(&sctx->stat_lock); 1021 if (__ratelimit(&_rs)) 1022 scrub_print_warning("i/o error", sblock_to_check); 1023 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 1024 } else if (sblock_bad->checksum_error) { 1025 spin_lock(&sctx->stat_lock); 1026 sctx->stat.csum_errors++; 1027 spin_unlock(&sctx->stat_lock); 1028 if (__ratelimit(&_rs)) 1029 scrub_print_warning("checksum error", sblock_to_check); 1030 btrfs_dev_stat_inc_and_print(dev, 1031 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1032 } else if (sblock_bad->header_error) { 1033 spin_lock(&sctx->stat_lock); 1034 sctx->stat.verify_errors++; 1035 spin_unlock(&sctx->stat_lock); 1036 if (__ratelimit(&_rs)) 1037 scrub_print_warning("checksum/header error", 1038 sblock_to_check); 1039 if (sblock_bad->generation_error) 1040 btrfs_dev_stat_inc_and_print(dev, 1041 BTRFS_DEV_STAT_GENERATION_ERRS); 1042 else 1043 btrfs_dev_stat_inc_and_print(dev, 1044 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1045 } 1046 1047 if (sctx->readonly) { 1048 ASSERT(!sctx->is_dev_replace); 1049 goto out; 1050 } 1051 1052 if (!is_metadata && !have_csum) { 1053 struct scrub_fixup_nodatasum *fixup_nodatasum; 1054 1055 WARN_ON(sctx->is_dev_replace); 1056 1057 nodatasum_case: 1058 1059 /* 1060 * !is_metadata and !have_csum, this means that the data 1061 * might not be COW'ed, that it might be modified 1062 * concurrently. The general strategy to work on the 1063 * commit root does not help in the case when COW is not 1064 * used. 1065 */ 1066 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 1067 if (!fixup_nodatasum) 1068 goto did_not_correct_error; 1069 fixup_nodatasum->sctx = sctx; 1070 fixup_nodatasum->dev = dev; 1071 fixup_nodatasum->logical = logical; 1072 fixup_nodatasum->root = fs_info->extent_root; 1073 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 1074 scrub_pending_trans_workers_inc(sctx); 1075 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, 1076 scrub_fixup_nodatasum, NULL, NULL); 1077 btrfs_queue_work(fs_info->scrub_workers, 1078 &fixup_nodatasum->work); 1079 goto out; 1080 } 1081 1082 /* 1083 * now build and submit the bios for the other mirrors, check 1084 * checksums. 1085 * First try to pick the mirror which is completely without I/O 1086 * errors and also does not have a checksum error. 1087 * If one is found, and if a checksum is present, the full block 1088 * that is known to contain an error is rewritten. Afterwards 1089 * the block is known to be corrected. 1090 * If a mirror is found which is completely correct, and no 1091 * checksum is present, only those pages are rewritten that had 1092 * an I/O error in the block to be repaired, since it cannot be 1093 * determined, which copy of the other pages is better (and it 1094 * could happen otherwise that a correct page would be 1095 * overwritten by a bad one). 1096 */ 1097 for (mirror_index = 0; 1098 mirror_index < BTRFS_MAX_MIRRORS && 1099 sblocks_for_recheck[mirror_index].page_count > 0; 1100 mirror_index++) { 1101 struct scrub_block *sblock_other; 1102 1103 if (mirror_index == failed_mirror_index) 1104 continue; 1105 sblock_other = sblocks_for_recheck + mirror_index; 1106 1107 /* build and submit the bios, check checksums */ 1108 scrub_recheck_block(fs_info, sblock_other, is_metadata, 1109 have_csum, csum, generation, 1110 sctx->csum_size, 0); 1111 1112 if (!sblock_other->header_error && 1113 !sblock_other->checksum_error && 1114 sblock_other->no_io_error_seen) { 1115 if (sctx->is_dev_replace) { 1116 scrub_write_block_to_dev_replace(sblock_other); 1117 goto corrected_error; 1118 } else { 1119 ret = scrub_repair_block_from_good_copy( 1120 sblock_bad, sblock_other); 1121 if (!ret) 1122 goto corrected_error; 1123 } 1124 } 1125 } 1126 1127 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) 1128 goto did_not_correct_error; 1129 1130 /* 1131 * In case of I/O errors in the area that is supposed to be 1132 * repaired, continue by picking good copies of those pages. 1133 * Select the good pages from mirrors to rewrite bad pages from 1134 * the area to fix. Afterwards verify the checksum of the block 1135 * that is supposed to be repaired. This verification step is 1136 * only done for the purpose of statistic counting and for the 1137 * final scrub report, whether errors remain. 1138 * A perfect algorithm could make use of the checksum and try 1139 * all possible combinations of pages from the different mirrors 1140 * until the checksum verification succeeds. For example, when 1141 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page 1142 * of mirror #2 is readable but the final checksum test fails, 1143 * then the 2nd page of mirror #3 could be tried, whether now 1144 * the final checksum succeedes. But this would be a rare 1145 * exception and is therefore not implemented. At least it is 1146 * avoided that the good copy is overwritten. 1147 * A more useful improvement would be to pick the sectors 1148 * without I/O error based on sector sizes (512 bytes on legacy 1149 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one 1150 * mirror could be repaired by taking 512 byte of a different 1151 * mirror, even if other 512 byte sectors in the same PAGE_SIZE 1152 * area are unreadable. 1153 */ 1154 success = 1; 1155 for (page_num = 0; page_num < sblock_bad->page_count; 1156 page_num++) { 1157 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1158 struct scrub_block *sblock_other = NULL; 1159 1160 /* skip no-io-error page in scrub */ 1161 if (!page_bad->io_error && !sctx->is_dev_replace) 1162 continue; 1163 1164 /* try to find no-io-error page in mirrors */ 1165 if (page_bad->io_error) { 1166 for (mirror_index = 0; 1167 mirror_index < BTRFS_MAX_MIRRORS && 1168 sblocks_for_recheck[mirror_index].page_count > 0; 1169 mirror_index++) { 1170 if (!sblocks_for_recheck[mirror_index]. 1171 pagev[page_num]->io_error) { 1172 sblock_other = sblocks_for_recheck + 1173 mirror_index; 1174 break; 1175 } 1176 } 1177 if (!sblock_other) 1178 success = 0; 1179 } 1180 1181 if (sctx->is_dev_replace) { 1182 /* 1183 * did not find a mirror to fetch the page 1184 * from. scrub_write_page_to_dev_replace() 1185 * handles this case (page->io_error), by 1186 * filling the block with zeros before 1187 * submitting the write request 1188 */ 1189 if (!sblock_other) 1190 sblock_other = sblock_bad; 1191 1192 if (scrub_write_page_to_dev_replace(sblock_other, 1193 page_num) != 0) { 1194 btrfs_dev_replace_stats_inc( 1195 &sctx->dev_root-> 1196 fs_info->dev_replace. 1197 num_write_errors); 1198 success = 0; 1199 } 1200 } else if (sblock_other) { 1201 ret = scrub_repair_page_from_good_copy(sblock_bad, 1202 sblock_other, 1203 page_num, 0); 1204 if (0 == ret) 1205 page_bad->io_error = 0; 1206 else 1207 success = 0; 1208 } 1209 } 1210 1211 if (success && !sctx->is_dev_replace) { 1212 if (is_metadata || have_csum) { 1213 /* 1214 * need to verify the checksum now that all 1215 * sectors on disk are repaired (the write 1216 * request for data to be repaired is on its way). 1217 * Just be lazy and use scrub_recheck_block() 1218 * which re-reads the data before the checksum 1219 * is verified, but most likely the data comes out 1220 * of the page cache. 1221 */ 1222 scrub_recheck_block(fs_info, sblock_bad, 1223 is_metadata, have_csum, csum, 1224 generation, sctx->csum_size, 1); 1225 if (!sblock_bad->header_error && 1226 !sblock_bad->checksum_error && 1227 sblock_bad->no_io_error_seen) 1228 goto corrected_error; 1229 else 1230 goto did_not_correct_error; 1231 } else { 1232 corrected_error: 1233 spin_lock(&sctx->stat_lock); 1234 sctx->stat.corrected_errors++; 1235 sblock_to_check->data_corrected = 1; 1236 spin_unlock(&sctx->stat_lock); 1237 printk_ratelimited_in_rcu(KERN_ERR 1238 "BTRFS: fixed up error at logical %llu on dev %s\n", 1239 logical, rcu_str_deref(dev->name)); 1240 } 1241 } else { 1242 did_not_correct_error: 1243 spin_lock(&sctx->stat_lock); 1244 sctx->stat.uncorrectable_errors++; 1245 spin_unlock(&sctx->stat_lock); 1246 printk_ratelimited_in_rcu(KERN_ERR 1247 "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", 1248 logical, rcu_str_deref(dev->name)); 1249 } 1250 1251 out: 1252 if (sblocks_for_recheck) { 1253 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; 1254 mirror_index++) { 1255 struct scrub_block *sblock = sblocks_for_recheck + 1256 mirror_index; 1257 struct scrub_recover *recover; 1258 int page_index; 1259 1260 for (page_index = 0; page_index < sblock->page_count; 1261 page_index++) { 1262 sblock->pagev[page_index]->sblock = NULL; 1263 recover = sblock->pagev[page_index]->recover; 1264 if (recover) { 1265 scrub_put_recover(recover); 1266 sblock->pagev[page_index]->recover = 1267 NULL; 1268 } 1269 scrub_page_put(sblock->pagev[page_index]); 1270 } 1271 } 1272 kfree(sblocks_for_recheck); 1273 } 1274 1275 return 0; 1276 } 1277 1278 static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) 1279 { 1280 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) 1281 return 2; 1282 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) 1283 return 3; 1284 else 1285 return (int)bbio->num_stripes; 1286 } 1287 1288 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, 1289 u64 *raid_map, 1290 u64 mapped_length, 1291 int nstripes, int mirror, 1292 int *stripe_index, 1293 u64 *stripe_offset) 1294 { 1295 int i; 1296 1297 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 1298 /* RAID5/6 */ 1299 for (i = 0; i < nstripes; i++) { 1300 if (raid_map[i] == RAID6_Q_STRIPE || 1301 raid_map[i] == RAID5_P_STRIPE) 1302 continue; 1303 1304 if (logical >= raid_map[i] && 1305 logical < raid_map[i] + mapped_length) 1306 break; 1307 } 1308 1309 *stripe_index = i; 1310 *stripe_offset = logical - raid_map[i]; 1311 } else { 1312 /* The other RAID type */ 1313 *stripe_index = mirror; 1314 *stripe_offset = 0; 1315 } 1316 } 1317 1318 static int scrub_setup_recheck_block(struct scrub_block *original_sblock, 1319 struct scrub_block *sblocks_for_recheck) 1320 { 1321 struct scrub_ctx *sctx = original_sblock->sctx; 1322 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 1323 u64 length = original_sblock->page_count * PAGE_SIZE; 1324 u64 logical = original_sblock->pagev[0]->logical; 1325 struct scrub_recover *recover; 1326 struct btrfs_bio *bbio; 1327 u64 sublen; 1328 u64 mapped_length; 1329 u64 stripe_offset; 1330 int stripe_index; 1331 int page_index = 0; 1332 int mirror_index; 1333 int nmirrors; 1334 int ret; 1335 1336 /* 1337 * note: the two members refs and outstanding_pages 1338 * are not used (and not set) in the blocks that are used for 1339 * the recheck procedure 1340 */ 1341 1342 while (length > 0) { 1343 sublen = min_t(u64, length, PAGE_SIZE); 1344 mapped_length = sublen; 1345 bbio = NULL; 1346 1347 /* 1348 * with a length of PAGE_SIZE, each returned stripe 1349 * represents one mirror 1350 */ 1351 ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, 1352 &mapped_length, &bbio, 0, 1); 1353 if (ret || !bbio || mapped_length < sublen) { 1354 btrfs_put_bbio(bbio); 1355 return -EIO; 1356 } 1357 1358 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); 1359 if (!recover) { 1360 btrfs_put_bbio(bbio); 1361 return -ENOMEM; 1362 } 1363 1364 atomic_set(&recover->refs, 1); 1365 recover->bbio = bbio; 1366 recover->map_length = mapped_length; 1367 1368 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1369 1370 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); 1371 1372 for (mirror_index = 0; mirror_index < nmirrors; 1373 mirror_index++) { 1374 struct scrub_block *sblock; 1375 struct scrub_page *page; 1376 1377 sblock = sblocks_for_recheck + mirror_index; 1378 sblock->sctx = sctx; 1379 page = kzalloc(sizeof(*page), GFP_NOFS); 1380 if (!page) { 1381 leave_nomem: 1382 spin_lock(&sctx->stat_lock); 1383 sctx->stat.malloc_errors++; 1384 spin_unlock(&sctx->stat_lock); 1385 scrub_put_recover(recover); 1386 return -ENOMEM; 1387 } 1388 scrub_page_get(page); 1389 sblock->pagev[page_index] = page; 1390 page->logical = logical; 1391 1392 scrub_stripe_index_and_offset(logical, 1393 bbio->map_type, 1394 bbio->raid_map, 1395 mapped_length, 1396 bbio->num_stripes - 1397 bbio->num_tgtdevs, 1398 mirror_index, 1399 &stripe_index, 1400 &stripe_offset); 1401 page->physical = bbio->stripes[stripe_index].physical + 1402 stripe_offset; 1403 page->dev = bbio->stripes[stripe_index].dev; 1404 1405 BUG_ON(page_index >= original_sblock->page_count); 1406 page->physical_for_dev_replace = 1407 original_sblock->pagev[page_index]-> 1408 physical_for_dev_replace; 1409 /* for missing devices, dev->bdev is NULL */ 1410 page->mirror_num = mirror_index + 1; 1411 sblock->page_count++; 1412 page->page = alloc_page(GFP_NOFS); 1413 if (!page->page) 1414 goto leave_nomem; 1415 1416 scrub_get_recover(recover); 1417 page->recover = recover; 1418 } 1419 scrub_put_recover(recover); 1420 length -= sublen; 1421 logical += sublen; 1422 page_index++; 1423 } 1424 1425 return 0; 1426 } 1427 1428 struct scrub_bio_ret { 1429 struct completion event; 1430 int error; 1431 }; 1432 1433 static void scrub_bio_wait_endio(struct bio *bio, int error) 1434 { 1435 struct scrub_bio_ret *ret = bio->bi_private; 1436 1437 ret->error = error; 1438 complete(&ret->event); 1439 } 1440 1441 static inline int scrub_is_page_on_raid56(struct scrub_page *page) 1442 { 1443 return page->recover && 1444 (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); 1445 } 1446 1447 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, 1448 struct bio *bio, 1449 struct scrub_page *page) 1450 { 1451 struct scrub_bio_ret done; 1452 int ret; 1453 1454 init_completion(&done.event); 1455 done.error = 0; 1456 bio->bi_iter.bi_sector = page->logical >> 9; 1457 bio->bi_private = &done; 1458 bio->bi_end_io = scrub_bio_wait_endio; 1459 1460 ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, 1461 page->recover->map_length, 1462 page->mirror_num, 0); 1463 if (ret) 1464 return ret; 1465 1466 wait_for_completion(&done.event); 1467 if (done.error) 1468 return -EIO; 1469 1470 return 0; 1471 } 1472 1473 /* 1474 * this function will check the on disk data for checksum errors, header 1475 * errors and read I/O errors. If any I/O errors happen, the exact pages 1476 * which are errored are marked as being bad. The goal is to enable scrub 1477 * to take those pages that are not errored from all the mirrors so that 1478 * the pages that are errored in the just handled mirror can be repaired. 1479 */ 1480 static void scrub_recheck_block(struct btrfs_fs_info *fs_info, 1481 struct scrub_block *sblock, int is_metadata, 1482 int have_csum, u8 *csum, u64 generation, 1483 u16 csum_size, int retry_failed_mirror) 1484 { 1485 int page_num; 1486 1487 sblock->no_io_error_seen = 1; 1488 sblock->header_error = 0; 1489 sblock->checksum_error = 0; 1490 1491 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1492 struct bio *bio; 1493 struct scrub_page *page = sblock->pagev[page_num]; 1494 1495 if (page->dev->bdev == NULL) { 1496 page->io_error = 1; 1497 sblock->no_io_error_seen = 0; 1498 continue; 1499 } 1500 1501 WARN_ON(!page->page); 1502 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 1503 if (!bio) { 1504 page->io_error = 1; 1505 sblock->no_io_error_seen = 0; 1506 continue; 1507 } 1508 bio->bi_bdev = page->dev->bdev; 1509 1510 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1511 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { 1512 if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) 1513 sblock->no_io_error_seen = 0; 1514 } else { 1515 bio->bi_iter.bi_sector = page->physical >> 9; 1516 1517 if (btrfsic_submit_bio_wait(READ, bio)) 1518 sblock->no_io_error_seen = 0; 1519 } 1520 1521 bio_put(bio); 1522 } 1523 1524 if (sblock->no_io_error_seen) 1525 scrub_recheck_block_checksum(fs_info, sblock, is_metadata, 1526 have_csum, csum, generation, 1527 csum_size); 1528 1529 return; 1530 } 1531 1532 static inline int scrub_check_fsid(u8 fsid[], 1533 struct scrub_page *spage) 1534 { 1535 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; 1536 int ret; 1537 1538 ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE); 1539 return !ret; 1540 } 1541 1542 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1543 struct scrub_block *sblock, 1544 int is_metadata, int have_csum, 1545 const u8 *csum, u64 generation, 1546 u16 csum_size) 1547 { 1548 int page_num; 1549 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1550 u32 crc = ~(u32)0; 1551 void *mapped_buffer; 1552 1553 WARN_ON(!sblock->pagev[0]->page); 1554 if (is_metadata) { 1555 struct btrfs_header *h; 1556 1557 mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1558 h = (struct btrfs_header *)mapped_buffer; 1559 1560 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || 1561 !scrub_check_fsid(h->fsid, sblock->pagev[0]) || 1562 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1563 BTRFS_UUID_SIZE)) { 1564 sblock->header_error = 1; 1565 } else if (generation != btrfs_stack_header_generation(h)) { 1566 sblock->header_error = 1; 1567 sblock->generation_error = 1; 1568 } 1569 csum = h->csum; 1570 } else { 1571 if (!have_csum) 1572 return; 1573 1574 mapped_buffer = kmap_atomic(sblock->pagev[0]->page); 1575 } 1576 1577 for (page_num = 0;;) { 1578 if (page_num == 0 && is_metadata) 1579 crc = btrfs_csum_data( 1580 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, 1581 crc, PAGE_SIZE - BTRFS_CSUM_SIZE); 1582 else 1583 crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE); 1584 1585 kunmap_atomic(mapped_buffer); 1586 page_num++; 1587 if (page_num >= sblock->page_count) 1588 break; 1589 WARN_ON(!sblock->pagev[page_num]->page); 1590 1591 mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page); 1592 } 1593 1594 btrfs_csum_final(crc, calculated_csum); 1595 if (memcmp(calculated_csum, csum, csum_size)) 1596 sblock->checksum_error = 1; 1597 } 1598 1599 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1600 struct scrub_block *sblock_good) 1601 { 1602 int page_num; 1603 int ret = 0; 1604 1605 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1606 int ret_sub; 1607 1608 ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1609 sblock_good, 1610 page_num, 1); 1611 if (ret_sub) 1612 ret = ret_sub; 1613 } 1614 1615 return ret; 1616 } 1617 1618 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 1619 struct scrub_block *sblock_good, 1620 int page_num, int force_write) 1621 { 1622 struct scrub_page *page_bad = sblock_bad->pagev[page_num]; 1623 struct scrub_page *page_good = sblock_good->pagev[page_num]; 1624 1625 BUG_ON(page_bad->page == NULL); 1626 BUG_ON(page_good->page == NULL); 1627 if (force_write || sblock_bad->header_error || 1628 sblock_bad->checksum_error || page_bad->io_error) { 1629 struct bio *bio; 1630 int ret; 1631 1632 if (!page_bad->dev->bdev) { 1633 printk_ratelimited(KERN_WARNING "BTRFS: " 1634 "scrub_repair_page_from_good_copy(bdev == NULL) " 1635 "is unexpected!\n"); 1636 return -EIO; 1637 } 1638 1639 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 1640 if (!bio) 1641 return -EIO; 1642 bio->bi_bdev = page_bad->dev->bdev; 1643 bio->bi_iter.bi_sector = page_bad->physical >> 9; 1644 1645 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1646 if (PAGE_SIZE != ret) { 1647 bio_put(bio); 1648 return -EIO; 1649 } 1650 1651 if (btrfsic_submit_bio_wait(WRITE, bio)) { 1652 btrfs_dev_stat_inc_and_print(page_bad->dev, 1653 BTRFS_DEV_STAT_WRITE_ERRS); 1654 btrfs_dev_replace_stats_inc( 1655 &sblock_bad->sctx->dev_root->fs_info-> 1656 dev_replace.num_write_errors); 1657 bio_put(bio); 1658 return -EIO; 1659 } 1660 bio_put(bio); 1661 } 1662 1663 return 0; 1664 } 1665 1666 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) 1667 { 1668 int page_num; 1669 1670 /* 1671 * This block is used for the check of the parity on the source device, 1672 * so the data needn't be written into the destination device. 1673 */ 1674 if (sblock->sparity) 1675 return; 1676 1677 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1678 int ret; 1679 1680 ret = scrub_write_page_to_dev_replace(sblock, page_num); 1681 if (ret) 1682 btrfs_dev_replace_stats_inc( 1683 &sblock->sctx->dev_root->fs_info->dev_replace. 1684 num_write_errors); 1685 } 1686 } 1687 1688 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, 1689 int page_num) 1690 { 1691 struct scrub_page *spage = sblock->pagev[page_num]; 1692 1693 BUG_ON(spage->page == NULL); 1694 if (spage->io_error) { 1695 void *mapped_buffer = kmap_atomic(spage->page); 1696 1697 memset(mapped_buffer, 0, PAGE_CACHE_SIZE); 1698 flush_dcache_page(spage->page); 1699 kunmap_atomic(mapped_buffer); 1700 } 1701 return scrub_add_page_to_wr_bio(sblock->sctx, spage); 1702 } 1703 1704 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, 1705 struct scrub_page *spage) 1706 { 1707 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1708 struct scrub_bio *sbio; 1709 int ret; 1710 1711 mutex_lock(&wr_ctx->wr_lock); 1712 again: 1713 if (!wr_ctx->wr_curr_bio) { 1714 wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio), 1715 GFP_NOFS); 1716 if (!wr_ctx->wr_curr_bio) { 1717 mutex_unlock(&wr_ctx->wr_lock); 1718 return -ENOMEM; 1719 } 1720 wr_ctx->wr_curr_bio->sctx = sctx; 1721 wr_ctx->wr_curr_bio->page_count = 0; 1722 } 1723 sbio = wr_ctx->wr_curr_bio; 1724 if (sbio->page_count == 0) { 1725 struct bio *bio; 1726 1727 sbio->physical = spage->physical_for_dev_replace; 1728 sbio->logical = spage->logical; 1729 sbio->dev = wr_ctx->tgtdev; 1730 bio = sbio->bio; 1731 if (!bio) { 1732 bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio); 1733 if (!bio) { 1734 mutex_unlock(&wr_ctx->wr_lock); 1735 return -ENOMEM; 1736 } 1737 sbio->bio = bio; 1738 } 1739 1740 bio->bi_private = sbio; 1741 bio->bi_end_io = scrub_wr_bio_end_io; 1742 bio->bi_bdev = sbio->dev->bdev; 1743 bio->bi_iter.bi_sector = sbio->physical >> 9; 1744 sbio->err = 0; 1745 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1746 spage->physical_for_dev_replace || 1747 sbio->logical + sbio->page_count * PAGE_SIZE != 1748 spage->logical) { 1749 scrub_wr_submit(sctx); 1750 goto again; 1751 } 1752 1753 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1754 if (ret != PAGE_SIZE) { 1755 if (sbio->page_count < 1) { 1756 bio_put(sbio->bio); 1757 sbio->bio = NULL; 1758 mutex_unlock(&wr_ctx->wr_lock); 1759 return -EIO; 1760 } 1761 scrub_wr_submit(sctx); 1762 goto again; 1763 } 1764 1765 sbio->pagev[sbio->page_count] = spage; 1766 scrub_page_get(spage); 1767 sbio->page_count++; 1768 if (sbio->page_count == wr_ctx->pages_per_wr_bio) 1769 scrub_wr_submit(sctx); 1770 mutex_unlock(&wr_ctx->wr_lock); 1771 1772 return 0; 1773 } 1774 1775 static void scrub_wr_submit(struct scrub_ctx *sctx) 1776 { 1777 struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx; 1778 struct scrub_bio *sbio; 1779 1780 if (!wr_ctx->wr_curr_bio) 1781 return; 1782 1783 sbio = wr_ctx->wr_curr_bio; 1784 wr_ctx->wr_curr_bio = NULL; 1785 WARN_ON(!sbio->bio->bi_bdev); 1786 scrub_pending_bio_inc(sctx); 1787 /* process all writes in a single worker thread. Then the block layer 1788 * orders the requests before sending them to the driver which 1789 * doubled the write performance on spinning disks when measured 1790 * with Linux 3.5 */ 1791 btrfsic_submit_bio(WRITE, sbio->bio); 1792 } 1793 1794 static void scrub_wr_bio_end_io(struct bio *bio, int err) 1795 { 1796 struct scrub_bio *sbio = bio->bi_private; 1797 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 1798 1799 sbio->err = err; 1800 sbio->bio = bio; 1801 1802 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, 1803 scrub_wr_bio_end_io_worker, NULL, NULL); 1804 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1805 } 1806 1807 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) 1808 { 1809 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1810 struct scrub_ctx *sctx = sbio->sctx; 1811 int i; 1812 1813 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); 1814 if (sbio->err) { 1815 struct btrfs_dev_replace *dev_replace = 1816 &sbio->sctx->dev_root->fs_info->dev_replace; 1817 1818 for (i = 0; i < sbio->page_count; i++) { 1819 struct scrub_page *spage = sbio->pagev[i]; 1820 1821 spage->io_error = 1; 1822 btrfs_dev_replace_stats_inc(&dev_replace-> 1823 num_write_errors); 1824 } 1825 } 1826 1827 for (i = 0; i < sbio->page_count; i++) 1828 scrub_page_put(sbio->pagev[i]); 1829 1830 bio_put(sbio->bio); 1831 kfree(sbio); 1832 scrub_pending_bio_dec(sctx); 1833 } 1834 1835 static int scrub_checksum(struct scrub_block *sblock) 1836 { 1837 u64 flags; 1838 int ret; 1839 1840 WARN_ON(sblock->page_count < 1); 1841 flags = sblock->pagev[0]->flags; 1842 ret = 0; 1843 if (flags & BTRFS_EXTENT_FLAG_DATA) 1844 ret = scrub_checksum_data(sblock); 1845 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1846 ret = scrub_checksum_tree_block(sblock); 1847 else if (flags & BTRFS_EXTENT_FLAG_SUPER) 1848 (void)scrub_checksum_super(sblock); 1849 else 1850 WARN_ON(1); 1851 if (ret) 1852 scrub_handle_errored_block(sblock); 1853 1854 return ret; 1855 } 1856 1857 static int scrub_checksum_data(struct scrub_block *sblock) 1858 { 1859 struct scrub_ctx *sctx = sblock->sctx; 1860 u8 csum[BTRFS_CSUM_SIZE]; 1861 u8 *on_disk_csum; 1862 struct page *page; 1863 void *buffer; 1864 u32 crc = ~(u32)0; 1865 int fail = 0; 1866 u64 len; 1867 int index; 1868 1869 BUG_ON(sblock->page_count < 1); 1870 if (!sblock->pagev[0]->have_csum) 1871 return 0; 1872 1873 on_disk_csum = sblock->pagev[0]->csum; 1874 page = sblock->pagev[0]->page; 1875 buffer = kmap_atomic(page); 1876 1877 len = sctx->sectorsize; 1878 index = 0; 1879 for (;;) { 1880 u64 l = min_t(u64, len, PAGE_SIZE); 1881 1882 crc = btrfs_csum_data(buffer, crc, l); 1883 kunmap_atomic(buffer); 1884 len -= l; 1885 if (len == 0) 1886 break; 1887 index++; 1888 BUG_ON(index >= sblock->page_count); 1889 BUG_ON(!sblock->pagev[index]->page); 1890 page = sblock->pagev[index]->page; 1891 buffer = kmap_atomic(page); 1892 } 1893 1894 btrfs_csum_final(crc, csum); 1895 if (memcmp(csum, on_disk_csum, sctx->csum_size)) 1896 fail = 1; 1897 1898 return fail; 1899 } 1900 1901 static int scrub_checksum_tree_block(struct scrub_block *sblock) 1902 { 1903 struct scrub_ctx *sctx = sblock->sctx; 1904 struct btrfs_header *h; 1905 struct btrfs_root *root = sctx->dev_root; 1906 struct btrfs_fs_info *fs_info = root->fs_info; 1907 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1908 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1909 struct page *page; 1910 void *mapped_buffer; 1911 u64 mapped_size; 1912 void *p; 1913 u32 crc = ~(u32)0; 1914 int fail = 0; 1915 int crc_fail = 0; 1916 u64 len; 1917 int index; 1918 1919 BUG_ON(sblock->page_count < 1); 1920 page = sblock->pagev[0]->page; 1921 mapped_buffer = kmap_atomic(page); 1922 h = (struct btrfs_header *)mapped_buffer; 1923 memcpy(on_disk_csum, h->csum, sctx->csum_size); 1924 1925 /* 1926 * we don't use the getter functions here, as we 1927 * a) don't have an extent buffer and 1928 * b) the page is already kmapped 1929 */ 1930 1931 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h)) 1932 ++fail; 1933 1934 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) 1935 ++fail; 1936 1937 if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) 1938 ++fail; 1939 1940 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1941 BTRFS_UUID_SIZE)) 1942 ++fail; 1943 1944 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1945 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1946 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1947 index = 0; 1948 for (;;) { 1949 u64 l = min_t(u64, len, mapped_size); 1950 1951 crc = btrfs_csum_data(p, crc, l); 1952 kunmap_atomic(mapped_buffer); 1953 len -= l; 1954 if (len == 0) 1955 break; 1956 index++; 1957 BUG_ON(index >= sblock->page_count); 1958 BUG_ON(!sblock->pagev[index]->page); 1959 page = sblock->pagev[index]->page; 1960 mapped_buffer = kmap_atomic(page); 1961 mapped_size = PAGE_SIZE; 1962 p = mapped_buffer; 1963 } 1964 1965 btrfs_csum_final(crc, calculated_csum); 1966 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 1967 ++crc_fail; 1968 1969 return fail || crc_fail; 1970 } 1971 1972 static int scrub_checksum_super(struct scrub_block *sblock) 1973 { 1974 struct btrfs_super_block *s; 1975 struct scrub_ctx *sctx = sblock->sctx; 1976 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1977 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1978 struct page *page; 1979 void *mapped_buffer; 1980 u64 mapped_size; 1981 void *p; 1982 u32 crc = ~(u32)0; 1983 int fail_gen = 0; 1984 int fail_cor = 0; 1985 u64 len; 1986 int index; 1987 1988 BUG_ON(sblock->page_count < 1); 1989 page = sblock->pagev[0]->page; 1990 mapped_buffer = kmap_atomic(page); 1991 s = (struct btrfs_super_block *)mapped_buffer; 1992 memcpy(on_disk_csum, s->csum, sctx->csum_size); 1993 1994 if (sblock->pagev[0]->logical != btrfs_super_bytenr(s)) 1995 ++fail_cor; 1996 1997 if (sblock->pagev[0]->generation != btrfs_super_generation(s)) 1998 ++fail_gen; 1999 2000 if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) 2001 ++fail_cor; 2002 2003 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 2004 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 2005 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 2006 index = 0; 2007 for (;;) { 2008 u64 l = min_t(u64, len, mapped_size); 2009 2010 crc = btrfs_csum_data(p, crc, l); 2011 kunmap_atomic(mapped_buffer); 2012 len -= l; 2013 if (len == 0) 2014 break; 2015 index++; 2016 BUG_ON(index >= sblock->page_count); 2017 BUG_ON(!sblock->pagev[index]->page); 2018 page = sblock->pagev[index]->page; 2019 mapped_buffer = kmap_atomic(page); 2020 mapped_size = PAGE_SIZE; 2021 p = mapped_buffer; 2022 } 2023 2024 btrfs_csum_final(crc, calculated_csum); 2025 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) 2026 ++fail_cor; 2027 2028 if (fail_cor + fail_gen) { 2029 /* 2030 * if we find an error in a super block, we just report it. 2031 * They will get written with the next transaction commit 2032 * anyway 2033 */ 2034 spin_lock(&sctx->stat_lock); 2035 ++sctx->stat.super_errors; 2036 spin_unlock(&sctx->stat_lock); 2037 if (fail_cor) 2038 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 2039 BTRFS_DEV_STAT_CORRUPTION_ERRS); 2040 else 2041 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev, 2042 BTRFS_DEV_STAT_GENERATION_ERRS); 2043 } 2044 2045 return fail_cor + fail_gen; 2046 } 2047 2048 static void scrub_block_get(struct scrub_block *sblock) 2049 { 2050 atomic_inc(&sblock->refs); 2051 } 2052 2053 static void scrub_block_put(struct scrub_block *sblock) 2054 { 2055 if (atomic_dec_and_test(&sblock->refs)) { 2056 int i; 2057 2058 if (sblock->sparity) 2059 scrub_parity_put(sblock->sparity); 2060 2061 for (i = 0; i < sblock->page_count; i++) 2062 scrub_page_put(sblock->pagev[i]); 2063 kfree(sblock); 2064 } 2065 } 2066 2067 static void scrub_page_get(struct scrub_page *spage) 2068 { 2069 atomic_inc(&spage->refs); 2070 } 2071 2072 static void scrub_page_put(struct scrub_page *spage) 2073 { 2074 if (atomic_dec_and_test(&spage->refs)) { 2075 if (spage->page) 2076 __free_page(spage->page); 2077 kfree(spage); 2078 } 2079 } 2080 2081 static void scrub_submit(struct scrub_ctx *sctx) 2082 { 2083 struct scrub_bio *sbio; 2084 2085 if (sctx->curr == -1) 2086 return; 2087 2088 sbio = sctx->bios[sctx->curr]; 2089 sctx->curr = -1; 2090 scrub_pending_bio_inc(sctx); 2091 2092 if (!sbio->bio->bi_bdev) { 2093 /* 2094 * this case should not happen. If btrfs_map_block() is 2095 * wrong, it could happen for dev-replace operations on 2096 * missing devices when no mirrors are available, but in 2097 * this case it should already fail the mount. 2098 * This case is handled correctly (but _very_ slowly). 2099 */ 2100 printk_ratelimited(KERN_WARNING 2101 "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n"); 2102 bio_endio(sbio->bio, -EIO); 2103 } else { 2104 btrfsic_submit_bio(READ, sbio->bio); 2105 } 2106 } 2107 2108 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, 2109 struct scrub_page *spage) 2110 { 2111 struct scrub_block *sblock = spage->sblock; 2112 struct scrub_bio *sbio; 2113 int ret; 2114 2115 again: 2116 /* 2117 * grab a fresh bio or wait for one to become available 2118 */ 2119 while (sctx->curr == -1) { 2120 spin_lock(&sctx->list_lock); 2121 sctx->curr = sctx->first_free; 2122 if (sctx->curr != -1) { 2123 sctx->first_free = sctx->bios[sctx->curr]->next_free; 2124 sctx->bios[sctx->curr]->next_free = -1; 2125 sctx->bios[sctx->curr]->page_count = 0; 2126 spin_unlock(&sctx->list_lock); 2127 } else { 2128 spin_unlock(&sctx->list_lock); 2129 wait_event(sctx->list_wait, sctx->first_free != -1); 2130 } 2131 } 2132 sbio = sctx->bios[sctx->curr]; 2133 if (sbio->page_count == 0) { 2134 struct bio *bio; 2135 2136 sbio->physical = spage->physical; 2137 sbio->logical = spage->logical; 2138 sbio->dev = spage->dev; 2139 bio = sbio->bio; 2140 if (!bio) { 2141 bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio); 2142 if (!bio) 2143 return -ENOMEM; 2144 sbio->bio = bio; 2145 } 2146 2147 bio->bi_private = sbio; 2148 bio->bi_end_io = scrub_bio_end_io; 2149 bio->bi_bdev = sbio->dev->bdev; 2150 bio->bi_iter.bi_sector = sbio->physical >> 9; 2151 sbio->err = 0; 2152 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 2153 spage->physical || 2154 sbio->logical + sbio->page_count * PAGE_SIZE != 2155 spage->logical || 2156 sbio->dev != spage->dev) { 2157 scrub_submit(sctx); 2158 goto again; 2159 } 2160 2161 sbio->pagev[sbio->page_count] = spage; 2162 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 2163 if (ret != PAGE_SIZE) { 2164 if (sbio->page_count < 1) { 2165 bio_put(sbio->bio); 2166 sbio->bio = NULL; 2167 return -EIO; 2168 } 2169 scrub_submit(sctx); 2170 goto again; 2171 } 2172 2173 scrub_block_get(sblock); /* one for the page added to the bio */ 2174 atomic_inc(&sblock->outstanding_pages); 2175 sbio->page_count++; 2176 if (sbio->page_count == sctx->pages_per_rd_bio) 2177 scrub_submit(sctx); 2178 2179 return 0; 2180 } 2181 2182 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 2183 u64 physical, struct btrfs_device *dev, u64 flags, 2184 u64 gen, int mirror_num, u8 *csum, int force, 2185 u64 physical_for_dev_replace) 2186 { 2187 struct scrub_block *sblock; 2188 int index; 2189 2190 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2191 if (!sblock) { 2192 spin_lock(&sctx->stat_lock); 2193 sctx->stat.malloc_errors++; 2194 spin_unlock(&sctx->stat_lock); 2195 return -ENOMEM; 2196 } 2197 2198 /* one ref inside this function, plus one for each page added to 2199 * a bio later on */ 2200 atomic_set(&sblock->refs, 1); 2201 sblock->sctx = sctx; 2202 sblock->no_io_error_seen = 1; 2203 2204 for (index = 0; len > 0; index++) { 2205 struct scrub_page *spage; 2206 u64 l = min_t(u64, len, PAGE_SIZE); 2207 2208 spage = kzalloc(sizeof(*spage), GFP_NOFS); 2209 if (!spage) { 2210 leave_nomem: 2211 spin_lock(&sctx->stat_lock); 2212 sctx->stat.malloc_errors++; 2213 spin_unlock(&sctx->stat_lock); 2214 scrub_block_put(sblock); 2215 return -ENOMEM; 2216 } 2217 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 2218 scrub_page_get(spage); 2219 sblock->pagev[index] = spage; 2220 spage->sblock = sblock; 2221 spage->dev = dev; 2222 spage->flags = flags; 2223 spage->generation = gen; 2224 spage->logical = logical; 2225 spage->physical = physical; 2226 spage->physical_for_dev_replace = physical_for_dev_replace; 2227 spage->mirror_num = mirror_num; 2228 if (csum) { 2229 spage->have_csum = 1; 2230 memcpy(spage->csum, csum, sctx->csum_size); 2231 } else { 2232 spage->have_csum = 0; 2233 } 2234 sblock->page_count++; 2235 spage->page = alloc_page(GFP_NOFS); 2236 if (!spage->page) 2237 goto leave_nomem; 2238 len -= l; 2239 logical += l; 2240 physical += l; 2241 physical_for_dev_replace += l; 2242 } 2243 2244 WARN_ON(sblock->page_count == 0); 2245 for (index = 0; index < sblock->page_count; index++) { 2246 struct scrub_page *spage = sblock->pagev[index]; 2247 int ret; 2248 2249 ret = scrub_add_page_to_rd_bio(sctx, spage); 2250 if (ret) { 2251 scrub_block_put(sblock); 2252 return ret; 2253 } 2254 } 2255 2256 if (force) 2257 scrub_submit(sctx); 2258 2259 /* last one frees, either here or in bio completion for last page */ 2260 scrub_block_put(sblock); 2261 return 0; 2262 } 2263 2264 static void scrub_bio_end_io(struct bio *bio, int err) 2265 { 2266 struct scrub_bio *sbio = bio->bi_private; 2267 struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; 2268 2269 sbio->err = err; 2270 sbio->bio = bio; 2271 2272 btrfs_queue_work(fs_info->scrub_workers, &sbio->work); 2273 } 2274 2275 static void scrub_bio_end_io_worker(struct btrfs_work *work) 2276 { 2277 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 2278 struct scrub_ctx *sctx = sbio->sctx; 2279 int i; 2280 2281 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO); 2282 if (sbio->err) { 2283 for (i = 0; i < sbio->page_count; i++) { 2284 struct scrub_page *spage = sbio->pagev[i]; 2285 2286 spage->io_error = 1; 2287 spage->sblock->no_io_error_seen = 0; 2288 } 2289 } 2290 2291 /* now complete the scrub_block items that have all pages completed */ 2292 for (i = 0; i < sbio->page_count; i++) { 2293 struct scrub_page *spage = sbio->pagev[i]; 2294 struct scrub_block *sblock = spage->sblock; 2295 2296 if (atomic_dec_and_test(&sblock->outstanding_pages)) 2297 scrub_block_complete(sblock); 2298 scrub_block_put(sblock); 2299 } 2300 2301 bio_put(sbio->bio); 2302 sbio->bio = NULL; 2303 spin_lock(&sctx->list_lock); 2304 sbio->next_free = sctx->first_free; 2305 sctx->first_free = sbio->index; 2306 spin_unlock(&sctx->list_lock); 2307 2308 if (sctx->is_dev_replace && 2309 atomic_read(&sctx->wr_ctx.flush_all_writes)) { 2310 mutex_lock(&sctx->wr_ctx.wr_lock); 2311 scrub_wr_submit(sctx); 2312 mutex_unlock(&sctx->wr_ctx.wr_lock); 2313 } 2314 2315 scrub_pending_bio_dec(sctx); 2316 } 2317 2318 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, 2319 unsigned long *bitmap, 2320 u64 start, u64 len) 2321 { 2322 int offset; 2323 int nsectors; 2324 int sectorsize = sparity->sctx->dev_root->sectorsize; 2325 2326 if (len >= sparity->stripe_len) { 2327 bitmap_set(bitmap, 0, sparity->nsectors); 2328 return; 2329 } 2330 2331 start -= sparity->logic_start; 2332 offset = (int)do_div(start, sparity->stripe_len); 2333 offset /= sectorsize; 2334 nsectors = (int)len / sectorsize; 2335 2336 if (offset + nsectors <= sparity->nsectors) { 2337 bitmap_set(bitmap, offset, nsectors); 2338 return; 2339 } 2340 2341 bitmap_set(bitmap, offset, sparity->nsectors - offset); 2342 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); 2343 } 2344 2345 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, 2346 u64 start, u64 len) 2347 { 2348 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); 2349 } 2350 2351 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, 2352 u64 start, u64 len) 2353 { 2354 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); 2355 } 2356 2357 static void scrub_block_complete(struct scrub_block *sblock) 2358 { 2359 int corrupted = 0; 2360 2361 if (!sblock->no_io_error_seen) { 2362 corrupted = 1; 2363 scrub_handle_errored_block(sblock); 2364 } else { 2365 /* 2366 * if has checksum error, write via repair mechanism in 2367 * dev replace case, otherwise write here in dev replace 2368 * case. 2369 */ 2370 corrupted = scrub_checksum(sblock); 2371 if (!corrupted && sblock->sctx->is_dev_replace) 2372 scrub_write_block_to_dev_replace(sblock); 2373 } 2374 2375 if (sblock->sparity && corrupted && !sblock->data_corrected) { 2376 u64 start = sblock->pagev[0]->logical; 2377 u64 end = sblock->pagev[sblock->page_count - 1]->logical + 2378 PAGE_SIZE; 2379 2380 scrub_parity_mark_sectors_error(sblock->sparity, 2381 start, end - start); 2382 } 2383 } 2384 2385 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, 2386 u8 *csum) 2387 { 2388 struct btrfs_ordered_sum *sum = NULL; 2389 unsigned long index; 2390 unsigned long num_sectors; 2391 2392 while (!list_empty(&sctx->csum_list)) { 2393 sum = list_first_entry(&sctx->csum_list, 2394 struct btrfs_ordered_sum, list); 2395 if (sum->bytenr > logical) 2396 return 0; 2397 if (sum->bytenr + sum->len > logical) 2398 break; 2399 2400 ++sctx->stat.csum_discards; 2401 list_del(&sum->list); 2402 kfree(sum); 2403 sum = NULL; 2404 } 2405 if (!sum) 2406 return 0; 2407 2408 index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize; 2409 num_sectors = sum->len / sctx->sectorsize; 2410 memcpy(csum, sum->sums + index, sctx->csum_size); 2411 if (index == num_sectors - 1) { 2412 list_del(&sum->list); 2413 kfree(sum); 2414 } 2415 return 1; 2416 } 2417 2418 /* scrub extent tries to collect up to 64 kB for each bio */ 2419 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, 2420 u64 physical, struct btrfs_device *dev, u64 flags, 2421 u64 gen, int mirror_num, u64 physical_for_dev_replace) 2422 { 2423 int ret; 2424 u8 csum[BTRFS_CSUM_SIZE]; 2425 u32 blocksize; 2426 2427 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2428 blocksize = sctx->sectorsize; 2429 spin_lock(&sctx->stat_lock); 2430 sctx->stat.data_extents_scrubbed++; 2431 sctx->stat.data_bytes_scrubbed += len; 2432 spin_unlock(&sctx->stat_lock); 2433 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2434 blocksize = sctx->nodesize; 2435 spin_lock(&sctx->stat_lock); 2436 sctx->stat.tree_extents_scrubbed++; 2437 sctx->stat.tree_bytes_scrubbed += len; 2438 spin_unlock(&sctx->stat_lock); 2439 } else { 2440 blocksize = sctx->sectorsize; 2441 WARN_ON(1); 2442 } 2443 2444 while (len) { 2445 u64 l = min_t(u64, len, blocksize); 2446 int have_csum = 0; 2447 2448 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2449 /* push csums to sbio */ 2450 have_csum = scrub_find_csum(sctx, logical, l, csum); 2451 if (have_csum == 0) 2452 ++sctx->stat.no_csum; 2453 if (sctx->is_dev_replace && !have_csum) { 2454 ret = copy_nocow_pages(sctx, logical, l, 2455 mirror_num, 2456 physical_for_dev_replace); 2457 goto behind_scrub_pages; 2458 } 2459 } 2460 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, 2461 mirror_num, have_csum ? csum : NULL, 0, 2462 physical_for_dev_replace); 2463 behind_scrub_pages: 2464 if (ret) 2465 return ret; 2466 len -= l; 2467 logical += l; 2468 physical += l; 2469 physical_for_dev_replace += l; 2470 } 2471 return 0; 2472 } 2473 2474 static int scrub_pages_for_parity(struct scrub_parity *sparity, 2475 u64 logical, u64 len, 2476 u64 physical, struct btrfs_device *dev, 2477 u64 flags, u64 gen, int mirror_num, u8 *csum) 2478 { 2479 struct scrub_ctx *sctx = sparity->sctx; 2480 struct scrub_block *sblock; 2481 int index; 2482 2483 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 2484 if (!sblock) { 2485 spin_lock(&sctx->stat_lock); 2486 sctx->stat.malloc_errors++; 2487 spin_unlock(&sctx->stat_lock); 2488 return -ENOMEM; 2489 } 2490 2491 /* one ref inside this function, plus one for each page added to 2492 * a bio later on */ 2493 atomic_set(&sblock->refs, 1); 2494 sblock->sctx = sctx; 2495 sblock->no_io_error_seen = 1; 2496 sblock->sparity = sparity; 2497 scrub_parity_get(sparity); 2498 2499 for (index = 0; len > 0; index++) { 2500 struct scrub_page *spage; 2501 u64 l = min_t(u64, len, PAGE_SIZE); 2502 2503 spage = kzalloc(sizeof(*spage), GFP_NOFS); 2504 if (!spage) { 2505 leave_nomem: 2506 spin_lock(&sctx->stat_lock); 2507 sctx->stat.malloc_errors++; 2508 spin_unlock(&sctx->stat_lock); 2509 scrub_block_put(sblock); 2510 return -ENOMEM; 2511 } 2512 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 2513 /* For scrub block */ 2514 scrub_page_get(spage); 2515 sblock->pagev[index] = spage; 2516 /* For scrub parity */ 2517 scrub_page_get(spage); 2518 list_add_tail(&spage->list, &sparity->spages); 2519 spage->sblock = sblock; 2520 spage->dev = dev; 2521 spage->flags = flags; 2522 spage->generation = gen; 2523 spage->logical = logical; 2524 spage->physical = physical; 2525 spage->mirror_num = mirror_num; 2526 if (csum) { 2527 spage->have_csum = 1; 2528 memcpy(spage->csum, csum, sctx->csum_size); 2529 } else { 2530 spage->have_csum = 0; 2531 } 2532 sblock->page_count++; 2533 spage->page = alloc_page(GFP_NOFS); 2534 if (!spage->page) 2535 goto leave_nomem; 2536 len -= l; 2537 logical += l; 2538 physical += l; 2539 } 2540 2541 WARN_ON(sblock->page_count == 0); 2542 for (index = 0; index < sblock->page_count; index++) { 2543 struct scrub_page *spage = sblock->pagev[index]; 2544 int ret; 2545 2546 ret = scrub_add_page_to_rd_bio(sctx, spage); 2547 if (ret) { 2548 scrub_block_put(sblock); 2549 return ret; 2550 } 2551 } 2552 2553 /* last one frees, either here or in bio completion for last page */ 2554 scrub_block_put(sblock); 2555 return 0; 2556 } 2557 2558 static int scrub_extent_for_parity(struct scrub_parity *sparity, 2559 u64 logical, u64 len, 2560 u64 physical, struct btrfs_device *dev, 2561 u64 flags, u64 gen, int mirror_num) 2562 { 2563 struct scrub_ctx *sctx = sparity->sctx; 2564 int ret; 2565 u8 csum[BTRFS_CSUM_SIZE]; 2566 u32 blocksize; 2567 2568 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2569 blocksize = sctx->sectorsize; 2570 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2571 blocksize = sctx->nodesize; 2572 } else { 2573 blocksize = sctx->sectorsize; 2574 WARN_ON(1); 2575 } 2576 2577 while (len) { 2578 u64 l = min_t(u64, len, blocksize); 2579 int have_csum = 0; 2580 2581 if (flags & BTRFS_EXTENT_FLAG_DATA) { 2582 /* push csums to sbio */ 2583 have_csum = scrub_find_csum(sctx, logical, l, csum); 2584 if (have_csum == 0) 2585 goto skip; 2586 } 2587 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, 2588 flags, gen, mirror_num, 2589 have_csum ? csum : NULL); 2590 if (ret) 2591 return ret; 2592 skip: 2593 len -= l; 2594 logical += l; 2595 physical += l; 2596 } 2597 return 0; 2598 } 2599 2600 /* 2601 * Given a physical address, this will calculate it's 2602 * logical offset. if this is a parity stripe, it will return 2603 * the most left data stripe's logical offset. 2604 * 2605 * return 0 if it is a data stripe, 1 means parity stripe. 2606 */ 2607 static int get_raid56_logic_offset(u64 physical, int num, 2608 struct map_lookup *map, u64 *offset, 2609 u64 *stripe_start) 2610 { 2611 int i; 2612 int j = 0; 2613 u64 stripe_nr; 2614 u64 last_offset; 2615 int stripe_index; 2616 int rot; 2617 2618 last_offset = (physical - map->stripes[num].physical) * 2619 nr_data_stripes(map); 2620 if (stripe_start) 2621 *stripe_start = last_offset; 2622 2623 *offset = last_offset; 2624 for (i = 0; i < nr_data_stripes(map); i++) { 2625 *offset = last_offset + i * map->stripe_len; 2626 2627 stripe_nr = *offset; 2628 do_div(stripe_nr, map->stripe_len); 2629 do_div(stripe_nr, nr_data_stripes(map)); 2630 2631 /* Work out the disk rotation on this stripe-set */ 2632 rot = do_div(stripe_nr, map->num_stripes); 2633 /* calculate which stripe this data locates */ 2634 rot += i; 2635 stripe_index = rot % map->num_stripes; 2636 if (stripe_index == num) 2637 return 0; 2638 if (stripe_index < num) 2639 j++; 2640 } 2641 *offset = last_offset + j * map->stripe_len; 2642 return 1; 2643 } 2644 2645 static void scrub_free_parity(struct scrub_parity *sparity) 2646 { 2647 struct scrub_ctx *sctx = sparity->sctx; 2648 struct scrub_page *curr, *next; 2649 int nbits; 2650 2651 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); 2652 if (nbits) { 2653 spin_lock(&sctx->stat_lock); 2654 sctx->stat.read_errors += nbits; 2655 sctx->stat.uncorrectable_errors += nbits; 2656 spin_unlock(&sctx->stat_lock); 2657 } 2658 2659 list_for_each_entry_safe(curr, next, &sparity->spages, list) { 2660 list_del_init(&curr->list); 2661 scrub_page_put(curr); 2662 } 2663 2664 kfree(sparity); 2665 } 2666 2667 static void scrub_parity_bio_endio(struct bio *bio, int error) 2668 { 2669 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; 2670 struct scrub_ctx *sctx = sparity->sctx; 2671 2672 if (error) 2673 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 2674 sparity->nsectors); 2675 2676 scrub_free_parity(sparity); 2677 scrub_pending_bio_dec(sctx); 2678 bio_put(bio); 2679 } 2680 2681 static void scrub_parity_check_and_repair(struct scrub_parity *sparity) 2682 { 2683 struct scrub_ctx *sctx = sparity->sctx; 2684 struct bio *bio; 2685 struct btrfs_raid_bio *rbio; 2686 struct scrub_page *spage; 2687 struct btrfs_bio *bbio = NULL; 2688 u64 length; 2689 int ret; 2690 2691 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, 2692 sparity->nsectors)) 2693 goto out; 2694 2695 length = sparity->logic_end - sparity->logic_start + 1; 2696 ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, 2697 sparity->logic_start, 2698 &length, &bbio, 0, 1); 2699 if (ret || !bbio || !bbio->raid_map) 2700 goto bbio_out; 2701 2702 bio = btrfs_io_bio_alloc(GFP_NOFS, 0); 2703 if (!bio) 2704 goto bbio_out; 2705 2706 bio->bi_iter.bi_sector = sparity->logic_start >> 9; 2707 bio->bi_private = sparity; 2708 bio->bi_end_io = scrub_parity_bio_endio; 2709 2710 rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, 2711 length, sparity->scrub_dev, 2712 sparity->dbitmap, 2713 sparity->nsectors); 2714 if (!rbio) 2715 goto rbio_out; 2716 2717 list_for_each_entry(spage, &sparity->spages, list) 2718 raid56_parity_add_scrub_pages(rbio, spage->page, 2719 spage->logical); 2720 2721 scrub_pending_bio_inc(sctx); 2722 raid56_parity_submit_scrub_rbio(rbio); 2723 return; 2724 2725 rbio_out: 2726 bio_put(bio); 2727 bbio_out: 2728 btrfs_put_bbio(bbio); 2729 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, 2730 sparity->nsectors); 2731 spin_lock(&sctx->stat_lock); 2732 sctx->stat.malloc_errors++; 2733 spin_unlock(&sctx->stat_lock); 2734 out: 2735 scrub_free_parity(sparity); 2736 } 2737 2738 static inline int scrub_calc_parity_bitmap_len(int nsectors) 2739 { 2740 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); 2741 } 2742 2743 static void scrub_parity_get(struct scrub_parity *sparity) 2744 { 2745 atomic_inc(&sparity->refs); 2746 } 2747 2748 static void scrub_parity_put(struct scrub_parity *sparity) 2749 { 2750 if (!atomic_dec_and_test(&sparity->refs)) 2751 return; 2752 2753 scrub_parity_check_and_repair(sparity); 2754 } 2755 2756 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, 2757 struct map_lookup *map, 2758 struct btrfs_device *sdev, 2759 struct btrfs_path *path, 2760 u64 logic_start, 2761 u64 logic_end) 2762 { 2763 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2764 struct btrfs_root *root = fs_info->extent_root; 2765 struct btrfs_root *csum_root = fs_info->csum_root; 2766 struct btrfs_extent_item *extent; 2767 u64 flags; 2768 int ret; 2769 int slot; 2770 struct extent_buffer *l; 2771 struct btrfs_key key; 2772 u64 generation; 2773 u64 extent_logical; 2774 u64 extent_physical; 2775 u64 extent_len; 2776 struct btrfs_device *extent_dev; 2777 struct scrub_parity *sparity; 2778 int nsectors; 2779 int bitmap_len; 2780 int extent_mirror_num; 2781 int stop_loop = 0; 2782 2783 nsectors = map->stripe_len / root->sectorsize; 2784 bitmap_len = scrub_calc_parity_bitmap_len(nsectors); 2785 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, 2786 GFP_NOFS); 2787 if (!sparity) { 2788 spin_lock(&sctx->stat_lock); 2789 sctx->stat.malloc_errors++; 2790 spin_unlock(&sctx->stat_lock); 2791 return -ENOMEM; 2792 } 2793 2794 sparity->stripe_len = map->stripe_len; 2795 sparity->nsectors = nsectors; 2796 sparity->sctx = sctx; 2797 sparity->scrub_dev = sdev; 2798 sparity->logic_start = logic_start; 2799 sparity->logic_end = logic_end; 2800 atomic_set(&sparity->refs, 1); 2801 INIT_LIST_HEAD(&sparity->spages); 2802 sparity->dbitmap = sparity->bitmap; 2803 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; 2804 2805 ret = 0; 2806 while (logic_start < logic_end) { 2807 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 2808 key.type = BTRFS_METADATA_ITEM_KEY; 2809 else 2810 key.type = BTRFS_EXTENT_ITEM_KEY; 2811 key.objectid = logic_start; 2812 key.offset = (u64)-1; 2813 2814 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2815 if (ret < 0) 2816 goto out; 2817 2818 if (ret > 0) { 2819 ret = btrfs_previous_extent_item(root, path, 0); 2820 if (ret < 0) 2821 goto out; 2822 if (ret > 0) { 2823 btrfs_release_path(path); 2824 ret = btrfs_search_slot(NULL, root, &key, 2825 path, 0, 0); 2826 if (ret < 0) 2827 goto out; 2828 } 2829 } 2830 2831 stop_loop = 0; 2832 while (1) { 2833 u64 bytes; 2834 2835 l = path->nodes[0]; 2836 slot = path->slots[0]; 2837 if (slot >= btrfs_header_nritems(l)) { 2838 ret = btrfs_next_leaf(root, path); 2839 if (ret == 0) 2840 continue; 2841 if (ret < 0) 2842 goto out; 2843 2844 stop_loop = 1; 2845 break; 2846 } 2847 btrfs_item_key_to_cpu(l, &key, slot); 2848 2849 if (key.type == BTRFS_METADATA_ITEM_KEY) 2850 bytes = root->nodesize; 2851 else 2852 bytes = key.offset; 2853 2854 if (key.objectid + bytes <= logic_start) 2855 goto next; 2856 2857 if (key.type != BTRFS_EXTENT_ITEM_KEY && 2858 key.type != BTRFS_METADATA_ITEM_KEY) 2859 goto next; 2860 2861 if (key.objectid > logic_end) { 2862 stop_loop = 1; 2863 break; 2864 } 2865 2866 while (key.objectid >= logic_start + map->stripe_len) 2867 logic_start += map->stripe_len; 2868 2869 extent = btrfs_item_ptr(l, slot, 2870 struct btrfs_extent_item); 2871 flags = btrfs_extent_flags(l, extent); 2872 generation = btrfs_extent_generation(l, extent); 2873 2874 if (key.objectid < logic_start && 2875 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 2876 btrfs_err(fs_info, 2877 "scrub: tree block %llu spanning stripes, ignored. logical=%llu", 2878 key.objectid, logic_start); 2879 goto next; 2880 } 2881 again: 2882 extent_logical = key.objectid; 2883 extent_len = bytes; 2884 2885 if (extent_logical < logic_start) { 2886 extent_len -= logic_start - extent_logical; 2887 extent_logical = logic_start; 2888 } 2889 2890 if (extent_logical + extent_len > 2891 logic_start + map->stripe_len) 2892 extent_len = logic_start + map->stripe_len - 2893 extent_logical; 2894 2895 scrub_parity_mark_sectors_data(sparity, extent_logical, 2896 extent_len); 2897 2898 scrub_remap_extent(fs_info, extent_logical, 2899 extent_len, &extent_physical, 2900 &extent_dev, 2901 &extent_mirror_num); 2902 2903 ret = btrfs_lookup_csums_range(csum_root, 2904 extent_logical, 2905 extent_logical + extent_len - 1, 2906 &sctx->csum_list, 1); 2907 if (ret) 2908 goto out; 2909 2910 ret = scrub_extent_for_parity(sparity, extent_logical, 2911 extent_len, 2912 extent_physical, 2913 extent_dev, flags, 2914 generation, 2915 extent_mirror_num); 2916 if (ret) 2917 goto out; 2918 2919 scrub_free_csums(sctx); 2920 if (extent_logical + extent_len < 2921 key.objectid + bytes) { 2922 logic_start += map->stripe_len; 2923 2924 if (logic_start >= logic_end) { 2925 stop_loop = 1; 2926 break; 2927 } 2928 2929 if (logic_start < key.objectid + bytes) { 2930 cond_resched(); 2931 goto again; 2932 } 2933 } 2934 next: 2935 path->slots[0]++; 2936 } 2937 2938 btrfs_release_path(path); 2939 2940 if (stop_loop) 2941 break; 2942 2943 logic_start += map->stripe_len; 2944 } 2945 out: 2946 if (ret < 0) 2947 scrub_parity_mark_sectors_error(sparity, logic_start, 2948 logic_end - logic_start + 1); 2949 scrub_parity_put(sparity); 2950 scrub_submit(sctx); 2951 mutex_lock(&sctx->wr_ctx.wr_lock); 2952 scrub_wr_submit(sctx); 2953 mutex_unlock(&sctx->wr_ctx.wr_lock); 2954 2955 btrfs_release_path(path); 2956 return ret < 0 ? ret : 0; 2957 } 2958 2959 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, 2960 struct map_lookup *map, 2961 struct btrfs_device *scrub_dev, 2962 int num, u64 base, u64 length, 2963 int is_dev_replace) 2964 { 2965 struct btrfs_path *path, *ppath; 2966 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 2967 struct btrfs_root *root = fs_info->extent_root; 2968 struct btrfs_root *csum_root = fs_info->csum_root; 2969 struct btrfs_extent_item *extent; 2970 struct blk_plug plug; 2971 u64 flags; 2972 int ret; 2973 int slot; 2974 u64 nstripes; 2975 struct extent_buffer *l; 2976 struct btrfs_key key; 2977 u64 physical; 2978 u64 logical; 2979 u64 logic_end; 2980 u64 physical_end; 2981 u64 generation; 2982 int mirror_num; 2983 struct reada_control *reada1; 2984 struct reada_control *reada2; 2985 struct btrfs_key key_start; 2986 struct btrfs_key key_end; 2987 u64 increment = map->stripe_len; 2988 u64 offset; 2989 u64 extent_logical; 2990 u64 extent_physical; 2991 u64 extent_len; 2992 u64 stripe_logical; 2993 u64 stripe_end; 2994 struct btrfs_device *extent_dev; 2995 int extent_mirror_num; 2996 int stop_loop = 0; 2997 2998 nstripes = length; 2999 physical = map->stripes[num].physical; 3000 offset = 0; 3001 do_div(nstripes, map->stripe_len); 3002 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3003 offset = map->stripe_len * num; 3004 increment = map->stripe_len * map->num_stripes; 3005 mirror_num = 1; 3006 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3007 int factor = map->num_stripes / map->sub_stripes; 3008 offset = map->stripe_len * (num / map->sub_stripes); 3009 increment = map->stripe_len * factor; 3010 mirror_num = num % map->sub_stripes + 1; 3011 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3012 increment = map->stripe_len; 3013 mirror_num = num % map->num_stripes + 1; 3014 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3015 increment = map->stripe_len; 3016 mirror_num = num % map->num_stripes + 1; 3017 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3018 get_raid56_logic_offset(physical, num, map, &offset, NULL); 3019 increment = map->stripe_len * nr_data_stripes(map); 3020 mirror_num = 1; 3021 } else { 3022 increment = map->stripe_len; 3023 mirror_num = 1; 3024 } 3025 3026 path = btrfs_alloc_path(); 3027 if (!path) 3028 return -ENOMEM; 3029 3030 ppath = btrfs_alloc_path(); 3031 if (!ppath) { 3032 btrfs_free_path(path); 3033 return -ENOMEM; 3034 } 3035 3036 /* 3037 * work on commit root. The related disk blocks are static as 3038 * long as COW is applied. This means, it is save to rewrite 3039 * them to repair disk errors without any race conditions 3040 */ 3041 path->search_commit_root = 1; 3042 path->skip_locking = 1; 3043 3044 ppath->search_commit_root = 1; 3045 ppath->skip_locking = 1; 3046 /* 3047 * trigger the readahead for extent tree csum tree and wait for 3048 * completion. During readahead, the scrub is officially paused 3049 * to not hold off transaction commits 3050 */ 3051 logical = base + offset; 3052 physical_end = physical + nstripes * map->stripe_len; 3053 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3054 get_raid56_logic_offset(physical_end, num, 3055 map, &logic_end, NULL); 3056 logic_end += base; 3057 } else { 3058 logic_end = logical + increment * nstripes; 3059 } 3060 wait_event(sctx->list_wait, 3061 atomic_read(&sctx->bios_in_flight) == 0); 3062 scrub_blocked_if_needed(fs_info); 3063 3064 /* FIXME it might be better to start readahead at commit root */ 3065 key_start.objectid = logical; 3066 key_start.type = BTRFS_EXTENT_ITEM_KEY; 3067 key_start.offset = (u64)0; 3068 key_end.objectid = logic_end; 3069 key_end.type = BTRFS_METADATA_ITEM_KEY; 3070 key_end.offset = (u64)-1; 3071 reada1 = btrfs_reada_add(root, &key_start, &key_end); 3072 3073 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3074 key_start.type = BTRFS_EXTENT_CSUM_KEY; 3075 key_start.offset = logical; 3076 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3077 key_end.type = BTRFS_EXTENT_CSUM_KEY; 3078 key_end.offset = logic_end; 3079 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 3080 3081 if (!IS_ERR(reada1)) 3082 btrfs_reada_wait(reada1); 3083 if (!IS_ERR(reada2)) 3084 btrfs_reada_wait(reada2); 3085 3086 3087 /* 3088 * collect all data csums for the stripe to avoid seeking during 3089 * the scrub. This might currently (crc32) end up to be about 1MB 3090 */ 3091 blk_start_plug(&plug); 3092 3093 /* 3094 * now find all extents for each stripe and scrub them 3095 */ 3096 ret = 0; 3097 while (physical < physical_end) { 3098 /* for raid56, we skip parity stripe */ 3099 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3100 ret = get_raid56_logic_offset(physical, num, 3101 map, &logical, &stripe_logical); 3102 logical += base; 3103 if (ret) { 3104 stripe_logical += base; 3105 stripe_end = stripe_logical + increment - 1; 3106 ret = scrub_raid56_parity(sctx, map, scrub_dev, 3107 ppath, stripe_logical, 3108 stripe_end); 3109 if (ret) 3110 goto out; 3111 goto skip; 3112 } 3113 } 3114 /* 3115 * canceled? 3116 */ 3117 if (atomic_read(&fs_info->scrub_cancel_req) || 3118 atomic_read(&sctx->cancel_req)) { 3119 ret = -ECANCELED; 3120 goto out; 3121 } 3122 /* 3123 * check to see if we have to pause 3124 */ 3125 if (atomic_read(&fs_info->scrub_pause_req)) { 3126 /* push queued extents */ 3127 atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 3128 scrub_submit(sctx); 3129 mutex_lock(&sctx->wr_ctx.wr_lock); 3130 scrub_wr_submit(sctx); 3131 mutex_unlock(&sctx->wr_ctx.wr_lock); 3132 wait_event(sctx->list_wait, 3133 atomic_read(&sctx->bios_in_flight) == 0); 3134 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 3135 scrub_blocked_if_needed(fs_info); 3136 } 3137 3138 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) 3139 key.type = BTRFS_METADATA_ITEM_KEY; 3140 else 3141 key.type = BTRFS_EXTENT_ITEM_KEY; 3142 key.objectid = logical; 3143 key.offset = (u64)-1; 3144 3145 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3146 if (ret < 0) 3147 goto out; 3148 3149 if (ret > 0) { 3150 ret = btrfs_previous_extent_item(root, path, 0); 3151 if (ret < 0) 3152 goto out; 3153 if (ret > 0) { 3154 /* there's no smaller item, so stick with the 3155 * larger one */ 3156 btrfs_release_path(path); 3157 ret = btrfs_search_slot(NULL, root, &key, 3158 path, 0, 0); 3159 if (ret < 0) 3160 goto out; 3161 } 3162 } 3163 3164 stop_loop = 0; 3165 while (1) { 3166 u64 bytes; 3167 3168 l = path->nodes[0]; 3169 slot = path->slots[0]; 3170 if (slot >= btrfs_header_nritems(l)) { 3171 ret = btrfs_next_leaf(root, path); 3172 if (ret == 0) 3173 continue; 3174 if (ret < 0) 3175 goto out; 3176 3177 stop_loop = 1; 3178 break; 3179 } 3180 btrfs_item_key_to_cpu(l, &key, slot); 3181 3182 if (key.type == BTRFS_METADATA_ITEM_KEY) 3183 bytes = root->nodesize; 3184 else 3185 bytes = key.offset; 3186 3187 if (key.objectid + bytes <= logical) 3188 goto next; 3189 3190 if (key.type != BTRFS_EXTENT_ITEM_KEY && 3191 key.type != BTRFS_METADATA_ITEM_KEY) 3192 goto next; 3193 3194 if (key.objectid >= logical + map->stripe_len) { 3195 /* out of this device extent */ 3196 if (key.objectid >= logic_end) 3197 stop_loop = 1; 3198 break; 3199 } 3200 3201 extent = btrfs_item_ptr(l, slot, 3202 struct btrfs_extent_item); 3203 flags = btrfs_extent_flags(l, extent); 3204 generation = btrfs_extent_generation(l, extent); 3205 3206 if (key.objectid < logical && 3207 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 3208 btrfs_err(fs_info, 3209 "scrub: tree block %llu spanning " 3210 "stripes, ignored. logical=%llu", 3211 key.objectid, logical); 3212 goto next; 3213 } 3214 3215 again: 3216 extent_logical = key.objectid; 3217 extent_len = bytes; 3218 3219 /* 3220 * trim extent to this stripe 3221 */ 3222 if (extent_logical < logical) { 3223 extent_len -= logical - extent_logical; 3224 extent_logical = logical; 3225 } 3226 if (extent_logical + extent_len > 3227 logical + map->stripe_len) { 3228 extent_len = logical + map->stripe_len - 3229 extent_logical; 3230 } 3231 3232 extent_physical = extent_logical - logical + physical; 3233 extent_dev = scrub_dev; 3234 extent_mirror_num = mirror_num; 3235 if (is_dev_replace) 3236 scrub_remap_extent(fs_info, extent_logical, 3237 extent_len, &extent_physical, 3238 &extent_dev, 3239 &extent_mirror_num); 3240 3241 ret = btrfs_lookup_csums_range(csum_root, logical, 3242 logical + map->stripe_len - 1, 3243 &sctx->csum_list, 1); 3244 if (ret) 3245 goto out; 3246 3247 ret = scrub_extent(sctx, extent_logical, extent_len, 3248 extent_physical, extent_dev, flags, 3249 generation, extent_mirror_num, 3250 extent_logical - logical + physical); 3251 if (ret) 3252 goto out; 3253 3254 scrub_free_csums(sctx); 3255 if (extent_logical + extent_len < 3256 key.objectid + bytes) { 3257 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 3258 /* 3259 * loop until we find next data stripe 3260 * or we have finished all stripes. 3261 */ 3262 loop: 3263 physical += map->stripe_len; 3264 ret = get_raid56_logic_offset(physical, 3265 num, map, &logical, 3266 &stripe_logical); 3267 logical += base; 3268 3269 if (ret && physical < physical_end) { 3270 stripe_logical += base; 3271 stripe_end = stripe_logical + 3272 increment - 1; 3273 ret = scrub_raid56_parity(sctx, 3274 map, scrub_dev, ppath, 3275 stripe_logical, 3276 stripe_end); 3277 if (ret) 3278 goto out; 3279 goto loop; 3280 } 3281 } else { 3282 physical += map->stripe_len; 3283 logical += increment; 3284 } 3285 if (logical < key.objectid + bytes) { 3286 cond_resched(); 3287 goto again; 3288 } 3289 3290 if (physical >= physical_end) { 3291 stop_loop = 1; 3292 break; 3293 } 3294 } 3295 next: 3296 path->slots[0]++; 3297 } 3298 btrfs_release_path(path); 3299 skip: 3300 logical += increment; 3301 physical += map->stripe_len; 3302 spin_lock(&sctx->stat_lock); 3303 if (stop_loop) 3304 sctx->stat.last_physical = map->stripes[num].physical + 3305 length; 3306 else 3307 sctx->stat.last_physical = physical; 3308 spin_unlock(&sctx->stat_lock); 3309 if (stop_loop) 3310 break; 3311 } 3312 out: 3313 /* push queued extents */ 3314 scrub_submit(sctx); 3315 mutex_lock(&sctx->wr_ctx.wr_lock); 3316 scrub_wr_submit(sctx); 3317 mutex_unlock(&sctx->wr_ctx.wr_lock); 3318 3319 blk_finish_plug(&plug); 3320 btrfs_free_path(path); 3321 btrfs_free_path(ppath); 3322 return ret < 0 ? ret : 0; 3323 } 3324 3325 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, 3326 struct btrfs_device *scrub_dev, 3327 u64 chunk_tree, u64 chunk_objectid, 3328 u64 chunk_offset, u64 length, 3329 u64 dev_offset, int is_dev_replace) 3330 { 3331 struct btrfs_mapping_tree *map_tree = 3332 &sctx->dev_root->fs_info->mapping_tree; 3333 struct map_lookup *map; 3334 struct extent_map *em; 3335 int i; 3336 int ret = 0; 3337 3338 read_lock(&map_tree->map_tree.lock); 3339 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 3340 read_unlock(&map_tree->map_tree.lock); 3341 3342 if (!em) 3343 return -EINVAL; 3344 3345 map = (struct map_lookup *)em->bdev; 3346 if (em->start != chunk_offset) 3347 goto out; 3348 3349 if (em->len < length) 3350 goto out; 3351 3352 for (i = 0; i < map->num_stripes; ++i) { 3353 if (map->stripes[i].dev->bdev == scrub_dev->bdev && 3354 map->stripes[i].physical == dev_offset) { 3355 ret = scrub_stripe(sctx, map, scrub_dev, i, 3356 chunk_offset, length, 3357 is_dev_replace); 3358 if (ret) 3359 goto out; 3360 } 3361 } 3362 out: 3363 free_extent_map(em); 3364 3365 return ret; 3366 } 3367 3368 static noinline_for_stack 3369 int scrub_enumerate_chunks(struct scrub_ctx *sctx, 3370 struct btrfs_device *scrub_dev, u64 start, u64 end, 3371 int is_dev_replace) 3372 { 3373 struct btrfs_dev_extent *dev_extent = NULL; 3374 struct btrfs_path *path; 3375 struct btrfs_root *root = sctx->dev_root; 3376 struct btrfs_fs_info *fs_info = root->fs_info; 3377 u64 length; 3378 u64 chunk_tree; 3379 u64 chunk_objectid; 3380 u64 chunk_offset; 3381 int ret; 3382 int slot; 3383 struct extent_buffer *l; 3384 struct btrfs_key key; 3385 struct btrfs_key found_key; 3386 struct btrfs_block_group_cache *cache; 3387 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 3388 3389 path = btrfs_alloc_path(); 3390 if (!path) 3391 return -ENOMEM; 3392 3393 path->reada = 2; 3394 path->search_commit_root = 1; 3395 path->skip_locking = 1; 3396 3397 key.objectid = scrub_dev->devid; 3398 key.offset = 0ull; 3399 key.type = BTRFS_DEV_EXTENT_KEY; 3400 3401 while (1) { 3402 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3403 if (ret < 0) 3404 break; 3405 if (ret > 0) { 3406 if (path->slots[0] >= 3407 btrfs_header_nritems(path->nodes[0])) { 3408 ret = btrfs_next_leaf(root, path); 3409 if (ret) 3410 break; 3411 } 3412 } 3413 3414 l = path->nodes[0]; 3415 slot = path->slots[0]; 3416 3417 btrfs_item_key_to_cpu(l, &found_key, slot); 3418 3419 if (found_key.objectid != scrub_dev->devid) 3420 break; 3421 3422 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 3423 break; 3424 3425 if (found_key.offset >= end) 3426 break; 3427 3428 if (found_key.offset < key.offset) 3429 break; 3430 3431 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 3432 length = btrfs_dev_extent_length(l, dev_extent); 3433 3434 if (found_key.offset + length <= start) 3435 goto skip; 3436 3437 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 3438 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 3439 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 3440 3441 /* 3442 * get a reference on the corresponding block group to prevent 3443 * the chunk from going away while we scrub it 3444 */ 3445 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3446 3447 /* some chunks are removed but not committed to disk yet, 3448 * continue scrubbing */ 3449 if (!cache) 3450 goto skip; 3451 3452 dev_replace->cursor_right = found_key.offset + length; 3453 dev_replace->cursor_left = found_key.offset; 3454 dev_replace->item_needs_writeback = 1; 3455 ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid, 3456 chunk_offset, length, found_key.offset, 3457 is_dev_replace); 3458 3459 /* 3460 * flush, submit all pending read and write bios, afterwards 3461 * wait for them. 3462 * Note that in the dev replace case, a read request causes 3463 * write requests that are submitted in the read completion 3464 * worker. Therefore in the current situation, it is required 3465 * that all write requests are flushed, so that all read and 3466 * write requests are really completed when bios_in_flight 3467 * changes to 0. 3468 */ 3469 atomic_set(&sctx->wr_ctx.flush_all_writes, 1); 3470 scrub_submit(sctx); 3471 mutex_lock(&sctx->wr_ctx.wr_lock); 3472 scrub_wr_submit(sctx); 3473 mutex_unlock(&sctx->wr_ctx.wr_lock); 3474 3475 wait_event(sctx->list_wait, 3476 atomic_read(&sctx->bios_in_flight) == 0); 3477 atomic_inc(&fs_info->scrubs_paused); 3478 wake_up(&fs_info->scrub_pause_wait); 3479 3480 /* 3481 * must be called before we decrease @scrub_paused. 3482 * make sure we don't block transaction commit while 3483 * we are waiting pending workers finished. 3484 */ 3485 wait_event(sctx->list_wait, 3486 atomic_read(&sctx->workers_pending) == 0); 3487 atomic_set(&sctx->wr_ctx.flush_all_writes, 0); 3488 3489 mutex_lock(&fs_info->scrub_lock); 3490 __scrub_blocked_if_needed(fs_info); 3491 atomic_dec(&fs_info->scrubs_paused); 3492 mutex_unlock(&fs_info->scrub_lock); 3493 wake_up(&fs_info->scrub_pause_wait); 3494 3495 btrfs_put_block_group(cache); 3496 if (ret) 3497 break; 3498 if (is_dev_replace && 3499 atomic64_read(&dev_replace->num_write_errors) > 0) { 3500 ret = -EIO; 3501 break; 3502 } 3503 if (sctx->stat.malloc_errors > 0) { 3504 ret = -ENOMEM; 3505 break; 3506 } 3507 3508 dev_replace->cursor_left = dev_replace->cursor_right; 3509 dev_replace->item_needs_writeback = 1; 3510 skip: 3511 key.offset = found_key.offset + length; 3512 btrfs_release_path(path); 3513 } 3514 3515 btrfs_free_path(path); 3516 3517 /* 3518 * ret can still be 1 from search_slot or next_leaf, 3519 * that's not an error 3520 */ 3521 return ret < 0 ? ret : 0; 3522 } 3523 3524 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, 3525 struct btrfs_device *scrub_dev) 3526 { 3527 int i; 3528 u64 bytenr; 3529 u64 gen; 3530 int ret; 3531 struct btrfs_root *root = sctx->dev_root; 3532 3533 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 3534 return -EIO; 3535 3536 /* Seed devices of a new filesystem has their own generation. */ 3537 if (scrub_dev->fs_devices != root->fs_info->fs_devices) 3538 gen = scrub_dev->generation; 3539 else 3540 gen = root->fs_info->last_trans_committed; 3541 3542 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 3543 bytenr = btrfs_sb_offset(i); 3544 if (bytenr + BTRFS_SUPER_INFO_SIZE > 3545 scrub_dev->commit_total_bytes) 3546 break; 3547 3548 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 3549 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, 3550 NULL, 1, bytenr); 3551 if (ret) 3552 return ret; 3553 } 3554 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 3555 3556 return 0; 3557 } 3558 3559 /* 3560 * get a reference count on fs_info->scrub_workers. start worker if necessary 3561 */ 3562 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, 3563 int is_dev_replace) 3564 { 3565 int ret = 0; 3566 int flags = WQ_FREEZABLE | WQ_UNBOUND; 3567 int max_active = fs_info->thread_pool_size; 3568 3569 if (fs_info->scrub_workers_refcnt == 0) { 3570 if (is_dev_replace) 3571 fs_info->scrub_workers = 3572 btrfs_alloc_workqueue("btrfs-scrub", flags, 3573 1, 4); 3574 else 3575 fs_info->scrub_workers = 3576 btrfs_alloc_workqueue("btrfs-scrub", flags, 3577 max_active, 4); 3578 if (!fs_info->scrub_workers) { 3579 ret = -ENOMEM; 3580 goto out; 3581 } 3582 fs_info->scrub_wr_completion_workers = 3583 btrfs_alloc_workqueue("btrfs-scrubwrc", flags, 3584 max_active, 2); 3585 if (!fs_info->scrub_wr_completion_workers) { 3586 ret = -ENOMEM; 3587 goto out; 3588 } 3589 fs_info->scrub_nocow_workers = 3590 btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); 3591 if (!fs_info->scrub_nocow_workers) { 3592 ret = -ENOMEM; 3593 goto out; 3594 } 3595 } 3596 ++fs_info->scrub_workers_refcnt; 3597 out: 3598 return ret; 3599 } 3600 3601 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) 3602 { 3603 if (--fs_info->scrub_workers_refcnt == 0) { 3604 btrfs_destroy_workqueue(fs_info->scrub_workers); 3605 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); 3606 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); 3607 } 3608 WARN_ON(fs_info->scrub_workers_refcnt < 0); 3609 } 3610 3611 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, 3612 u64 end, struct btrfs_scrub_progress *progress, 3613 int readonly, int is_dev_replace) 3614 { 3615 struct scrub_ctx *sctx; 3616 int ret; 3617 struct btrfs_device *dev; 3618 struct rcu_string *name; 3619 3620 if (btrfs_fs_closing(fs_info)) 3621 return -EINVAL; 3622 3623 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { 3624 /* 3625 * in this case scrub is unable to calculate the checksum 3626 * the way scrub is implemented. Do not handle this 3627 * situation at all because it won't ever happen. 3628 */ 3629 btrfs_err(fs_info, 3630 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", 3631 fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); 3632 return -EINVAL; 3633 } 3634 3635 if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { 3636 /* not supported for data w/o checksums */ 3637 btrfs_err(fs_info, 3638 "scrub: size assumption sectorsize != PAGE_SIZE " 3639 "(%d != %lu) fails", 3640 fs_info->chunk_root->sectorsize, PAGE_SIZE); 3641 return -EINVAL; 3642 } 3643 3644 if (fs_info->chunk_root->nodesize > 3645 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || 3646 fs_info->chunk_root->sectorsize > 3647 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { 3648 /* 3649 * would exhaust the array bounds of pagev member in 3650 * struct scrub_block 3651 */ 3652 btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize " 3653 "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", 3654 fs_info->chunk_root->nodesize, 3655 SCRUB_MAX_PAGES_PER_BLOCK, 3656 fs_info->chunk_root->sectorsize, 3657 SCRUB_MAX_PAGES_PER_BLOCK); 3658 return -EINVAL; 3659 } 3660 3661 3662 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3663 dev = btrfs_find_device(fs_info, devid, NULL, NULL); 3664 if (!dev || (dev->missing && !is_dev_replace)) { 3665 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3666 return -ENODEV; 3667 } 3668 3669 if (!is_dev_replace && !readonly && !dev->writeable) { 3670 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3671 rcu_read_lock(); 3672 name = rcu_dereference(dev->name); 3673 btrfs_err(fs_info, "scrub: device %s is not writable", 3674 name->str); 3675 rcu_read_unlock(); 3676 return -EROFS; 3677 } 3678 3679 mutex_lock(&fs_info->scrub_lock); 3680 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { 3681 mutex_unlock(&fs_info->scrub_lock); 3682 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3683 return -EIO; 3684 } 3685 3686 btrfs_dev_replace_lock(&fs_info->dev_replace); 3687 if (dev->scrub_device || 3688 (!is_dev_replace && 3689 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) { 3690 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3691 mutex_unlock(&fs_info->scrub_lock); 3692 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3693 return -EINPROGRESS; 3694 } 3695 btrfs_dev_replace_unlock(&fs_info->dev_replace); 3696 3697 ret = scrub_workers_get(fs_info, is_dev_replace); 3698 if (ret) { 3699 mutex_unlock(&fs_info->scrub_lock); 3700 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3701 return ret; 3702 } 3703 3704 sctx = scrub_setup_ctx(dev, is_dev_replace); 3705 if (IS_ERR(sctx)) { 3706 mutex_unlock(&fs_info->scrub_lock); 3707 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3708 scrub_workers_put(fs_info); 3709 return PTR_ERR(sctx); 3710 } 3711 sctx->readonly = readonly; 3712 dev->scrub_device = sctx; 3713 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3714 3715 /* 3716 * checking @scrub_pause_req here, we can avoid 3717 * race between committing transaction and scrubbing. 3718 */ 3719 __scrub_blocked_if_needed(fs_info); 3720 atomic_inc(&fs_info->scrubs_running); 3721 mutex_unlock(&fs_info->scrub_lock); 3722 3723 if (!is_dev_replace) { 3724 /* 3725 * by holding device list mutex, we can 3726 * kick off writing super in log tree sync. 3727 */ 3728 mutex_lock(&fs_info->fs_devices->device_list_mutex); 3729 ret = scrub_supers(sctx, dev); 3730 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3731 } 3732 3733 if (!ret) 3734 ret = scrub_enumerate_chunks(sctx, dev, start, end, 3735 is_dev_replace); 3736 3737 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); 3738 atomic_dec(&fs_info->scrubs_running); 3739 wake_up(&fs_info->scrub_pause_wait); 3740 3741 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); 3742 3743 if (progress) 3744 memcpy(progress, &sctx->stat, sizeof(*progress)); 3745 3746 mutex_lock(&fs_info->scrub_lock); 3747 dev->scrub_device = NULL; 3748 scrub_workers_put(fs_info); 3749 mutex_unlock(&fs_info->scrub_lock); 3750 3751 scrub_put_ctx(sctx); 3752 3753 return ret; 3754 } 3755 3756 void btrfs_scrub_pause(struct btrfs_root *root) 3757 { 3758 struct btrfs_fs_info *fs_info = root->fs_info; 3759 3760 mutex_lock(&fs_info->scrub_lock); 3761 atomic_inc(&fs_info->scrub_pause_req); 3762 while (atomic_read(&fs_info->scrubs_paused) != 3763 atomic_read(&fs_info->scrubs_running)) { 3764 mutex_unlock(&fs_info->scrub_lock); 3765 wait_event(fs_info->scrub_pause_wait, 3766 atomic_read(&fs_info->scrubs_paused) == 3767 atomic_read(&fs_info->scrubs_running)); 3768 mutex_lock(&fs_info->scrub_lock); 3769 } 3770 mutex_unlock(&fs_info->scrub_lock); 3771 } 3772 3773 void btrfs_scrub_continue(struct btrfs_root *root) 3774 { 3775 struct btrfs_fs_info *fs_info = root->fs_info; 3776 3777 atomic_dec(&fs_info->scrub_pause_req); 3778 wake_up(&fs_info->scrub_pause_wait); 3779 } 3780 3781 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 3782 { 3783 mutex_lock(&fs_info->scrub_lock); 3784 if (!atomic_read(&fs_info->scrubs_running)) { 3785 mutex_unlock(&fs_info->scrub_lock); 3786 return -ENOTCONN; 3787 } 3788 3789 atomic_inc(&fs_info->scrub_cancel_req); 3790 while (atomic_read(&fs_info->scrubs_running)) { 3791 mutex_unlock(&fs_info->scrub_lock); 3792 wait_event(fs_info->scrub_pause_wait, 3793 atomic_read(&fs_info->scrubs_running) == 0); 3794 mutex_lock(&fs_info->scrub_lock); 3795 } 3796 atomic_dec(&fs_info->scrub_cancel_req); 3797 mutex_unlock(&fs_info->scrub_lock); 3798 3799 return 0; 3800 } 3801 3802 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, 3803 struct btrfs_device *dev) 3804 { 3805 struct scrub_ctx *sctx; 3806 3807 mutex_lock(&fs_info->scrub_lock); 3808 sctx = dev->scrub_device; 3809 if (!sctx) { 3810 mutex_unlock(&fs_info->scrub_lock); 3811 return -ENOTCONN; 3812 } 3813 atomic_inc(&sctx->cancel_req); 3814 while (dev->scrub_device) { 3815 mutex_unlock(&fs_info->scrub_lock); 3816 wait_event(fs_info->scrub_pause_wait, 3817 dev->scrub_device == NULL); 3818 mutex_lock(&fs_info->scrub_lock); 3819 } 3820 mutex_unlock(&fs_info->scrub_lock); 3821 3822 return 0; 3823 } 3824 3825 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 3826 struct btrfs_scrub_progress *progress) 3827 { 3828 struct btrfs_device *dev; 3829 struct scrub_ctx *sctx = NULL; 3830 3831 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3832 dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); 3833 if (dev) 3834 sctx = dev->scrub_device; 3835 if (sctx) 3836 memcpy(progress, &sctx->stat, sizeof(*progress)); 3837 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3838 3839 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; 3840 } 3841 3842 static void scrub_remap_extent(struct btrfs_fs_info *fs_info, 3843 u64 extent_logical, u64 extent_len, 3844 u64 *extent_physical, 3845 struct btrfs_device **extent_dev, 3846 int *extent_mirror_num) 3847 { 3848 u64 mapped_length; 3849 struct btrfs_bio *bbio = NULL; 3850 int ret; 3851 3852 mapped_length = extent_len; 3853 ret = btrfs_map_block(fs_info, READ, extent_logical, 3854 &mapped_length, &bbio, 0); 3855 if (ret || !bbio || mapped_length < extent_len || 3856 !bbio->stripes[0].dev->bdev) { 3857 btrfs_put_bbio(bbio); 3858 return; 3859 } 3860 3861 *extent_physical = bbio->stripes[0].physical; 3862 *extent_mirror_num = bbio->mirror_num; 3863 *extent_dev = bbio->stripes[0].dev; 3864 btrfs_put_bbio(bbio); 3865 } 3866 3867 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, 3868 struct scrub_wr_ctx *wr_ctx, 3869 struct btrfs_fs_info *fs_info, 3870 struct btrfs_device *dev, 3871 int is_dev_replace) 3872 { 3873 WARN_ON(wr_ctx->wr_curr_bio != NULL); 3874 3875 mutex_init(&wr_ctx->wr_lock); 3876 wr_ctx->wr_curr_bio = NULL; 3877 if (!is_dev_replace) 3878 return 0; 3879 3880 WARN_ON(!dev->bdev); 3881 wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO, 3882 bio_get_nr_vecs(dev->bdev)); 3883 wr_ctx->tgtdev = dev; 3884 atomic_set(&wr_ctx->flush_all_writes, 0); 3885 return 0; 3886 } 3887 3888 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx) 3889 { 3890 mutex_lock(&wr_ctx->wr_lock); 3891 kfree(wr_ctx->wr_curr_bio); 3892 wr_ctx->wr_curr_bio = NULL; 3893 mutex_unlock(&wr_ctx->wr_lock); 3894 } 3895 3896 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, 3897 int mirror_num, u64 physical_for_dev_replace) 3898 { 3899 struct scrub_copy_nocow_ctx *nocow_ctx; 3900 struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; 3901 3902 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); 3903 if (!nocow_ctx) { 3904 spin_lock(&sctx->stat_lock); 3905 sctx->stat.malloc_errors++; 3906 spin_unlock(&sctx->stat_lock); 3907 return -ENOMEM; 3908 } 3909 3910 scrub_pending_trans_workers_inc(sctx); 3911 3912 nocow_ctx->sctx = sctx; 3913 nocow_ctx->logical = logical; 3914 nocow_ctx->len = len; 3915 nocow_ctx->mirror_num = mirror_num; 3916 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3917 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, 3918 copy_nocow_pages_worker, NULL, NULL); 3919 INIT_LIST_HEAD(&nocow_ctx->inodes); 3920 btrfs_queue_work(fs_info->scrub_nocow_workers, 3921 &nocow_ctx->work); 3922 3923 return 0; 3924 } 3925 3926 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx) 3927 { 3928 struct scrub_copy_nocow_ctx *nocow_ctx = ctx; 3929 struct scrub_nocow_inode *nocow_inode; 3930 3931 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS); 3932 if (!nocow_inode) 3933 return -ENOMEM; 3934 nocow_inode->inum = inum; 3935 nocow_inode->offset = offset; 3936 nocow_inode->root = root; 3937 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes); 3938 return 0; 3939 } 3940 3941 #define COPY_COMPLETE 1 3942 3943 static void copy_nocow_pages_worker(struct btrfs_work *work) 3944 { 3945 struct scrub_copy_nocow_ctx *nocow_ctx = 3946 container_of(work, struct scrub_copy_nocow_ctx, work); 3947 struct scrub_ctx *sctx = nocow_ctx->sctx; 3948 u64 logical = nocow_ctx->logical; 3949 u64 len = nocow_ctx->len; 3950 int mirror_num = nocow_ctx->mirror_num; 3951 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 3952 int ret; 3953 struct btrfs_trans_handle *trans = NULL; 3954 struct btrfs_fs_info *fs_info; 3955 struct btrfs_path *path; 3956 struct btrfs_root *root; 3957 int not_written = 0; 3958 3959 fs_info = sctx->dev_root->fs_info; 3960 root = fs_info->extent_root; 3961 3962 path = btrfs_alloc_path(); 3963 if (!path) { 3964 spin_lock(&sctx->stat_lock); 3965 sctx->stat.malloc_errors++; 3966 spin_unlock(&sctx->stat_lock); 3967 not_written = 1; 3968 goto out; 3969 } 3970 3971 trans = btrfs_join_transaction(root); 3972 if (IS_ERR(trans)) { 3973 not_written = 1; 3974 goto out; 3975 } 3976 3977 ret = iterate_inodes_from_logical(logical, fs_info, path, 3978 record_inode_for_nocow, nocow_ctx); 3979 if (ret != 0 && ret != -ENOENT) { 3980 btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, " 3981 "phys %llu, len %llu, mir %u, ret %d", 3982 logical, physical_for_dev_replace, len, mirror_num, 3983 ret); 3984 not_written = 1; 3985 goto out; 3986 } 3987 3988 btrfs_end_transaction(trans, root); 3989 trans = NULL; 3990 while (!list_empty(&nocow_ctx->inodes)) { 3991 struct scrub_nocow_inode *entry; 3992 entry = list_first_entry(&nocow_ctx->inodes, 3993 struct scrub_nocow_inode, 3994 list); 3995 list_del_init(&entry->list); 3996 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset, 3997 entry->root, nocow_ctx); 3998 kfree(entry); 3999 if (ret == COPY_COMPLETE) { 4000 ret = 0; 4001 break; 4002 } else if (ret) { 4003 break; 4004 } 4005 } 4006 out: 4007 while (!list_empty(&nocow_ctx->inodes)) { 4008 struct scrub_nocow_inode *entry; 4009 entry = list_first_entry(&nocow_ctx->inodes, 4010 struct scrub_nocow_inode, 4011 list); 4012 list_del_init(&entry->list); 4013 kfree(entry); 4014 } 4015 if (trans && !IS_ERR(trans)) 4016 btrfs_end_transaction(trans, root); 4017 if (not_written) 4018 btrfs_dev_replace_stats_inc(&fs_info->dev_replace. 4019 num_uncorrectable_read_errors); 4020 4021 btrfs_free_path(path); 4022 kfree(nocow_ctx); 4023 4024 scrub_pending_trans_workers_dec(sctx); 4025 } 4026 4027 static int check_extent_to_block(struct inode *inode, u64 start, u64 len, 4028 u64 logical) 4029 { 4030 struct extent_state *cached_state = NULL; 4031 struct btrfs_ordered_extent *ordered; 4032 struct extent_io_tree *io_tree; 4033 struct extent_map *em; 4034 u64 lockstart = start, lockend = start + len - 1; 4035 int ret = 0; 4036 4037 io_tree = &BTRFS_I(inode)->io_tree; 4038 4039 lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); 4040 ordered = btrfs_lookup_ordered_range(inode, lockstart, len); 4041 if (ordered) { 4042 btrfs_put_ordered_extent(ordered); 4043 ret = 1; 4044 goto out_unlock; 4045 } 4046 4047 em = btrfs_get_extent(inode, NULL, 0, start, len, 0); 4048 if (IS_ERR(em)) { 4049 ret = PTR_ERR(em); 4050 goto out_unlock; 4051 } 4052 4053 /* 4054 * This extent does not actually cover the logical extent anymore, 4055 * move on to the next inode. 4056 */ 4057 if (em->block_start > logical || 4058 em->block_start + em->block_len < logical + len) { 4059 free_extent_map(em); 4060 ret = 1; 4061 goto out_unlock; 4062 } 4063 free_extent_map(em); 4064 4065 out_unlock: 4066 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, 4067 GFP_NOFS); 4068 return ret; 4069 } 4070 4071 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, 4072 struct scrub_copy_nocow_ctx *nocow_ctx) 4073 { 4074 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; 4075 struct btrfs_key key; 4076 struct inode *inode; 4077 struct page *page; 4078 struct btrfs_root *local_root; 4079 struct extent_io_tree *io_tree; 4080 u64 physical_for_dev_replace; 4081 u64 nocow_ctx_logical; 4082 u64 len = nocow_ctx->len; 4083 unsigned long index; 4084 int srcu_index; 4085 int ret = 0; 4086 int err = 0; 4087 4088 key.objectid = root; 4089 key.type = BTRFS_ROOT_ITEM_KEY; 4090 key.offset = (u64)-1; 4091 4092 srcu_index = srcu_read_lock(&fs_info->subvol_srcu); 4093 4094 local_root = btrfs_read_fs_root_no_name(fs_info, &key); 4095 if (IS_ERR(local_root)) { 4096 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 4097 return PTR_ERR(local_root); 4098 } 4099 4100 key.type = BTRFS_INODE_ITEM_KEY; 4101 key.objectid = inum; 4102 key.offset = 0; 4103 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL); 4104 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index); 4105 if (IS_ERR(inode)) 4106 return PTR_ERR(inode); 4107 4108 /* Avoid truncate/dio/punch hole.. */ 4109 mutex_lock(&inode->i_mutex); 4110 inode_dio_wait(inode); 4111 4112 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; 4113 io_tree = &BTRFS_I(inode)->io_tree; 4114 nocow_ctx_logical = nocow_ctx->logical; 4115 4116 ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); 4117 if (ret) { 4118 ret = ret > 0 ? 0 : ret; 4119 goto out; 4120 } 4121 4122 while (len >= PAGE_CACHE_SIZE) { 4123 index = offset >> PAGE_CACHE_SHIFT; 4124 again: 4125 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 4126 if (!page) { 4127 btrfs_err(fs_info, "find_or_create_page() failed"); 4128 ret = -ENOMEM; 4129 goto out; 4130 } 4131 4132 if (PageUptodate(page)) { 4133 if (PageDirty(page)) 4134 goto next_page; 4135 } else { 4136 ClearPageError(page); 4137 err = extent_read_full_page(io_tree, page, 4138 btrfs_get_extent, 4139 nocow_ctx->mirror_num); 4140 if (err) { 4141 ret = err; 4142 goto next_page; 4143 } 4144 4145 lock_page(page); 4146 /* 4147 * If the page has been remove from the page cache, 4148 * the data on it is meaningless, because it may be 4149 * old one, the new data may be written into the new 4150 * page in the page cache. 4151 */ 4152 if (page->mapping != inode->i_mapping) { 4153 unlock_page(page); 4154 page_cache_release(page); 4155 goto again; 4156 } 4157 if (!PageUptodate(page)) { 4158 ret = -EIO; 4159 goto next_page; 4160 } 4161 } 4162 4163 ret = check_extent_to_block(inode, offset, len, 4164 nocow_ctx_logical); 4165 if (ret) { 4166 ret = ret > 0 ? 0 : ret; 4167 goto next_page; 4168 } 4169 4170 err = write_page_nocow(nocow_ctx->sctx, 4171 physical_for_dev_replace, page); 4172 if (err) 4173 ret = err; 4174 next_page: 4175 unlock_page(page); 4176 page_cache_release(page); 4177 4178 if (ret) 4179 break; 4180 4181 offset += PAGE_CACHE_SIZE; 4182 physical_for_dev_replace += PAGE_CACHE_SIZE; 4183 nocow_ctx_logical += PAGE_CACHE_SIZE; 4184 len -= PAGE_CACHE_SIZE; 4185 } 4186 ret = COPY_COMPLETE; 4187 out: 4188 mutex_unlock(&inode->i_mutex); 4189 iput(inode); 4190 return ret; 4191 } 4192 4193 static int write_page_nocow(struct scrub_ctx *sctx, 4194 u64 physical_for_dev_replace, struct page *page) 4195 { 4196 struct bio *bio; 4197 struct btrfs_device *dev; 4198 int ret; 4199 4200 dev = sctx->wr_ctx.tgtdev; 4201 if (!dev) 4202 return -EIO; 4203 if (!dev->bdev) { 4204 printk_ratelimited(KERN_WARNING 4205 "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); 4206 return -EIO; 4207 } 4208 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 4209 if (!bio) { 4210 spin_lock(&sctx->stat_lock); 4211 sctx->stat.malloc_errors++; 4212 spin_unlock(&sctx->stat_lock); 4213 return -ENOMEM; 4214 } 4215 bio->bi_iter.bi_size = 0; 4216 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; 4217 bio->bi_bdev = dev->bdev; 4218 ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 4219 if (ret != PAGE_CACHE_SIZE) { 4220 leave_with_eio: 4221 bio_put(bio); 4222 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 4223 return -EIO; 4224 } 4225 4226 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) 4227 goto leave_with_eio; 4228 4229 bio_put(bio); 4230 return 0; 4231 } 4232