1 /* 2 * Copyright (C) 2011 STRATO. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public 6 * License v2 as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public 14 * License along with this program; if not, write to the 15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 16 * Boston, MA 021110-1307, USA. 17 */ 18 19 #include <linux/blkdev.h> 20 #include <linux/ratelimit.h> 21 #include "ctree.h" 22 #include "volumes.h" 23 #include "disk-io.h" 24 #include "ordered-data.h" 25 #include "transaction.h" 26 #include "backref.h" 27 #include "extent_io.h" 28 #include "check-integrity.h" 29 #include "rcu-string.h" 30 31 /* 32 * This is only the first step towards a full-features scrub. It reads all 33 * extent and super block and verifies the checksums. In case a bad checksum 34 * is found or the extent cannot be read, good data will be written back if 35 * any can be found. 36 * 37 * Future enhancements: 38 * - In case an unrepairable extent is encountered, track which files are 39 * affected and report them 40 * - track and record media errors, throw out bad devices 41 * - add a mode to also read unallocated space 42 */ 43 44 struct scrub_block; 45 struct scrub_dev; 46 47 #define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */ 48 #define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */ 49 #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ 50 51 struct scrub_page { 52 struct scrub_block *sblock; 53 struct page *page; 54 struct btrfs_device *dev; 55 u64 flags; /* extent flags */ 56 u64 generation; 57 u64 logical; 58 u64 physical; 59 struct { 60 unsigned int mirror_num:8; 61 unsigned int have_csum:1; 62 unsigned int io_error:1; 63 }; 64 u8 csum[BTRFS_CSUM_SIZE]; 65 }; 66 67 struct scrub_bio { 68 int index; 69 struct scrub_dev *sdev; 70 struct bio *bio; 71 int err; 72 u64 logical; 73 u64 physical; 74 struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; 75 int page_count; 76 int next_free; 77 struct btrfs_work work; 78 }; 79 80 struct scrub_block { 81 struct scrub_page pagev[SCRUB_MAX_PAGES_PER_BLOCK]; 82 int page_count; 83 atomic_t outstanding_pages; 84 atomic_t ref_count; /* free mem on transition to zero */ 85 struct scrub_dev *sdev; 86 struct { 87 unsigned int header_error:1; 88 unsigned int checksum_error:1; 89 unsigned int no_io_error_seen:1; 90 unsigned int generation_error:1; /* also sets header_error */ 91 }; 92 }; 93 94 struct scrub_dev { 95 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV]; 96 struct btrfs_device *dev; 97 int first_free; 98 int curr; 99 atomic_t in_flight; 100 atomic_t fixup_cnt; 101 spinlock_t list_lock; 102 wait_queue_head_t list_wait; 103 u16 csum_size; 104 struct list_head csum_list; 105 atomic_t cancel_req; 106 int readonly; 107 int pages_per_bio; /* <= SCRUB_PAGES_PER_BIO */ 108 u32 sectorsize; 109 u32 nodesize; 110 u32 leafsize; 111 /* 112 * statistics 113 */ 114 struct btrfs_scrub_progress stat; 115 spinlock_t stat_lock; 116 }; 117 118 struct scrub_fixup_nodatasum { 119 struct scrub_dev *sdev; 120 u64 logical; 121 struct btrfs_root *root; 122 struct btrfs_work work; 123 int mirror_num; 124 }; 125 126 struct scrub_warning { 127 struct btrfs_path *path; 128 u64 extent_item_size; 129 char *scratch_buf; 130 char *msg_buf; 131 const char *errstr; 132 sector_t sector; 133 u64 logical; 134 struct btrfs_device *dev; 135 int msg_bufsize; 136 int scratch_bufsize; 137 }; 138 139 140 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); 141 static int scrub_setup_recheck_block(struct scrub_dev *sdev, 142 struct btrfs_mapping_tree *map_tree, 143 u64 length, u64 logical, 144 struct scrub_block *sblock); 145 static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 146 struct scrub_block *sblock, int is_metadata, 147 int have_csum, u8 *csum, u64 generation, 148 u16 csum_size); 149 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 150 struct scrub_block *sblock, 151 int is_metadata, int have_csum, 152 const u8 *csum, u64 generation, 153 u16 csum_size); 154 static void scrub_complete_bio_end_io(struct bio *bio, int err); 155 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 156 struct scrub_block *sblock_good, 157 int force_write); 158 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 159 struct scrub_block *sblock_good, 160 int page_num, int force_write); 161 static int scrub_checksum_data(struct scrub_block *sblock); 162 static int scrub_checksum_tree_block(struct scrub_block *sblock); 163 static int scrub_checksum_super(struct scrub_block *sblock); 164 static void scrub_block_get(struct scrub_block *sblock); 165 static void scrub_block_put(struct scrub_block *sblock); 166 static int scrub_add_page_to_bio(struct scrub_dev *sdev, 167 struct scrub_page *spage); 168 static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 169 u64 physical, u64 flags, u64 gen, int mirror_num, 170 u8 *csum, int force); 171 static void scrub_bio_end_io(struct bio *bio, int err); 172 static void scrub_bio_end_io_worker(struct btrfs_work *work); 173 static void scrub_block_complete(struct scrub_block *sblock); 174 175 176 static void scrub_free_csums(struct scrub_dev *sdev) 177 { 178 while (!list_empty(&sdev->csum_list)) { 179 struct btrfs_ordered_sum *sum; 180 sum = list_first_entry(&sdev->csum_list, 181 struct btrfs_ordered_sum, list); 182 list_del(&sum->list); 183 kfree(sum); 184 } 185 } 186 187 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 188 { 189 int i; 190 191 if (!sdev) 192 return; 193 194 /* this can happen when scrub is cancelled */ 195 if (sdev->curr != -1) { 196 struct scrub_bio *sbio = sdev->bios[sdev->curr]; 197 198 for (i = 0; i < sbio->page_count; i++) { 199 BUG_ON(!sbio->pagev[i]); 200 BUG_ON(!sbio->pagev[i]->page); 201 scrub_block_put(sbio->pagev[i]->sblock); 202 } 203 bio_put(sbio->bio); 204 } 205 206 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 207 struct scrub_bio *sbio = sdev->bios[i]; 208 209 if (!sbio) 210 break; 211 kfree(sbio); 212 } 213 214 scrub_free_csums(sdev); 215 kfree(sdev); 216 } 217 218 static noinline_for_stack 219 struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) 220 { 221 struct scrub_dev *sdev; 222 int i; 223 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 224 int pages_per_bio; 225 226 pages_per_bio = min_t(int, SCRUB_PAGES_PER_BIO, 227 bio_get_nr_vecs(dev->bdev)); 228 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 229 if (!sdev) 230 goto nomem; 231 sdev->dev = dev; 232 sdev->pages_per_bio = pages_per_bio; 233 sdev->curr = -1; 234 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 235 struct scrub_bio *sbio; 236 237 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 238 if (!sbio) 239 goto nomem; 240 sdev->bios[i] = sbio; 241 242 sbio->index = i; 243 sbio->sdev = sdev; 244 sbio->page_count = 0; 245 sbio->work.func = scrub_bio_end_io_worker; 246 247 if (i != SCRUB_BIOS_PER_DEV-1) 248 sdev->bios[i]->next_free = i + 1; 249 else 250 sdev->bios[i]->next_free = -1; 251 } 252 sdev->first_free = 0; 253 sdev->nodesize = dev->dev_root->nodesize; 254 sdev->leafsize = dev->dev_root->leafsize; 255 sdev->sectorsize = dev->dev_root->sectorsize; 256 atomic_set(&sdev->in_flight, 0); 257 atomic_set(&sdev->fixup_cnt, 0); 258 atomic_set(&sdev->cancel_req, 0); 259 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 260 INIT_LIST_HEAD(&sdev->csum_list); 261 262 spin_lock_init(&sdev->list_lock); 263 spin_lock_init(&sdev->stat_lock); 264 init_waitqueue_head(&sdev->list_wait); 265 return sdev; 266 267 nomem: 268 scrub_free_dev(sdev); 269 return ERR_PTR(-ENOMEM); 270 } 271 272 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) 273 { 274 u64 isize; 275 u32 nlink; 276 int ret; 277 int i; 278 struct extent_buffer *eb; 279 struct btrfs_inode_item *inode_item; 280 struct scrub_warning *swarn = ctx; 281 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; 282 struct inode_fs_paths *ipath = NULL; 283 struct btrfs_root *local_root; 284 struct btrfs_key root_key; 285 286 root_key.objectid = root; 287 root_key.type = BTRFS_ROOT_ITEM_KEY; 288 root_key.offset = (u64)-1; 289 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); 290 if (IS_ERR(local_root)) { 291 ret = PTR_ERR(local_root); 292 goto err; 293 } 294 295 ret = inode_item_info(inum, 0, local_root, swarn->path); 296 if (ret) { 297 btrfs_release_path(swarn->path); 298 goto err; 299 } 300 301 eb = swarn->path->nodes[0]; 302 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], 303 struct btrfs_inode_item); 304 isize = btrfs_inode_size(eb, inode_item); 305 nlink = btrfs_inode_nlink(eb, inode_item); 306 btrfs_release_path(swarn->path); 307 308 ipath = init_ipath(4096, local_root, swarn->path); 309 if (IS_ERR(ipath)) { 310 ret = PTR_ERR(ipath); 311 ipath = NULL; 312 goto err; 313 } 314 ret = paths_from_inode(inum, ipath); 315 316 if (ret < 0) 317 goto err; 318 319 /* 320 * we deliberately ignore the bit ipath might have been too small to 321 * hold all of the paths here 322 */ 323 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 324 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 325 "%s, sector %llu, root %llu, inode %llu, offset %llu, " 326 "length %llu, links %u (path: %s)\n", swarn->errstr, 327 swarn->logical, rcu_str_deref(swarn->dev->name), 328 (unsigned long long)swarn->sector, root, inum, offset, 329 min(isize - offset, (u64)PAGE_SIZE), nlink, 330 (char *)(unsigned long)ipath->fspath->val[i]); 331 332 free_ipath(ipath); 333 return 0; 334 335 err: 336 printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev " 337 "%s, sector %llu, root %llu, inode %llu, offset %llu: path " 338 "resolving failed with ret=%d\n", swarn->errstr, 339 swarn->logical, rcu_str_deref(swarn->dev->name), 340 (unsigned long long)swarn->sector, root, inum, offset, ret); 341 342 free_ipath(ipath); 343 return 0; 344 } 345 346 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) 347 { 348 struct btrfs_device *dev = sblock->sdev->dev; 349 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 350 struct btrfs_path *path; 351 struct btrfs_key found_key; 352 struct extent_buffer *eb; 353 struct btrfs_extent_item *ei; 354 struct scrub_warning swarn; 355 unsigned long ptr = 0; 356 u64 extent_item_pos; 357 u64 flags = 0; 358 u64 ref_root; 359 u32 item_size; 360 u8 ref_level; 361 const int bufsize = 4096; 362 int ret; 363 364 path = btrfs_alloc_path(); 365 366 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); 367 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); 368 BUG_ON(sblock->page_count < 1); 369 swarn.sector = (sblock->pagev[0].physical) >> 9; 370 swarn.logical = sblock->pagev[0].logical; 371 swarn.errstr = errstr; 372 swarn.dev = dev; 373 swarn.msg_bufsize = bufsize; 374 swarn.scratch_bufsize = bufsize; 375 376 if (!path || !swarn.scratch_buf || !swarn.msg_buf) 377 goto out; 378 379 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 380 &flags); 381 if (ret < 0) 382 goto out; 383 384 extent_item_pos = swarn.logical - found_key.objectid; 385 swarn.extent_item_size = found_key.offset; 386 387 eb = path->nodes[0]; 388 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 389 item_size = btrfs_item_size_nr(eb, path->slots[0]); 390 btrfs_release_path(path); 391 392 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 393 do { 394 ret = tree_backref_for_extent(&ptr, eb, ei, item_size, 395 &ref_root, &ref_level); 396 printk_in_rcu(KERN_WARNING 397 "btrfs: %s at logical %llu on dev %s, " 398 "sector %llu: metadata %s (level %d) in tree " 399 "%llu\n", errstr, swarn.logical, 400 rcu_str_deref(dev->name), 401 (unsigned long long)swarn.sector, 402 ref_level ? "node" : "leaf", 403 ret < 0 ? -1 : ref_level, 404 ret < 0 ? -1 : ref_root); 405 } while (ret != 1); 406 } else { 407 swarn.path = path; 408 iterate_extent_inodes(fs_info, found_key.objectid, 409 extent_item_pos, 1, 410 scrub_print_warning_inode, &swarn); 411 } 412 413 out: 414 btrfs_free_path(path); 415 kfree(swarn.scratch_buf); 416 kfree(swarn.msg_buf); 417 } 418 419 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) 420 { 421 struct page *page = NULL; 422 unsigned long index; 423 struct scrub_fixup_nodatasum *fixup = ctx; 424 int ret; 425 int corrected = 0; 426 struct btrfs_key key; 427 struct inode *inode = NULL; 428 u64 end = offset + PAGE_SIZE - 1; 429 struct btrfs_root *local_root; 430 431 key.objectid = root; 432 key.type = BTRFS_ROOT_ITEM_KEY; 433 key.offset = (u64)-1; 434 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); 435 if (IS_ERR(local_root)) 436 return PTR_ERR(local_root); 437 438 key.type = BTRFS_INODE_ITEM_KEY; 439 key.objectid = inum; 440 key.offset = 0; 441 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); 442 if (IS_ERR(inode)) 443 return PTR_ERR(inode); 444 445 index = offset >> PAGE_CACHE_SHIFT; 446 447 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 448 if (!page) { 449 ret = -ENOMEM; 450 goto out; 451 } 452 453 if (PageUptodate(page)) { 454 struct btrfs_mapping_tree *map_tree; 455 if (PageDirty(page)) { 456 /* 457 * we need to write the data to the defect sector. the 458 * data that was in that sector is not in memory, 459 * because the page was modified. we must not write the 460 * modified page to that sector. 461 * 462 * TODO: what could be done here: wait for the delalloc 463 * runner to write out that page (might involve 464 * COW) and see whether the sector is still 465 * referenced afterwards. 466 * 467 * For the meantime, we'll treat this error 468 * incorrectable, although there is a chance that a 469 * later scrub will find the bad sector again and that 470 * there's no dirty page in memory, then. 471 */ 472 ret = -EIO; 473 goto out; 474 } 475 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; 476 ret = repair_io_failure(map_tree, offset, PAGE_SIZE, 477 fixup->logical, page, 478 fixup->mirror_num); 479 unlock_page(page); 480 corrected = !ret; 481 } else { 482 /* 483 * we need to get good data first. the general readpage path 484 * will call repair_io_failure for us, we just have to make 485 * sure we read the bad mirror. 486 */ 487 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 488 EXTENT_DAMAGED, GFP_NOFS); 489 if (ret) { 490 /* set_extent_bits should give proper error */ 491 WARN_ON(ret > 0); 492 if (ret > 0) 493 ret = -EFAULT; 494 goto out; 495 } 496 497 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, 498 btrfs_get_extent, 499 fixup->mirror_num); 500 wait_on_page_locked(page); 501 502 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, 503 end, EXTENT_DAMAGED, 0, NULL); 504 if (!corrected) 505 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, 506 EXTENT_DAMAGED, GFP_NOFS); 507 } 508 509 out: 510 if (page) 511 put_page(page); 512 if (inode) 513 iput(inode); 514 515 if (ret < 0) 516 return ret; 517 518 if (ret == 0 && corrected) { 519 /* 520 * we only need to call readpage for one of the inodes belonging 521 * to this extent. so make iterate_extent_inodes stop 522 */ 523 return 1; 524 } 525 526 return -EIO; 527 } 528 529 static void scrub_fixup_nodatasum(struct btrfs_work *work) 530 { 531 int ret; 532 struct scrub_fixup_nodatasum *fixup; 533 struct scrub_dev *sdev; 534 struct btrfs_trans_handle *trans = NULL; 535 struct btrfs_fs_info *fs_info; 536 struct btrfs_path *path; 537 int uncorrectable = 0; 538 539 fixup = container_of(work, struct scrub_fixup_nodatasum, work); 540 sdev = fixup->sdev; 541 fs_info = fixup->root->fs_info; 542 543 path = btrfs_alloc_path(); 544 if (!path) { 545 spin_lock(&sdev->stat_lock); 546 ++sdev->stat.malloc_errors; 547 spin_unlock(&sdev->stat_lock); 548 uncorrectable = 1; 549 goto out; 550 } 551 552 trans = btrfs_join_transaction(fixup->root); 553 if (IS_ERR(trans)) { 554 uncorrectable = 1; 555 goto out; 556 } 557 558 /* 559 * the idea is to trigger a regular read through the standard path. we 560 * read a page from the (failed) logical address by specifying the 561 * corresponding copynum of the failed sector. thus, that readpage is 562 * expected to fail. 563 * that is the point where on-the-fly error correction will kick in 564 * (once it's finished) and rewrite the failed sector if a good copy 565 * can be found. 566 */ 567 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, 568 path, scrub_fixup_readpage, 569 fixup); 570 if (ret < 0) { 571 uncorrectable = 1; 572 goto out; 573 } 574 WARN_ON(ret != 1); 575 576 spin_lock(&sdev->stat_lock); 577 ++sdev->stat.corrected_errors; 578 spin_unlock(&sdev->stat_lock); 579 580 out: 581 if (trans && !IS_ERR(trans)) 582 btrfs_end_transaction(trans, fixup->root); 583 if (uncorrectable) { 584 spin_lock(&sdev->stat_lock); 585 ++sdev->stat.uncorrectable_errors; 586 spin_unlock(&sdev->stat_lock); 587 588 printk_ratelimited_in_rcu(KERN_ERR 589 "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n", 590 (unsigned long long)fixup->logical, 591 rcu_str_deref(sdev->dev->name)); 592 } 593 594 btrfs_free_path(path); 595 kfree(fixup); 596 597 /* see caller why we're pretending to be paused in the scrub counters */ 598 mutex_lock(&fs_info->scrub_lock); 599 atomic_dec(&fs_info->scrubs_running); 600 atomic_dec(&fs_info->scrubs_paused); 601 mutex_unlock(&fs_info->scrub_lock); 602 atomic_dec(&sdev->fixup_cnt); 603 wake_up(&fs_info->scrub_pause_wait); 604 wake_up(&sdev->list_wait); 605 } 606 607 /* 608 * scrub_handle_errored_block gets called when either verification of the 609 * pages failed or the bio failed to read, e.g. with EIO. In the latter 610 * case, this function handles all pages in the bio, even though only one 611 * may be bad. 612 * The goal of this function is to repair the errored block by using the 613 * contents of one of the mirrors. 614 */ 615 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) 616 { 617 struct scrub_dev *sdev = sblock_to_check->sdev; 618 struct btrfs_fs_info *fs_info; 619 u64 length; 620 u64 logical; 621 u64 generation; 622 unsigned int failed_mirror_index; 623 unsigned int is_metadata; 624 unsigned int have_csum; 625 u8 *csum; 626 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */ 627 struct scrub_block *sblock_bad; 628 int ret; 629 int mirror_index; 630 int page_num; 631 int success; 632 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, 633 DEFAULT_RATELIMIT_BURST); 634 635 BUG_ON(sblock_to_check->page_count < 1); 636 fs_info = sdev->dev->dev_root->fs_info; 637 length = sblock_to_check->page_count * PAGE_SIZE; 638 logical = sblock_to_check->pagev[0].logical; 639 generation = sblock_to_check->pagev[0].generation; 640 BUG_ON(sblock_to_check->pagev[0].mirror_num < 1); 641 failed_mirror_index = sblock_to_check->pagev[0].mirror_num - 1; 642 is_metadata = !(sblock_to_check->pagev[0].flags & 643 BTRFS_EXTENT_FLAG_DATA); 644 have_csum = sblock_to_check->pagev[0].have_csum; 645 csum = sblock_to_check->pagev[0].csum; 646 647 /* 648 * read all mirrors one after the other. This includes to 649 * re-read the extent or metadata block that failed (that was 650 * the cause that this fixup code is called) another time, 651 * page by page this time in order to know which pages 652 * caused I/O errors and which ones are good (for all mirrors). 653 * It is the goal to handle the situation when more than one 654 * mirror contains I/O errors, but the errors do not 655 * overlap, i.e. the data can be repaired by selecting the 656 * pages from those mirrors without I/O error on the 657 * particular pages. One example (with blocks >= 2 * PAGE_SIZE) 658 * would be that mirror #1 has an I/O error on the first page, 659 * the second page is good, and mirror #2 has an I/O error on 660 * the second page, but the first page is good. 661 * Then the first page of the first mirror can be repaired by 662 * taking the first page of the second mirror, and the 663 * second page of the second mirror can be repaired by 664 * copying the contents of the 2nd page of the 1st mirror. 665 * One more note: if the pages of one mirror contain I/O 666 * errors, the checksum cannot be verified. In order to get 667 * the best data for repairing, the first attempt is to find 668 * a mirror without I/O errors and with a validated checksum. 669 * Only if this is not possible, the pages are picked from 670 * mirrors with I/O errors without considering the checksum. 671 * If the latter is the case, at the end, the checksum of the 672 * repaired area is verified in order to correctly maintain 673 * the statistics. 674 */ 675 676 sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * 677 sizeof(*sblocks_for_recheck), 678 GFP_NOFS); 679 if (!sblocks_for_recheck) { 680 spin_lock(&sdev->stat_lock); 681 sdev->stat.malloc_errors++; 682 sdev->stat.read_errors++; 683 sdev->stat.uncorrectable_errors++; 684 spin_unlock(&sdev->stat_lock); 685 btrfs_dev_stat_inc_and_print(sdev->dev, 686 BTRFS_DEV_STAT_READ_ERRS); 687 goto out; 688 } 689 690 /* setup the context, map the logical blocks and alloc the pages */ 691 ret = scrub_setup_recheck_block(sdev, &fs_info->mapping_tree, length, 692 logical, sblocks_for_recheck); 693 if (ret) { 694 spin_lock(&sdev->stat_lock); 695 sdev->stat.read_errors++; 696 sdev->stat.uncorrectable_errors++; 697 spin_unlock(&sdev->stat_lock); 698 btrfs_dev_stat_inc_and_print(sdev->dev, 699 BTRFS_DEV_STAT_READ_ERRS); 700 goto out; 701 } 702 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); 703 sblock_bad = sblocks_for_recheck + failed_mirror_index; 704 705 /* build and submit the bios for the failed mirror, check checksums */ 706 ret = scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, 707 csum, generation, sdev->csum_size); 708 if (ret) { 709 spin_lock(&sdev->stat_lock); 710 sdev->stat.read_errors++; 711 sdev->stat.uncorrectable_errors++; 712 spin_unlock(&sdev->stat_lock); 713 btrfs_dev_stat_inc_and_print(sdev->dev, 714 BTRFS_DEV_STAT_READ_ERRS); 715 goto out; 716 } 717 718 if (!sblock_bad->header_error && !sblock_bad->checksum_error && 719 sblock_bad->no_io_error_seen) { 720 /* 721 * the error disappeared after reading page by page, or 722 * the area was part of a huge bio and other parts of the 723 * bio caused I/O errors, or the block layer merged several 724 * read requests into one and the error is caused by a 725 * different bio (usually one of the two latter cases is 726 * the cause) 727 */ 728 spin_lock(&sdev->stat_lock); 729 sdev->stat.unverified_errors++; 730 spin_unlock(&sdev->stat_lock); 731 732 goto out; 733 } 734 735 if (!sblock_bad->no_io_error_seen) { 736 spin_lock(&sdev->stat_lock); 737 sdev->stat.read_errors++; 738 spin_unlock(&sdev->stat_lock); 739 if (__ratelimit(&_rs)) 740 scrub_print_warning("i/o error", sblock_to_check); 741 btrfs_dev_stat_inc_and_print(sdev->dev, 742 BTRFS_DEV_STAT_READ_ERRS); 743 } else if (sblock_bad->checksum_error) { 744 spin_lock(&sdev->stat_lock); 745 sdev->stat.csum_errors++; 746 spin_unlock(&sdev->stat_lock); 747 if (__ratelimit(&_rs)) 748 scrub_print_warning("checksum error", sblock_to_check); 749 btrfs_dev_stat_inc_and_print(sdev->dev, 750 BTRFS_DEV_STAT_CORRUPTION_ERRS); 751 } else if (sblock_bad->header_error) { 752 spin_lock(&sdev->stat_lock); 753 sdev->stat.verify_errors++; 754 spin_unlock(&sdev->stat_lock); 755 if (__ratelimit(&_rs)) 756 scrub_print_warning("checksum/header error", 757 sblock_to_check); 758 if (sblock_bad->generation_error) 759 btrfs_dev_stat_inc_and_print(sdev->dev, 760 BTRFS_DEV_STAT_GENERATION_ERRS); 761 else 762 btrfs_dev_stat_inc_and_print(sdev->dev, 763 BTRFS_DEV_STAT_CORRUPTION_ERRS); 764 } 765 766 if (sdev->readonly) 767 goto did_not_correct_error; 768 769 if (!is_metadata && !have_csum) { 770 struct scrub_fixup_nodatasum *fixup_nodatasum; 771 772 /* 773 * !is_metadata and !have_csum, this means that the data 774 * might not be COW'ed, that it might be modified 775 * concurrently. The general strategy to work on the 776 * commit root does not help in the case when COW is not 777 * used. 778 */ 779 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS); 780 if (!fixup_nodatasum) 781 goto did_not_correct_error; 782 fixup_nodatasum->sdev = sdev; 783 fixup_nodatasum->logical = logical; 784 fixup_nodatasum->root = fs_info->extent_root; 785 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 786 /* 787 * increment scrubs_running to prevent cancel requests from 788 * completing as long as a fixup worker is running. we must also 789 * increment scrubs_paused to prevent deadlocking on pause 790 * requests used for transactions commits (as the worker uses a 791 * transaction context). it is safe to regard the fixup worker 792 * as paused for all matters practical. effectively, we only 793 * avoid cancellation requests from completing. 794 */ 795 mutex_lock(&fs_info->scrub_lock); 796 atomic_inc(&fs_info->scrubs_running); 797 atomic_inc(&fs_info->scrubs_paused); 798 mutex_unlock(&fs_info->scrub_lock); 799 atomic_inc(&sdev->fixup_cnt); 800 fixup_nodatasum->work.func = scrub_fixup_nodatasum; 801 btrfs_queue_worker(&fs_info->scrub_workers, 802 &fixup_nodatasum->work); 803 goto out; 804 } 805 806 /* 807 * now build and submit the bios for the other mirrors, check 808 * checksums 809 */ 810 for (mirror_index = 0; 811 mirror_index < BTRFS_MAX_MIRRORS && 812 sblocks_for_recheck[mirror_index].page_count > 0; 813 mirror_index++) { 814 if (mirror_index == failed_mirror_index) 815 continue; 816 817 /* build and submit the bios, check checksums */ 818 ret = scrub_recheck_block(fs_info, 819 sblocks_for_recheck + mirror_index, 820 is_metadata, have_csum, csum, 821 generation, sdev->csum_size); 822 if (ret) 823 goto did_not_correct_error; 824 } 825 826 /* 827 * first try to pick the mirror which is completely without I/O 828 * errors and also does not have a checksum error. 829 * If one is found, and if a checksum is present, the full block 830 * that is known to contain an error is rewritten. Afterwards 831 * the block is known to be corrected. 832 * If a mirror is found which is completely correct, and no 833 * checksum is present, only those pages are rewritten that had 834 * an I/O error in the block to be repaired, since it cannot be 835 * determined, which copy of the other pages is better (and it 836 * could happen otherwise that a correct page would be 837 * overwritten by a bad one). 838 */ 839 for (mirror_index = 0; 840 mirror_index < BTRFS_MAX_MIRRORS && 841 sblocks_for_recheck[mirror_index].page_count > 0; 842 mirror_index++) { 843 struct scrub_block *sblock_other = sblocks_for_recheck + 844 mirror_index; 845 846 if (!sblock_other->header_error && 847 !sblock_other->checksum_error && 848 sblock_other->no_io_error_seen) { 849 int force_write = is_metadata || have_csum; 850 851 ret = scrub_repair_block_from_good_copy(sblock_bad, 852 sblock_other, 853 force_write); 854 if (0 == ret) 855 goto corrected_error; 856 } 857 } 858 859 /* 860 * in case of I/O errors in the area that is supposed to be 861 * repaired, continue by picking good copies of those pages. 862 * Select the good pages from mirrors to rewrite bad pages from 863 * the area to fix. Afterwards verify the checksum of the block 864 * that is supposed to be repaired. This verification step is 865 * only done for the purpose of statistic counting and for the 866 * final scrub report, whether errors remain. 867 * A perfect algorithm could make use of the checksum and try 868 * all possible combinations of pages from the different mirrors 869 * until the checksum verification succeeds. For example, when 870 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page 871 * of mirror #2 is readable but the final checksum test fails, 872 * then the 2nd page of mirror #3 could be tried, whether now 873 * the final checksum succeedes. But this would be a rare 874 * exception and is therefore not implemented. At least it is 875 * avoided that the good copy is overwritten. 876 * A more useful improvement would be to pick the sectors 877 * without I/O error based on sector sizes (512 bytes on legacy 878 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one 879 * mirror could be repaired by taking 512 byte of a different 880 * mirror, even if other 512 byte sectors in the same PAGE_SIZE 881 * area are unreadable. 882 */ 883 884 /* can only fix I/O errors from here on */ 885 if (sblock_bad->no_io_error_seen) 886 goto did_not_correct_error; 887 888 success = 1; 889 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 890 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 891 892 if (!page_bad->io_error) 893 continue; 894 895 for (mirror_index = 0; 896 mirror_index < BTRFS_MAX_MIRRORS && 897 sblocks_for_recheck[mirror_index].page_count > 0; 898 mirror_index++) { 899 struct scrub_block *sblock_other = sblocks_for_recheck + 900 mirror_index; 901 struct scrub_page *page_other = sblock_other->pagev + 902 page_num; 903 904 if (!page_other->io_error) { 905 ret = scrub_repair_page_from_good_copy( 906 sblock_bad, sblock_other, page_num, 0); 907 if (0 == ret) { 908 page_bad->io_error = 0; 909 break; /* succeeded for this page */ 910 } 911 } 912 } 913 914 if (page_bad->io_error) { 915 /* did not find a mirror to copy the page from */ 916 success = 0; 917 } 918 } 919 920 if (success) { 921 if (is_metadata || have_csum) { 922 /* 923 * need to verify the checksum now that all 924 * sectors on disk are repaired (the write 925 * request for data to be repaired is on its way). 926 * Just be lazy and use scrub_recheck_block() 927 * which re-reads the data before the checksum 928 * is verified, but most likely the data comes out 929 * of the page cache. 930 */ 931 ret = scrub_recheck_block(fs_info, sblock_bad, 932 is_metadata, have_csum, csum, 933 generation, sdev->csum_size); 934 if (!ret && !sblock_bad->header_error && 935 !sblock_bad->checksum_error && 936 sblock_bad->no_io_error_seen) 937 goto corrected_error; 938 else 939 goto did_not_correct_error; 940 } else { 941 corrected_error: 942 spin_lock(&sdev->stat_lock); 943 sdev->stat.corrected_errors++; 944 spin_unlock(&sdev->stat_lock); 945 printk_ratelimited_in_rcu(KERN_ERR 946 "btrfs: fixed up error at logical %llu on dev %s\n", 947 (unsigned long long)logical, 948 rcu_str_deref(sdev->dev->name)); 949 } 950 } else { 951 did_not_correct_error: 952 spin_lock(&sdev->stat_lock); 953 sdev->stat.uncorrectable_errors++; 954 spin_unlock(&sdev->stat_lock); 955 printk_ratelimited_in_rcu(KERN_ERR 956 "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n", 957 (unsigned long long)logical, 958 rcu_str_deref(sdev->dev->name)); 959 } 960 961 out: 962 if (sblocks_for_recheck) { 963 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; 964 mirror_index++) { 965 struct scrub_block *sblock = sblocks_for_recheck + 966 mirror_index; 967 int page_index; 968 969 for (page_index = 0; page_index < SCRUB_PAGES_PER_BIO; 970 page_index++) 971 if (sblock->pagev[page_index].page) 972 __free_page( 973 sblock->pagev[page_index].page); 974 } 975 kfree(sblocks_for_recheck); 976 } 977 978 return 0; 979 } 980 981 static int scrub_setup_recheck_block(struct scrub_dev *sdev, 982 struct btrfs_mapping_tree *map_tree, 983 u64 length, u64 logical, 984 struct scrub_block *sblocks_for_recheck) 985 { 986 int page_index; 987 int mirror_index; 988 int ret; 989 990 /* 991 * note: the three members sdev, ref_count and outstanding_pages 992 * are not used (and not set) in the blocks that are used for 993 * the recheck procedure 994 */ 995 996 page_index = 0; 997 while (length > 0) { 998 u64 sublen = min_t(u64, length, PAGE_SIZE); 999 u64 mapped_length = sublen; 1000 struct btrfs_bio *bbio = NULL; 1001 1002 /* 1003 * with a length of PAGE_SIZE, each returned stripe 1004 * represents one mirror 1005 */ 1006 ret = btrfs_map_block(map_tree, WRITE, logical, &mapped_length, 1007 &bbio, 0); 1008 if (ret || !bbio || mapped_length < sublen) { 1009 kfree(bbio); 1010 return -EIO; 1011 } 1012 1013 BUG_ON(page_index >= SCRUB_PAGES_PER_BIO); 1014 for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; 1015 mirror_index++) { 1016 struct scrub_block *sblock; 1017 struct scrub_page *page; 1018 1019 if (mirror_index >= BTRFS_MAX_MIRRORS) 1020 continue; 1021 1022 sblock = sblocks_for_recheck + mirror_index; 1023 page = sblock->pagev + page_index; 1024 page->logical = logical; 1025 page->physical = bbio->stripes[mirror_index].physical; 1026 /* for missing devices, dev->bdev is NULL */ 1027 page->dev = bbio->stripes[mirror_index].dev; 1028 page->mirror_num = mirror_index + 1; 1029 page->page = alloc_page(GFP_NOFS); 1030 if (!page->page) { 1031 spin_lock(&sdev->stat_lock); 1032 sdev->stat.malloc_errors++; 1033 spin_unlock(&sdev->stat_lock); 1034 kfree(bbio); 1035 return -ENOMEM; 1036 } 1037 sblock->page_count++; 1038 } 1039 kfree(bbio); 1040 length -= sublen; 1041 logical += sublen; 1042 page_index++; 1043 } 1044 1045 return 0; 1046 } 1047 1048 /* 1049 * this function will check the on disk data for checksum errors, header 1050 * errors and read I/O errors. If any I/O errors happen, the exact pages 1051 * which are errored are marked as being bad. The goal is to enable scrub 1052 * to take those pages that are not errored from all the mirrors so that 1053 * the pages that are errored in the just handled mirror can be repaired. 1054 */ 1055 static int scrub_recheck_block(struct btrfs_fs_info *fs_info, 1056 struct scrub_block *sblock, int is_metadata, 1057 int have_csum, u8 *csum, u64 generation, 1058 u16 csum_size) 1059 { 1060 int page_num; 1061 1062 sblock->no_io_error_seen = 1; 1063 sblock->header_error = 0; 1064 sblock->checksum_error = 0; 1065 1066 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1067 struct bio *bio; 1068 int ret; 1069 struct scrub_page *page = sblock->pagev + page_num; 1070 DECLARE_COMPLETION_ONSTACK(complete); 1071 1072 if (page->dev->bdev == NULL) { 1073 page->io_error = 1; 1074 sblock->no_io_error_seen = 0; 1075 continue; 1076 } 1077 1078 BUG_ON(!page->page); 1079 bio = bio_alloc(GFP_NOFS, 1); 1080 if (!bio) 1081 return -EIO; 1082 bio->bi_bdev = page->dev->bdev; 1083 bio->bi_sector = page->physical >> 9; 1084 bio->bi_end_io = scrub_complete_bio_end_io; 1085 bio->bi_private = &complete; 1086 1087 ret = bio_add_page(bio, page->page, PAGE_SIZE, 0); 1088 if (PAGE_SIZE != ret) { 1089 bio_put(bio); 1090 return -EIO; 1091 } 1092 btrfsic_submit_bio(READ, bio); 1093 1094 /* this will also unplug the queue */ 1095 wait_for_completion(&complete); 1096 1097 page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags); 1098 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1099 sblock->no_io_error_seen = 0; 1100 bio_put(bio); 1101 } 1102 1103 if (sblock->no_io_error_seen) 1104 scrub_recheck_block_checksum(fs_info, sblock, is_metadata, 1105 have_csum, csum, generation, 1106 csum_size); 1107 1108 return 0; 1109 } 1110 1111 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1112 struct scrub_block *sblock, 1113 int is_metadata, int have_csum, 1114 const u8 *csum, u64 generation, 1115 u16 csum_size) 1116 { 1117 int page_num; 1118 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1119 u32 crc = ~(u32)0; 1120 struct btrfs_root *root = fs_info->extent_root; 1121 void *mapped_buffer; 1122 1123 BUG_ON(!sblock->pagev[0].page); 1124 if (is_metadata) { 1125 struct btrfs_header *h; 1126 1127 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1128 h = (struct btrfs_header *)mapped_buffer; 1129 1130 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) || 1131 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1132 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1133 BTRFS_UUID_SIZE)) { 1134 sblock->header_error = 1; 1135 } else if (generation != le64_to_cpu(h->generation)) { 1136 sblock->header_error = 1; 1137 sblock->generation_error = 1; 1138 } 1139 csum = h->csum; 1140 } else { 1141 if (!have_csum) 1142 return; 1143 1144 mapped_buffer = kmap_atomic(sblock->pagev[0].page); 1145 } 1146 1147 for (page_num = 0;;) { 1148 if (page_num == 0 && is_metadata) 1149 crc = btrfs_csum_data(root, 1150 ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE, 1151 crc, PAGE_SIZE - BTRFS_CSUM_SIZE); 1152 else 1153 crc = btrfs_csum_data(root, mapped_buffer, crc, 1154 PAGE_SIZE); 1155 1156 kunmap_atomic(mapped_buffer); 1157 page_num++; 1158 if (page_num >= sblock->page_count) 1159 break; 1160 BUG_ON(!sblock->pagev[page_num].page); 1161 1162 mapped_buffer = kmap_atomic(sblock->pagev[page_num].page); 1163 } 1164 1165 btrfs_csum_final(crc, calculated_csum); 1166 if (memcmp(calculated_csum, csum, csum_size)) 1167 sblock->checksum_error = 1; 1168 } 1169 1170 static void scrub_complete_bio_end_io(struct bio *bio, int err) 1171 { 1172 complete((struct completion *)bio->bi_private); 1173 } 1174 1175 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1176 struct scrub_block *sblock_good, 1177 int force_write) 1178 { 1179 int page_num; 1180 int ret = 0; 1181 1182 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { 1183 int ret_sub; 1184 1185 ret_sub = scrub_repair_page_from_good_copy(sblock_bad, 1186 sblock_good, 1187 page_num, 1188 force_write); 1189 if (ret_sub) 1190 ret = ret_sub; 1191 } 1192 1193 return ret; 1194 } 1195 1196 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, 1197 struct scrub_block *sblock_good, 1198 int page_num, int force_write) 1199 { 1200 struct scrub_page *page_bad = sblock_bad->pagev + page_num; 1201 struct scrub_page *page_good = sblock_good->pagev + page_num; 1202 1203 BUG_ON(sblock_bad->pagev[page_num].page == NULL); 1204 BUG_ON(sblock_good->pagev[page_num].page == NULL); 1205 if (force_write || sblock_bad->header_error || 1206 sblock_bad->checksum_error || page_bad->io_error) { 1207 struct bio *bio; 1208 int ret; 1209 DECLARE_COMPLETION_ONSTACK(complete); 1210 1211 bio = bio_alloc(GFP_NOFS, 1); 1212 if (!bio) 1213 return -EIO; 1214 bio->bi_bdev = page_bad->dev->bdev; 1215 bio->bi_sector = page_bad->physical >> 9; 1216 bio->bi_end_io = scrub_complete_bio_end_io; 1217 bio->bi_private = &complete; 1218 1219 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1220 if (PAGE_SIZE != ret) { 1221 bio_put(bio); 1222 return -EIO; 1223 } 1224 btrfsic_submit_bio(WRITE, bio); 1225 1226 /* this will also unplug the queue */ 1227 wait_for_completion(&complete); 1228 if (!bio_flagged(bio, BIO_UPTODATE)) { 1229 btrfs_dev_stat_inc_and_print(page_bad->dev, 1230 BTRFS_DEV_STAT_WRITE_ERRS); 1231 bio_put(bio); 1232 return -EIO; 1233 } 1234 bio_put(bio); 1235 } 1236 1237 return 0; 1238 } 1239 1240 static void scrub_checksum(struct scrub_block *sblock) 1241 { 1242 u64 flags; 1243 int ret; 1244 1245 BUG_ON(sblock->page_count < 1); 1246 flags = sblock->pagev[0].flags; 1247 ret = 0; 1248 if (flags & BTRFS_EXTENT_FLAG_DATA) 1249 ret = scrub_checksum_data(sblock); 1250 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1251 ret = scrub_checksum_tree_block(sblock); 1252 else if (flags & BTRFS_EXTENT_FLAG_SUPER) 1253 (void)scrub_checksum_super(sblock); 1254 else 1255 WARN_ON(1); 1256 if (ret) 1257 scrub_handle_errored_block(sblock); 1258 } 1259 1260 static int scrub_checksum_data(struct scrub_block *sblock) 1261 { 1262 struct scrub_dev *sdev = sblock->sdev; 1263 u8 csum[BTRFS_CSUM_SIZE]; 1264 u8 *on_disk_csum; 1265 struct page *page; 1266 void *buffer; 1267 u32 crc = ~(u32)0; 1268 int fail = 0; 1269 struct btrfs_root *root = sdev->dev->dev_root; 1270 u64 len; 1271 int index; 1272 1273 BUG_ON(sblock->page_count < 1); 1274 if (!sblock->pagev[0].have_csum) 1275 return 0; 1276 1277 on_disk_csum = sblock->pagev[0].csum; 1278 page = sblock->pagev[0].page; 1279 buffer = kmap_atomic(page); 1280 1281 len = sdev->sectorsize; 1282 index = 0; 1283 for (;;) { 1284 u64 l = min_t(u64, len, PAGE_SIZE); 1285 1286 crc = btrfs_csum_data(root, buffer, crc, l); 1287 kunmap_atomic(buffer); 1288 len -= l; 1289 if (len == 0) 1290 break; 1291 index++; 1292 BUG_ON(index >= sblock->page_count); 1293 BUG_ON(!sblock->pagev[index].page); 1294 page = sblock->pagev[index].page; 1295 buffer = kmap_atomic(page); 1296 } 1297 1298 btrfs_csum_final(crc, csum); 1299 if (memcmp(csum, on_disk_csum, sdev->csum_size)) 1300 fail = 1; 1301 1302 return fail; 1303 } 1304 1305 static int scrub_checksum_tree_block(struct scrub_block *sblock) 1306 { 1307 struct scrub_dev *sdev = sblock->sdev; 1308 struct btrfs_header *h; 1309 struct btrfs_root *root = sdev->dev->dev_root; 1310 struct btrfs_fs_info *fs_info = root->fs_info; 1311 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1312 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1313 struct page *page; 1314 void *mapped_buffer; 1315 u64 mapped_size; 1316 void *p; 1317 u32 crc = ~(u32)0; 1318 int fail = 0; 1319 int crc_fail = 0; 1320 u64 len; 1321 int index; 1322 1323 BUG_ON(sblock->page_count < 1); 1324 page = sblock->pagev[0].page; 1325 mapped_buffer = kmap_atomic(page); 1326 h = (struct btrfs_header *)mapped_buffer; 1327 memcpy(on_disk_csum, h->csum, sdev->csum_size); 1328 1329 /* 1330 * we don't use the getter functions here, as we 1331 * a) don't have an extent buffer and 1332 * b) the page is already kmapped 1333 */ 1334 1335 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr)) 1336 ++fail; 1337 1338 if (sblock->pagev[0].generation != le64_to_cpu(h->generation)) 1339 ++fail; 1340 1341 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1342 ++fail; 1343 1344 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1345 BTRFS_UUID_SIZE)) 1346 ++fail; 1347 1348 BUG_ON(sdev->nodesize != sdev->leafsize); 1349 len = sdev->nodesize - BTRFS_CSUM_SIZE; 1350 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1351 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1352 index = 0; 1353 for (;;) { 1354 u64 l = min_t(u64, len, mapped_size); 1355 1356 crc = btrfs_csum_data(root, p, crc, l); 1357 kunmap_atomic(mapped_buffer); 1358 len -= l; 1359 if (len == 0) 1360 break; 1361 index++; 1362 BUG_ON(index >= sblock->page_count); 1363 BUG_ON(!sblock->pagev[index].page); 1364 page = sblock->pagev[index].page; 1365 mapped_buffer = kmap_atomic(page); 1366 mapped_size = PAGE_SIZE; 1367 p = mapped_buffer; 1368 } 1369 1370 btrfs_csum_final(crc, calculated_csum); 1371 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1372 ++crc_fail; 1373 1374 return fail || crc_fail; 1375 } 1376 1377 static int scrub_checksum_super(struct scrub_block *sblock) 1378 { 1379 struct btrfs_super_block *s; 1380 struct scrub_dev *sdev = sblock->sdev; 1381 struct btrfs_root *root = sdev->dev->dev_root; 1382 struct btrfs_fs_info *fs_info = root->fs_info; 1383 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1384 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1385 struct page *page; 1386 void *mapped_buffer; 1387 u64 mapped_size; 1388 void *p; 1389 u32 crc = ~(u32)0; 1390 int fail_gen = 0; 1391 int fail_cor = 0; 1392 u64 len; 1393 int index; 1394 1395 BUG_ON(sblock->page_count < 1); 1396 page = sblock->pagev[0].page; 1397 mapped_buffer = kmap_atomic(page); 1398 s = (struct btrfs_super_block *)mapped_buffer; 1399 memcpy(on_disk_csum, s->csum, sdev->csum_size); 1400 1401 if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr)) 1402 ++fail_cor; 1403 1404 if (sblock->pagev[0].generation != le64_to_cpu(s->generation)) 1405 ++fail_gen; 1406 1407 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1408 ++fail_cor; 1409 1410 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1411 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1412 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1413 index = 0; 1414 for (;;) { 1415 u64 l = min_t(u64, len, mapped_size); 1416 1417 crc = btrfs_csum_data(root, p, crc, l); 1418 kunmap_atomic(mapped_buffer); 1419 len -= l; 1420 if (len == 0) 1421 break; 1422 index++; 1423 BUG_ON(index >= sblock->page_count); 1424 BUG_ON(!sblock->pagev[index].page); 1425 page = sblock->pagev[index].page; 1426 mapped_buffer = kmap_atomic(page); 1427 mapped_size = PAGE_SIZE; 1428 p = mapped_buffer; 1429 } 1430 1431 btrfs_csum_final(crc, calculated_csum); 1432 if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) 1433 ++fail_cor; 1434 1435 if (fail_cor + fail_gen) { 1436 /* 1437 * if we find an error in a super block, we just report it. 1438 * They will get written with the next transaction commit 1439 * anyway 1440 */ 1441 spin_lock(&sdev->stat_lock); 1442 ++sdev->stat.super_errors; 1443 spin_unlock(&sdev->stat_lock); 1444 if (fail_cor) 1445 btrfs_dev_stat_inc_and_print(sdev->dev, 1446 BTRFS_DEV_STAT_CORRUPTION_ERRS); 1447 else 1448 btrfs_dev_stat_inc_and_print(sdev->dev, 1449 BTRFS_DEV_STAT_GENERATION_ERRS); 1450 } 1451 1452 return fail_cor + fail_gen; 1453 } 1454 1455 static void scrub_block_get(struct scrub_block *sblock) 1456 { 1457 atomic_inc(&sblock->ref_count); 1458 } 1459 1460 static void scrub_block_put(struct scrub_block *sblock) 1461 { 1462 if (atomic_dec_and_test(&sblock->ref_count)) { 1463 int i; 1464 1465 for (i = 0; i < sblock->page_count; i++) 1466 if (sblock->pagev[i].page) 1467 __free_page(sblock->pagev[i].page); 1468 kfree(sblock); 1469 } 1470 } 1471 1472 static void scrub_submit(struct scrub_dev *sdev) 1473 { 1474 struct scrub_bio *sbio; 1475 1476 if (sdev->curr == -1) 1477 return; 1478 1479 sbio = sdev->bios[sdev->curr]; 1480 sdev->curr = -1; 1481 atomic_inc(&sdev->in_flight); 1482 1483 btrfsic_submit_bio(READ, sbio->bio); 1484 } 1485 1486 static int scrub_add_page_to_bio(struct scrub_dev *sdev, 1487 struct scrub_page *spage) 1488 { 1489 struct scrub_block *sblock = spage->sblock; 1490 struct scrub_bio *sbio; 1491 int ret; 1492 1493 again: 1494 /* 1495 * grab a fresh bio or wait for one to become available 1496 */ 1497 while (sdev->curr == -1) { 1498 spin_lock(&sdev->list_lock); 1499 sdev->curr = sdev->first_free; 1500 if (sdev->curr != -1) { 1501 sdev->first_free = sdev->bios[sdev->curr]->next_free; 1502 sdev->bios[sdev->curr]->next_free = -1; 1503 sdev->bios[sdev->curr]->page_count = 0; 1504 spin_unlock(&sdev->list_lock); 1505 } else { 1506 spin_unlock(&sdev->list_lock); 1507 wait_event(sdev->list_wait, sdev->first_free != -1); 1508 } 1509 } 1510 sbio = sdev->bios[sdev->curr]; 1511 if (sbio->page_count == 0) { 1512 struct bio *bio; 1513 1514 sbio->physical = spage->physical; 1515 sbio->logical = spage->logical; 1516 bio = sbio->bio; 1517 if (!bio) { 1518 bio = bio_alloc(GFP_NOFS, sdev->pages_per_bio); 1519 if (!bio) 1520 return -ENOMEM; 1521 sbio->bio = bio; 1522 } 1523 1524 bio->bi_private = sbio; 1525 bio->bi_end_io = scrub_bio_end_io; 1526 bio->bi_bdev = sdev->dev->bdev; 1527 bio->bi_sector = spage->physical >> 9; 1528 sbio->err = 0; 1529 } else if (sbio->physical + sbio->page_count * PAGE_SIZE != 1530 spage->physical || 1531 sbio->logical + sbio->page_count * PAGE_SIZE != 1532 spage->logical) { 1533 scrub_submit(sdev); 1534 goto again; 1535 } 1536 1537 sbio->pagev[sbio->page_count] = spage; 1538 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0); 1539 if (ret != PAGE_SIZE) { 1540 if (sbio->page_count < 1) { 1541 bio_put(sbio->bio); 1542 sbio->bio = NULL; 1543 return -EIO; 1544 } 1545 scrub_submit(sdev); 1546 goto again; 1547 } 1548 1549 scrub_block_get(sblock); /* one for the added page */ 1550 atomic_inc(&sblock->outstanding_pages); 1551 sbio->page_count++; 1552 if (sbio->page_count == sdev->pages_per_bio) 1553 scrub_submit(sdev); 1554 1555 return 0; 1556 } 1557 1558 static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len, 1559 u64 physical, u64 flags, u64 gen, int mirror_num, 1560 u8 *csum, int force) 1561 { 1562 struct scrub_block *sblock; 1563 int index; 1564 1565 sblock = kzalloc(sizeof(*sblock), GFP_NOFS); 1566 if (!sblock) { 1567 spin_lock(&sdev->stat_lock); 1568 sdev->stat.malloc_errors++; 1569 spin_unlock(&sdev->stat_lock); 1570 return -ENOMEM; 1571 } 1572 1573 /* one ref inside this function, plus one for each page later on */ 1574 atomic_set(&sblock->ref_count, 1); 1575 sblock->sdev = sdev; 1576 sblock->no_io_error_seen = 1; 1577 1578 for (index = 0; len > 0; index++) { 1579 struct scrub_page *spage = sblock->pagev + index; 1580 u64 l = min_t(u64, len, PAGE_SIZE); 1581 1582 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); 1583 spage->page = alloc_page(GFP_NOFS); 1584 if (!spage->page) { 1585 spin_lock(&sdev->stat_lock); 1586 sdev->stat.malloc_errors++; 1587 spin_unlock(&sdev->stat_lock); 1588 while (index > 0) { 1589 index--; 1590 __free_page(sblock->pagev[index].page); 1591 } 1592 kfree(sblock); 1593 return -ENOMEM; 1594 } 1595 spage->sblock = sblock; 1596 spage->dev = sdev->dev; 1597 spage->flags = flags; 1598 spage->generation = gen; 1599 spage->logical = logical; 1600 spage->physical = physical; 1601 spage->mirror_num = mirror_num; 1602 if (csum) { 1603 spage->have_csum = 1; 1604 memcpy(spage->csum, csum, sdev->csum_size); 1605 } else { 1606 spage->have_csum = 0; 1607 } 1608 sblock->page_count++; 1609 len -= l; 1610 logical += l; 1611 physical += l; 1612 } 1613 1614 BUG_ON(sblock->page_count == 0); 1615 for (index = 0; index < sblock->page_count; index++) { 1616 struct scrub_page *spage = sblock->pagev + index; 1617 int ret; 1618 1619 ret = scrub_add_page_to_bio(sdev, spage); 1620 if (ret) { 1621 scrub_block_put(sblock); 1622 return ret; 1623 } 1624 } 1625 1626 if (force) 1627 scrub_submit(sdev); 1628 1629 /* last one frees, either here or in bio completion for last page */ 1630 scrub_block_put(sblock); 1631 return 0; 1632 } 1633 1634 static void scrub_bio_end_io(struct bio *bio, int err) 1635 { 1636 struct scrub_bio *sbio = bio->bi_private; 1637 struct scrub_dev *sdev = sbio->sdev; 1638 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1639 1640 sbio->err = err; 1641 sbio->bio = bio; 1642 1643 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 1644 } 1645 1646 static void scrub_bio_end_io_worker(struct btrfs_work *work) 1647 { 1648 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); 1649 struct scrub_dev *sdev = sbio->sdev; 1650 int i; 1651 1652 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_BIO); 1653 if (sbio->err) { 1654 for (i = 0; i < sbio->page_count; i++) { 1655 struct scrub_page *spage = sbio->pagev[i]; 1656 1657 spage->io_error = 1; 1658 spage->sblock->no_io_error_seen = 0; 1659 } 1660 } 1661 1662 /* now complete the scrub_block items that have all pages completed */ 1663 for (i = 0; i < sbio->page_count; i++) { 1664 struct scrub_page *spage = sbio->pagev[i]; 1665 struct scrub_block *sblock = spage->sblock; 1666 1667 if (atomic_dec_and_test(&sblock->outstanding_pages)) 1668 scrub_block_complete(sblock); 1669 scrub_block_put(sblock); 1670 } 1671 1672 bio_put(sbio->bio); 1673 sbio->bio = NULL; 1674 spin_lock(&sdev->list_lock); 1675 sbio->next_free = sdev->first_free; 1676 sdev->first_free = sbio->index; 1677 spin_unlock(&sdev->list_lock); 1678 atomic_dec(&sdev->in_flight); 1679 wake_up(&sdev->list_wait); 1680 } 1681 1682 static void scrub_block_complete(struct scrub_block *sblock) 1683 { 1684 if (!sblock->no_io_error_seen) 1685 scrub_handle_errored_block(sblock); 1686 else 1687 scrub_checksum(sblock); 1688 } 1689 1690 static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, 1691 u8 *csum) 1692 { 1693 struct btrfs_ordered_sum *sum = NULL; 1694 int ret = 0; 1695 unsigned long i; 1696 unsigned long num_sectors; 1697 1698 while (!list_empty(&sdev->csum_list)) { 1699 sum = list_first_entry(&sdev->csum_list, 1700 struct btrfs_ordered_sum, list); 1701 if (sum->bytenr > logical) 1702 return 0; 1703 if (sum->bytenr + sum->len > logical) 1704 break; 1705 1706 ++sdev->stat.csum_discards; 1707 list_del(&sum->list); 1708 kfree(sum); 1709 sum = NULL; 1710 } 1711 if (!sum) 1712 return 0; 1713 1714 num_sectors = sum->len / sdev->sectorsize; 1715 for (i = 0; i < num_sectors; ++i) { 1716 if (sum->sums[i].bytenr == logical) { 1717 memcpy(csum, &sum->sums[i].sum, sdev->csum_size); 1718 ret = 1; 1719 break; 1720 } 1721 } 1722 if (ret && i == num_sectors - 1) { 1723 list_del(&sum->list); 1724 kfree(sum); 1725 } 1726 return ret; 1727 } 1728 1729 /* scrub extent tries to collect up to 64 kB for each bio */ 1730 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1731 u64 physical, u64 flags, u64 gen, int mirror_num) 1732 { 1733 int ret; 1734 u8 csum[BTRFS_CSUM_SIZE]; 1735 u32 blocksize; 1736 1737 if (flags & BTRFS_EXTENT_FLAG_DATA) { 1738 blocksize = sdev->sectorsize; 1739 spin_lock(&sdev->stat_lock); 1740 sdev->stat.data_extents_scrubbed++; 1741 sdev->stat.data_bytes_scrubbed += len; 1742 spin_unlock(&sdev->stat_lock); 1743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 1744 BUG_ON(sdev->nodesize != sdev->leafsize); 1745 blocksize = sdev->nodesize; 1746 spin_lock(&sdev->stat_lock); 1747 sdev->stat.tree_extents_scrubbed++; 1748 sdev->stat.tree_bytes_scrubbed += len; 1749 spin_unlock(&sdev->stat_lock); 1750 } else { 1751 blocksize = sdev->sectorsize; 1752 BUG_ON(1); 1753 } 1754 1755 while (len) { 1756 u64 l = min_t(u64, len, blocksize); 1757 int have_csum = 0; 1758 1759 if (flags & BTRFS_EXTENT_FLAG_DATA) { 1760 /* push csums to sbio */ 1761 have_csum = scrub_find_csum(sdev, logical, l, csum); 1762 if (have_csum == 0) 1763 ++sdev->stat.no_csum; 1764 } 1765 ret = scrub_pages(sdev, logical, l, physical, flags, gen, 1766 mirror_num, have_csum ? csum : NULL, 0); 1767 if (ret) 1768 return ret; 1769 len -= l; 1770 logical += l; 1771 physical += l; 1772 } 1773 return 0; 1774 } 1775 1776 static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, 1777 struct map_lookup *map, int num, u64 base, u64 length) 1778 { 1779 struct btrfs_path *path; 1780 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 1781 struct btrfs_root *root = fs_info->extent_root; 1782 struct btrfs_root *csum_root = fs_info->csum_root; 1783 struct btrfs_extent_item *extent; 1784 struct blk_plug plug; 1785 u64 flags; 1786 int ret; 1787 int slot; 1788 int i; 1789 u64 nstripes; 1790 struct extent_buffer *l; 1791 struct btrfs_key key; 1792 u64 physical; 1793 u64 logical; 1794 u64 generation; 1795 int mirror_num; 1796 struct reada_control *reada1; 1797 struct reada_control *reada2; 1798 struct btrfs_key key_start; 1799 struct btrfs_key key_end; 1800 1801 u64 increment = map->stripe_len; 1802 u64 offset; 1803 1804 nstripes = length; 1805 offset = 0; 1806 do_div(nstripes, map->stripe_len); 1807 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1808 offset = map->stripe_len * num; 1809 increment = map->stripe_len * map->num_stripes; 1810 mirror_num = 1; 1811 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1812 int factor = map->num_stripes / map->sub_stripes; 1813 offset = map->stripe_len * (num / map->sub_stripes); 1814 increment = map->stripe_len * factor; 1815 mirror_num = num % map->sub_stripes + 1; 1816 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1817 increment = map->stripe_len; 1818 mirror_num = num % map->num_stripes + 1; 1819 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1820 increment = map->stripe_len; 1821 mirror_num = num % map->num_stripes + 1; 1822 } else { 1823 increment = map->stripe_len; 1824 mirror_num = 1; 1825 } 1826 1827 path = btrfs_alloc_path(); 1828 if (!path) 1829 return -ENOMEM; 1830 1831 /* 1832 * work on commit root. The related disk blocks are static as 1833 * long as COW is applied. This means, it is save to rewrite 1834 * them to repair disk errors without any race conditions 1835 */ 1836 path->search_commit_root = 1; 1837 path->skip_locking = 1; 1838 1839 /* 1840 * trigger the readahead for extent tree csum tree and wait for 1841 * completion. During readahead, the scrub is officially paused 1842 * to not hold off transaction commits 1843 */ 1844 logical = base + offset; 1845 1846 wait_event(sdev->list_wait, 1847 atomic_read(&sdev->in_flight) == 0); 1848 atomic_inc(&fs_info->scrubs_paused); 1849 wake_up(&fs_info->scrub_pause_wait); 1850 1851 /* FIXME it might be better to start readahead at commit root */ 1852 key_start.objectid = logical; 1853 key_start.type = BTRFS_EXTENT_ITEM_KEY; 1854 key_start.offset = (u64)0; 1855 key_end.objectid = base + offset + nstripes * increment; 1856 key_end.type = BTRFS_EXTENT_ITEM_KEY; 1857 key_end.offset = (u64)0; 1858 reada1 = btrfs_reada_add(root, &key_start, &key_end); 1859 1860 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 1861 key_start.type = BTRFS_EXTENT_CSUM_KEY; 1862 key_start.offset = logical; 1863 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 1864 key_end.type = BTRFS_EXTENT_CSUM_KEY; 1865 key_end.offset = base + offset + nstripes * increment; 1866 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 1867 1868 if (!IS_ERR(reada1)) 1869 btrfs_reada_wait(reada1); 1870 if (!IS_ERR(reada2)) 1871 btrfs_reada_wait(reada2); 1872 1873 mutex_lock(&fs_info->scrub_lock); 1874 while (atomic_read(&fs_info->scrub_pause_req)) { 1875 mutex_unlock(&fs_info->scrub_lock); 1876 wait_event(fs_info->scrub_pause_wait, 1877 atomic_read(&fs_info->scrub_pause_req) == 0); 1878 mutex_lock(&fs_info->scrub_lock); 1879 } 1880 atomic_dec(&fs_info->scrubs_paused); 1881 mutex_unlock(&fs_info->scrub_lock); 1882 wake_up(&fs_info->scrub_pause_wait); 1883 1884 /* 1885 * collect all data csums for the stripe to avoid seeking during 1886 * the scrub. This might currently (crc32) end up to be about 1MB 1887 */ 1888 blk_start_plug(&plug); 1889 1890 /* 1891 * now find all extents for each stripe and scrub them 1892 */ 1893 logical = base + offset; 1894 physical = map->stripes[num].physical; 1895 ret = 0; 1896 for (i = 0; i < nstripes; ++i) { 1897 /* 1898 * canceled? 1899 */ 1900 if (atomic_read(&fs_info->scrub_cancel_req) || 1901 atomic_read(&sdev->cancel_req)) { 1902 ret = -ECANCELED; 1903 goto out; 1904 } 1905 /* 1906 * check to see if we have to pause 1907 */ 1908 if (atomic_read(&fs_info->scrub_pause_req)) { 1909 /* push queued extents */ 1910 scrub_submit(sdev); 1911 wait_event(sdev->list_wait, 1912 atomic_read(&sdev->in_flight) == 0); 1913 atomic_inc(&fs_info->scrubs_paused); 1914 wake_up(&fs_info->scrub_pause_wait); 1915 mutex_lock(&fs_info->scrub_lock); 1916 while (atomic_read(&fs_info->scrub_pause_req)) { 1917 mutex_unlock(&fs_info->scrub_lock); 1918 wait_event(fs_info->scrub_pause_wait, 1919 atomic_read(&fs_info->scrub_pause_req) == 0); 1920 mutex_lock(&fs_info->scrub_lock); 1921 } 1922 atomic_dec(&fs_info->scrubs_paused); 1923 mutex_unlock(&fs_info->scrub_lock); 1924 wake_up(&fs_info->scrub_pause_wait); 1925 } 1926 1927 ret = btrfs_lookup_csums_range(csum_root, logical, 1928 logical + map->stripe_len - 1, 1929 &sdev->csum_list, 1); 1930 if (ret) 1931 goto out; 1932 1933 key.objectid = logical; 1934 key.type = BTRFS_EXTENT_ITEM_KEY; 1935 key.offset = (u64)0; 1936 1937 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1938 if (ret < 0) 1939 goto out; 1940 if (ret > 0) { 1941 ret = btrfs_previous_item(root, path, 0, 1942 BTRFS_EXTENT_ITEM_KEY); 1943 if (ret < 0) 1944 goto out; 1945 if (ret > 0) { 1946 /* there's no smaller item, so stick with the 1947 * larger one */ 1948 btrfs_release_path(path); 1949 ret = btrfs_search_slot(NULL, root, &key, 1950 path, 0, 0); 1951 if (ret < 0) 1952 goto out; 1953 } 1954 } 1955 1956 while (1) { 1957 l = path->nodes[0]; 1958 slot = path->slots[0]; 1959 if (slot >= btrfs_header_nritems(l)) { 1960 ret = btrfs_next_leaf(root, path); 1961 if (ret == 0) 1962 continue; 1963 if (ret < 0) 1964 goto out; 1965 1966 break; 1967 } 1968 btrfs_item_key_to_cpu(l, &key, slot); 1969 1970 if (key.objectid + key.offset <= logical) 1971 goto next; 1972 1973 if (key.objectid >= logical + map->stripe_len) 1974 break; 1975 1976 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) 1977 goto next; 1978 1979 extent = btrfs_item_ptr(l, slot, 1980 struct btrfs_extent_item); 1981 flags = btrfs_extent_flags(l, extent); 1982 generation = btrfs_extent_generation(l, extent); 1983 1984 if (key.objectid < logical && 1985 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { 1986 printk(KERN_ERR 1987 "btrfs scrub: tree block %llu spanning " 1988 "stripes, ignored. logical=%llu\n", 1989 (unsigned long long)key.objectid, 1990 (unsigned long long)logical); 1991 goto next; 1992 } 1993 1994 /* 1995 * trim extent to this stripe 1996 */ 1997 if (key.objectid < logical) { 1998 key.offset -= logical - key.objectid; 1999 key.objectid = logical; 2000 } 2001 if (key.objectid + key.offset > 2002 logical + map->stripe_len) { 2003 key.offset = logical + map->stripe_len - 2004 key.objectid; 2005 } 2006 2007 ret = scrub_extent(sdev, key.objectid, key.offset, 2008 key.objectid - logical + physical, 2009 flags, generation, mirror_num); 2010 if (ret) 2011 goto out; 2012 2013 next: 2014 path->slots[0]++; 2015 } 2016 btrfs_release_path(path); 2017 logical += increment; 2018 physical += map->stripe_len; 2019 spin_lock(&sdev->stat_lock); 2020 sdev->stat.last_physical = physical; 2021 spin_unlock(&sdev->stat_lock); 2022 } 2023 /* push queued extents */ 2024 scrub_submit(sdev); 2025 2026 out: 2027 blk_finish_plug(&plug); 2028 btrfs_free_path(path); 2029 return ret < 0 ? ret : 0; 2030 } 2031 2032 static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev, 2033 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length, 2034 u64 dev_offset) 2035 { 2036 struct btrfs_mapping_tree *map_tree = 2037 &sdev->dev->dev_root->fs_info->mapping_tree; 2038 struct map_lookup *map; 2039 struct extent_map *em; 2040 int i; 2041 int ret = -EINVAL; 2042 2043 read_lock(&map_tree->map_tree.lock); 2044 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); 2045 read_unlock(&map_tree->map_tree.lock); 2046 2047 if (!em) 2048 return -EINVAL; 2049 2050 map = (struct map_lookup *)em->bdev; 2051 if (em->start != chunk_offset) 2052 goto out; 2053 2054 if (em->len < length) 2055 goto out; 2056 2057 for (i = 0; i < map->num_stripes; ++i) { 2058 if (map->stripes[i].dev == sdev->dev && 2059 map->stripes[i].physical == dev_offset) { 2060 ret = scrub_stripe(sdev, map, i, chunk_offset, length); 2061 if (ret) 2062 goto out; 2063 } 2064 } 2065 out: 2066 free_extent_map(em); 2067 2068 return ret; 2069 } 2070 2071 static noinline_for_stack 2072 int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) 2073 { 2074 struct btrfs_dev_extent *dev_extent = NULL; 2075 struct btrfs_path *path; 2076 struct btrfs_root *root = sdev->dev->dev_root; 2077 struct btrfs_fs_info *fs_info = root->fs_info; 2078 u64 length; 2079 u64 chunk_tree; 2080 u64 chunk_objectid; 2081 u64 chunk_offset; 2082 int ret; 2083 int slot; 2084 struct extent_buffer *l; 2085 struct btrfs_key key; 2086 struct btrfs_key found_key; 2087 struct btrfs_block_group_cache *cache; 2088 2089 path = btrfs_alloc_path(); 2090 if (!path) 2091 return -ENOMEM; 2092 2093 path->reada = 2; 2094 path->search_commit_root = 1; 2095 path->skip_locking = 1; 2096 2097 key.objectid = sdev->dev->devid; 2098 key.offset = 0ull; 2099 key.type = BTRFS_DEV_EXTENT_KEY; 2100 2101 2102 while (1) { 2103 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2104 if (ret < 0) 2105 break; 2106 if (ret > 0) { 2107 if (path->slots[0] >= 2108 btrfs_header_nritems(path->nodes[0])) { 2109 ret = btrfs_next_leaf(root, path); 2110 if (ret) 2111 break; 2112 } 2113 } 2114 2115 l = path->nodes[0]; 2116 slot = path->slots[0]; 2117 2118 btrfs_item_key_to_cpu(l, &found_key, slot); 2119 2120 if (found_key.objectid != sdev->dev->devid) 2121 break; 2122 2123 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2124 break; 2125 2126 if (found_key.offset >= end) 2127 break; 2128 2129 if (found_key.offset < key.offset) 2130 break; 2131 2132 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 2133 length = btrfs_dev_extent_length(l, dev_extent); 2134 2135 if (found_key.offset + length <= start) { 2136 key.offset = found_key.offset + length; 2137 btrfs_release_path(path); 2138 continue; 2139 } 2140 2141 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2142 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2143 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2144 2145 /* 2146 * get a reference on the corresponding block group to prevent 2147 * the chunk from going away while we scrub it 2148 */ 2149 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2150 if (!cache) { 2151 ret = -ENOENT; 2152 break; 2153 } 2154 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, 2155 chunk_offset, length, found_key.offset); 2156 btrfs_put_block_group(cache); 2157 if (ret) 2158 break; 2159 2160 key.offset = found_key.offset + length; 2161 btrfs_release_path(path); 2162 } 2163 2164 btrfs_free_path(path); 2165 2166 /* 2167 * ret can still be 1 from search_slot or next_leaf, 2168 * that's not an error 2169 */ 2170 return ret < 0 ? ret : 0; 2171 } 2172 2173 static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) 2174 { 2175 int i; 2176 u64 bytenr; 2177 u64 gen; 2178 int ret; 2179 struct btrfs_device *device = sdev->dev; 2180 struct btrfs_root *root = device->dev_root; 2181 2182 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 2183 return -EIO; 2184 2185 gen = root->fs_info->last_trans_committed; 2186 2187 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2188 bytenr = btrfs_sb_offset(i); 2189 if (bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 2190 break; 2191 2192 ret = scrub_pages(sdev, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2193 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1); 2194 if (ret) 2195 return ret; 2196 } 2197 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2198 2199 return 0; 2200 } 2201 2202 /* 2203 * get a reference count on fs_info->scrub_workers. start worker if necessary 2204 */ 2205 static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) 2206 { 2207 struct btrfs_fs_info *fs_info = root->fs_info; 2208 int ret = 0; 2209 2210 mutex_lock(&fs_info->scrub_lock); 2211 if (fs_info->scrub_workers_refcnt == 0) { 2212 btrfs_init_workers(&fs_info->scrub_workers, "scrub", 2213 fs_info->thread_pool_size, &fs_info->generic_worker); 2214 fs_info->scrub_workers.idle_thresh = 4; 2215 ret = btrfs_start_workers(&fs_info->scrub_workers); 2216 if (ret) 2217 goto out; 2218 } 2219 ++fs_info->scrub_workers_refcnt; 2220 out: 2221 mutex_unlock(&fs_info->scrub_lock); 2222 2223 return ret; 2224 } 2225 2226 static noinline_for_stack void scrub_workers_put(struct btrfs_root *root) 2227 { 2228 struct btrfs_fs_info *fs_info = root->fs_info; 2229 2230 mutex_lock(&fs_info->scrub_lock); 2231 if (--fs_info->scrub_workers_refcnt == 0) 2232 btrfs_stop_workers(&fs_info->scrub_workers); 2233 WARN_ON(fs_info->scrub_workers_refcnt < 0); 2234 mutex_unlock(&fs_info->scrub_lock); 2235 } 2236 2237 2238 int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, 2239 struct btrfs_scrub_progress *progress, int readonly) 2240 { 2241 struct scrub_dev *sdev; 2242 struct btrfs_fs_info *fs_info = root->fs_info; 2243 int ret; 2244 struct btrfs_device *dev; 2245 2246 if (btrfs_fs_closing(root->fs_info)) 2247 return -EINVAL; 2248 2249 /* 2250 * check some assumptions 2251 */ 2252 if (root->nodesize != root->leafsize) { 2253 printk(KERN_ERR 2254 "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n", 2255 root->nodesize, root->leafsize); 2256 return -EINVAL; 2257 } 2258 2259 if (root->nodesize > BTRFS_STRIPE_LEN) { 2260 /* 2261 * in this case scrub is unable to calculate the checksum 2262 * the way scrub is implemented. Do not handle this 2263 * situation at all because it won't ever happen. 2264 */ 2265 printk(KERN_ERR 2266 "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n", 2267 root->nodesize, BTRFS_STRIPE_LEN); 2268 return -EINVAL; 2269 } 2270 2271 if (root->sectorsize != PAGE_SIZE) { 2272 /* not supported for data w/o checksums */ 2273 printk(KERN_ERR 2274 "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n", 2275 root->sectorsize, (unsigned long long)PAGE_SIZE); 2276 return -EINVAL; 2277 } 2278 2279 ret = scrub_workers_get(root); 2280 if (ret) 2281 return ret; 2282 2283 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2284 dev = btrfs_find_device(root, devid, NULL, NULL); 2285 if (!dev || dev->missing) { 2286 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2287 scrub_workers_put(root); 2288 return -ENODEV; 2289 } 2290 mutex_lock(&fs_info->scrub_lock); 2291 2292 if (!dev->in_fs_metadata) { 2293 mutex_unlock(&fs_info->scrub_lock); 2294 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2295 scrub_workers_put(root); 2296 return -ENODEV; 2297 } 2298 2299 if (dev->scrub_device) { 2300 mutex_unlock(&fs_info->scrub_lock); 2301 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2302 scrub_workers_put(root); 2303 return -EINPROGRESS; 2304 } 2305 sdev = scrub_setup_dev(dev); 2306 if (IS_ERR(sdev)) { 2307 mutex_unlock(&fs_info->scrub_lock); 2308 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2309 scrub_workers_put(root); 2310 return PTR_ERR(sdev); 2311 } 2312 sdev->readonly = readonly; 2313 dev->scrub_device = sdev; 2314 2315 atomic_inc(&fs_info->scrubs_running); 2316 mutex_unlock(&fs_info->scrub_lock); 2317 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2318 2319 down_read(&fs_info->scrub_super_lock); 2320 ret = scrub_supers(sdev); 2321 up_read(&fs_info->scrub_super_lock); 2322 2323 if (!ret) 2324 ret = scrub_enumerate_chunks(sdev, start, end); 2325 2326 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 2327 atomic_dec(&fs_info->scrubs_running); 2328 wake_up(&fs_info->scrub_pause_wait); 2329 2330 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); 2331 2332 if (progress) 2333 memcpy(progress, &sdev->stat, sizeof(*progress)); 2334 2335 mutex_lock(&fs_info->scrub_lock); 2336 dev->scrub_device = NULL; 2337 mutex_unlock(&fs_info->scrub_lock); 2338 2339 scrub_free_dev(sdev); 2340 scrub_workers_put(root); 2341 2342 return ret; 2343 } 2344 2345 void btrfs_scrub_pause(struct btrfs_root *root) 2346 { 2347 struct btrfs_fs_info *fs_info = root->fs_info; 2348 2349 mutex_lock(&fs_info->scrub_lock); 2350 atomic_inc(&fs_info->scrub_pause_req); 2351 while (atomic_read(&fs_info->scrubs_paused) != 2352 atomic_read(&fs_info->scrubs_running)) { 2353 mutex_unlock(&fs_info->scrub_lock); 2354 wait_event(fs_info->scrub_pause_wait, 2355 atomic_read(&fs_info->scrubs_paused) == 2356 atomic_read(&fs_info->scrubs_running)); 2357 mutex_lock(&fs_info->scrub_lock); 2358 } 2359 mutex_unlock(&fs_info->scrub_lock); 2360 } 2361 2362 void btrfs_scrub_continue(struct btrfs_root *root) 2363 { 2364 struct btrfs_fs_info *fs_info = root->fs_info; 2365 2366 atomic_dec(&fs_info->scrub_pause_req); 2367 wake_up(&fs_info->scrub_pause_wait); 2368 } 2369 2370 void btrfs_scrub_pause_super(struct btrfs_root *root) 2371 { 2372 down_write(&root->fs_info->scrub_super_lock); 2373 } 2374 2375 void btrfs_scrub_continue_super(struct btrfs_root *root) 2376 { 2377 up_write(&root->fs_info->scrub_super_lock); 2378 } 2379 2380 int __btrfs_scrub_cancel(struct btrfs_fs_info *fs_info) 2381 { 2382 2383 mutex_lock(&fs_info->scrub_lock); 2384 if (!atomic_read(&fs_info->scrubs_running)) { 2385 mutex_unlock(&fs_info->scrub_lock); 2386 return -ENOTCONN; 2387 } 2388 2389 atomic_inc(&fs_info->scrub_cancel_req); 2390 while (atomic_read(&fs_info->scrubs_running)) { 2391 mutex_unlock(&fs_info->scrub_lock); 2392 wait_event(fs_info->scrub_pause_wait, 2393 atomic_read(&fs_info->scrubs_running) == 0); 2394 mutex_lock(&fs_info->scrub_lock); 2395 } 2396 atomic_dec(&fs_info->scrub_cancel_req); 2397 mutex_unlock(&fs_info->scrub_lock); 2398 2399 return 0; 2400 } 2401 2402 int btrfs_scrub_cancel(struct btrfs_root *root) 2403 { 2404 return __btrfs_scrub_cancel(root->fs_info); 2405 } 2406 2407 int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev) 2408 { 2409 struct btrfs_fs_info *fs_info = root->fs_info; 2410 struct scrub_dev *sdev; 2411 2412 mutex_lock(&fs_info->scrub_lock); 2413 sdev = dev->scrub_device; 2414 if (!sdev) { 2415 mutex_unlock(&fs_info->scrub_lock); 2416 return -ENOTCONN; 2417 } 2418 atomic_inc(&sdev->cancel_req); 2419 while (dev->scrub_device) { 2420 mutex_unlock(&fs_info->scrub_lock); 2421 wait_event(fs_info->scrub_pause_wait, 2422 dev->scrub_device == NULL); 2423 mutex_lock(&fs_info->scrub_lock); 2424 } 2425 mutex_unlock(&fs_info->scrub_lock); 2426 2427 return 0; 2428 } 2429 2430 int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid) 2431 { 2432 struct btrfs_fs_info *fs_info = root->fs_info; 2433 struct btrfs_device *dev; 2434 int ret; 2435 2436 /* 2437 * we have to hold the device_list_mutex here so the device 2438 * does not go away in cancel_dev. FIXME: find a better solution 2439 */ 2440 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2441 dev = btrfs_find_device(root, devid, NULL, NULL); 2442 if (!dev) { 2443 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2444 return -ENODEV; 2445 } 2446 ret = btrfs_scrub_cancel_dev(root, dev); 2447 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2448 2449 return ret; 2450 } 2451 2452 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2453 struct btrfs_scrub_progress *progress) 2454 { 2455 struct btrfs_device *dev; 2456 struct scrub_dev *sdev = NULL; 2457 2458 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2459 dev = btrfs_find_device(root, devid, NULL, NULL); 2460 if (dev) 2461 sdev = dev->scrub_device; 2462 if (sdev) 2463 memcpy(progress, &sdev->stat, sizeof(*progress)); 2464 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2465 2466 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV; 2467 } 2468