1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) STRATO AG 2012. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/blkdev.h> 10 #include <linux/kthread.h> 11 #include <linux/math64.h> 12 #include "misc.h" 13 #include "ctree.h" 14 #include "extent_map.h" 15 #include "disk-io.h" 16 #include "transaction.h" 17 #include "print-tree.h" 18 #include "volumes.h" 19 #include "async-thread.h" 20 #include "check-integrity.h" 21 #include "rcu-string.h" 22 #include "dev-replace.h" 23 #include "sysfs.h" 24 #include "zoned.h" 25 #include "block-group.h" 26 #include "fs.h" 27 28 /* 29 * Device replace overview 30 * 31 * [Objective] 32 * To copy all extents (both new and on-disk) from source device to target 33 * device, while still keeping the filesystem read-write. 34 * 35 * [Method] 36 * There are two main methods involved: 37 * 38 * - Write duplication 39 * 40 * All new writes will be written to both target and source devices, so even 41 * if replace gets canceled, sources device still contains up-to-date data. 42 * 43 * Location: handle_ops_on_dev_replace() from __btrfs_map_block() 44 * Start: btrfs_dev_replace_start() 45 * End: btrfs_dev_replace_finishing() 46 * Content: Latest data/metadata 47 * 48 * - Copy existing extents 49 * 50 * This happens by re-using scrub facility, as scrub also iterates through 51 * existing extents from commit root. 52 * 53 * Location: scrub_write_block_to_dev_replace() from 54 * scrub_block_complete() 55 * Content: Data/meta from commit root. 56 * 57 * Due to the content difference, we need to avoid nocow write when dev-replace 58 * is happening. This is done by marking the block group read-only and waiting 59 * for NOCOW writes. 60 * 61 * After replace is done, the finishing part is done by swapping the target and 62 * source devices. 63 * 64 * Location: btrfs_dev_replace_update_device_in_mapping_tree() from 65 * btrfs_dev_replace_finishing() 66 */ 67 68 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 69 int scrub_ret); 70 static int btrfs_dev_replace_kthread(void *data); 71 72 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) 73 { 74 struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID }; 75 struct btrfs_key key; 76 struct btrfs_root *dev_root = fs_info->dev_root; 77 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 78 struct extent_buffer *eb; 79 int slot; 80 int ret = 0; 81 struct btrfs_path *path = NULL; 82 int item_size; 83 struct btrfs_dev_replace_item *ptr; 84 u64 src_devid; 85 86 if (!dev_root) 87 return 0; 88 89 path = btrfs_alloc_path(); 90 if (!path) { 91 ret = -ENOMEM; 92 goto out; 93 } 94 95 key.objectid = 0; 96 key.type = BTRFS_DEV_REPLACE_KEY; 97 key.offset = 0; 98 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 99 if (ret) { 100 no_valid_dev_replace_entry_found: 101 /* 102 * We don't have a replace item or it's corrupted. If there is 103 * a replace target, fail the mount. 104 */ 105 if (btrfs_find_device(fs_info->fs_devices, &args)) { 106 btrfs_err(fs_info, 107 "found replace target device without a valid replace item"); 108 ret = -EUCLEAN; 109 goto out; 110 } 111 ret = 0; 112 dev_replace->replace_state = 113 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 114 dev_replace->cont_reading_from_srcdev_mode = 115 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; 116 dev_replace->time_started = 0; 117 dev_replace->time_stopped = 0; 118 atomic64_set(&dev_replace->num_write_errors, 0); 119 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 120 dev_replace->cursor_left = 0; 121 dev_replace->committed_cursor_left = 0; 122 dev_replace->cursor_left_last_write_of_item = 0; 123 dev_replace->cursor_right = 0; 124 dev_replace->srcdev = NULL; 125 dev_replace->tgtdev = NULL; 126 dev_replace->is_valid = 0; 127 dev_replace->item_needs_writeback = 0; 128 goto out; 129 } 130 slot = path->slots[0]; 131 eb = path->nodes[0]; 132 item_size = btrfs_item_size(eb, slot); 133 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); 134 135 if (item_size != sizeof(struct btrfs_dev_replace_item)) { 136 btrfs_warn(fs_info, 137 "dev_replace entry found has unexpected size, ignore entry"); 138 goto no_valid_dev_replace_entry_found; 139 } 140 141 src_devid = btrfs_dev_replace_src_devid(eb, ptr); 142 dev_replace->cont_reading_from_srcdev_mode = 143 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); 144 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); 145 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); 146 dev_replace->time_stopped = 147 btrfs_dev_replace_time_stopped(eb, ptr); 148 atomic64_set(&dev_replace->num_write_errors, 149 btrfs_dev_replace_num_write_errors(eb, ptr)); 150 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 151 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); 152 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); 153 dev_replace->committed_cursor_left = dev_replace->cursor_left; 154 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; 155 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); 156 dev_replace->is_valid = 1; 157 158 dev_replace->item_needs_writeback = 0; 159 switch (dev_replace->replace_state) { 160 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 161 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 162 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 163 /* 164 * We don't have an active replace item but if there is a 165 * replace target, fail the mount. 166 */ 167 if (btrfs_find_device(fs_info->fs_devices, &args)) { 168 btrfs_err(fs_info, 169 "replace without active item, run 'device scan --forget' on the target device"); 170 ret = -EUCLEAN; 171 } else { 172 dev_replace->srcdev = NULL; 173 dev_replace->tgtdev = NULL; 174 } 175 break; 176 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 177 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 178 dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args); 179 args.devid = src_devid; 180 dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args); 181 182 /* 183 * allow 'btrfs dev replace_cancel' if src/tgt device is 184 * missing 185 */ 186 if (!dev_replace->srcdev && 187 !btrfs_test_opt(fs_info, DEGRADED)) { 188 ret = -EIO; 189 btrfs_warn(fs_info, 190 "cannot mount because device replace operation is ongoing and"); 191 btrfs_warn(fs_info, 192 "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", 193 src_devid); 194 } 195 if (!dev_replace->tgtdev && 196 !btrfs_test_opt(fs_info, DEGRADED)) { 197 ret = -EIO; 198 btrfs_warn(fs_info, 199 "cannot mount because device replace operation is ongoing and"); 200 btrfs_warn(fs_info, 201 "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", 202 BTRFS_DEV_REPLACE_DEVID); 203 } 204 if (dev_replace->tgtdev) { 205 if (dev_replace->srcdev) { 206 dev_replace->tgtdev->total_bytes = 207 dev_replace->srcdev->total_bytes; 208 dev_replace->tgtdev->disk_total_bytes = 209 dev_replace->srcdev->disk_total_bytes; 210 dev_replace->tgtdev->commit_total_bytes = 211 dev_replace->srcdev->commit_total_bytes; 212 dev_replace->tgtdev->bytes_used = 213 dev_replace->srcdev->bytes_used; 214 dev_replace->tgtdev->commit_bytes_used = 215 dev_replace->srcdev->commit_bytes_used; 216 } 217 set_bit(BTRFS_DEV_STATE_REPLACE_TGT, 218 &dev_replace->tgtdev->dev_state); 219 220 WARN_ON(fs_info->fs_devices->rw_devices == 0); 221 dev_replace->tgtdev->io_width = fs_info->sectorsize; 222 dev_replace->tgtdev->io_align = fs_info->sectorsize; 223 dev_replace->tgtdev->sector_size = fs_info->sectorsize; 224 dev_replace->tgtdev->fs_info = fs_info; 225 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 226 &dev_replace->tgtdev->dev_state); 227 } 228 break; 229 } 230 231 out: 232 btrfs_free_path(path); 233 return ret; 234 } 235 236 /* 237 * Initialize a new device for device replace target from a given source dev 238 * and path. 239 * 240 * Return 0 and new device in @device_out, otherwise return < 0 241 */ 242 static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 243 const char *device_path, 244 struct btrfs_device *srcdev, 245 struct btrfs_device **device_out) 246 { 247 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 248 struct btrfs_device *device; 249 struct block_device *bdev; 250 struct rcu_string *name; 251 u64 devid = BTRFS_DEV_REPLACE_DEVID; 252 int ret = 0; 253 254 *device_out = NULL; 255 if (srcdev->fs_devices->seeding) { 256 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 257 return -EINVAL; 258 } 259 260 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 261 fs_info->bdev_holder); 262 if (IS_ERR(bdev)) { 263 btrfs_err(fs_info, "target device %s is invalid!", device_path); 264 return PTR_ERR(bdev); 265 } 266 267 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 268 btrfs_err(fs_info, 269 "dev-replace: zoned type of target device mismatch with filesystem"); 270 ret = -EINVAL; 271 goto error; 272 } 273 274 sync_blockdev(bdev); 275 276 list_for_each_entry(device, &fs_devices->devices, dev_list) { 277 if (device->bdev == bdev) { 278 btrfs_err(fs_info, 279 "target device is in the filesystem!"); 280 ret = -EEXIST; 281 goto error; 282 } 283 } 284 285 286 if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) { 287 btrfs_err(fs_info, 288 "target device is smaller than source device!"); 289 ret = -EINVAL; 290 goto error; 291 } 292 293 294 device = btrfs_alloc_device(NULL, &devid, NULL); 295 if (IS_ERR(device)) { 296 ret = PTR_ERR(device); 297 goto error; 298 } 299 300 name = rcu_string_strdup(device_path, GFP_KERNEL); 301 if (!name) { 302 btrfs_free_device(device); 303 ret = -ENOMEM; 304 goto error; 305 } 306 rcu_assign_pointer(device->name, name); 307 ret = lookup_bdev(device_path, &device->devt); 308 if (ret) 309 goto error; 310 311 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 312 device->generation = 0; 313 device->io_width = fs_info->sectorsize; 314 device->io_align = fs_info->sectorsize; 315 device->sector_size = fs_info->sectorsize; 316 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 317 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 318 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 319 device->commit_total_bytes = srcdev->commit_total_bytes; 320 device->commit_bytes_used = device->bytes_used; 321 device->fs_info = fs_info; 322 device->bdev = bdev; 323 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 324 set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 325 device->mode = FMODE_EXCL; 326 device->dev_stats_valid = 1; 327 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 328 device->fs_devices = fs_devices; 329 330 ret = btrfs_get_dev_zone_info(device, false); 331 if (ret) 332 goto error; 333 334 mutex_lock(&fs_devices->device_list_mutex); 335 list_add(&device->dev_list, &fs_devices->devices); 336 fs_devices->num_devices++; 337 fs_devices->open_devices++; 338 mutex_unlock(&fs_devices->device_list_mutex); 339 340 *device_out = device; 341 return 0; 342 343 error: 344 blkdev_put(bdev, FMODE_EXCL); 345 return ret; 346 } 347 348 /* 349 * called from commit_transaction. Writes changed device replace state to 350 * disk. 351 */ 352 int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) 353 { 354 struct btrfs_fs_info *fs_info = trans->fs_info; 355 int ret; 356 struct btrfs_root *dev_root = fs_info->dev_root; 357 struct btrfs_path *path; 358 struct btrfs_key key; 359 struct extent_buffer *eb; 360 struct btrfs_dev_replace_item *ptr; 361 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 362 363 down_read(&dev_replace->rwsem); 364 if (!dev_replace->is_valid || 365 !dev_replace->item_needs_writeback) { 366 up_read(&dev_replace->rwsem); 367 return 0; 368 } 369 up_read(&dev_replace->rwsem); 370 371 key.objectid = 0; 372 key.type = BTRFS_DEV_REPLACE_KEY; 373 key.offset = 0; 374 375 path = btrfs_alloc_path(); 376 if (!path) { 377 ret = -ENOMEM; 378 goto out; 379 } 380 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 381 if (ret < 0) { 382 btrfs_warn(fs_info, 383 "error %d while searching for dev_replace item!", 384 ret); 385 goto out; 386 } 387 388 if (ret == 0 && 389 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 390 /* 391 * need to delete old one and insert a new one. 392 * Since no attempt is made to recover any old state, if the 393 * dev_replace state is 'running', the data on the target 394 * drive is lost. 395 * It would be possible to recover the state: just make sure 396 * that the beginning of the item is never changed and always 397 * contains all the essential information. Then read this 398 * minimal set of information and use it as a base for the 399 * new state. 400 */ 401 ret = btrfs_del_item(trans, dev_root, path); 402 if (ret != 0) { 403 btrfs_warn(fs_info, 404 "delete too small dev_replace item failed %d!", 405 ret); 406 goto out; 407 } 408 ret = 1; 409 } 410 411 if (ret == 1) { 412 /* need to insert a new item */ 413 btrfs_release_path(path); 414 ret = btrfs_insert_empty_item(trans, dev_root, path, 415 &key, sizeof(*ptr)); 416 if (ret < 0) { 417 btrfs_warn(fs_info, 418 "insert dev_replace item failed %d!", ret); 419 goto out; 420 } 421 } 422 423 eb = path->nodes[0]; 424 ptr = btrfs_item_ptr(eb, path->slots[0], 425 struct btrfs_dev_replace_item); 426 427 down_write(&dev_replace->rwsem); 428 if (dev_replace->srcdev) 429 btrfs_set_dev_replace_src_devid(eb, ptr, 430 dev_replace->srcdev->devid); 431 else 432 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); 433 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, 434 dev_replace->cont_reading_from_srcdev_mode); 435 btrfs_set_dev_replace_replace_state(eb, ptr, 436 dev_replace->replace_state); 437 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); 438 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); 439 btrfs_set_dev_replace_num_write_errors(eb, ptr, 440 atomic64_read(&dev_replace->num_write_errors)); 441 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, 442 atomic64_read(&dev_replace->num_uncorrectable_read_errors)); 443 dev_replace->cursor_left_last_write_of_item = 444 dev_replace->cursor_left; 445 btrfs_set_dev_replace_cursor_left(eb, ptr, 446 dev_replace->cursor_left_last_write_of_item); 447 btrfs_set_dev_replace_cursor_right(eb, ptr, 448 dev_replace->cursor_right); 449 dev_replace->item_needs_writeback = 0; 450 up_write(&dev_replace->rwsem); 451 452 btrfs_mark_buffer_dirty(eb); 453 454 out: 455 btrfs_free_path(path); 456 457 return ret; 458 } 459 460 static char* btrfs_dev_name(struct btrfs_device *device) 461 { 462 if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 463 return "<missing disk>"; 464 else 465 return rcu_str_deref(device->name); 466 } 467 468 static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, 469 struct btrfs_device *src_dev) 470 { 471 struct btrfs_path *path; 472 struct btrfs_key key; 473 struct btrfs_key found_key; 474 struct btrfs_root *root = fs_info->dev_root; 475 struct btrfs_dev_extent *dev_extent = NULL; 476 struct btrfs_block_group *cache; 477 struct btrfs_trans_handle *trans; 478 int iter_ret = 0; 479 int ret = 0; 480 u64 chunk_offset; 481 482 /* Do not use "to_copy" on non zoned filesystem for now */ 483 if (!btrfs_is_zoned(fs_info)) 484 return 0; 485 486 mutex_lock(&fs_info->chunk_mutex); 487 488 /* Ensure we don't have pending new block group */ 489 spin_lock(&fs_info->trans_lock); 490 while (fs_info->running_transaction && 491 !list_empty(&fs_info->running_transaction->dev_update_list)) { 492 spin_unlock(&fs_info->trans_lock); 493 mutex_unlock(&fs_info->chunk_mutex); 494 trans = btrfs_attach_transaction(root); 495 if (IS_ERR(trans)) { 496 ret = PTR_ERR(trans); 497 mutex_lock(&fs_info->chunk_mutex); 498 if (ret == -ENOENT) { 499 spin_lock(&fs_info->trans_lock); 500 continue; 501 } else { 502 goto unlock; 503 } 504 } 505 506 ret = btrfs_commit_transaction(trans); 507 mutex_lock(&fs_info->chunk_mutex); 508 if (ret) 509 goto unlock; 510 511 spin_lock(&fs_info->trans_lock); 512 } 513 spin_unlock(&fs_info->trans_lock); 514 515 path = btrfs_alloc_path(); 516 if (!path) { 517 ret = -ENOMEM; 518 goto unlock; 519 } 520 521 path->reada = READA_FORWARD; 522 path->search_commit_root = 1; 523 path->skip_locking = 1; 524 525 key.objectid = src_dev->devid; 526 key.type = BTRFS_DEV_EXTENT_KEY; 527 key.offset = 0; 528 529 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 530 struct extent_buffer *leaf = path->nodes[0]; 531 532 if (found_key.objectid != src_dev->devid) 533 break; 534 535 if (found_key.type != BTRFS_DEV_EXTENT_KEY) 536 break; 537 538 if (found_key.offset < key.offset) 539 break; 540 541 dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 542 543 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); 544 545 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 546 if (!cache) 547 continue; 548 549 set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 550 btrfs_put_block_group(cache); 551 } 552 if (iter_ret < 0) 553 ret = iter_ret; 554 555 btrfs_free_path(path); 556 unlock: 557 mutex_unlock(&fs_info->chunk_mutex); 558 559 return ret; 560 } 561 562 bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, 563 struct btrfs_block_group *cache, 564 u64 physical) 565 { 566 struct btrfs_fs_info *fs_info = cache->fs_info; 567 struct extent_map *em; 568 struct map_lookup *map; 569 u64 chunk_offset = cache->start; 570 int num_extents, cur_extent; 571 int i; 572 573 /* Do not use "to_copy" on non zoned filesystem for now */ 574 if (!btrfs_is_zoned(fs_info)) 575 return true; 576 577 spin_lock(&cache->lock); 578 if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) { 579 spin_unlock(&cache->lock); 580 return true; 581 } 582 spin_unlock(&cache->lock); 583 584 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 585 ASSERT(!IS_ERR(em)); 586 map = em->map_lookup; 587 588 num_extents = 0; 589 cur_extent = 0; 590 for (i = 0; i < map->num_stripes; i++) { 591 /* We have more device extent to copy */ 592 if (srcdev != map->stripes[i].dev) 593 continue; 594 595 num_extents++; 596 if (physical == map->stripes[i].physical) 597 cur_extent = i; 598 } 599 600 free_extent_map(em); 601 602 if (num_extents > 1 && cur_extent < num_extents - 1) { 603 /* 604 * Has more stripes on this device. Keep this block group 605 * readonly until we finish all the stripes. 606 */ 607 return false; 608 } 609 610 /* Last stripe on this device */ 611 clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 612 613 return true; 614 } 615 616 static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, 617 const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, 618 int read_src) 619 { 620 struct btrfs_root *root = fs_info->dev_root; 621 struct btrfs_trans_handle *trans; 622 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 623 int ret; 624 struct btrfs_device *tgt_device = NULL; 625 struct btrfs_device *src_device = NULL; 626 627 src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, 628 srcdev_name); 629 if (IS_ERR(src_device)) 630 return PTR_ERR(src_device); 631 632 if (btrfs_pinned_by_swapfile(fs_info, src_device)) { 633 btrfs_warn_in_rcu(fs_info, 634 "cannot replace device %s (devid %llu) due to active swapfile", 635 btrfs_dev_name(src_device), src_device->devid); 636 return -ETXTBSY; 637 } 638 639 /* 640 * Here we commit the transaction to make sure commit_total_bytes 641 * of all the devices are updated. 642 */ 643 trans = btrfs_attach_transaction(root); 644 if (!IS_ERR(trans)) { 645 ret = btrfs_commit_transaction(trans); 646 if (ret) 647 return ret; 648 } else if (PTR_ERR(trans) != -ENOENT) { 649 return PTR_ERR(trans); 650 } 651 652 ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, 653 src_device, &tgt_device); 654 if (ret) 655 return ret; 656 657 ret = mark_block_group_to_copy(fs_info, src_device); 658 if (ret) 659 return ret; 660 661 down_write(&dev_replace->rwsem); 662 switch (dev_replace->replace_state) { 663 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 664 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 665 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 666 break; 667 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 668 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 669 ASSERT(0); 670 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 671 up_write(&dev_replace->rwsem); 672 goto leave; 673 } 674 675 dev_replace->cont_reading_from_srcdev_mode = read_src; 676 dev_replace->srcdev = src_device; 677 dev_replace->tgtdev = tgt_device; 678 679 btrfs_info_in_rcu(fs_info, 680 "dev_replace from %s (devid %llu) to %s started", 681 btrfs_dev_name(src_device), 682 src_device->devid, 683 rcu_str_deref(tgt_device->name)); 684 685 /* 686 * from now on, the writes to the srcdev are all duplicated to 687 * go to the tgtdev as well (refer to btrfs_map_block()). 688 */ 689 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 690 dev_replace->time_started = ktime_get_real_seconds(); 691 dev_replace->cursor_left = 0; 692 dev_replace->committed_cursor_left = 0; 693 dev_replace->cursor_left_last_write_of_item = 0; 694 dev_replace->cursor_right = 0; 695 dev_replace->is_valid = 1; 696 dev_replace->item_needs_writeback = 1; 697 atomic64_set(&dev_replace->num_write_errors, 0); 698 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 699 up_write(&dev_replace->rwsem); 700 701 ret = btrfs_sysfs_add_device(tgt_device); 702 if (ret) 703 btrfs_err(fs_info, "kobj add dev failed %d", ret); 704 705 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 706 707 /* 708 * Commit dev_replace state and reserve 1 item for it. 709 * This is crucial to ensure we won't miss copying extents for new block 710 * groups that are allocated after we started the device replace, and 711 * must be done after setting up the device replace state. 712 */ 713 trans = btrfs_start_transaction(root, 1); 714 if (IS_ERR(trans)) { 715 ret = PTR_ERR(trans); 716 down_write(&dev_replace->rwsem); 717 dev_replace->replace_state = 718 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 719 dev_replace->srcdev = NULL; 720 dev_replace->tgtdev = NULL; 721 up_write(&dev_replace->rwsem); 722 goto leave; 723 } 724 725 ret = btrfs_commit_transaction(trans); 726 WARN_ON(ret); 727 728 /* the disk copy procedure reuses the scrub code */ 729 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 730 btrfs_device_get_total_bytes(src_device), 731 &dev_replace->scrub_progress, 0, 1); 732 733 ret = btrfs_dev_replace_finishing(fs_info, ret); 734 if (ret == -EINPROGRESS) 735 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; 736 737 return ret; 738 739 leave: 740 btrfs_destroy_dev_replace_tgtdev(tgt_device); 741 return ret; 742 } 743 744 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, 745 struct btrfs_ioctl_dev_replace_args *args) 746 { 747 int ret; 748 749 switch (args->start.cont_reading_from_srcdev_mode) { 750 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 751 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 752 break; 753 default: 754 return -EINVAL; 755 } 756 757 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || 758 args->start.tgtdev_name[0] == '\0') 759 return -EINVAL; 760 761 ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, 762 args->start.srcdevid, 763 args->start.srcdev_name, 764 args->start.cont_reading_from_srcdev_mode); 765 args->result = ret; 766 /* don't warn if EINPROGRESS, someone else might be running scrub */ 767 if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || 768 ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) 769 return 0; 770 771 return ret; 772 } 773 774 /* 775 * blocked until all in-flight bios operations are finished. 776 */ 777 static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) 778 { 779 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 780 wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum( 781 &fs_info->dev_replace.bio_counter)); 782 } 783 784 /* 785 * we have removed target device, it is safe to allow new bios request. 786 */ 787 static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) 788 { 789 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 790 wake_up(&fs_info->dev_replace.replace_wait); 791 } 792 793 /* 794 * When finishing the device replace, before swapping the source device with the 795 * target device we must update the chunk allocation state in the target device, 796 * as it is empty because replace works by directly copying the chunks and not 797 * through the normal chunk allocation path. 798 */ 799 static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, 800 struct btrfs_device *tgtdev) 801 { 802 struct extent_state *cached_state = NULL; 803 u64 start = 0; 804 u64 found_start; 805 u64 found_end; 806 int ret = 0; 807 808 lockdep_assert_held(&srcdev->fs_info->chunk_mutex); 809 810 while (!find_first_extent_bit(&srcdev->alloc_state, start, 811 &found_start, &found_end, 812 CHUNK_ALLOCATED, &cached_state)) { 813 ret = set_extent_bits(&tgtdev->alloc_state, found_start, 814 found_end, CHUNK_ALLOCATED); 815 if (ret) 816 break; 817 start = found_end + 1; 818 } 819 820 free_extent_state(cached_state); 821 return ret; 822 } 823 824 static void btrfs_dev_replace_update_device_in_mapping_tree( 825 struct btrfs_fs_info *fs_info, 826 struct btrfs_device *srcdev, 827 struct btrfs_device *tgtdev) 828 { 829 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 830 struct extent_map *em; 831 struct map_lookup *map; 832 u64 start = 0; 833 int i; 834 835 write_lock(&em_tree->lock); 836 do { 837 em = lookup_extent_mapping(em_tree, start, (u64)-1); 838 if (!em) 839 break; 840 map = em->map_lookup; 841 for (i = 0; i < map->num_stripes; i++) 842 if (srcdev == map->stripes[i].dev) 843 map->stripes[i].dev = tgtdev; 844 start = em->start + em->len; 845 free_extent_map(em); 846 } while (start); 847 write_unlock(&em_tree->lock); 848 } 849 850 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 851 int scrub_ret) 852 { 853 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 854 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 855 struct btrfs_device *tgt_device; 856 struct btrfs_device *src_device; 857 struct btrfs_root *root = fs_info->tree_root; 858 u8 uuid_tmp[BTRFS_UUID_SIZE]; 859 struct btrfs_trans_handle *trans; 860 int ret = 0; 861 862 /* don't allow cancel or unmount to disturb the finishing procedure */ 863 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 864 865 down_read(&dev_replace->rwsem); 866 /* was the operation canceled, or is it finished? */ 867 if (dev_replace->replace_state != 868 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 869 up_read(&dev_replace->rwsem); 870 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 871 return 0; 872 } 873 874 tgt_device = dev_replace->tgtdev; 875 src_device = dev_replace->srcdev; 876 up_read(&dev_replace->rwsem); 877 878 /* 879 * flush all outstanding I/O and inode extent mappings before the 880 * copy operation is declared as being finished 881 */ 882 ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); 883 if (ret) { 884 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 885 return ret; 886 } 887 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 888 889 /* 890 * We have to use this loop approach because at this point src_device 891 * has to be available for transaction commit to complete, yet new 892 * chunks shouldn't be allocated on the device. 893 */ 894 while (1) { 895 trans = btrfs_start_transaction(root, 0); 896 if (IS_ERR(trans)) { 897 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 898 return PTR_ERR(trans); 899 } 900 ret = btrfs_commit_transaction(trans); 901 WARN_ON(ret); 902 903 /* Prevent write_all_supers() during the finishing procedure */ 904 mutex_lock(&fs_devices->device_list_mutex); 905 /* Prevent new chunks being allocated on the source device */ 906 mutex_lock(&fs_info->chunk_mutex); 907 908 if (!list_empty(&src_device->post_commit_list)) { 909 mutex_unlock(&fs_devices->device_list_mutex); 910 mutex_unlock(&fs_info->chunk_mutex); 911 } else { 912 break; 913 } 914 } 915 916 down_write(&dev_replace->rwsem); 917 dev_replace->replace_state = 918 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 919 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; 920 dev_replace->tgtdev = NULL; 921 dev_replace->srcdev = NULL; 922 dev_replace->time_stopped = ktime_get_real_seconds(); 923 dev_replace->item_needs_writeback = 1; 924 925 /* 926 * Update allocation state in the new device and replace the old device 927 * with the new one in the mapping tree. 928 */ 929 if (!scrub_ret) { 930 scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); 931 if (scrub_ret) 932 goto error; 933 btrfs_dev_replace_update_device_in_mapping_tree(fs_info, 934 src_device, 935 tgt_device); 936 } else { 937 if (scrub_ret != -ECANCELED) 938 btrfs_err_in_rcu(fs_info, 939 "btrfs_scrub_dev(%s, %llu, %s) failed %d", 940 btrfs_dev_name(src_device), 941 src_device->devid, 942 rcu_str_deref(tgt_device->name), scrub_ret); 943 error: 944 up_write(&dev_replace->rwsem); 945 mutex_unlock(&fs_info->chunk_mutex); 946 mutex_unlock(&fs_devices->device_list_mutex); 947 btrfs_rm_dev_replace_blocked(fs_info); 948 if (tgt_device) 949 btrfs_destroy_dev_replace_tgtdev(tgt_device); 950 btrfs_rm_dev_replace_unblocked(fs_info); 951 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 952 953 return scrub_ret; 954 } 955 956 btrfs_info_in_rcu(fs_info, 957 "dev_replace from %s (devid %llu) to %s finished", 958 btrfs_dev_name(src_device), 959 src_device->devid, 960 rcu_str_deref(tgt_device->name)); 961 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); 962 tgt_device->devid = src_device->devid; 963 src_device->devid = BTRFS_DEV_REPLACE_DEVID; 964 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 965 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 966 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 967 btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); 968 btrfs_device_set_disk_total_bytes(tgt_device, 969 src_device->disk_total_bytes); 970 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); 971 tgt_device->commit_bytes_used = src_device->bytes_used; 972 973 btrfs_assign_next_active_device(src_device, tgt_device); 974 975 list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); 976 fs_devices->rw_devices++; 977 978 up_write(&dev_replace->rwsem); 979 btrfs_rm_dev_replace_blocked(fs_info); 980 981 btrfs_rm_dev_replace_remove_srcdev(src_device); 982 983 btrfs_rm_dev_replace_unblocked(fs_info); 984 985 /* 986 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will 987 * update on-disk dev stats value during commit transaction 988 */ 989 atomic_inc(&tgt_device->dev_stats_ccnt); 990 991 /* 992 * this is again a consistent state where no dev_replace procedure 993 * is running, the target device is part of the filesystem, the 994 * source device is not part of the filesystem anymore and its 1st 995 * superblock is scratched out so that it is no longer marked to 996 * belong to this filesystem. 997 */ 998 mutex_unlock(&fs_info->chunk_mutex); 999 mutex_unlock(&fs_devices->device_list_mutex); 1000 1001 /* replace the sysfs entry */ 1002 btrfs_sysfs_remove_device(src_device); 1003 btrfs_sysfs_update_devid(tgt_device); 1004 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) 1005 btrfs_scratch_superblocks(fs_info, src_device->bdev, 1006 src_device->name->str); 1007 1008 /* write back the superblocks */ 1009 trans = btrfs_start_transaction(root, 0); 1010 if (!IS_ERR(trans)) 1011 btrfs_commit_transaction(trans); 1012 1013 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 1014 1015 btrfs_rm_dev_replace_free_srcdev(src_device); 1016 1017 return 0; 1018 } 1019 1020 /* 1021 * Read progress of device replace status according to the state and last 1022 * stored position. The value format is the same as for 1023 * btrfs_dev_replace::progress_1000 1024 */ 1025 static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info) 1026 { 1027 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1028 u64 ret = 0; 1029 1030 switch (dev_replace->replace_state) { 1031 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 1032 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1033 ret = 0; 1034 break; 1035 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1036 ret = 1000; 1037 break; 1038 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1039 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1040 ret = div64_u64(dev_replace->cursor_left, 1041 div_u64(btrfs_device_get_total_bytes( 1042 dev_replace->srcdev), 1000)); 1043 break; 1044 } 1045 1046 return ret; 1047 } 1048 1049 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 1050 struct btrfs_ioctl_dev_replace_args *args) 1051 { 1052 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1053 1054 down_read(&dev_replace->rwsem); 1055 /* even if !dev_replace_is_valid, the values are good enough for 1056 * the replace_status ioctl */ 1057 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 1058 args->status.replace_state = dev_replace->replace_state; 1059 args->status.time_started = dev_replace->time_started; 1060 args->status.time_stopped = dev_replace->time_stopped; 1061 args->status.num_write_errors = 1062 atomic64_read(&dev_replace->num_write_errors); 1063 args->status.num_uncorrectable_read_errors = 1064 atomic64_read(&dev_replace->num_uncorrectable_read_errors); 1065 args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); 1066 up_read(&dev_replace->rwsem); 1067 } 1068 1069 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) 1070 { 1071 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1072 struct btrfs_device *tgt_device = NULL; 1073 struct btrfs_device *src_device = NULL; 1074 struct btrfs_trans_handle *trans; 1075 struct btrfs_root *root = fs_info->tree_root; 1076 int result; 1077 int ret; 1078 1079 if (sb_rdonly(fs_info->sb)) 1080 return -EROFS; 1081 1082 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 1083 down_write(&dev_replace->rwsem); 1084 switch (dev_replace->replace_state) { 1085 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 1086 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1087 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1088 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 1089 up_write(&dev_replace->rwsem); 1090 break; 1091 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1092 tgt_device = dev_replace->tgtdev; 1093 src_device = dev_replace->srcdev; 1094 up_write(&dev_replace->rwsem); 1095 ret = btrfs_scrub_cancel(fs_info); 1096 if (ret < 0) { 1097 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 1098 } else { 1099 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 1100 /* 1101 * btrfs_dev_replace_finishing() will handle the 1102 * cleanup part 1103 */ 1104 btrfs_info_in_rcu(fs_info, 1105 "dev_replace from %s (devid %llu) to %s canceled", 1106 btrfs_dev_name(src_device), src_device->devid, 1107 btrfs_dev_name(tgt_device)); 1108 } 1109 break; 1110 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1111 /* 1112 * Scrub doing the replace isn't running so we need to do the 1113 * cleanup step of btrfs_dev_replace_finishing() here 1114 */ 1115 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 1116 tgt_device = dev_replace->tgtdev; 1117 src_device = dev_replace->srcdev; 1118 dev_replace->tgtdev = NULL; 1119 dev_replace->srcdev = NULL; 1120 dev_replace->replace_state = 1121 BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 1122 dev_replace->time_stopped = ktime_get_real_seconds(); 1123 dev_replace->item_needs_writeback = 1; 1124 1125 up_write(&dev_replace->rwsem); 1126 1127 /* Scrub for replace must not be running in suspended state */ 1128 btrfs_scrub_cancel(fs_info); 1129 1130 trans = btrfs_start_transaction(root, 0); 1131 if (IS_ERR(trans)) { 1132 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 1133 return PTR_ERR(trans); 1134 } 1135 ret = btrfs_commit_transaction(trans); 1136 WARN_ON(ret); 1137 1138 btrfs_info_in_rcu(fs_info, 1139 "suspended dev_replace from %s (devid %llu) to %s canceled", 1140 btrfs_dev_name(src_device), src_device->devid, 1141 btrfs_dev_name(tgt_device)); 1142 1143 if (tgt_device) 1144 btrfs_destroy_dev_replace_tgtdev(tgt_device); 1145 break; 1146 default: 1147 up_write(&dev_replace->rwsem); 1148 result = -EINVAL; 1149 } 1150 1151 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 1152 return result; 1153 } 1154 1155 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) 1156 { 1157 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1158 1159 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 1160 down_write(&dev_replace->rwsem); 1161 1162 switch (dev_replace->replace_state) { 1163 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 1164 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1165 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1166 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1167 break; 1168 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1169 dev_replace->replace_state = 1170 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 1171 dev_replace->time_stopped = ktime_get_real_seconds(); 1172 dev_replace->item_needs_writeback = 1; 1173 btrfs_info(fs_info, "suspending dev_replace for unmount"); 1174 break; 1175 } 1176 1177 up_write(&dev_replace->rwsem); 1178 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 1179 } 1180 1181 /* resume dev_replace procedure that was interrupted by unmount */ 1182 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) 1183 { 1184 struct task_struct *task; 1185 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1186 1187 down_write(&dev_replace->rwsem); 1188 1189 switch (dev_replace->replace_state) { 1190 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 1191 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1192 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1193 up_write(&dev_replace->rwsem); 1194 return 0; 1195 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1196 break; 1197 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1198 dev_replace->replace_state = 1199 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 1200 break; 1201 } 1202 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { 1203 btrfs_info(fs_info, 1204 "cannot continue dev_replace, tgtdev is missing"); 1205 btrfs_info(fs_info, 1206 "you may cancel the operation after 'mount -o degraded'"); 1207 dev_replace->replace_state = 1208 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 1209 up_write(&dev_replace->rwsem); 1210 return 0; 1211 } 1212 up_write(&dev_replace->rwsem); 1213 1214 /* 1215 * This could collide with a paused balance, but the exclusive op logic 1216 * should never allow both to start and pause. We don't want to allow 1217 * dev-replace to start anyway. 1218 */ 1219 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { 1220 down_write(&dev_replace->rwsem); 1221 dev_replace->replace_state = 1222 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 1223 up_write(&dev_replace->rwsem); 1224 btrfs_info(fs_info, 1225 "cannot resume dev-replace, other exclusive operation running"); 1226 return 0; 1227 } 1228 1229 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); 1230 return PTR_ERR_OR_ZERO(task); 1231 } 1232 1233 static int btrfs_dev_replace_kthread(void *data) 1234 { 1235 struct btrfs_fs_info *fs_info = data; 1236 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1237 u64 progress; 1238 int ret; 1239 1240 progress = btrfs_dev_replace_progress(fs_info); 1241 progress = div_u64(progress, 10); 1242 btrfs_info_in_rcu(fs_info, 1243 "continuing dev_replace from %s (devid %llu) to target %s @%u%%", 1244 btrfs_dev_name(dev_replace->srcdev), 1245 dev_replace->srcdev->devid, 1246 btrfs_dev_name(dev_replace->tgtdev), 1247 (unsigned int)progress); 1248 1249 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 1250 dev_replace->committed_cursor_left, 1251 btrfs_device_get_total_bytes(dev_replace->srcdev), 1252 &dev_replace->scrub_progress, 0, 1); 1253 ret = btrfs_dev_replace_finishing(fs_info, ret); 1254 WARN_ON(ret && ret != -ECANCELED); 1255 1256 btrfs_exclop_finish(fs_info); 1257 return 0; 1258 } 1259 1260 int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) 1261 { 1262 if (!dev_replace->is_valid) 1263 return 0; 1264 1265 switch (dev_replace->replace_state) { 1266 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 1267 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1268 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1269 return 0; 1270 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1271 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1272 /* 1273 * return true even if tgtdev is missing (this is 1274 * something that can happen if the dev_replace 1275 * procedure is suspended by an umount and then 1276 * the tgtdev is missing (or "btrfs dev scan") was 1277 * not called and the filesystem is remounted 1278 * in degraded state. This does not stop the 1279 * dev_replace procedure. It needs to be canceled 1280 * manually if the cancellation is wanted. 1281 */ 1282 break; 1283 } 1284 return 1; 1285 } 1286 1287 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) 1288 { 1289 percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); 1290 cond_wake_up_nomb(&fs_info->dev_replace.replace_wait); 1291 } 1292 1293 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) 1294 { 1295 while (1) { 1296 percpu_counter_inc(&fs_info->dev_replace.bio_counter); 1297 if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, 1298 &fs_info->fs_state))) 1299 break; 1300 1301 btrfs_bio_counter_dec(fs_info); 1302 wait_event(fs_info->dev_replace.replace_wait, 1303 !test_bit(BTRFS_FS_STATE_DEV_REPLACING, 1304 &fs_info->fs_state)); 1305 } 1306 } 1307