1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) STRATO AG 2012. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/bio.h> 8 #include <linux/slab.h> 9 #include <linux/blkdev.h> 10 #include <linux/kthread.h> 11 #include <linux/math64.h> 12 #include "misc.h" 13 #include "ctree.h" 14 #include "extent_map.h" 15 #include "disk-io.h" 16 #include "transaction.h" 17 #include "print-tree.h" 18 #include "volumes.h" 19 #include "async-thread.h" 20 #include "check-integrity.h" 21 #include "rcu-string.h" 22 #include "dev-replace.h" 23 #include "sysfs.h" 24 #include "zoned.h" 25 26 /* 27 * Device replace overview 28 * 29 * [Objective] 30 * To copy all extents (both new and on-disk) from source device to target 31 * device, while still keeping the filesystem read-write. 32 * 33 * [Method] 34 * There are two main methods involved: 35 * 36 * - Write duplication 37 * 38 * All new writes will be written to both target and source devices, so even 39 * if replace gets canceled, sources device still contans up-to-date data. 40 * 41 * Location: handle_ops_on_dev_replace() from __btrfs_map_block() 42 * Start: btrfs_dev_replace_start() 43 * End: btrfs_dev_replace_finishing() 44 * Content: Latest data/metadata 45 * 46 * - Copy existing extents 47 * 48 * This happens by re-using scrub facility, as scrub also iterates through 49 * existing extents from commit root. 50 * 51 * Location: scrub_write_block_to_dev_replace() from 52 * scrub_block_complete() 53 * Content: Data/meta from commit root. 54 * 55 * Due to the content difference, we need to avoid nocow write when dev-replace 56 * is happening. This is done by marking the block group read-only and waiting 57 * for NOCOW writes. 58 * 59 * After replace is done, the finishing part is done by swapping the target and 60 * source devices. 61 * 62 * Location: btrfs_dev_replace_update_device_in_mapping_tree() from 63 * btrfs_dev_replace_finishing() 64 */ 65 66 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 67 int scrub_ret); 68 static int btrfs_dev_replace_kthread(void *data); 69 70 int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) 71 { 72 struct btrfs_key key; 73 struct btrfs_root *dev_root = fs_info->dev_root; 74 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 75 struct extent_buffer *eb; 76 int slot; 77 int ret = 0; 78 struct btrfs_path *path = NULL; 79 int item_size; 80 struct btrfs_dev_replace_item *ptr; 81 u64 src_devid; 82 83 path = btrfs_alloc_path(); 84 if (!path) { 85 ret = -ENOMEM; 86 goto out; 87 } 88 89 key.objectid = 0; 90 key.type = BTRFS_DEV_REPLACE_KEY; 91 key.offset = 0; 92 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); 93 if (ret) { 94 no_valid_dev_replace_entry_found: 95 /* 96 * We don't have a replace item or it's corrupted. If there is 97 * a replace target, fail the mount. 98 */ 99 if (btrfs_find_device(fs_info->fs_devices, 100 BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { 101 btrfs_err(fs_info, 102 "found replace target device without a valid replace item"); 103 ret = -EUCLEAN; 104 goto out; 105 } 106 ret = 0; 107 dev_replace->replace_state = 108 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 109 dev_replace->cont_reading_from_srcdev_mode = 110 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS; 111 dev_replace->time_started = 0; 112 dev_replace->time_stopped = 0; 113 atomic64_set(&dev_replace->num_write_errors, 0); 114 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 115 dev_replace->cursor_left = 0; 116 dev_replace->committed_cursor_left = 0; 117 dev_replace->cursor_left_last_write_of_item = 0; 118 dev_replace->cursor_right = 0; 119 dev_replace->srcdev = NULL; 120 dev_replace->tgtdev = NULL; 121 dev_replace->is_valid = 0; 122 dev_replace->item_needs_writeback = 0; 123 goto out; 124 } 125 slot = path->slots[0]; 126 eb = path->nodes[0]; 127 item_size = btrfs_item_size_nr(eb, slot); 128 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item); 129 130 if (item_size != sizeof(struct btrfs_dev_replace_item)) { 131 btrfs_warn(fs_info, 132 "dev_replace entry found has unexpected size, ignore entry"); 133 goto no_valid_dev_replace_entry_found; 134 } 135 136 src_devid = btrfs_dev_replace_src_devid(eb, ptr); 137 dev_replace->cont_reading_from_srcdev_mode = 138 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr); 139 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr); 140 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr); 141 dev_replace->time_stopped = 142 btrfs_dev_replace_time_stopped(eb, ptr); 143 atomic64_set(&dev_replace->num_write_errors, 144 btrfs_dev_replace_num_write_errors(eb, ptr)); 145 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 146 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr)); 147 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr); 148 dev_replace->committed_cursor_left = dev_replace->cursor_left; 149 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left; 150 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr); 151 dev_replace->is_valid = 1; 152 153 dev_replace->item_needs_writeback = 0; 154 switch (dev_replace->replace_state) { 155 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 156 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 157 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 158 /* 159 * We don't have an active replace item but if there is a 160 * replace target, fail the mount. 161 */ 162 if (btrfs_find_device(fs_info->fs_devices, 163 BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { 164 btrfs_err(fs_info, 165 "replace devid present without an active replace item"); 166 ret = -EUCLEAN; 167 } else { 168 dev_replace->srcdev = NULL; 169 dev_replace->tgtdev = NULL; 170 } 171 break; 172 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 173 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 174 dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, 175 src_devid, NULL, NULL); 176 dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, 177 BTRFS_DEV_REPLACE_DEVID, 178 NULL, NULL); 179 /* 180 * allow 'btrfs dev replace_cancel' if src/tgt device is 181 * missing 182 */ 183 if (!dev_replace->srcdev && 184 !btrfs_test_opt(fs_info, DEGRADED)) { 185 ret = -EIO; 186 btrfs_warn(fs_info, 187 "cannot mount because device replace operation is ongoing and"); 188 btrfs_warn(fs_info, 189 "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?", 190 src_devid); 191 } 192 if (!dev_replace->tgtdev && 193 !btrfs_test_opt(fs_info, DEGRADED)) { 194 ret = -EIO; 195 btrfs_warn(fs_info, 196 "cannot mount because device replace operation is ongoing and"); 197 btrfs_warn(fs_info, 198 "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?", 199 BTRFS_DEV_REPLACE_DEVID); 200 } 201 if (dev_replace->tgtdev) { 202 if (dev_replace->srcdev) { 203 dev_replace->tgtdev->total_bytes = 204 dev_replace->srcdev->total_bytes; 205 dev_replace->tgtdev->disk_total_bytes = 206 dev_replace->srcdev->disk_total_bytes; 207 dev_replace->tgtdev->commit_total_bytes = 208 dev_replace->srcdev->commit_total_bytes; 209 dev_replace->tgtdev->bytes_used = 210 dev_replace->srcdev->bytes_used; 211 dev_replace->tgtdev->commit_bytes_used = 212 dev_replace->srcdev->commit_bytes_used; 213 } 214 set_bit(BTRFS_DEV_STATE_REPLACE_TGT, 215 &dev_replace->tgtdev->dev_state); 216 217 WARN_ON(fs_info->fs_devices->rw_devices == 0); 218 dev_replace->tgtdev->io_width = fs_info->sectorsize; 219 dev_replace->tgtdev->io_align = fs_info->sectorsize; 220 dev_replace->tgtdev->sector_size = fs_info->sectorsize; 221 dev_replace->tgtdev->fs_info = fs_info; 222 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 223 &dev_replace->tgtdev->dev_state); 224 } 225 break; 226 } 227 228 out: 229 btrfs_free_path(path); 230 return ret; 231 } 232 233 /* 234 * Initialize a new device for device replace target from a given source dev 235 * and path. 236 * 237 * Return 0 and new device in @device_out, otherwise return < 0 238 */ 239 static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 240 const char *device_path, 241 struct btrfs_device *srcdev, 242 struct btrfs_device **device_out) 243 { 244 struct btrfs_device *device; 245 struct block_device *bdev; 246 struct rcu_string *name; 247 u64 devid = BTRFS_DEV_REPLACE_DEVID; 248 int ret = 0; 249 250 *device_out = NULL; 251 if (srcdev->fs_devices->seeding) { 252 btrfs_err(fs_info, "the filesystem is a seed filesystem!"); 253 return -EINVAL; 254 } 255 256 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 257 fs_info->bdev_holder); 258 if (IS_ERR(bdev)) { 259 btrfs_err(fs_info, "target device %s is invalid!", device_path); 260 return PTR_ERR(bdev); 261 } 262 263 sync_blockdev(bdev); 264 265 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { 266 if (device->bdev == bdev) { 267 btrfs_err(fs_info, 268 "target device is in the filesystem!"); 269 ret = -EEXIST; 270 goto error; 271 } 272 } 273 274 275 if (i_size_read(bdev->bd_inode) < 276 btrfs_device_get_total_bytes(srcdev)) { 277 btrfs_err(fs_info, 278 "target device is smaller than source device!"); 279 ret = -EINVAL; 280 goto error; 281 } 282 283 284 device = btrfs_alloc_device(NULL, &devid, NULL); 285 if (IS_ERR(device)) { 286 ret = PTR_ERR(device); 287 goto error; 288 } 289 290 name = rcu_string_strdup(device_path, GFP_KERNEL); 291 if (!name) { 292 btrfs_free_device(device); 293 ret = -ENOMEM; 294 goto error; 295 } 296 rcu_assign_pointer(device->name, name); 297 298 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 299 device->generation = 0; 300 device->io_width = fs_info->sectorsize; 301 device->io_align = fs_info->sectorsize; 302 device->sector_size = fs_info->sectorsize; 303 device->total_bytes = btrfs_device_get_total_bytes(srcdev); 304 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); 305 device->bytes_used = btrfs_device_get_bytes_used(srcdev); 306 device->commit_total_bytes = srcdev->commit_total_bytes; 307 device->commit_bytes_used = device->bytes_used; 308 device->fs_info = fs_info; 309 device->bdev = bdev; 310 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 311 set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 312 device->mode = FMODE_EXCL; 313 device->dev_stats_valid = 1; 314 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 315 device->fs_devices = fs_info->fs_devices; 316 317 ret = btrfs_get_dev_zone_info(device); 318 if (ret) 319 goto error; 320 321 mutex_lock(&fs_info->fs_devices->device_list_mutex); 322 list_add(&device->dev_list, &fs_info->fs_devices->devices); 323 fs_info->fs_devices->num_devices++; 324 fs_info->fs_devices->open_devices++; 325 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 326 327 *device_out = device; 328 return 0; 329 330 error: 331 blkdev_put(bdev, FMODE_EXCL); 332 return ret; 333 } 334 335 /* 336 * called from commit_transaction. Writes changed device replace state to 337 * disk. 338 */ 339 int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) 340 { 341 struct btrfs_fs_info *fs_info = trans->fs_info; 342 int ret; 343 struct btrfs_root *dev_root = fs_info->dev_root; 344 struct btrfs_path *path; 345 struct btrfs_key key; 346 struct extent_buffer *eb; 347 struct btrfs_dev_replace_item *ptr; 348 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 349 350 down_read(&dev_replace->rwsem); 351 if (!dev_replace->is_valid || 352 !dev_replace->item_needs_writeback) { 353 up_read(&dev_replace->rwsem); 354 return 0; 355 } 356 up_read(&dev_replace->rwsem); 357 358 key.objectid = 0; 359 key.type = BTRFS_DEV_REPLACE_KEY; 360 key.offset = 0; 361 362 path = btrfs_alloc_path(); 363 if (!path) { 364 ret = -ENOMEM; 365 goto out; 366 } 367 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 368 if (ret < 0) { 369 btrfs_warn(fs_info, 370 "error %d while searching for dev_replace item!", 371 ret); 372 goto out; 373 } 374 375 if (ret == 0 && 376 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 377 /* 378 * need to delete old one and insert a new one. 379 * Since no attempt is made to recover any old state, if the 380 * dev_replace state is 'running', the data on the target 381 * drive is lost. 382 * It would be possible to recover the state: just make sure 383 * that the beginning of the item is never changed and always 384 * contains all the essential information. Then read this 385 * minimal set of information and use it as a base for the 386 * new state. 387 */ 388 ret = btrfs_del_item(trans, dev_root, path); 389 if (ret != 0) { 390 btrfs_warn(fs_info, 391 "delete too small dev_replace item failed %d!", 392 ret); 393 goto out; 394 } 395 ret = 1; 396 } 397 398 if (ret == 1) { 399 /* need to insert a new item */ 400 btrfs_release_path(path); 401 ret = btrfs_insert_empty_item(trans, dev_root, path, 402 &key, sizeof(*ptr)); 403 if (ret < 0) { 404 btrfs_warn(fs_info, 405 "insert dev_replace item failed %d!", ret); 406 goto out; 407 } 408 } 409 410 eb = path->nodes[0]; 411 ptr = btrfs_item_ptr(eb, path->slots[0], 412 struct btrfs_dev_replace_item); 413 414 down_write(&dev_replace->rwsem); 415 if (dev_replace->srcdev) 416 btrfs_set_dev_replace_src_devid(eb, ptr, 417 dev_replace->srcdev->devid); 418 else 419 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1); 420 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr, 421 dev_replace->cont_reading_from_srcdev_mode); 422 btrfs_set_dev_replace_replace_state(eb, ptr, 423 dev_replace->replace_state); 424 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started); 425 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped); 426 btrfs_set_dev_replace_num_write_errors(eb, ptr, 427 atomic64_read(&dev_replace->num_write_errors)); 428 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr, 429 atomic64_read(&dev_replace->num_uncorrectable_read_errors)); 430 dev_replace->cursor_left_last_write_of_item = 431 dev_replace->cursor_left; 432 btrfs_set_dev_replace_cursor_left(eb, ptr, 433 dev_replace->cursor_left_last_write_of_item); 434 btrfs_set_dev_replace_cursor_right(eb, ptr, 435 dev_replace->cursor_right); 436 dev_replace->item_needs_writeback = 0; 437 up_write(&dev_replace->rwsem); 438 439 btrfs_mark_buffer_dirty(eb); 440 441 out: 442 btrfs_free_path(path); 443 444 return ret; 445 } 446 447 static char* btrfs_dev_name(struct btrfs_device *device) 448 { 449 if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 450 return "<missing disk>"; 451 else 452 return rcu_str_deref(device->name); 453 } 454 455 static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, 456 const char *tgtdev_name, u64 srcdevid, const char *srcdev_name, 457 int read_src) 458 { 459 struct btrfs_root *root = fs_info->dev_root; 460 struct btrfs_trans_handle *trans; 461 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 462 int ret; 463 struct btrfs_device *tgt_device = NULL; 464 struct btrfs_device *src_device = NULL; 465 466 src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, 467 srcdev_name); 468 if (IS_ERR(src_device)) 469 return PTR_ERR(src_device); 470 471 if (btrfs_pinned_by_swapfile(fs_info, src_device)) { 472 btrfs_warn_in_rcu(fs_info, 473 "cannot replace device %s (devid %llu) due to active swapfile", 474 btrfs_dev_name(src_device), src_device->devid); 475 return -ETXTBSY; 476 } 477 478 /* 479 * Here we commit the transaction to make sure commit_total_bytes 480 * of all the devices are updated. 481 */ 482 trans = btrfs_attach_transaction(root); 483 if (!IS_ERR(trans)) { 484 ret = btrfs_commit_transaction(trans); 485 if (ret) 486 return ret; 487 } else if (PTR_ERR(trans) != -ENOENT) { 488 return PTR_ERR(trans); 489 } 490 491 ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, 492 src_device, &tgt_device); 493 if (ret) 494 return ret; 495 496 down_write(&dev_replace->rwsem); 497 switch (dev_replace->replace_state) { 498 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 499 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 500 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 501 break; 502 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 503 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 504 ASSERT(0); 505 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 506 up_write(&dev_replace->rwsem); 507 goto leave; 508 } 509 510 dev_replace->cont_reading_from_srcdev_mode = read_src; 511 dev_replace->srcdev = src_device; 512 dev_replace->tgtdev = tgt_device; 513 514 btrfs_info_in_rcu(fs_info, 515 "dev_replace from %s (devid %llu) to %s started", 516 btrfs_dev_name(src_device), 517 src_device->devid, 518 rcu_str_deref(tgt_device->name)); 519 520 /* 521 * from now on, the writes to the srcdev are all duplicated to 522 * go to the tgtdev as well (refer to btrfs_map_block()). 523 */ 524 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 525 dev_replace->time_started = ktime_get_real_seconds(); 526 dev_replace->cursor_left = 0; 527 dev_replace->committed_cursor_left = 0; 528 dev_replace->cursor_left_last_write_of_item = 0; 529 dev_replace->cursor_right = 0; 530 dev_replace->is_valid = 1; 531 dev_replace->item_needs_writeback = 1; 532 atomic64_set(&dev_replace->num_write_errors, 0); 533 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 534 up_write(&dev_replace->rwsem); 535 536 ret = btrfs_sysfs_add_device(tgt_device); 537 if (ret) 538 btrfs_err(fs_info, "kobj add dev failed %d", ret); 539 540 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 541 542 /* Commit dev_replace state and reserve 1 item for it. */ 543 trans = btrfs_start_transaction(root, 1); 544 if (IS_ERR(trans)) { 545 ret = PTR_ERR(trans); 546 down_write(&dev_replace->rwsem); 547 dev_replace->replace_state = 548 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; 549 dev_replace->srcdev = NULL; 550 dev_replace->tgtdev = NULL; 551 up_write(&dev_replace->rwsem); 552 goto leave; 553 } 554 555 ret = btrfs_commit_transaction(trans); 556 WARN_ON(ret); 557 558 /* the disk copy procedure reuses the scrub code */ 559 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 560 btrfs_device_get_total_bytes(src_device), 561 &dev_replace->scrub_progress, 0, 1); 562 563 ret = btrfs_dev_replace_finishing(fs_info, ret); 564 if (ret == -EINPROGRESS) 565 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; 566 567 return ret; 568 569 leave: 570 btrfs_destroy_dev_replace_tgtdev(tgt_device); 571 return ret; 572 } 573 574 int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, 575 struct btrfs_ioctl_dev_replace_args *args) 576 { 577 int ret; 578 579 switch (args->start.cont_reading_from_srcdev_mode) { 580 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: 581 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: 582 break; 583 default: 584 return -EINVAL; 585 } 586 587 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || 588 args->start.tgtdev_name[0] == '\0') 589 return -EINVAL; 590 591 ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, 592 args->start.srcdevid, 593 args->start.srcdev_name, 594 args->start.cont_reading_from_srcdev_mode); 595 args->result = ret; 596 /* don't warn if EINPROGRESS, someone else might be running scrub */ 597 if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS || 598 ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR) 599 return 0; 600 601 return ret; 602 } 603 604 /* 605 * blocked until all in-flight bios operations are finished. 606 */ 607 static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) 608 { 609 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 610 wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum( 611 &fs_info->dev_replace.bio_counter)); 612 } 613 614 /* 615 * we have removed target device, it is safe to allow new bios request. 616 */ 617 static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) 618 { 619 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 620 wake_up(&fs_info->dev_replace.replace_wait); 621 } 622 623 /* 624 * When finishing the device replace, before swapping the source device with the 625 * target device we must update the chunk allocation state in the target device, 626 * as it is empty because replace works by directly copying the chunks and not 627 * through the normal chunk allocation path. 628 */ 629 static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, 630 struct btrfs_device *tgtdev) 631 { 632 struct extent_state *cached_state = NULL; 633 u64 start = 0; 634 u64 found_start; 635 u64 found_end; 636 int ret = 0; 637 638 lockdep_assert_held(&srcdev->fs_info->chunk_mutex); 639 640 while (!find_first_extent_bit(&srcdev->alloc_state, start, 641 &found_start, &found_end, 642 CHUNK_ALLOCATED, &cached_state)) { 643 ret = set_extent_bits(&tgtdev->alloc_state, found_start, 644 found_end, CHUNK_ALLOCATED); 645 if (ret) 646 break; 647 start = found_end + 1; 648 } 649 650 free_extent_state(cached_state); 651 return ret; 652 } 653 654 static void btrfs_dev_replace_update_device_in_mapping_tree( 655 struct btrfs_fs_info *fs_info, 656 struct btrfs_device *srcdev, 657 struct btrfs_device *tgtdev) 658 { 659 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 660 struct extent_map *em; 661 struct map_lookup *map; 662 u64 start = 0; 663 int i; 664 665 write_lock(&em_tree->lock); 666 do { 667 em = lookup_extent_mapping(em_tree, start, (u64)-1); 668 if (!em) 669 break; 670 map = em->map_lookup; 671 for (i = 0; i < map->num_stripes; i++) 672 if (srcdev == map->stripes[i].dev) 673 map->stripes[i].dev = tgtdev; 674 start = em->start + em->len; 675 free_extent_map(em); 676 } while (start); 677 write_unlock(&em_tree->lock); 678 } 679 680 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 681 int scrub_ret) 682 { 683 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 684 struct btrfs_device *tgt_device; 685 struct btrfs_device *src_device; 686 struct btrfs_root *root = fs_info->tree_root; 687 u8 uuid_tmp[BTRFS_UUID_SIZE]; 688 struct btrfs_trans_handle *trans; 689 int ret = 0; 690 691 /* don't allow cancel or unmount to disturb the finishing procedure */ 692 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 693 694 down_read(&dev_replace->rwsem); 695 /* was the operation canceled, or is it finished? */ 696 if (dev_replace->replace_state != 697 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) { 698 up_read(&dev_replace->rwsem); 699 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 700 return 0; 701 } 702 703 tgt_device = dev_replace->tgtdev; 704 src_device = dev_replace->srcdev; 705 up_read(&dev_replace->rwsem); 706 707 /* 708 * flush all outstanding I/O and inode extent mappings before the 709 * copy operation is declared as being finished 710 */ 711 ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); 712 if (ret) { 713 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 714 return ret; 715 } 716 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); 717 718 if (!scrub_ret) 719 btrfs_reada_remove_dev(src_device); 720 721 /* 722 * We have to use this loop approach because at this point src_device 723 * has to be available for transaction commit to complete, yet new 724 * chunks shouldn't be allocated on the device. 725 */ 726 while (1) { 727 trans = btrfs_start_transaction(root, 0); 728 if (IS_ERR(trans)) { 729 btrfs_reada_undo_remove_dev(src_device); 730 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 731 return PTR_ERR(trans); 732 } 733 ret = btrfs_commit_transaction(trans); 734 WARN_ON(ret); 735 736 /* Prevent write_all_supers() during the finishing procedure */ 737 mutex_lock(&fs_info->fs_devices->device_list_mutex); 738 /* Prevent new chunks being allocated on the source device */ 739 mutex_lock(&fs_info->chunk_mutex); 740 741 if (!list_empty(&src_device->post_commit_list)) { 742 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 743 mutex_unlock(&fs_info->chunk_mutex); 744 } else { 745 break; 746 } 747 } 748 749 down_write(&dev_replace->rwsem); 750 dev_replace->replace_state = 751 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 752 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED; 753 dev_replace->tgtdev = NULL; 754 dev_replace->srcdev = NULL; 755 dev_replace->time_stopped = ktime_get_real_seconds(); 756 dev_replace->item_needs_writeback = 1; 757 758 /* 759 * Update allocation state in the new device and replace the old device 760 * with the new one in the mapping tree. 761 */ 762 if (!scrub_ret) { 763 scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device); 764 if (scrub_ret) 765 goto error; 766 btrfs_dev_replace_update_device_in_mapping_tree(fs_info, 767 src_device, 768 tgt_device); 769 } else { 770 if (scrub_ret != -ECANCELED) 771 btrfs_err_in_rcu(fs_info, 772 "btrfs_scrub_dev(%s, %llu, %s) failed %d", 773 btrfs_dev_name(src_device), 774 src_device->devid, 775 rcu_str_deref(tgt_device->name), scrub_ret); 776 error: 777 up_write(&dev_replace->rwsem); 778 mutex_unlock(&fs_info->chunk_mutex); 779 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 780 btrfs_reada_undo_remove_dev(src_device); 781 btrfs_rm_dev_replace_blocked(fs_info); 782 if (tgt_device) 783 btrfs_destroy_dev_replace_tgtdev(tgt_device); 784 btrfs_rm_dev_replace_unblocked(fs_info); 785 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 786 787 return scrub_ret; 788 } 789 790 btrfs_info_in_rcu(fs_info, 791 "dev_replace from %s (devid %llu) to %s finished", 792 btrfs_dev_name(src_device), 793 src_device->devid, 794 rcu_str_deref(tgt_device->name)); 795 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); 796 tgt_device->devid = src_device->devid; 797 src_device->devid = BTRFS_DEV_REPLACE_DEVID; 798 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 799 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 800 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 801 btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); 802 btrfs_device_set_disk_total_bytes(tgt_device, 803 src_device->disk_total_bytes); 804 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); 805 tgt_device->commit_bytes_used = src_device->bytes_used; 806 807 btrfs_assign_next_active_device(src_device, tgt_device); 808 809 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 810 fs_info->fs_devices->rw_devices++; 811 812 up_write(&dev_replace->rwsem); 813 btrfs_rm_dev_replace_blocked(fs_info); 814 815 btrfs_rm_dev_replace_remove_srcdev(src_device); 816 817 btrfs_rm_dev_replace_unblocked(fs_info); 818 819 /* 820 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will 821 * update on-disk dev stats value during commit transaction 822 */ 823 atomic_inc(&tgt_device->dev_stats_ccnt); 824 825 /* 826 * this is again a consistent state where no dev_replace procedure 827 * is running, the target device is part of the filesystem, the 828 * source device is not part of the filesystem anymore and its 1st 829 * superblock is scratched out so that it is no longer marked to 830 * belong to this filesystem. 831 */ 832 mutex_unlock(&fs_info->chunk_mutex); 833 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 834 835 /* replace the sysfs entry */ 836 btrfs_sysfs_remove_device(src_device); 837 btrfs_sysfs_update_devid(tgt_device); 838 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) 839 btrfs_scratch_superblocks(fs_info, src_device->bdev, 840 src_device->name->str); 841 842 /* write back the superblocks */ 843 trans = btrfs_start_transaction(root, 0); 844 if (!IS_ERR(trans)) 845 btrfs_commit_transaction(trans); 846 847 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 848 849 btrfs_rm_dev_replace_free_srcdev(src_device); 850 851 return 0; 852 } 853 854 /* 855 * Read progress of device replace status according to the state and last 856 * stored position. The value format is the same as for 857 * btrfs_dev_replace::progress_1000 858 */ 859 static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info) 860 { 861 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 862 u64 ret = 0; 863 864 switch (dev_replace->replace_state) { 865 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 866 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 867 ret = 0; 868 break; 869 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 870 ret = 1000; 871 break; 872 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 873 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 874 ret = div64_u64(dev_replace->cursor_left, 875 div_u64(btrfs_device_get_total_bytes( 876 dev_replace->srcdev), 1000)); 877 break; 878 } 879 880 return ret; 881 } 882 883 void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 884 struct btrfs_ioctl_dev_replace_args *args) 885 { 886 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 887 888 down_read(&dev_replace->rwsem); 889 /* even if !dev_replace_is_valid, the values are good enough for 890 * the replace_status ioctl */ 891 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 892 args->status.replace_state = dev_replace->replace_state; 893 args->status.time_started = dev_replace->time_started; 894 args->status.time_stopped = dev_replace->time_stopped; 895 args->status.num_write_errors = 896 atomic64_read(&dev_replace->num_write_errors); 897 args->status.num_uncorrectable_read_errors = 898 atomic64_read(&dev_replace->num_uncorrectable_read_errors); 899 args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); 900 up_read(&dev_replace->rwsem); 901 } 902 903 int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) 904 { 905 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 906 struct btrfs_device *tgt_device = NULL; 907 struct btrfs_device *src_device = NULL; 908 struct btrfs_trans_handle *trans; 909 struct btrfs_root *root = fs_info->tree_root; 910 int result; 911 int ret; 912 913 if (sb_rdonly(fs_info->sb)) 914 return -EROFS; 915 916 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 917 down_write(&dev_replace->rwsem); 918 switch (dev_replace->replace_state) { 919 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 920 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 921 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 922 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 923 up_write(&dev_replace->rwsem); 924 break; 925 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 926 tgt_device = dev_replace->tgtdev; 927 src_device = dev_replace->srcdev; 928 up_write(&dev_replace->rwsem); 929 ret = btrfs_scrub_cancel(fs_info); 930 if (ret < 0) { 931 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED; 932 } else { 933 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 934 /* 935 * btrfs_dev_replace_finishing() will handle the 936 * cleanup part 937 */ 938 btrfs_info_in_rcu(fs_info, 939 "dev_replace from %s (devid %llu) to %s canceled", 940 btrfs_dev_name(src_device), src_device->devid, 941 btrfs_dev_name(tgt_device)); 942 } 943 break; 944 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 945 /* 946 * Scrub doing the replace isn't running so we need to do the 947 * cleanup step of btrfs_dev_replace_finishing() here 948 */ 949 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 950 tgt_device = dev_replace->tgtdev; 951 src_device = dev_replace->srcdev; 952 dev_replace->tgtdev = NULL; 953 dev_replace->srcdev = NULL; 954 dev_replace->replace_state = 955 BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED; 956 dev_replace->time_stopped = ktime_get_real_seconds(); 957 dev_replace->item_needs_writeback = 1; 958 959 up_write(&dev_replace->rwsem); 960 961 /* Scrub for replace must not be running in suspended state */ 962 ret = btrfs_scrub_cancel(fs_info); 963 ASSERT(ret != -ENOTCONN); 964 965 trans = btrfs_start_transaction(root, 0); 966 if (IS_ERR(trans)) { 967 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 968 return PTR_ERR(trans); 969 } 970 ret = btrfs_commit_transaction(trans); 971 WARN_ON(ret); 972 973 btrfs_info_in_rcu(fs_info, 974 "suspended dev_replace from %s (devid %llu) to %s canceled", 975 btrfs_dev_name(src_device), src_device->devid, 976 btrfs_dev_name(tgt_device)); 977 978 if (tgt_device) 979 btrfs_destroy_dev_replace_tgtdev(tgt_device); 980 break; 981 default: 982 up_write(&dev_replace->rwsem); 983 result = -EINVAL; 984 } 985 986 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 987 return result; 988 } 989 990 void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info) 991 { 992 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 993 994 mutex_lock(&dev_replace->lock_finishing_cancel_unmount); 995 down_write(&dev_replace->rwsem); 996 997 switch (dev_replace->replace_state) { 998 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 999 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1000 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1001 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1002 break; 1003 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1004 dev_replace->replace_state = 1005 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 1006 dev_replace->time_stopped = ktime_get_real_seconds(); 1007 dev_replace->item_needs_writeback = 1; 1008 btrfs_info(fs_info, "suspending dev_replace for unmount"); 1009 break; 1010 } 1011 1012 up_write(&dev_replace->rwsem); 1013 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 1014 } 1015 1016 /* resume dev_replace procedure that was interrupted by unmount */ 1017 int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) 1018 { 1019 struct task_struct *task; 1020 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1021 1022 down_write(&dev_replace->rwsem); 1023 1024 switch (dev_replace->replace_state) { 1025 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 1026 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1027 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1028 up_write(&dev_replace->rwsem); 1029 return 0; 1030 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1031 break; 1032 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1033 dev_replace->replace_state = 1034 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED; 1035 break; 1036 } 1037 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) { 1038 btrfs_info(fs_info, 1039 "cannot continue dev_replace, tgtdev is missing"); 1040 btrfs_info(fs_info, 1041 "you may cancel the operation after 'mount -o degraded'"); 1042 dev_replace->replace_state = 1043 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 1044 up_write(&dev_replace->rwsem); 1045 return 0; 1046 } 1047 up_write(&dev_replace->rwsem); 1048 1049 /* 1050 * This could collide with a paused balance, but the exclusive op logic 1051 * should never allow both to start and pause. We don't want to allow 1052 * dev-replace to start anyway. 1053 */ 1054 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { 1055 down_write(&dev_replace->rwsem); 1056 dev_replace->replace_state = 1057 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; 1058 up_write(&dev_replace->rwsem); 1059 btrfs_info(fs_info, 1060 "cannot resume dev-replace, other exclusive operation running"); 1061 return 0; 1062 } 1063 1064 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); 1065 return PTR_ERR_OR_ZERO(task); 1066 } 1067 1068 static int btrfs_dev_replace_kthread(void *data) 1069 { 1070 struct btrfs_fs_info *fs_info = data; 1071 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1072 u64 progress; 1073 int ret; 1074 1075 progress = btrfs_dev_replace_progress(fs_info); 1076 progress = div_u64(progress, 10); 1077 btrfs_info_in_rcu(fs_info, 1078 "continuing dev_replace from %s (devid %llu) to target %s @%u%%", 1079 btrfs_dev_name(dev_replace->srcdev), 1080 dev_replace->srcdev->devid, 1081 btrfs_dev_name(dev_replace->tgtdev), 1082 (unsigned int)progress); 1083 1084 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 1085 dev_replace->committed_cursor_left, 1086 btrfs_device_get_total_bytes(dev_replace->srcdev), 1087 &dev_replace->scrub_progress, 0, 1); 1088 ret = btrfs_dev_replace_finishing(fs_info, ret); 1089 WARN_ON(ret && ret != -ECANCELED); 1090 1091 btrfs_exclop_finish(fs_info); 1092 return 0; 1093 } 1094 1095 int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) 1096 { 1097 if (!dev_replace->is_valid) 1098 return 0; 1099 1100 switch (dev_replace->replace_state) { 1101 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 1102 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: 1103 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: 1104 return 0; 1105 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 1106 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 1107 /* 1108 * return true even if tgtdev is missing (this is 1109 * something that can happen if the dev_replace 1110 * procedure is suspended by an umount and then 1111 * the tgtdev is missing (or "btrfs dev scan") was 1112 * not called and the filesystem is remounted 1113 * in degraded state. This does not stop the 1114 * dev_replace procedure. It needs to be canceled 1115 * manually if the cancellation is wanted. 1116 */ 1117 break; 1118 } 1119 return 1; 1120 } 1121 1122 void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) 1123 { 1124 percpu_counter_inc(&fs_info->dev_replace.bio_counter); 1125 } 1126 1127 void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) 1128 { 1129 percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount); 1130 cond_wake_up_nomb(&fs_info->dev_replace.replace_wait); 1131 } 1132 1133 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) 1134 { 1135 while (1) { 1136 percpu_counter_inc(&fs_info->dev_replace.bio_counter); 1137 if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, 1138 &fs_info->fs_state))) 1139 break; 1140 1141 btrfs_bio_counter_dec(fs_info); 1142 wait_event(fs_info->dev_replace.replace_wait, 1143 !test_bit(BTRFS_FS_STATE_DEV_REPLACING, 1144 &fs_info->fs_state)); 1145 } 1146 } 1147