1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/slab.h> 9 #include <linux/ratelimit.h> 10 #include <linux/kthread.h> 11 #include <linux/semaphore.h> 12 #include <linux/uuid.h> 13 #include <linux/list_sort.h> 14 #include <linux/namei.h> 15 #include "misc.h" 16 #include "ctree.h" 17 #include "extent_map.h" 18 #include "disk-io.h" 19 #include "transaction.h" 20 #include "print-tree.h" 21 #include "volumes.h" 22 #include "raid56.h" 23 #include "rcu-string.h" 24 #include "dev-replace.h" 25 #include "sysfs.h" 26 #include "tree-checker.h" 27 #include "space-info.h" 28 #include "block-group.h" 29 #include "discard.h" 30 #include "zoned.h" 31 #include "fs.h" 32 #include "accessors.h" 33 #include "uuid-tree.h" 34 #include "ioctl.h" 35 #include "relocation.h" 36 #include "scrub.h" 37 #include "super.h" 38 39 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 40 BTRFS_BLOCK_GROUP_RAID10 | \ 41 BTRFS_BLOCK_GROUP_RAID56_MASK) 42 43 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 44 [BTRFS_RAID_RAID10] = { 45 .sub_stripes = 2, 46 .dev_stripes = 1, 47 .devs_max = 0, /* 0 == as many as possible */ 48 .devs_min = 2, 49 .tolerated_failures = 1, 50 .devs_increment = 2, 51 .ncopies = 2, 52 .nparity = 0, 53 .raid_name = "raid10", 54 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 55 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 56 }, 57 [BTRFS_RAID_RAID1] = { 58 .sub_stripes = 1, 59 .dev_stripes = 1, 60 .devs_max = 2, 61 .devs_min = 2, 62 .tolerated_failures = 1, 63 .devs_increment = 2, 64 .ncopies = 2, 65 .nparity = 0, 66 .raid_name = "raid1", 67 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 68 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 69 }, 70 [BTRFS_RAID_RAID1C3] = { 71 .sub_stripes = 1, 72 .dev_stripes = 1, 73 .devs_max = 3, 74 .devs_min = 3, 75 .tolerated_failures = 2, 76 .devs_increment = 3, 77 .ncopies = 3, 78 .nparity = 0, 79 .raid_name = "raid1c3", 80 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 81 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 82 }, 83 [BTRFS_RAID_RAID1C4] = { 84 .sub_stripes = 1, 85 .dev_stripes = 1, 86 .devs_max = 4, 87 .devs_min = 4, 88 .tolerated_failures = 3, 89 .devs_increment = 4, 90 .ncopies = 4, 91 .nparity = 0, 92 .raid_name = "raid1c4", 93 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 94 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 95 }, 96 [BTRFS_RAID_DUP] = { 97 .sub_stripes = 1, 98 .dev_stripes = 2, 99 .devs_max = 1, 100 .devs_min = 1, 101 .tolerated_failures = 0, 102 .devs_increment = 1, 103 .ncopies = 2, 104 .nparity = 0, 105 .raid_name = "dup", 106 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 107 .mindev_error = 0, 108 }, 109 [BTRFS_RAID_RAID0] = { 110 .sub_stripes = 1, 111 .dev_stripes = 1, 112 .devs_max = 0, 113 .devs_min = 1, 114 .tolerated_failures = 0, 115 .devs_increment = 1, 116 .ncopies = 1, 117 .nparity = 0, 118 .raid_name = "raid0", 119 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 120 .mindev_error = 0, 121 }, 122 [BTRFS_RAID_SINGLE] = { 123 .sub_stripes = 1, 124 .dev_stripes = 1, 125 .devs_max = 1, 126 .devs_min = 1, 127 .tolerated_failures = 0, 128 .devs_increment = 1, 129 .ncopies = 1, 130 .nparity = 0, 131 .raid_name = "single", 132 .bg_flag = 0, 133 .mindev_error = 0, 134 }, 135 [BTRFS_RAID_RAID5] = { 136 .sub_stripes = 1, 137 .dev_stripes = 1, 138 .devs_max = 0, 139 .devs_min = 2, 140 .tolerated_failures = 1, 141 .devs_increment = 1, 142 .ncopies = 1, 143 .nparity = 1, 144 .raid_name = "raid5", 145 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 146 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 147 }, 148 [BTRFS_RAID_RAID6] = { 149 .sub_stripes = 1, 150 .dev_stripes = 1, 151 .devs_max = 0, 152 .devs_min = 3, 153 .tolerated_failures = 2, 154 .devs_increment = 1, 155 .ncopies = 1, 156 .nparity = 2, 157 .raid_name = "raid6", 158 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 159 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 160 }, 161 }; 162 163 /* 164 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 165 * can be used as index to access btrfs_raid_array[]. 166 */ 167 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 168 { 169 const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); 170 171 if (!profile) 172 return BTRFS_RAID_SINGLE; 173 174 return BTRFS_BG_FLAG_TO_INDEX(profile); 175 } 176 177 const char *btrfs_bg_type_to_raid_name(u64 flags) 178 { 179 const int index = btrfs_bg_flags_to_raid_index(flags); 180 181 if (index >= BTRFS_NR_RAID_TYPES) 182 return NULL; 183 184 return btrfs_raid_array[index].raid_name; 185 } 186 187 int btrfs_nr_parity_stripes(u64 type) 188 { 189 enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); 190 191 return btrfs_raid_array[index].nparity; 192 } 193 194 /* 195 * Fill @buf with textual description of @bg_flags, no more than @size_buf 196 * bytes including terminating null byte. 197 */ 198 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 199 { 200 int i; 201 int ret; 202 char *bp = buf; 203 u64 flags = bg_flags; 204 u32 size_bp = size_buf; 205 206 if (!flags) { 207 strcpy(bp, "NONE"); 208 return; 209 } 210 211 #define DESCRIBE_FLAG(flag, desc) \ 212 do { \ 213 if (flags & (flag)) { \ 214 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 215 if (ret < 0 || ret >= size_bp) \ 216 goto out_overflow; \ 217 size_bp -= ret; \ 218 bp += ret; \ 219 flags &= ~(flag); \ 220 } \ 221 } while (0) 222 223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 224 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 225 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 226 227 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 228 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 229 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 230 btrfs_raid_array[i].raid_name); 231 #undef DESCRIBE_FLAG 232 233 if (flags) { 234 ret = snprintf(bp, size_bp, "0x%llx|", flags); 235 size_bp -= ret; 236 } 237 238 if (size_bp < size_buf) 239 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 240 241 /* 242 * The text is trimmed, it's up to the caller to provide sufficiently 243 * large buffer 244 */ 245 out_overflow:; 246 } 247 248 static int init_first_rw_device(struct btrfs_trans_handle *trans); 249 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 250 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 251 252 /* 253 * Device locking 254 * ============== 255 * 256 * There are several mutexes that protect manipulation of devices and low-level 257 * structures like chunks but not block groups, extents or files 258 * 259 * uuid_mutex (global lock) 260 * ------------------------ 261 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 262 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 263 * device) or requested by the device= mount option 264 * 265 * the mutex can be very coarse and can cover long-running operations 266 * 267 * protects: updates to fs_devices counters like missing devices, rw devices, 268 * seeding, structure cloning, opening/closing devices at mount/umount time 269 * 270 * global::fs_devs - add, remove, updates to the global list 271 * 272 * does not protect: manipulation of the fs_devices::devices list in general 273 * but in mount context it could be used to exclude list modifications by eg. 274 * scan ioctl 275 * 276 * btrfs_device::name - renames (write side), read is RCU 277 * 278 * fs_devices::device_list_mutex (per-fs, with RCU) 279 * ------------------------------------------------ 280 * protects updates to fs_devices::devices, ie. adding and deleting 281 * 282 * simple list traversal with read-only actions can be done with RCU protection 283 * 284 * may be used to exclude some operations from running concurrently without any 285 * modifications to the list (see write_all_supers) 286 * 287 * Is not required at mount and close times, because our device list is 288 * protected by the uuid_mutex at that point. 289 * 290 * balance_mutex 291 * ------------- 292 * protects balance structures (status, state) and context accessed from 293 * several places (internally, ioctl) 294 * 295 * chunk_mutex 296 * ----------- 297 * protects chunks, adding or removing during allocation, trim or when a new 298 * device is added/removed. Additionally it also protects post_commit_list of 299 * individual devices, since they can be added to the transaction's 300 * post_commit_list only with chunk_mutex held. 301 * 302 * cleaner_mutex 303 * ------------- 304 * a big lock that is held by the cleaner thread and prevents running subvolume 305 * cleaning together with relocation or delayed iputs 306 * 307 * 308 * Lock nesting 309 * ============ 310 * 311 * uuid_mutex 312 * device_list_mutex 313 * chunk_mutex 314 * balance_mutex 315 * 316 * 317 * Exclusive operations 318 * ==================== 319 * 320 * Maintains the exclusivity of the following operations that apply to the 321 * whole filesystem and cannot run in parallel. 322 * 323 * - Balance (*) 324 * - Device add 325 * - Device remove 326 * - Device replace (*) 327 * - Resize 328 * 329 * The device operations (as above) can be in one of the following states: 330 * 331 * - Running state 332 * - Paused state 333 * - Completed state 334 * 335 * Only device operations marked with (*) can go into the Paused state for the 336 * following reasons: 337 * 338 * - ioctl (only Balance can be Paused through ioctl) 339 * - filesystem remounted as read-only 340 * - filesystem unmounted and mounted as read-only 341 * - system power-cycle and filesystem mounted as read-only 342 * - filesystem or device errors leading to forced read-only 343 * 344 * The status of exclusive operation is set and cleared atomically. 345 * During the course of Paused state, fs_info::exclusive_operation remains set. 346 * A device operation in Paused or Running state can be canceled or resumed 347 * either by ioctl (Balance only) or when remounted as read-write. 348 * The exclusive status is cleared when the device operation is canceled or 349 * completed. 350 */ 351 352 DEFINE_MUTEX(uuid_mutex); 353 static LIST_HEAD(fs_uuids); 354 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 355 { 356 return &fs_uuids; 357 } 358 359 /* 360 * alloc_fs_devices - allocate struct btrfs_fs_devices 361 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 362 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 363 * 364 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 365 * The returned struct is not linked onto any lists and can be destroyed with 366 * kfree() right away. 367 */ 368 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 369 const u8 *metadata_fsid) 370 { 371 struct btrfs_fs_devices *fs_devs; 372 373 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 374 if (!fs_devs) 375 return ERR_PTR(-ENOMEM); 376 377 mutex_init(&fs_devs->device_list_mutex); 378 379 INIT_LIST_HEAD(&fs_devs->devices); 380 INIT_LIST_HEAD(&fs_devs->alloc_list); 381 INIT_LIST_HEAD(&fs_devs->fs_list); 382 INIT_LIST_HEAD(&fs_devs->seed_list); 383 if (fsid) 384 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 385 386 if (metadata_fsid) 387 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 388 else if (fsid) 389 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 390 391 return fs_devs; 392 } 393 394 void btrfs_free_device(struct btrfs_device *device) 395 { 396 WARN_ON(!list_empty(&device->post_commit_list)); 397 rcu_string_free(device->name); 398 extent_io_tree_release(&device->alloc_state); 399 btrfs_destroy_dev_zone_info(device); 400 kfree(device); 401 } 402 403 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 404 { 405 struct btrfs_device *device; 406 407 WARN_ON(fs_devices->opened); 408 while (!list_empty(&fs_devices->devices)) { 409 device = list_entry(fs_devices->devices.next, 410 struct btrfs_device, dev_list); 411 list_del(&device->dev_list); 412 btrfs_free_device(device); 413 } 414 kfree(fs_devices); 415 } 416 417 void __exit btrfs_cleanup_fs_uuids(void) 418 { 419 struct btrfs_fs_devices *fs_devices; 420 421 while (!list_empty(&fs_uuids)) { 422 fs_devices = list_entry(fs_uuids.next, 423 struct btrfs_fs_devices, fs_list); 424 list_del(&fs_devices->fs_list); 425 free_fs_devices(fs_devices); 426 } 427 } 428 429 static noinline struct btrfs_fs_devices *find_fsid( 430 const u8 *fsid, const u8 *metadata_fsid) 431 { 432 struct btrfs_fs_devices *fs_devices; 433 434 ASSERT(fsid); 435 436 /* Handle non-split brain cases */ 437 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 438 if (metadata_fsid) { 439 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 440 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 441 BTRFS_FSID_SIZE) == 0) 442 return fs_devices; 443 } else { 444 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 445 return fs_devices; 446 } 447 } 448 return NULL; 449 } 450 451 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 452 struct btrfs_super_block *disk_super) 453 { 454 455 struct btrfs_fs_devices *fs_devices; 456 457 /* 458 * Handle scanned device having completed its fsid change but 459 * belonging to a fs_devices that was created by first scanning 460 * a device which didn't have its fsid/metadata_uuid changed 461 * at all and the CHANGING_FSID_V2 flag set. 462 */ 463 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 464 if (fs_devices->fsid_change && 465 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 466 BTRFS_FSID_SIZE) == 0 && 467 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 468 BTRFS_FSID_SIZE) == 0) { 469 return fs_devices; 470 } 471 } 472 /* 473 * Handle scanned device having completed its fsid change but 474 * belonging to a fs_devices that was created by a device that 475 * has an outdated pair of fsid/metadata_uuid and 476 * CHANGING_FSID_V2 flag set. 477 */ 478 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 479 if (fs_devices->fsid_change && 480 memcmp(fs_devices->metadata_uuid, 481 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 482 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 483 BTRFS_FSID_SIZE) == 0) { 484 return fs_devices; 485 } 486 } 487 488 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 489 } 490 491 492 static int 493 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 494 int flush, struct block_device **bdev, 495 struct btrfs_super_block **disk_super) 496 { 497 int ret; 498 499 *bdev = blkdev_get_by_path(device_path, flags, holder); 500 501 if (IS_ERR(*bdev)) { 502 ret = PTR_ERR(*bdev); 503 goto error; 504 } 505 506 if (flush) 507 sync_blockdev(*bdev); 508 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 509 if (ret) { 510 blkdev_put(*bdev, flags); 511 goto error; 512 } 513 invalidate_bdev(*bdev); 514 *disk_super = btrfs_read_dev_super(*bdev); 515 if (IS_ERR(*disk_super)) { 516 ret = PTR_ERR(*disk_super); 517 blkdev_put(*bdev, flags); 518 goto error; 519 } 520 521 return 0; 522 523 error: 524 *bdev = NULL; 525 return ret; 526 } 527 528 /* 529 * Search and remove all stale devices (which are not mounted). When both 530 * inputs are NULL, it will search and release all stale devices. 531 * 532 * @devt: Optional. When provided will it release all unmounted devices 533 * matching this devt only. 534 * @skip_device: Optional. Will skip this device when searching for the stale 535 * devices. 536 * 537 * Return: 0 for success or if @devt is 0. 538 * -EBUSY if @devt is a mounted device. 539 * -ENOENT if @devt does not match any device in the list. 540 */ 541 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 542 { 543 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 544 struct btrfs_device *device, *tmp_device; 545 int ret = 0; 546 547 lockdep_assert_held(&uuid_mutex); 548 549 if (devt) 550 ret = -ENOENT; 551 552 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 553 554 mutex_lock(&fs_devices->device_list_mutex); 555 list_for_each_entry_safe(device, tmp_device, 556 &fs_devices->devices, dev_list) { 557 if (skip_device && skip_device == device) 558 continue; 559 if (devt && devt != device->devt) 560 continue; 561 if (fs_devices->opened) { 562 /* for an already deleted device return 0 */ 563 if (devt && ret != 0) 564 ret = -EBUSY; 565 break; 566 } 567 568 /* delete the stale device */ 569 fs_devices->num_devices--; 570 list_del(&device->dev_list); 571 btrfs_free_device(device); 572 573 ret = 0; 574 } 575 mutex_unlock(&fs_devices->device_list_mutex); 576 577 if (fs_devices->num_devices == 0) { 578 btrfs_sysfs_remove_fsid(fs_devices); 579 list_del(&fs_devices->fs_list); 580 free_fs_devices(fs_devices); 581 } 582 } 583 584 return ret; 585 } 586 587 /* 588 * This is only used on mount, and we are protected from competing things 589 * messing with our fs_devices by the uuid_mutex, thus we do not need the 590 * fs_devices->device_list_mutex here. 591 */ 592 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 593 struct btrfs_device *device, fmode_t flags, 594 void *holder) 595 { 596 struct block_device *bdev; 597 struct btrfs_super_block *disk_super; 598 u64 devid; 599 int ret; 600 601 if (device->bdev) 602 return -EINVAL; 603 if (!device->name) 604 return -EINVAL; 605 606 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 607 &bdev, &disk_super); 608 if (ret) 609 return ret; 610 611 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 if (devid != device->devid) 613 goto error_free_page; 614 615 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 616 goto error_free_page; 617 618 device->generation = btrfs_super_generation(disk_super); 619 620 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 621 if (btrfs_super_incompat_flags(disk_super) & 622 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 623 pr_err( 624 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 625 goto error_free_page; 626 } 627 628 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 629 fs_devices->seeding = true; 630 } else { 631 if (bdev_read_only(bdev)) 632 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 633 else 634 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 635 } 636 637 if (!bdev_nonrot(bdev)) 638 fs_devices->rotating = true; 639 640 if (bdev_max_discard_sectors(bdev)) 641 fs_devices->discardable = true; 642 643 device->bdev = bdev; 644 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 645 device->mode = flags; 646 647 fs_devices->open_devices++; 648 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 649 device->devid != BTRFS_DEV_REPLACE_DEVID) { 650 fs_devices->rw_devices++; 651 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 652 } 653 btrfs_release_disk_super(disk_super); 654 655 return 0; 656 657 error_free_page: 658 btrfs_release_disk_super(disk_super); 659 blkdev_put(bdev, flags); 660 661 return -EINVAL; 662 } 663 664 /* 665 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 666 * being created with a disk that has already completed its fsid change. Such 667 * disk can belong to an fs which has its FSID changed or to one which doesn't. 668 * Handle both cases here. 669 */ 670 static struct btrfs_fs_devices *find_fsid_inprogress( 671 struct btrfs_super_block *disk_super) 672 { 673 struct btrfs_fs_devices *fs_devices; 674 675 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 676 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 677 BTRFS_FSID_SIZE) != 0 && 678 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 679 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 680 return fs_devices; 681 } 682 } 683 684 return find_fsid(disk_super->fsid, NULL); 685 } 686 687 688 static struct btrfs_fs_devices *find_fsid_changed( 689 struct btrfs_super_block *disk_super) 690 { 691 struct btrfs_fs_devices *fs_devices; 692 693 /* 694 * Handles the case where scanned device is part of an fs that had 695 * multiple successful changes of FSID but currently device didn't 696 * observe it. Meaning our fsid will be different than theirs. We need 697 * to handle two subcases : 698 * 1 - The fs still continues to have different METADATA/FSID uuids. 699 * 2 - The fs is switched back to its original FSID (METADATA/FSID 700 * are equal). 701 */ 702 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 703 /* Changed UUIDs */ 704 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 705 BTRFS_FSID_SIZE) != 0 && 706 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 707 BTRFS_FSID_SIZE) == 0 && 708 memcmp(fs_devices->fsid, disk_super->fsid, 709 BTRFS_FSID_SIZE) != 0) 710 return fs_devices; 711 712 /* Unchanged UUIDs */ 713 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 714 BTRFS_FSID_SIZE) == 0 && 715 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 716 BTRFS_FSID_SIZE) == 0) 717 return fs_devices; 718 } 719 720 return NULL; 721 } 722 723 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 724 struct btrfs_super_block *disk_super) 725 { 726 struct btrfs_fs_devices *fs_devices; 727 728 /* 729 * Handle the case where the scanned device is part of an fs whose last 730 * metadata UUID change reverted it to the original FSID. At the same 731 * time fs_devices was first created by another constituent device 732 * which didn't fully observe the operation. This results in an 733 * btrfs_fs_devices created with metadata/fsid different AND 734 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 735 * fs_devices equal to the FSID of the disk. 736 */ 737 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 738 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 739 BTRFS_FSID_SIZE) != 0 && 740 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 741 BTRFS_FSID_SIZE) == 0 && 742 fs_devices->fsid_change) 743 return fs_devices; 744 } 745 746 return NULL; 747 } 748 /* 749 * Add new device to list of registered devices 750 * 751 * Returns: 752 * device pointer which was just added or updated when successful 753 * error pointer when failed 754 */ 755 static noinline struct btrfs_device *device_list_add(const char *path, 756 struct btrfs_super_block *disk_super, 757 bool *new_device_added) 758 { 759 struct btrfs_device *device; 760 struct btrfs_fs_devices *fs_devices = NULL; 761 struct rcu_string *name; 762 u64 found_transid = btrfs_super_generation(disk_super); 763 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 764 dev_t path_devt; 765 int error; 766 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 767 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 768 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 769 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 770 771 error = lookup_bdev(path, &path_devt); 772 if (error) { 773 btrfs_err(NULL, "failed to lookup block device for path %s: %d", 774 path, error); 775 return ERR_PTR(error); 776 } 777 778 if (fsid_change_in_progress) { 779 if (!has_metadata_uuid) 780 fs_devices = find_fsid_inprogress(disk_super); 781 else 782 fs_devices = find_fsid_changed(disk_super); 783 } else if (has_metadata_uuid) { 784 fs_devices = find_fsid_with_metadata_uuid(disk_super); 785 } else { 786 fs_devices = find_fsid_reverted_metadata(disk_super); 787 if (!fs_devices) 788 fs_devices = find_fsid(disk_super->fsid, NULL); 789 } 790 791 792 if (!fs_devices) { 793 if (has_metadata_uuid) 794 fs_devices = alloc_fs_devices(disk_super->fsid, 795 disk_super->metadata_uuid); 796 else 797 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 798 799 if (IS_ERR(fs_devices)) 800 return ERR_CAST(fs_devices); 801 802 fs_devices->fsid_change = fsid_change_in_progress; 803 804 mutex_lock(&fs_devices->device_list_mutex); 805 list_add(&fs_devices->fs_list, &fs_uuids); 806 807 device = NULL; 808 } else { 809 struct btrfs_dev_lookup_args args = { 810 .devid = devid, 811 .uuid = disk_super->dev_item.uuid, 812 }; 813 814 mutex_lock(&fs_devices->device_list_mutex); 815 device = btrfs_find_device(fs_devices, &args); 816 817 /* 818 * If this disk has been pulled into an fs devices created by 819 * a device which had the CHANGING_FSID_V2 flag then replace the 820 * metadata_uuid/fsid values of the fs_devices. 821 */ 822 if (fs_devices->fsid_change && 823 found_transid > fs_devices->latest_generation) { 824 memcpy(fs_devices->fsid, disk_super->fsid, 825 BTRFS_FSID_SIZE); 826 827 if (has_metadata_uuid) 828 memcpy(fs_devices->metadata_uuid, 829 disk_super->metadata_uuid, 830 BTRFS_FSID_SIZE); 831 else 832 memcpy(fs_devices->metadata_uuid, 833 disk_super->fsid, BTRFS_FSID_SIZE); 834 835 fs_devices->fsid_change = false; 836 } 837 } 838 839 if (!device) { 840 unsigned int nofs_flag; 841 842 if (fs_devices->opened) { 843 btrfs_err(NULL, 844 "device %s belongs to fsid %pU, and the fs is already mounted", 845 path, fs_devices->fsid); 846 mutex_unlock(&fs_devices->device_list_mutex); 847 return ERR_PTR(-EBUSY); 848 } 849 850 nofs_flag = memalloc_nofs_save(); 851 device = btrfs_alloc_device(NULL, &devid, 852 disk_super->dev_item.uuid, path); 853 memalloc_nofs_restore(nofs_flag); 854 if (IS_ERR(device)) { 855 mutex_unlock(&fs_devices->device_list_mutex); 856 /* we can safely leave the fs_devices entry around */ 857 return device; 858 } 859 860 device->devt = path_devt; 861 862 list_add_rcu(&device->dev_list, &fs_devices->devices); 863 fs_devices->num_devices++; 864 865 device->fs_devices = fs_devices; 866 *new_device_added = true; 867 868 if (disk_super->label[0]) 869 pr_info( 870 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 871 disk_super->label, devid, found_transid, path, 872 current->comm, task_pid_nr(current)); 873 else 874 pr_info( 875 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 876 disk_super->fsid, devid, found_transid, path, 877 current->comm, task_pid_nr(current)); 878 879 } else if (!device->name || strcmp(device->name->str, path)) { 880 /* 881 * When FS is already mounted. 882 * 1. If you are here and if the device->name is NULL that 883 * means this device was missing at time of FS mount. 884 * 2. If you are here and if the device->name is different 885 * from 'path' that means either 886 * a. The same device disappeared and reappeared with 887 * different name. or 888 * b. The missing-disk-which-was-replaced, has 889 * reappeared now. 890 * 891 * We must allow 1 and 2a above. But 2b would be a spurious 892 * and unintentional. 893 * 894 * Further in case of 1 and 2a above, the disk at 'path' 895 * would have missed some transaction when it was away and 896 * in case of 2a the stale bdev has to be updated as well. 897 * 2b must not be allowed at all time. 898 */ 899 900 /* 901 * For now, we do allow update to btrfs_fs_device through the 902 * btrfs dev scan cli after FS has been mounted. We're still 903 * tracking a problem where systems fail mount by subvolume id 904 * when we reject replacement on a mounted FS. 905 */ 906 if (!fs_devices->opened && found_transid < device->generation) { 907 /* 908 * That is if the FS is _not_ mounted and if you 909 * are here, that means there is more than one 910 * disk with same uuid and devid.We keep the one 911 * with larger generation number or the last-in if 912 * generation are equal. 913 */ 914 mutex_unlock(&fs_devices->device_list_mutex); 915 btrfs_err(NULL, 916 "device %s already registered with a higher generation, found %llu expect %llu", 917 path, found_transid, device->generation); 918 return ERR_PTR(-EEXIST); 919 } 920 921 /* 922 * We are going to replace the device path for a given devid, 923 * make sure it's the same device if the device is mounted 924 * 925 * NOTE: the device->fs_info may not be reliable here so pass 926 * in a NULL to message helpers instead. This avoids a possible 927 * use-after-free when the fs_info and fs_info->sb are already 928 * torn down. 929 */ 930 if (device->bdev) { 931 if (device->devt != path_devt) { 932 mutex_unlock(&fs_devices->device_list_mutex); 933 btrfs_warn_in_rcu(NULL, 934 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 935 path, devid, found_transid, 936 current->comm, 937 task_pid_nr(current)); 938 return ERR_PTR(-EEXIST); 939 } 940 btrfs_info_in_rcu(NULL, 941 "devid %llu device path %s changed to %s scanned by %s (%d)", 942 devid, btrfs_dev_name(device), 943 path, current->comm, 944 task_pid_nr(current)); 945 } 946 947 name = rcu_string_strdup(path, GFP_NOFS); 948 if (!name) { 949 mutex_unlock(&fs_devices->device_list_mutex); 950 return ERR_PTR(-ENOMEM); 951 } 952 rcu_string_free(device->name); 953 rcu_assign_pointer(device->name, name); 954 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 955 fs_devices->missing_devices--; 956 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 957 } 958 device->devt = path_devt; 959 } 960 961 /* 962 * Unmount does not free the btrfs_device struct but would zero 963 * generation along with most of the other members. So just update 964 * it back. We need it to pick the disk with largest generation 965 * (as above). 966 */ 967 if (!fs_devices->opened) { 968 device->generation = found_transid; 969 fs_devices->latest_generation = max_t(u64, found_transid, 970 fs_devices->latest_generation); 971 } 972 973 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 974 975 mutex_unlock(&fs_devices->device_list_mutex); 976 return device; 977 } 978 979 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 980 { 981 struct btrfs_fs_devices *fs_devices; 982 struct btrfs_device *device; 983 struct btrfs_device *orig_dev; 984 int ret = 0; 985 986 lockdep_assert_held(&uuid_mutex); 987 988 fs_devices = alloc_fs_devices(orig->fsid, NULL); 989 if (IS_ERR(fs_devices)) 990 return fs_devices; 991 992 fs_devices->total_devices = orig->total_devices; 993 994 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 995 const char *dev_path = NULL; 996 997 /* 998 * This is ok to do without RCU read locked because we hold the 999 * uuid mutex so nothing we touch in here is going to disappear. 1000 */ 1001 if (orig_dev->name) 1002 dev_path = orig_dev->name->str; 1003 1004 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1005 orig_dev->uuid, dev_path); 1006 if (IS_ERR(device)) { 1007 ret = PTR_ERR(device); 1008 goto error; 1009 } 1010 1011 if (orig_dev->zone_info) { 1012 struct btrfs_zoned_device_info *zone_info; 1013 1014 zone_info = btrfs_clone_dev_zone_info(orig_dev); 1015 if (!zone_info) { 1016 btrfs_free_device(device); 1017 ret = -ENOMEM; 1018 goto error; 1019 } 1020 device->zone_info = zone_info; 1021 } 1022 1023 list_add(&device->dev_list, &fs_devices->devices); 1024 device->fs_devices = fs_devices; 1025 fs_devices->num_devices++; 1026 } 1027 return fs_devices; 1028 error: 1029 free_fs_devices(fs_devices); 1030 return ERR_PTR(ret); 1031 } 1032 1033 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1034 struct btrfs_device **latest_dev) 1035 { 1036 struct btrfs_device *device, *next; 1037 1038 /* This is the initialized path, it is safe to release the devices. */ 1039 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1040 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1041 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1042 &device->dev_state) && 1043 !test_bit(BTRFS_DEV_STATE_MISSING, 1044 &device->dev_state) && 1045 (!*latest_dev || 1046 device->generation > (*latest_dev)->generation)) { 1047 *latest_dev = device; 1048 } 1049 continue; 1050 } 1051 1052 /* 1053 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1054 * in btrfs_init_dev_replace() so just continue. 1055 */ 1056 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1057 continue; 1058 1059 if (device->bdev) { 1060 blkdev_put(device->bdev, device->mode); 1061 device->bdev = NULL; 1062 fs_devices->open_devices--; 1063 } 1064 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1065 list_del_init(&device->dev_alloc_list); 1066 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1067 fs_devices->rw_devices--; 1068 } 1069 list_del_init(&device->dev_list); 1070 fs_devices->num_devices--; 1071 btrfs_free_device(device); 1072 } 1073 1074 } 1075 1076 /* 1077 * After we have read the system tree and know devids belonging to this 1078 * filesystem, remove the device which does not belong there. 1079 */ 1080 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1081 { 1082 struct btrfs_device *latest_dev = NULL; 1083 struct btrfs_fs_devices *seed_dev; 1084 1085 mutex_lock(&uuid_mutex); 1086 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1087 1088 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1089 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1090 1091 fs_devices->latest_dev = latest_dev; 1092 1093 mutex_unlock(&uuid_mutex); 1094 } 1095 1096 static void btrfs_close_bdev(struct btrfs_device *device) 1097 { 1098 if (!device->bdev) 1099 return; 1100 1101 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1102 sync_blockdev(device->bdev); 1103 invalidate_bdev(device->bdev); 1104 } 1105 1106 blkdev_put(device->bdev, device->mode); 1107 } 1108 1109 static void btrfs_close_one_device(struct btrfs_device *device) 1110 { 1111 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1112 1113 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1114 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1115 list_del_init(&device->dev_alloc_list); 1116 fs_devices->rw_devices--; 1117 } 1118 1119 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1120 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1121 1122 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1123 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1124 fs_devices->missing_devices--; 1125 } 1126 1127 btrfs_close_bdev(device); 1128 if (device->bdev) { 1129 fs_devices->open_devices--; 1130 device->bdev = NULL; 1131 } 1132 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1133 btrfs_destroy_dev_zone_info(device); 1134 1135 device->fs_info = NULL; 1136 atomic_set(&device->dev_stats_ccnt, 0); 1137 extent_io_tree_release(&device->alloc_state); 1138 1139 /* 1140 * Reset the flush error record. We might have a transient flush error 1141 * in this mount, and if so we aborted the current transaction and set 1142 * the fs to an error state, guaranteeing no super blocks can be further 1143 * committed. However that error might be transient and if we unmount the 1144 * filesystem and mount it again, we should allow the mount to succeed 1145 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1146 * filesystem again we still get flush errors, then we will again abort 1147 * any transaction and set the error state, guaranteeing no commits of 1148 * unsafe super blocks. 1149 */ 1150 device->last_flush_error = 0; 1151 1152 /* Verify the device is back in a pristine state */ 1153 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1154 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1155 ASSERT(list_empty(&device->dev_alloc_list)); 1156 ASSERT(list_empty(&device->post_commit_list)); 1157 } 1158 1159 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1160 { 1161 struct btrfs_device *device, *tmp; 1162 1163 lockdep_assert_held(&uuid_mutex); 1164 1165 if (--fs_devices->opened > 0) 1166 return; 1167 1168 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1169 btrfs_close_one_device(device); 1170 1171 WARN_ON(fs_devices->open_devices); 1172 WARN_ON(fs_devices->rw_devices); 1173 fs_devices->opened = 0; 1174 fs_devices->seeding = false; 1175 fs_devices->fs_info = NULL; 1176 } 1177 1178 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1179 { 1180 LIST_HEAD(list); 1181 struct btrfs_fs_devices *tmp; 1182 1183 mutex_lock(&uuid_mutex); 1184 close_fs_devices(fs_devices); 1185 if (!fs_devices->opened) { 1186 list_splice_init(&fs_devices->seed_list, &list); 1187 1188 /* 1189 * If the struct btrfs_fs_devices is not assembled with any 1190 * other device, it can be re-initialized during the next mount 1191 * without the needing device-scan step. Therefore, it can be 1192 * fully freed. 1193 */ 1194 if (fs_devices->num_devices == 1) { 1195 list_del(&fs_devices->fs_list); 1196 free_fs_devices(fs_devices); 1197 } 1198 } 1199 1200 1201 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1202 close_fs_devices(fs_devices); 1203 list_del(&fs_devices->seed_list); 1204 free_fs_devices(fs_devices); 1205 } 1206 mutex_unlock(&uuid_mutex); 1207 } 1208 1209 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1210 fmode_t flags, void *holder) 1211 { 1212 struct btrfs_device *device; 1213 struct btrfs_device *latest_dev = NULL; 1214 struct btrfs_device *tmp_device; 1215 1216 flags |= FMODE_EXCL; 1217 1218 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1219 dev_list) { 1220 int ret; 1221 1222 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1223 if (ret == 0 && 1224 (!latest_dev || device->generation > latest_dev->generation)) { 1225 latest_dev = device; 1226 } else if (ret == -ENODATA) { 1227 fs_devices->num_devices--; 1228 list_del(&device->dev_list); 1229 btrfs_free_device(device); 1230 } 1231 } 1232 if (fs_devices->open_devices == 0) 1233 return -EINVAL; 1234 1235 fs_devices->opened = 1; 1236 fs_devices->latest_dev = latest_dev; 1237 fs_devices->total_rw_bytes = 0; 1238 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1239 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1240 1241 return 0; 1242 } 1243 1244 static int devid_cmp(void *priv, const struct list_head *a, 1245 const struct list_head *b) 1246 { 1247 const struct btrfs_device *dev1, *dev2; 1248 1249 dev1 = list_entry(a, struct btrfs_device, dev_list); 1250 dev2 = list_entry(b, struct btrfs_device, dev_list); 1251 1252 if (dev1->devid < dev2->devid) 1253 return -1; 1254 else if (dev1->devid > dev2->devid) 1255 return 1; 1256 return 0; 1257 } 1258 1259 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1260 fmode_t flags, void *holder) 1261 { 1262 int ret; 1263 1264 lockdep_assert_held(&uuid_mutex); 1265 /* 1266 * The device_list_mutex cannot be taken here in case opening the 1267 * underlying device takes further locks like open_mutex. 1268 * 1269 * We also don't need the lock here as this is called during mount and 1270 * exclusion is provided by uuid_mutex 1271 */ 1272 1273 if (fs_devices->opened) { 1274 fs_devices->opened++; 1275 ret = 0; 1276 } else { 1277 list_sort(NULL, &fs_devices->devices, devid_cmp); 1278 ret = open_fs_devices(fs_devices, flags, holder); 1279 } 1280 1281 return ret; 1282 } 1283 1284 void btrfs_release_disk_super(struct btrfs_super_block *super) 1285 { 1286 struct page *page = virt_to_page(super); 1287 1288 put_page(page); 1289 } 1290 1291 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1292 u64 bytenr, u64 bytenr_orig) 1293 { 1294 struct btrfs_super_block *disk_super; 1295 struct page *page; 1296 void *p; 1297 pgoff_t index; 1298 1299 /* make sure our super fits in the device */ 1300 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 1301 return ERR_PTR(-EINVAL); 1302 1303 /* make sure our super fits in the page */ 1304 if (sizeof(*disk_super) > PAGE_SIZE) 1305 return ERR_PTR(-EINVAL); 1306 1307 /* make sure our super doesn't straddle pages on disk */ 1308 index = bytenr >> PAGE_SHIFT; 1309 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1310 return ERR_PTR(-EINVAL); 1311 1312 /* pull in the page with our super */ 1313 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1314 1315 if (IS_ERR(page)) 1316 return ERR_CAST(page); 1317 1318 p = page_address(page); 1319 1320 /* align our pointer to the offset of the super block */ 1321 disk_super = p + offset_in_page(bytenr); 1322 1323 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1324 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1325 btrfs_release_disk_super(p); 1326 return ERR_PTR(-EINVAL); 1327 } 1328 1329 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1330 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1331 1332 return disk_super; 1333 } 1334 1335 int btrfs_forget_devices(dev_t devt) 1336 { 1337 int ret; 1338 1339 mutex_lock(&uuid_mutex); 1340 ret = btrfs_free_stale_devices(devt, NULL); 1341 mutex_unlock(&uuid_mutex); 1342 1343 return ret; 1344 } 1345 1346 /* 1347 * Look for a btrfs signature on a device. This may be called out of the mount path 1348 * and we are not allowed to call set_blocksize during the scan. The superblock 1349 * is read via pagecache 1350 */ 1351 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1352 void *holder) 1353 { 1354 struct btrfs_super_block *disk_super; 1355 bool new_device_added = false; 1356 struct btrfs_device *device = NULL; 1357 struct block_device *bdev; 1358 u64 bytenr, bytenr_orig; 1359 int ret; 1360 1361 lockdep_assert_held(&uuid_mutex); 1362 1363 /* 1364 * we would like to check all the supers, but that would make 1365 * a btrfs mount succeed after a mkfs from a different FS. 1366 * So, we need to add a special mount option to scan for 1367 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1368 */ 1369 flags |= FMODE_EXCL; 1370 1371 bdev = blkdev_get_by_path(path, flags, holder); 1372 if (IS_ERR(bdev)) 1373 return ERR_CAST(bdev); 1374 1375 bytenr_orig = btrfs_sb_offset(0); 1376 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1377 if (ret) { 1378 device = ERR_PTR(ret); 1379 goto error_bdev_put; 1380 } 1381 1382 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1383 if (IS_ERR(disk_super)) { 1384 device = ERR_CAST(disk_super); 1385 goto error_bdev_put; 1386 } 1387 1388 device = device_list_add(path, disk_super, &new_device_added); 1389 if (!IS_ERR(device) && new_device_added) 1390 btrfs_free_stale_devices(device->devt, device); 1391 1392 btrfs_release_disk_super(disk_super); 1393 1394 error_bdev_put: 1395 blkdev_put(bdev, flags); 1396 1397 return device; 1398 } 1399 1400 /* 1401 * Try to find a chunk that intersects [start, start + len] range and when one 1402 * such is found, record the end of it in *start 1403 */ 1404 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1405 u64 len) 1406 { 1407 u64 physical_start, physical_end; 1408 1409 lockdep_assert_held(&device->fs_info->chunk_mutex); 1410 1411 if (!find_first_extent_bit(&device->alloc_state, *start, 1412 &physical_start, &physical_end, 1413 CHUNK_ALLOCATED, NULL)) { 1414 1415 if (in_range(physical_start, *start, len) || 1416 in_range(*start, physical_start, 1417 physical_end - physical_start)) { 1418 *start = physical_end + 1; 1419 return true; 1420 } 1421 } 1422 return false; 1423 } 1424 1425 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1426 { 1427 switch (device->fs_devices->chunk_alloc_policy) { 1428 case BTRFS_CHUNK_ALLOC_REGULAR: 1429 return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); 1430 case BTRFS_CHUNK_ALLOC_ZONED: 1431 /* 1432 * We don't care about the starting region like regular 1433 * allocator, because we anyway use/reserve the first two zones 1434 * for superblock logging. 1435 */ 1436 return ALIGN(start, device->zone_info->zone_size); 1437 default: 1438 BUG(); 1439 } 1440 } 1441 1442 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1443 u64 *hole_start, u64 *hole_size, 1444 u64 num_bytes) 1445 { 1446 u64 zone_size = device->zone_info->zone_size; 1447 u64 pos; 1448 int ret; 1449 bool changed = false; 1450 1451 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1452 1453 while (*hole_size > 0) { 1454 pos = btrfs_find_allocatable_zones(device, *hole_start, 1455 *hole_start + *hole_size, 1456 num_bytes); 1457 if (pos != *hole_start) { 1458 *hole_size = *hole_start + *hole_size - pos; 1459 *hole_start = pos; 1460 changed = true; 1461 if (*hole_size < num_bytes) 1462 break; 1463 } 1464 1465 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1466 1467 /* Range is ensured to be empty */ 1468 if (!ret) 1469 return changed; 1470 1471 /* Given hole range was invalid (outside of device) */ 1472 if (ret == -ERANGE) { 1473 *hole_start += *hole_size; 1474 *hole_size = 0; 1475 return true; 1476 } 1477 1478 *hole_start += zone_size; 1479 *hole_size -= zone_size; 1480 changed = true; 1481 } 1482 1483 return changed; 1484 } 1485 1486 /* 1487 * Check if specified hole is suitable for allocation. 1488 * 1489 * @device: the device which we have the hole 1490 * @hole_start: starting position of the hole 1491 * @hole_size: the size of the hole 1492 * @num_bytes: the size of the free space that we need 1493 * 1494 * This function may modify @hole_start and @hole_size to reflect the suitable 1495 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1496 */ 1497 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1498 u64 *hole_size, u64 num_bytes) 1499 { 1500 bool changed = false; 1501 u64 hole_end = *hole_start + *hole_size; 1502 1503 for (;;) { 1504 /* 1505 * Check before we set max_hole_start, otherwise we could end up 1506 * sending back this offset anyway. 1507 */ 1508 if (contains_pending_extent(device, hole_start, *hole_size)) { 1509 if (hole_end >= *hole_start) 1510 *hole_size = hole_end - *hole_start; 1511 else 1512 *hole_size = 0; 1513 changed = true; 1514 } 1515 1516 switch (device->fs_devices->chunk_alloc_policy) { 1517 case BTRFS_CHUNK_ALLOC_REGULAR: 1518 /* No extra check */ 1519 break; 1520 case BTRFS_CHUNK_ALLOC_ZONED: 1521 if (dev_extent_hole_check_zoned(device, hole_start, 1522 hole_size, num_bytes)) { 1523 changed = true; 1524 /* 1525 * The changed hole can contain pending extent. 1526 * Loop again to check that. 1527 */ 1528 continue; 1529 } 1530 break; 1531 default: 1532 BUG(); 1533 } 1534 1535 break; 1536 } 1537 1538 return changed; 1539 } 1540 1541 /* 1542 * Find free space in the specified device. 1543 * 1544 * @device: the device which we search the free space in 1545 * @num_bytes: the size of the free space that we need 1546 * @search_start: the position from which to begin the search 1547 * @start: store the start of the free space. 1548 * @len: the size of the free space. that we find, or the size 1549 * of the max free space if we don't find suitable free space 1550 * 1551 * This does a pretty simple search, the expectation is that it is called very 1552 * infrequently and that a given device has a small number of extents. 1553 * 1554 * @start is used to store the start of the free space if we find. But if we 1555 * don't find suitable free space, it will be used to store the start position 1556 * of the max free space. 1557 * 1558 * @len is used to store the size of the free space that we find. 1559 * But if we don't find suitable free space, it is used to store the size of 1560 * the max free space. 1561 * 1562 * NOTE: This function will search *commit* root of device tree, and does extra 1563 * check to ensure dev extents are not double allocated. 1564 * This makes the function safe to allocate dev extents but may not report 1565 * correct usable device space, as device extent freed in current transaction 1566 * is not reported as available. 1567 */ 1568 static int find_free_dev_extent_start(struct btrfs_device *device, 1569 u64 num_bytes, u64 search_start, u64 *start, 1570 u64 *len) 1571 { 1572 struct btrfs_fs_info *fs_info = device->fs_info; 1573 struct btrfs_root *root = fs_info->dev_root; 1574 struct btrfs_key key; 1575 struct btrfs_dev_extent *dev_extent; 1576 struct btrfs_path *path; 1577 u64 hole_size; 1578 u64 max_hole_start; 1579 u64 max_hole_size; 1580 u64 extent_end; 1581 u64 search_end = device->total_bytes; 1582 int ret; 1583 int slot; 1584 struct extent_buffer *l; 1585 1586 search_start = dev_extent_search_start(device, search_start); 1587 1588 WARN_ON(device->zone_info && 1589 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1590 1591 path = btrfs_alloc_path(); 1592 if (!path) 1593 return -ENOMEM; 1594 1595 max_hole_start = search_start; 1596 max_hole_size = 0; 1597 1598 again: 1599 if (search_start >= search_end || 1600 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1601 ret = -ENOSPC; 1602 goto out; 1603 } 1604 1605 path->reada = READA_FORWARD; 1606 path->search_commit_root = 1; 1607 path->skip_locking = 1; 1608 1609 key.objectid = device->devid; 1610 key.offset = search_start; 1611 key.type = BTRFS_DEV_EXTENT_KEY; 1612 1613 ret = btrfs_search_backwards(root, &key, path); 1614 if (ret < 0) 1615 goto out; 1616 1617 while (search_start < search_end) { 1618 l = path->nodes[0]; 1619 slot = path->slots[0]; 1620 if (slot >= btrfs_header_nritems(l)) { 1621 ret = btrfs_next_leaf(root, path); 1622 if (ret == 0) 1623 continue; 1624 if (ret < 0) 1625 goto out; 1626 1627 break; 1628 } 1629 btrfs_item_key_to_cpu(l, &key, slot); 1630 1631 if (key.objectid < device->devid) 1632 goto next; 1633 1634 if (key.objectid > device->devid) 1635 break; 1636 1637 if (key.type != BTRFS_DEV_EXTENT_KEY) 1638 goto next; 1639 1640 if (key.offset > search_end) 1641 break; 1642 1643 if (key.offset > search_start) { 1644 hole_size = key.offset - search_start; 1645 dev_extent_hole_check(device, &search_start, &hole_size, 1646 num_bytes); 1647 1648 if (hole_size > max_hole_size) { 1649 max_hole_start = search_start; 1650 max_hole_size = hole_size; 1651 } 1652 1653 /* 1654 * If this free space is greater than which we need, 1655 * it must be the max free space that we have found 1656 * until now, so max_hole_start must point to the start 1657 * of this free space and the length of this free space 1658 * is stored in max_hole_size. Thus, we return 1659 * max_hole_start and max_hole_size and go back to the 1660 * caller. 1661 */ 1662 if (hole_size >= num_bytes) { 1663 ret = 0; 1664 goto out; 1665 } 1666 } 1667 1668 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1669 extent_end = key.offset + btrfs_dev_extent_length(l, 1670 dev_extent); 1671 if (extent_end > search_start) 1672 search_start = extent_end; 1673 next: 1674 path->slots[0]++; 1675 cond_resched(); 1676 } 1677 1678 /* 1679 * At this point, search_start should be the end of 1680 * allocated dev extents, and when shrinking the device, 1681 * search_end may be smaller than search_start. 1682 */ 1683 if (search_end > search_start) { 1684 hole_size = search_end - search_start; 1685 if (dev_extent_hole_check(device, &search_start, &hole_size, 1686 num_bytes)) { 1687 btrfs_release_path(path); 1688 goto again; 1689 } 1690 1691 if (hole_size > max_hole_size) { 1692 max_hole_start = search_start; 1693 max_hole_size = hole_size; 1694 } 1695 } 1696 1697 /* See above. */ 1698 if (max_hole_size < num_bytes) 1699 ret = -ENOSPC; 1700 else 1701 ret = 0; 1702 1703 ASSERT(max_hole_start + max_hole_size <= search_end); 1704 out: 1705 btrfs_free_path(path); 1706 *start = max_hole_start; 1707 if (len) 1708 *len = max_hole_size; 1709 return ret; 1710 } 1711 1712 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1713 u64 *start, u64 *len) 1714 { 1715 /* FIXME use last free of some kind */ 1716 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1717 } 1718 1719 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1720 struct btrfs_device *device, 1721 u64 start, u64 *dev_extent_len) 1722 { 1723 struct btrfs_fs_info *fs_info = device->fs_info; 1724 struct btrfs_root *root = fs_info->dev_root; 1725 int ret; 1726 struct btrfs_path *path; 1727 struct btrfs_key key; 1728 struct btrfs_key found_key; 1729 struct extent_buffer *leaf = NULL; 1730 struct btrfs_dev_extent *extent = NULL; 1731 1732 path = btrfs_alloc_path(); 1733 if (!path) 1734 return -ENOMEM; 1735 1736 key.objectid = device->devid; 1737 key.offset = start; 1738 key.type = BTRFS_DEV_EXTENT_KEY; 1739 again: 1740 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1741 if (ret > 0) { 1742 ret = btrfs_previous_item(root, path, key.objectid, 1743 BTRFS_DEV_EXTENT_KEY); 1744 if (ret) 1745 goto out; 1746 leaf = path->nodes[0]; 1747 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1748 extent = btrfs_item_ptr(leaf, path->slots[0], 1749 struct btrfs_dev_extent); 1750 BUG_ON(found_key.offset > start || found_key.offset + 1751 btrfs_dev_extent_length(leaf, extent) < start); 1752 key = found_key; 1753 btrfs_release_path(path); 1754 goto again; 1755 } else if (ret == 0) { 1756 leaf = path->nodes[0]; 1757 extent = btrfs_item_ptr(leaf, path->slots[0], 1758 struct btrfs_dev_extent); 1759 } else { 1760 goto out; 1761 } 1762 1763 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1764 1765 ret = btrfs_del_item(trans, root, path); 1766 if (ret == 0) 1767 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1768 out: 1769 btrfs_free_path(path); 1770 return ret; 1771 } 1772 1773 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1774 { 1775 struct extent_map_tree *em_tree; 1776 struct extent_map *em; 1777 struct rb_node *n; 1778 u64 ret = 0; 1779 1780 em_tree = &fs_info->mapping_tree; 1781 read_lock(&em_tree->lock); 1782 n = rb_last(&em_tree->map.rb_root); 1783 if (n) { 1784 em = rb_entry(n, struct extent_map, rb_node); 1785 ret = em->start + em->len; 1786 } 1787 read_unlock(&em_tree->lock); 1788 1789 return ret; 1790 } 1791 1792 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1793 u64 *devid_ret) 1794 { 1795 int ret; 1796 struct btrfs_key key; 1797 struct btrfs_key found_key; 1798 struct btrfs_path *path; 1799 1800 path = btrfs_alloc_path(); 1801 if (!path) 1802 return -ENOMEM; 1803 1804 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1805 key.type = BTRFS_DEV_ITEM_KEY; 1806 key.offset = (u64)-1; 1807 1808 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1809 if (ret < 0) 1810 goto error; 1811 1812 if (ret == 0) { 1813 /* Corruption */ 1814 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1815 ret = -EUCLEAN; 1816 goto error; 1817 } 1818 1819 ret = btrfs_previous_item(fs_info->chunk_root, path, 1820 BTRFS_DEV_ITEMS_OBJECTID, 1821 BTRFS_DEV_ITEM_KEY); 1822 if (ret) { 1823 *devid_ret = 1; 1824 } else { 1825 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1826 path->slots[0]); 1827 *devid_ret = found_key.offset + 1; 1828 } 1829 ret = 0; 1830 error: 1831 btrfs_free_path(path); 1832 return ret; 1833 } 1834 1835 /* 1836 * the device information is stored in the chunk root 1837 * the btrfs_device struct should be fully filled in 1838 */ 1839 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1840 struct btrfs_device *device) 1841 { 1842 int ret; 1843 struct btrfs_path *path; 1844 struct btrfs_dev_item *dev_item; 1845 struct extent_buffer *leaf; 1846 struct btrfs_key key; 1847 unsigned long ptr; 1848 1849 path = btrfs_alloc_path(); 1850 if (!path) 1851 return -ENOMEM; 1852 1853 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1854 key.type = BTRFS_DEV_ITEM_KEY; 1855 key.offset = device->devid; 1856 1857 btrfs_reserve_chunk_metadata(trans, true); 1858 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1859 &key, sizeof(*dev_item)); 1860 btrfs_trans_release_chunk_metadata(trans); 1861 if (ret) 1862 goto out; 1863 1864 leaf = path->nodes[0]; 1865 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1866 1867 btrfs_set_device_id(leaf, dev_item, device->devid); 1868 btrfs_set_device_generation(leaf, dev_item, 0); 1869 btrfs_set_device_type(leaf, dev_item, device->type); 1870 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1871 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1872 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1873 btrfs_set_device_total_bytes(leaf, dev_item, 1874 btrfs_device_get_disk_total_bytes(device)); 1875 btrfs_set_device_bytes_used(leaf, dev_item, 1876 btrfs_device_get_bytes_used(device)); 1877 btrfs_set_device_group(leaf, dev_item, 0); 1878 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1879 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1880 btrfs_set_device_start_offset(leaf, dev_item, 0); 1881 1882 ptr = btrfs_device_uuid(dev_item); 1883 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1884 ptr = btrfs_device_fsid(dev_item); 1885 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1886 ptr, BTRFS_FSID_SIZE); 1887 btrfs_mark_buffer_dirty(leaf); 1888 1889 ret = 0; 1890 out: 1891 btrfs_free_path(path); 1892 return ret; 1893 } 1894 1895 /* 1896 * Function to update ctime/mtime for a given device path. 1897 * Mainly used for ctime/mtime based probe like libblkid. 1898 * 1899 * We don't care about errors here, this is just to be kind to userspace. 1900 */ 1901 static void update_dev_time(const char *device_path) 1902 { 1903 struct path path; 1904 struct timespec64 now; 1905 int ret; 1906 1907 ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 1908 if (ret) 1909 return; 1910 1911 now = current_time(d_inode(path.dentry)); 1912 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); 1913 path_put(&path); 1914 } 1915 1916 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, 1917 struct btrfs_device *device) 1918 { 1919 struct btrfs_root *root = device->fs_info->chunk_root; 1920 int ret; 1921 struct btrfs_path *path; 1922 struct btrfs_key key; 1923 1924 path = btrfs_alloc_path(); 1925 if (!path) 1926 return -ENOMEM; 1927 1928 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1929 key.type = BTRFS_DEV_ITEM_KEY; 1930 key.offset = device->devid; 1931 1932 btrfs_reserve_chunk_metadata(trans, false); 1933 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1934 btrfs_trans_release_chunk_metadata(trans); 1935 if (ret) { 1936 if (ret > 0) 1937 ret = -ENOENT; 1938 goto out; 1939 } 1940 1941 ret = btrfs_del_item(trans, root, path); 1942 out: 1943 btrfs_free_path(path); 1944 return ret; 1945 } 1946 1947 /* 1948 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1949 * filesystem. It's up to the caller to adjust that number regarding eg. device 1950 * replace. 1951 */ 1952 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1953 u64 num_devices) 1954 { 1955 u64 all_avail; 1956 unsigned seq; 1957 int i; 1958 1959 do { 1960 seq = read_seqbegin(&fs_info->profiles_lock); 1961 1962 all_avail = fs_info->avail_data_alloc_bits | 1963 fs_info->avail_system_alloc_bits | 1964 fs_info->avail_metadata_alloc_bits; 1965 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1966 1967 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1968 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1969 continue; 1970 1971 if (num_devices < btrfs_raid_array[i].devs_min) 1972 return btrfs_raid_array[i].mindev_error; 1973 } 1974 1975 return 0; 1976 } 1977 1978 static struct btrfs_device * btrfs_find_next_active_device( 1979 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1980 { 1981 struct btrfs_device *next_device; 1982 1983 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1984 if (next_device != device && 1985 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1986 && next_device->bdev) 1987 return next_device; 1988 } 1989 1990 return NULL; 1991 } 1992 1993 /* 1994 * Helper function to check if the given device is part of s_bdev / latest_dev 1995 * and replace it with the provided or the next active device, in the context 1996 * where this function called, there should be always be another device (or 1997 * this_dev) which is active. 1998 */ 1999 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 2000 struct btrfs_device *next_device) 2001 { 2002 struct btrfs_fs_info *fs_info = device->fs_info; 2003 2004 if (!next_device) 2005 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2006 device); 2007 ASSERT(next_device); 2008 2009 if (fs_info->sb->s_bdev && 2010 (fs_info->sb->s_bdev == device->bdev)) 2011 fs_info->sb->s_bdev = next_device->bdev; 2012 2013 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2014 fs_info->fs_devices->latest_dev = next_device; 2015 } 2016 2017 /* 2018 * Return btrfs_fs_devices::num_devices excluding the device that's being 2019 * currently replaced. 2020 */ 2021 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2022 { 2023 u64 num_devices = fs_info->fs_devices->num_devices; 2024 2025 down_read(&fs_info->dev_replace.rwsem); 2026 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2027 ASSERT(num_devices > 1); 2028 num_devices--; 2029 } 2030 up_read(&fs_info->dev_replace.rwsem); 2031 2032 return num_devices; 2033 } 2034 2035 static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, 2036 struct block_device *bdev, int copy_num) 2037 { 2038 struct btrfs_super_block *disk_super; 2039 const size_t len = sizeof(disk_super->magic); 2040 const u64 bytenr = btrfs_sb_offset(copy_num); 2041 int ret; 2042 2043 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); 2044 if (IS_ERR(disk_super)) 2045 return; 2046 2047 memset(&disk_super->magic, 0, len); 2048 folio_mark_dirty(virt_to_folio(disk_super)); 2049 btrfs_release_disk_super(disk_super); 2050 2051 ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1); 2052 if (ret) 2053 btrfs_warn(fs_info, "error clearing superblock number %d (%d)", 2054 copy_num, ret); 2055 } 2056 2057 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2058 struct block_device *bdev, 2059 const char *device_path) 2060 { 2061 int copy_num; 2062 2063 if (!bdev) 2064 return; 2065 2066 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2067 if (bdev_is_zoned(bdev)) 2068 btrfs_reset_sb_log_zones(bdev, copy_num); 2069 else 2070 btrfs_scratch_superblock(fs_info, bdev, copy_num); 2071 } 2072 2073 /* Notify udev that device has changed */ 2074 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2075 2076 /* Update ctime/mtime for device path for libblkid */ 2077 update_dev_time(device_path); 2078 } 2079 2080 int btrfs_rm_device(struct btrfs_fs_info *fs_info, 2081 struct btrfs_dev_lookup_args *args, 2082 struct block_device **bdev, fmode_t *mode) 2083 { 2084 struct btrfs_trans_handle *trans; 2085 struct btrfs_device *device; 2086 struct btrfs_fs_devices *cur_devices; 2087 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2088 u64 num_devices; 2089 int ret = 0; 2090 2091 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2092 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); 2093 return -EINVAL; 2094 } 2095 2096 /* 2097 * The device list in fs_devices is accessed without locks (neither 2098 * uuid_mutex nor device_list_mutex) as it won't change on a mounted 2099 * filesystem and another device rm cannot run. 2100 */ 2101 num_devices = btrfs_num_devices(fs_info); 2102 2103 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2104 if (ret) 2105 return ret; 2106 2107 device = btrfs_find_device(fs_info->fs_devices, args); 2108 if (!device) { 2109 if (args->missing) 2110 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2111 else 2112 ret = -ENOENT; 2113 return ret; 2114 } 2115 2116 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2117 btrfs_warn_in_rcu(fs_info, 2118 "cannot remove device %s (devid %llu) due to active swapfile", 2119 btrfs_dev_name(device), device->devid); 2120 return -ETXTBSY; 2121 } 2122 2123 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 2124 return BTRFS_ERROR_DEV_TGT_REPLACE; 2125 2126 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2127 fs_info->fs_devices->rw_devices == 1) 2128 return BTRFS_ERROR_DEV_ONLY_WRITABLE; 2129 2130 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2131 mutex_lock(&fs_info->chunk_mutex); 2132 list_del_init(&device->dev_alloc_list); 2133 device->fs_devices->rw_devices--; 2134 mutex_unlock(&fs_info->chunk_mutex); 2135 } 2136 2137 ret = btrfs_shrink_device(device, 0); 2138 if (ret) 2139 goto error_undo; 2140 2141 trans = btrfs_start_transaction(fs_info->chunk_root, 0); 2142 if (IS_ERR(trans)) { 2143 ret = PTR_ERR(trans); 2144 goto error_undo; 2145 } 2146 2147 ret = btrfs_rm_dev_item(trans, device); 2148 if (ret) { 2149 /* Any error in dev item removal is critical */ 2150 btrfs_crit(fs_info, 2151 "failed to remove device item for devid %llu: %d", 2152 device->devid, ret); 2153 btrfs_abort_transaction(trans, ret); 2154 btrfs_end_transaction(trans); 2155 return ret; 2156 } 2157 2158 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2159 btrfs_scrub_cancel_dev(device); 2160 2161 /* 2162 * the device list mutex makes sure that we don't change 2163 * the device list while someone else is writing out all 2164 * the device supers. Whoever is writing all supers, should 2165 * lock the device list mutex before getting the number of 2166 * devices in the super block (super_copy). Conversely, 2167 * whoever updates the number of devices in the super block 2168 * (super_copy) should hold the device list mutex. 2169 */ 2170 2171 /* 2172 * In normal cases the cur_devices == fs_devices. But in case 2173 * of deleting a seed device, the cur_devices should point to 2174 * its own fs_devices listed under the fs_devices->seed_list. 2175 */ 2176 cur_devices = device->fs_devices; 2177 mutex_lock(&fs_devices->device_list_mutex); 2178 list_del_rcu(&device->dev_list); 2179 2180 cur_devices->num_devices--; 2181 cur_devices->total_devices--; 2182 /* Update total_devices of the parent fs_devices if it's seed */ 2183 if (cur_devices != fs_devices) 2184 fs_devices->total_devices--; 2185 2186 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2187 cur_devices->missing_devices--; 2188 2189 btrfs_assign_next_active_device(device, NULL); 2190 2191 if (device->bdev) { 2192 cur_devices->open_devices--; 2193 /* remove sysfs entry */ 2194 btrfs_sysfs_remove_device(device); 2195 } 2196 2197 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2198 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2199 mutex_unlock(&fs_devices->device_list_mutex); 2200 2201 /* 2202 * At this point, the device is zero sized and detached from the 2203 * devices list. All that's left is to zero out the old supers and 2204 * free the device. 2205 * 2206 * We cannot call btrfs_close_bdev() here because we're holding the sb 2207 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2208 * block device and it's dependencies. Instead just flush the device 2209 * and let the caller do the final blkdev_put. 2210 */ 2211 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2212 btrfs_scratch_superblocks(fs_info, device->bdev, 2213 device->name->str); 2214 if (device->bdev) { 2215 sync_blockdev(device->bdev); 2216 invalidate_bdev(device->bdev); 2217 } 2218 } 2219 2220 *bdev = device->bdev; 2221 *mode = device->mode; 2222 synchronize_rcu(); 2223 btrfs_free_device(device); 2224 2225 /* 2226 * This can happen if cur_devices is the private seed devices list. We 2227 * cannot call close_fs_devices() here because it expects the uuid_mutex 2228 * to be held, but in fact we don't need that for the private 2229 * seed_devices, we can simply decrement cur_devices->opened and then 2230 * remove it from our list and free the fs_devices. 2231 */ 2232 if (cur_devices->num_devices == 0) { 2233 list_del_init(&cur_devices->seed_list); 2234 ASSERT(cur_devices->opened == 1); 2235 cur_devices->opened--; 2236 free_fs_devices(cur_devices); 2237 } 2238 2239 ret = btrfs_commit_transaction(trans); 2240 2241 return ret; 2242 2243 error_undo: 2244 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2245 mutex_lock(&fs_info->chunk_mutex); 2246 list_add(&device->dev_alloc_list, 2247 &fs_devices->alloc_list); 2248 device->fs_devices->rw_devices++; 2249 mutex_unlock(&fs_info->chunk_mutex); 2250 } 2251 return ret; 2252 } 2253 2254 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2255 { 2256 struct btrfs_fs_devices *fs_devices; 2257 2258 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2259 2260 /* 2261 * in case of fs with no seed, srcdev->fs_devices will point 2262 * to fs_devices of fs_info. However when the dev being replaced is 2263 * a seed dev it will point to the seed's local fs_devices. In short 2264 * srcdev will have its correct fs_devices in both the cases. 2265 */ 2266 fs_devices = srcdev->fs_devices; 2267 2268 list_del_rcu(&srcdev->dev_list); 2269 list_del(&srcdev->dev_alloc_list); 2270 fs_devices->num_devices--; 2271 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2272 fs_devices->missing_devices--; 2273 2274 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2275 fs_devices->rw_devices--; 2276 2277 if (srcdev->bdev) 2278 fs_devices->open_devices--; 2279 } 2280 2281 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2282 { 2283 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2284 2285 mutex_lock(&uuid_mutex); 2286 2287 btrfs_close_bdev(srcdev); 2288 synchronize_rcu(); 2289 btrfs_free_device(srcdev); 2290 2291 /* if this is no devs we rather delete the fs_devices */ 2292 if (!fs_devices->num_devices) { 2293 /* 2294 * On a mounted FS, num_devices can't be zero unless it's a 2295 * seed. In case of a seed device being replaced, the replace 2296 * target added to the sprout FS, so there will be no more 2297 * device left under the seed FS. 2298 */ 2299 ASSERT(fs_devices->seeding); 2300 2301 list_del_init(&fs_devices->seed_list); 2302 close_fs_devices(fs_devices); 2303 free_fs_devices(fs_devices); 2304 } 2305 mutex_unlock(&uuid_mutex); 2306 } 2307 2308 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2309 { 2310 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2311 2312 mutex_lock(&fs_devices->device_list_mutex); 2313 2314 btrfs_sysfs_remove_device(tgtdev); 2315 2316 if (tgtdev->bdev) 2317 fs_devices->open_devices--; 2318 2319 fs_devices->num_devices--; 2320 2321 btrfs_assign_next_active_device(tgtdev, NULL); 2322 2323 list_del_rcu(&tgtdev->dev_list); 2324 2325 mutex_unlock(&fs_devices->device_list_mutex); 2326 2327 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2328 tgtdev->name->str); 2329 2330 btrfs_close_bdev(tgtdev); 2331 synchronize_rcu(); 2332 btrfs_free_device(tgtdev); 2333 } 2334 2335 /* 2336 * Populate args from device at path. 2337 * 2338 * @fs_info: the filesystem 2339 * @args: the args to populate 2340 * @path: the path to the device 2341 * 2342 * This will read the super block of the device at @path and populate @args with 2343 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 2344 * lookup a device to operate on, but need to do it before we take any locks. 2345 * This properly handles the special case of "missing" that a user may pass in, 2346 * and does some basic sanity checks. The caller must make sure that @path is 2347 * properly NUL terminated before calling in, and must call 2348 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 2349 * uuid buffers. 2350 * 2351 * Return: 0 for success, -errno for failure 2352 */ 2353 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 2354 struct btrfs_dev_lookup_args *args, 2355 const char *path) 2356 { 2357 struct btrfs_super_block *disk_super; 2358 struct block_device *bdev; 2359 int ret; 2360 2361 if (!path || !path[0]) 2362 return -EINVAL; 2363 if (!strcmp(path, "missing")) { 2364 args->missing = true; 2365 return 0; 2366 } 2367 2368 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 2369 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 2370 if (!args->uuid || !args->fsid) { 2371 btrfs_put_dev_args_from_path(args); 2372 return -ENOMEM; 2373 } 2374 2375 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, 2376 &bdev, &disk_super); 2377 if (ret) { 2378 btrfs_put_dev_args_from_path(args); 2379 return ret; 2380 } 2381 2382 args->devid = btrfs_stack_device_id(&disk_super->dev_item); 2383 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 2384 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2385 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 2386 else 2387 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 2388 btrfs_release_disk_super(disk_super); 2389 blkdev_put(bdev, FMODE_READ); 2390 return 0; 2391 } 2392 2393 /* 2394 * Only use this jointly with btrfs_get_dev_args_from_path() because we will 2395 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 2396 * that don't need to be freed. 2397 */ 2398 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 2399 { 2400 kfree(args->uuid); 2401 kfree(args->fsid); 2402 args->uuid = NULL; 2403 args->fsid = NULL; 2404 } 2405 2406 struct btrfs_device *btrfs_find_device_by_devspec( 2407 struct btrfs_fs_info *fs_info, u64 devid, 2408 const char *device_path) 2409 { 2410 BTRFS_DEV_LOOKUP_ARGS(args); 2411 struct btrfs_device *device; 2412 int ret; 2413 2414 if (devid) { 2415 args.devid = devid; 2416 device = btrfs_find_device(fs_info->fs_devices, &args); 2417 if (!device) 2418 return ERR_PTR(-ENOENT); 2419 return device; 2420 } 2421 2422 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 2423 if (ret) 2424 return ERR_PTR(ret); 2425 device = btrfs_find_device(fs_info->fs_devices, &args); 2426 btrfs_put_dev_args_from_path(&args); 2427 if (!device) 2428 return ERR_PTR(-ENOENT); 2429 return device; 2430 } 2431 2432 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 2433 { 2434 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2435 struct btrfs_fs_devices *old_devices; 2436 struct btrfs_fs_devices *seed_devices; 2437 2438 lockdep_assert_held(&uuid_mutex); 2439 if (!fs_devices->seeding) 2440 return ERR_PTR(-EINVAL); 2441 2442 /* 2443 * Private copy of the seed devices, anchored at 2444 * fs_info->fs_devices->seed_list 2445 */ 2446 seed_devices = alloc_fs_devices(NULL, NULL); 2447 if (IS_ERR(seed_devices)) 2448 return seed_devices; 2449 2450 /* 2451 * It's necessary to retain a copy of the original seed fs_devices in 2452 * fs_uuids so that filesystems which have been seeded can successfully 2453 * reference the seed device from open_seed_devices. This also supports 2454 * multiple fs seed. 2455 */ 2456 old_devices = clone_fs_devices(fs_devices); 2457 if (IS_ERR(old_devices)) { 2458 kfree(seed_devices); 2459 return old_devices; 2460 } 2461 2462 list_add(&old_devices->fs_list, &fs_uuids); 2463 2464 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2465 seed_devices->opened = 1; 2466 INIT_LIST_HEAD(&seed_devices->devices); 2467 INIT_LIST_HEAD(&seed_devices->alloc_list); 2468 mutex_init(&seed_devices->device_list_mutex); 2469 2470 return seed_devices; 2471 } 2472 2473 /* 2474 * Splice seed devices into the sprout fs_devices. 2475 * Generate a new fsid for the sprouted read-write filesystem. 2476 */ 2477 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 2478 struct btrfs_fs_devices *seed_devices) 2479 { 2480 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2481 struct btrfs_super_block *disk_super = fs_info->super_copy; 2482 struct btrfs_device *device; 2483 u64 super_flags; 2484 2485 /* 2486 * We are updating the fsid, the thread leading to device_list_add() 2487 * could race, so uuid_mutex is needed. 2488 */ 2489 lockdep_assert_held(&uuid_mutex); 2490 2491 /* 2492 * The threads listed below may traverse dev_list but can do that without 2493 * device_list_mutex: 2494 * - All device ops and balance - as we are in btrfs_exclop_start. 2495 * - Various dev_list readers - are using RCU. 2496 * - btrfs_ioctl_fitrim() - is using RCU. 2497 * 2498 * For-read threads as below are using device_list_mutex: 2499 * - Readonly scrub btrfs_scrub_dev() 2500 * - Readonly scrub btrfs_scrub_progress() 2501 * - btrfs_get_dev_stats() 2502 */ 2503 lockdep_assert_held(&fs_devices->device_list_mutex); 2504 2505 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2506 synchronize_rcu); 2507 list_for_each_entry(device, &seed_devices->devices, dev_list) 2508 device->fs_devices = seed_devices; 2509 2510 fs_devices->seeding = false; 2511 fs_devices->num_devices = 0; 2512 fs_devices->open_devices = 0; 2513 fs_devices->missing_devices = 0; 2514 fs_devices->rotating = false; 2515 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2516 2517 generate_random_uuid(fs_devices->fsid); 2518 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2519 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2520 2521 super_flags = btrfs_super_flags(disk_super) & 2522 ~BTRFS_SUPER_FLAG_SEEDING; 2523 btrfs_set_super_flags(disk_super, super_flags); 2524 } 2525 2526 /* 2527 * Store the expected generation for seed devices in device items. 2528 */ 2529 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2530 { 2531 BTRFS_DEV_LOOKUP_ARGS(args); 2532 struct btrfs_fs_info *fs_info = trans->fs_info; 2533 struct btrfs_root *root = fs_info->chunk_root; 2534 struct btrfs_path *path; 2535 struct extent_buffer *leaf; 2536 struct btrfs_dev_item *dev_item; 2537 struct btrfs_device *device; 2538 struct btrfs_key key; 2539 u8 fs_uuid[BTRFS_FSID_SIZE]; 2540 u8 dev_uuid[BTRFS_UUID_SIZE]; 2541 int ret; 2542 2543 path = btrfs_alloc_path(); 2544 if (!path) 2545 return -ENOMEM; 2546 2547 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2548 key.offset = 0; 2549 key.type = BTRFS_DEV_ITEM_KEY; 2550 2551 while (1) { 2552 btrfs_reserve_chunk_metadata(trans, false); 2553 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2554 btrfs_trans_release_chunk_metadata(trans); 2555 if (ret < 0) 2556 goto error; 2557 2558 leaf = path->nodes[0]; 2559 next_slot: 2560 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2561 ret = btrfs_next_leaf(root, path); 2562 if (ret > 0) 2563 break; 2564 if (ret < 0) 2565 goto error; 2566 leaf = path->nodes[0]; 2567 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2568 btrfs_release_path(path); 2569 continue; 2570 } 2571 2572 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2573 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2574 key.type != BTRFS_DEV_ITEM_KEY) 2575 break; 2576 2577 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2578 struct btrfs_dev_item); 2579 args.devid = btrfs_device_id(leaf, dev_item); 2580 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2581 BTRFS_UUID_SIZE); 2582 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2583 BTRFS_FSID_SIZE); 2584 args.uuid = dev_uuid; 2585 args.fsid = fs_uuid; 2586 device = btrfs_find_device(fs_info->fs_devices, &args); 2587 BUG_ON(!device); /* Logic error */ 2588 2589 if (device->fs_devices->seeding) { 2590 btrfs_set_device_generation(leaf, dev_item, 2591 device->generation); 2592 btrfs_mark_buffer_dirty(leaf); 2593 } 2594 2595 path->slots[0]++; 2596 goto next_slot; 2597 } 2598 ret = 0; 2599 error: 2600 btrfs_free_path(path); 2601 return ret; 2602 } 2603 2604 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2605 { 2606 struct btrfs_root *root = fs_info->dev_root; 2607 struct btrfs_trans_handle *trans; 2608 struct btrfs_device *device; 2609 struct block_device *bdev; 2610 struct super_block *sb = fs_info->sb; 2611 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2612 struct btrfs_fs_devices *seed_devices; 2613 u64 orig_super_total_bytes; 2614 u64 orig_super_num_devices; 2615 int ret = 0; 2616 bool seeding_dev = false; 2617 bool locked = false; 2618 2619 if (sb_rdonly(sb) && !fs_devices->seeding) 2620 return -EROFS; 2621 2622 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2623 fs_info->bdev_holder); 2624 if (IS_ERR(bdev)) 2625 return PTR_ERR(bdev); 2626 2627 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2628 ret = -EINVAL; 2629 goto error; 2630 } 2631 2632 if (fs_devices->seeding) { 2633 seeding_dev = true; 2634 down_write(&sb->s_umount); 2635 mutex_lock(&uuid_mutex); 2636 locked = true; 2637 } 2638 2639 sync_blockdev(bdev); 2640 2641 rcu_read_lock(); 2642 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2643 if (device->bdev == bdev) { 2644 ret = -EEXIST; 2645 rcu_read_unlock(); 2646 goto error; 2647 } 2648 } 2649 rcu_read_unlock(); 2650 2651 device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); 2652 if (IS_ERR(device)) { 2653 /* we can safely leave the fs_devices entry around */ 2654 ret = PTR_ERR(device); 2655 goto error; 2656 } 2657 2658 device->fs_info = fs_info; 2659 device->bdev = bdev; 2660 ret = lookup_bdev(device_path, &device->devt); 2661 if (ret) 2662 goto error_free_device; 2663 2664 ret = btrfs_get_dev_zone_info(device, false); 2665 if (ret) 2666 goto error_free_device; 2667 2668 trans = btrfs_start_transaction(root, 0); 2669 if (IS_ERR(trans)) { 2670 ret = PTR_ERR(trans); 2671 goto error_free_zone; 2672 } 2673 2674 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2675 device->generation = trans->transid; 2676 device->io_width = fs_info->sectorsize; 2677 device->io_align = fs_info->sectorsize; 2678 device->sector_size = fs_info->sectorsize; 2679 device->total_bytes = 2680 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); 2681 device->disk_total_bytes = device->total_bytes; 2682 device->commit_total_bytes = device->total_bytes; 2683 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2684 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2685 device->mode = FMODE_EXCL; 2686 device->dev_stats_valid = 1; 2687 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2688 2689 if (seeding_dev) { 2690 btrfs_clear_sb_rdonly(sb); 2691 2692 /* GFP_KERNEL allocation must not be under device_list_mutex */ 2693 seed_devices = btrfs_init_sprout(fs_info); 2694 if (IS_ERR(seed_devices)) { 2695 ret = PTR_ERR(seed_devices); 2696 btrfs_abort_transaction(trans, ret); 2697 goto error_trans; 2698 } 2699 } 2700 2701 mutex_lock(&fs_devices->device_list_mutex); 2702 if (seeding_dev) { 2703 btrfs_setup_sprout(fs_info, seed_devices); 2704 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2705 device); 2706 } 2707 2708 device->fs_devices = fs_devices; 2709 2710 mutex_lock(&fs_info->chunk_mutex); 2711 list_add_rcu(&device->dev_list, &fs_devices->devices); 2712 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2713 fs_devices->num_devices++; 2714 fs_devices->open_devices++; 2715 fs_devices->rw_devices++; 2716 fs_devices->total_devices++; 2717 fs_devices->total_rw_bytes += device->total_bytes; 2718 2719 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2720 2721 if (!bdev_nonrot(bdev)) 2722 fs_devices->rotating = true; 2723 2724 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2725 btrfs_set_super_total_bytes(fs_info->super_copy, 2726 round_down(orig_super_total_bytes + device->total_bytes, 2727 fs_info->sectorsize)); 2728 2729 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2730 btrfs_set_super_num_devices(fs_info->super_copy, 2731 orig_super_num_devices + 1); 2732 2733 /* 2734 * we've got more storage, clear any full flags on the space 2735 * infos 2736 */ 2737 btrfs_clear_space_info_full(fs_info); 2738 2739 mutex_unlock(&fs_info->chunk_mutex); 2740 2741 /* Add sysfs device entry */ 2742 btrfs_sysfs_add_device(device); 2743 2744 mutex_unlock(&fs_devices->device_list_mutex); 2745 2746 if (seeding_dev) { 2747 mutex_lock(&fs_info->chunk_mutex); 2748 ret = init_first_rw_device(trans); 2749 mutex_unlock(&fs_info->chunk_mutex); 2750 if (ret) { 2751 btrfs_abort_transaction(trans, ret); 2752 goto error_sysfs; 2753 } 2754 } 2755 2756 ret = btrfs_add_dev_item(trans, device); 2757 if (ret) { 2758 btrfs_abort_transaction(trans, ret); 2759 goto error_sysfs; 2760 } 2761 2762 if (seeding_dev) { 2763 ret = btrfs_finish_sprout(trans); 2764 if (ret) { 2765 btrfs_abort_transaction(trans, ret); 2766 goto error_sysfs; 2767 } 2768 2769 /* 2770 * fs_devices now represents the newly sprouted filesystem and 2771 * its fsid has been changed by btrfs_sprout_splice(). 2772 */ 2773 btrfs_sysfs_update_sprout_fsid(fs_devices); 2774 } 2775 2776 ret = btrfs_commit_transaction(trans); 2777 2778 if (seeding_dev) { 2779 mutex_unlock(&uuid_mutex); 2780 up_write(&sb->s_umount); 2781 locked = false; 2782 2783 if (ret) /* transaction commit */ 2784 return ret; 2785 2786 ret = btrfs_relocate_sys_chunks(fs_info); 2787 if (ret < 0) 2788 btrfs_handle_fs_error(fs_info, ret, 2789 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2790 trans = btrfs_attach_transaction(root); 2791 if (IS_ERR(trans)) { 2792 if (PTR_ERR(trans) == -ENOENT) 2793 return 0; 2794 ret = PTR_ERR(trans); 2795 trans = NULL; 2796 goto error_sysfs; 2797 } 2798 ret = btrfs_commit_transaction(trans); 2799 } 2800 2801 /* 2802 * Now that we have written a new super block to this device, check all 2803 * other fs_devices list if device_path alienates any other scanned 2804 * device. 2805 * We can ignore the return value as it typically returns -EINVAL and 2806 * only succeeds if the device was an alien. 2807 */ 2808 btrfs_forget_devices(device->devt); 2809 2810 /* Update ctime/mtime for blkid or udev */ 2811 update_dev_time(device_path); 2812 2813 return ret; 2814 2815 error_sysfs: 2816 btrfs_sysfs_remove_device(device); 2817 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2818 mutex_lock(&fs_info->chunk_mutex); 2819 list_del_rcu(&device->dev_list); 2820 list_del(&device->dev_alloc_list); 2821 fs_info->fs_devices->num_devices--; 2822 fs_info->fs_devices->open_devices--; 2823 fs_info->fs_devices->rw_devices--; 2824 fs_info->fs_devices->total_devices--; 2825 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2826 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2827 btrfs_set_super_total_bytes(fs_info->super_copy, 2828 orig_super_total_bytes); 2829 btrfs_set_super_num_devices(fs_info->super_copy, 2830 orig_super_num_devices); 2831 mutex_unlock(&fs_info->chunk_mutex); 2832 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2833 error_trans: 2834 if (seeding_dev) 2835 btrfs_set_sb_rdonly(sb); 2836 if (trans) 2837 btrfs_end_transaction(trans); 2838 error_free_zone: 2839 btrfs_destroy_dev_zone_info(device); 2840 error_free_device: 2841 btrfs_free_device(device); 2842 error: 2843 blkdev_put(bdev, FMODE_EXCL); 2844 if (locked) { 2845 mutex_unlock(&uuid_mutex); 2846 up_write(&sb->s_umount); 2847 } 2848 return ret; 2849 } 2850 2851 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2852 struct btrfs_device *device) 2853 { 2854 int ret; 2855 struct btrfs_path *path; 2856 struct btrfs_root *root = device->fs_info->chunk_root; 2857 struct btrfs_dev_item *dev_item; 2858 struct extent_buffer *leaf; 2859 struct btrfs_key key; 2860 2861 path = btrfs_alloc_path(); 2862 if (!path) 2863 return -ENOMEM; 2864 2865 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2866 key.type = BTRFS_DEV_ITEM_KEY; 2867 key.offset = device->devid; 2868 2869 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2870 if (ret < 0) 2871 goto out; 2872 2873 if (ret > 0) { 2874 ret = -ENOENT; 2875 goto out; 2876 } 2877 2878 leaf = path->nodes[0]; 2879 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2880 2881 btrfs_set_device_id(leaf, dev_item, device->devid); 2882 btrfs_set_device_type(leaf, dev_item, device->type); 2883 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2884 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2885 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2886 btrfs_set_device_total_bytes(leaf, dev_item, 2887 btrfs_device_get_disk_total_bytes(device)); 2888 btrfs_set_device_bytes_used(leaf, dev_item, 2889 btrfs_device_get_bytes_used(device)); 2890 btrfs_mark_buffer_dirty(leaf); 2891 2892 out: 2893 btrfs_free_path(path); 2894 return ret; 2895 } 2896 2897 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2898 struct btrfs_device *device, u64 new_size) 2899 { 2900 struct btrfs_fs_info *fs_info = device->fs_info; 2901 struct btrfs_super_block *super_copy = fs_info->super_copy; 2902 u64 old_total; 2903 u64 diff; 2904 int ret; 2905 2906 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2907 return -EACCES; 2908 2909 new_size = round_down(new_size, fs_info->sectorsize); 2910 2911 mutex_lock(&fs_info->chunk_mutex); 2912 old_total = btrfs_super_total_bytes(super_copy); 2913 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2914 2915 if (new_size <= device->total_bytes || 2916 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2917 mutex_unlock(&fs_info->chunk_mutex); 2918 return -EINVAL; 2919 } 2920 2921 btrfs_set_super_total_bytes(super_copy, 2922 round_down(old_total + diff, fs_info->sectorsize)); 2923 device->fs_devices->total_rw_bytes += diff; 2924 2925 btrfs_device_set_total_bytes(device, new_size); 2926 btrfs_device_set_disk_total_bytes(device, new_size); 2927 btrfs_clear_space_info_full(device->fs_info); 2928 if (list_empty(&device->post_commit_list)) 2929 list_add_tail(&device->post_commit_list, 2930 &trans->transaction->dev_update_list); 2931 mutex_unlock(&fs_info->chunk_mutex); 2932 2933 btrfs_reserve_chunk_metadata(trans, false); 2934 ret = btrfs_update_device(trans, device); 2935 btrfs_trans_release_chunk_metadata(trans); 2936 2937 return ret; 2938 } 2939 2940 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2941 { 2942 struct btrfs_fs_info *fs_info = trans->fs_info; 2943 struct btrfs_root *root = fs_info->chunk_root; 2944 int ret; 2945 struct btrfs_path *path; 2946 struct btrfs_key key; 2947 2948 path = btrfs_alloc_path(); 2949 if (!path) 2950 return -ENOMEM; 2951 2952 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2953 key.offset = chunk_offset; 2954 key.type = BTRFS_CHUNK_ITEM_KEY; 2955 2956 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2957 if (ret < 0) 2958 goto out; 2959 else if (ret > 0) { /* Logic error or corruption */ 2960 btrfs_handle_fs_error(fs_info, -ENOENT, 2961 "Failed lookup while freeing chunk."); 2962 ret = -ENOENT; 2963 goto out; 2964 } 2965 2966 ret = btrfs_del_item(trans, root, path); 2967 if (ret < 0) 2968 btrfs_handle_fs_error(fs_info, ret, 2969 "Failed to delete chunk item."); 2970 out: 2971 btrfs_free_path(path); 2972 return ret; 2973 } 2974 2975 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2976 { 2977 struct btrfs_super_block *super_copy = fs_info->super_copy; 2978 struct btrfs_disk_key *disk_key; 2979 struct btrfs_chunk *chunk; 2980 u8 *ptr; 2981 int ret = 0; 2982 u32 num_stripes; 2983 u32 array_size; 2984 u32 len = 0; 2985 u32 cur; 2986 struct btrfs_key key; 2987 2988 lockdep_assert_held(&fs_info->chunk_mutex); 2989 array_size = btrfs_super_sys_array_size(super_copy); 2990 2991 ptr = super_copy->sys_chunk_array; 2992 cur = 0; 2993 2994 while (cur < array_size) { 2995 disk_key = (struct btrfs_disk_key *)ptr; 2996 btrfs_disk_key_to_cpu(&key, disk_key); 2997 2998 len = sizeof(*disk_key); 2999 3000 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3001 chunk = (struct btrfs_chunk *)(ptr + len); 3002 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 3003 len += btrfs_chunk_item_size(num_stripes); 3004 } else { 3005 ret = -EIO; 3006 break; 3007 } 3008 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 3009 key.offset == chunk_offset) { 3010 memmove(ptr, ptr + len, array_size - (cur + len)); 3011 array_size -= len; 3012 btrfs_set_super_sys_array_size(super_copy, array_size); 3013 } else { 3014 ptr += len; 3015 cur += len; 3016 } 3017 } 3018 return ret; 3019 } 3020 3021 /* 3022 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 3023 * @logical: Logical block offset in bytes. 3024 * @length: Length of extent in bytes. 3025 * 3026 * Return: Chunk mapping or ERR_PTR. 3027 */ 3028 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 3029 u64 logical, u64 length) 3030 { 3031 struct extent_map_tree *em_tree; 3032 struct extent_map *em; 3033 3034 em_tree = &fs_info->mapping_tree; 3035 read_lock(&em_tree->lock); 3036 em = lookup_extent_mapping(em_tree, logical, length); 3037 read_unlock(&em_tree->lock); 3038 3039 if (!em) { 3040 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 3041 logical, length); 3042 return ERR_PTR(-EINVAL); 3043 } 3044 3045 if (em->start > logical || em->start + em->len < logical) { 3046 btrfs_crit(fs_info, 3047 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 3048 logical, length, em->start, em->start + em->len); 3049 free_extent_map(em); 3050 return ERR_PTR(-EINVAL); 3051 } 3052 3053 /* callers are responsible for dropping em's ref. */ 3054 return em; 3055 } 3056 3057 static int remove_chunk_item(struct btrfs_trans_handle *trans, 3058 struct map_lookup *map, u64 chunk_offset) 3059 { 3060 int i; 3061 3062 /* 3063 * Removing chunk items and updating the device items in the chunks btree 3064 * requires holding the chunk_mutex. 3065 * See the comment at btrfs_chunk_alloc() for the details. 3066 */ 3067 lockdep_assert_held(&trans->fs_info->chunk_mutex); 3068 3069 for (i = 0; i < map->num_stripes; i++) { 3070 int ret; 3071 3072 ret = btrfs_update_device(trans, map->stripes[i].dev); 3073 if (ret) 3074 return ret; 3075 } 3076 3077 return btrfs_free_chunk(trans, chunk_offset); 3078 } 3079 3080 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3081 { 3082 struct btrfs_fs_info *fs_info = trans->fs_info; 3083 struct extent_map *em; 3084 struct map_lookup *map; 3085 u64 dev_extent_len = 0; 3086 int i, ret = 0; 3087 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3088 3089 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3090 if (IS_ERR(em)) { 3091 /* 3092 * This is a logic error, but we don't want to just rely on the 3093 * user having built with ASSERT enabled, so if ASSERT doesn't 3094 * do anything we still error out. 3095 */ 3096 ASSERT(0); 3097 return PTR_ERR(em); 3098 } 3099 map = em->map_lookup; 3100 3101 /* 3102 * First delete the device extent items from the devices btree. 3103 * We take the device_list_mutex to avoid racing with the finishing phase 3104 * of a device replace operation. See the comment below before acquiring 3105 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3106 * because that can result in a deadlock when deleting the device extent 3107 * items from the devices btree - COWing an extent buffer from the btree 3108 * may result in allocating a new metadata chunk, which would attempt to 3109 * lock again fs_info->chunk_mutex. 3110 */ 3111 mutex_lock(&fs_devices->device_list_mutex); 3112 for (i = 0; i < map->num_stripes; i++) { 3113 struct btrfs_device *device = map->stripes[i].dev; 3114 ret = btrfs_free_dev_extent(trans, device, 3115 map->stripes[i].physical, 3116 &dev_extent_len); 3117 if (ret) { 3118 mutex_unlock(&fs_devices->device_list_mutex); 3119 btrfs_abort_transaction(trans, ret); 3120 goto out; 3121 } 3122 3123 if (device->bytes_used > 0) { 3124 mutex_lock(&fs_info->chunk_mutex); 3125 btrfs_device_set_bytes_used(device, 3126 device->bytes_used - dev_extent_len); 3127 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3128 btrfs_clear_space_info_full(fs_info); 3129 mutex_unlock(&fs_info->chunk_mutex); 3130 } 3131 } 3132 mutex_unlock(&fs_devices->device_list_mutex); 3133 3134 /* 3135 * We acquire fs_info->chunk_mutex for 2 reasons: 3136 * 3137 * 1) Just like with the first phase of the chunk allocation, we must 3138 * reserve system space, do all chunk btree updates and deletions, and 3139 * update the system chunk array in the superblock while holding this 3140 * mutex. This is for similar reasons as explained on the comment at 3141 * the top of btrfs_chunk_alloc(); 3142 * 3143 * 2) Prevent races with the final phase of a device replace operation 3144 * that replaces the device object associated with the map's stripes, 3145 * because the device object's id can change at any time during that 3146 * final phase of the device replace operation 3147 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3148 * replaced device and then see it with an ID of 3149 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3150 * the device item, which does not exists on the chunk btree. 3151 * The finishing phase of device replace acquires both the 3152 * device_list_mutex and the chunk_mutex, in that order, so we are 3153 * safe by just acquiring the chunk_mutex. 3154 */ 3155 trans->removing_chunk = true; 3156 mutex_lock(&fs_info->chunk_mutex); 3157 3158 check_system_chunk(trans, map->type); 3159 3160 ret = remove_chunk_item(trans, map, chunk_offset); 3161 /* 3162 * Normally we should not get -ENOSPC since we reserved space before 3163 * through the call to check_system_chunk(). 3164 * 3165 * Despite our system space_info having enough free space, we may not 3166 * be able to allocate extents from its block groups, because all have 3167 * an incompatible profile, which will force us to allocate a new system 3168 * block group with the right profile, or right after we called 3169 * check_system_space() above, a scrub turned the only system block group 3170 * with enough free space into RO mode. 3171 * This is explained with more detail at do_chunk_alloc(). 3172 * 3173 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3174 */ 3175 if (ret == -ENOSPC) { 3176 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3177 struct btrfs_block_group *sys_bg; 3178 3179 sys_bg = btrfs_create_chunk(trans, sys_flags); 3180 if (IS_ERR(sys_bg)) { 3181 ret = PTR_ERR(sys_bg); 3182 btrfs_abort_transaction(trans, ret); 3183 goto out; 3184 } 3185 3186 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3187 if (ret) { 3188 btrfs_abort_transaction(trans, ret); 3189 goto out; 3190 } 3191 3192 ret = remove_chunk_item(trans, map, chunk_offset); 3193 if (ret) { 3194 btrfs_abort_transaction(trans, ret); 3195 goto out; 3196 } 3197 } else if (ret) { 3198 btrfs_abort_transaction(trans, ret); 3199 goto out; 3200 } 3201 3202 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3203 3204 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3205 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3206 if (ret) { 3207 btrfs_abort_transaction(trans, ret); 3208 goto out; 3209 } 3210 } 3211 3212 mutex_unlock(&fs_info->chunk_mutex); 3213 trans->removing_chunk = false; 3214 3215 /* 3216 * We are done with chunk btree updates and deletions, so release the 3217 * system space we previously reserved (with check_system_chunk()). 3218 */ 3219 btrfs_trans_release_chunk_metadata(trans); 3220 3221 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3222 if (ret) { 3223 btrfs_abort_transaction(trans, ret); 3224 goto out; 3225 } 3226 3227 out: 3228 if (trans->removing_chunk) { 3229 mutex_unlock(&fs_info->chunk_mutex); 3230 trans->removing_chunk = false; 3231 } 3232 /* once for us */ 3233 free_extent_map(em); 3234 return ret; 3235 } 3236 3237 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3238 { 3239 struct btrfs_root *root = fs_info->chunk_root; 3240 struct btrfs_trans_handle *trans; 3241 struct btrfs_block_group *block_group; 3242 u64 length; 3243 int ret; 3244 3245 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3246 btrfs_err(fs_info, 3247 "relocate: not supported on extent tree v2 yet"); 3248 return -EINVAL; 3249 } 3250 3251 /* 3252 * Prevent races with automatic removal of unused block groups. 3253 * After we relocate and before we remove the chunk with offset 3254 * chunk_offset, automatic removal of the block group can kick in, 3255 * resulting in a failure when calling btrfs_remove_chunk() below. 3256 * 3257 * Make sure to acquire this mutex before doing a tree search (dev 3258 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3259 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3260 * we release the path used to search the chunk/dev tree and before 3261 * the current task acquires this mutex and calls us. 3262 */ 3263 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3264 3265 /* step one, relocate all the extents inside this chunk */ 3266 btrfs_scrub_pause(fs_info); 3267 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3268 btrfs_scrub_continue(fs_info); 3269 if (ret) 3270 return ret; 3271 3272 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3273 if (!block_group) 3274 return -ENOENT; 3275 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3276 length = block_group->length; 3277 btrfs_put_block_group(block_group); 3278 3279 /* 3280 * On a zoned file system, discard the whole block group, this will 3281 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3282 * resetting the zone fails, don't treat it as a fatal problem from the 3283 * filesystem's point of view. 3284 */ 3285 if (btrfs_is_zoned(fs_info)) { 3286 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3287 if (ret) 3288 btrfs_info(fs_info, 3289 "failed to reset zone %llu after relocation", 3290 chunk_offset); 3291 } 3292 3293 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3294 chunk_offset); 3295 if (IS_ERR(trans)) { 3296 ret = PTR_ERR(trans); 3297 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3298 return ret; 3299 } 3300 3301 /* 3302 * step two, delete the device extents and the 3303 * chunk tree entries 3304 */ 3305 ret = btrfs_remove_chunk(trans, chunk_offset); 3306 btrfs_end_transaction(trans); 3307 return ret; 3308 } 3309 3310 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3311 { 3312 struct btrfs_root *chunk_root = fs_info->chunk_root; 3313 struct btrfs_path *path; 3314 struct extent_buffer *leaf; 3315 struct btrfs_chunk *chunk; 3316 struct btrfs_key key; 3317 struct btrfs_key found_key; 3318 u64 chunk_type; 3319 bool retried = false; 3320 int failed = 0; 3321 int ret; 3322 3323 path = btrfs_alloc_path(); 3324 if (!path) 3325 return -ENOMEM; 3326 3327 again: 3328 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3329 key.offset = (u64)-1; 3330 key.type = BTRFS_CHUNK_ITEM_KEY; 3331 3332 while (1) { 3333 mutex_lock(&fs_info->reclaim_bgs_lock); 3334 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3335 if (ret < 0) { 3336 mutex_unlock(&fs_info->reclaim_bgs_lock); 3337 goto error; 3338 } 3339 BUG_ON(ret == 0); /* Corruption */ 3340 3341 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3342 key.type); 3343 if (ret) 3344 mutex_unlock(&fs_info->reclaim_bgs_lock); 3345 if (ret < 0) 3346 goto error; 3347 if (ret > 0) 3348 break; 3349 3350 leaf = path->nodes[0]; 3351 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3352 3353 chunk = btrfs_item_ptr(leaf, path->slots[0], 3354 struct btrfs_chunk); 3355 chunk_type = btrfs_chunk_type(leaf, chunk); 3356 btrfs_release_path(path); 3357 3358 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3359 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3360 if (ret == -ENOSPC) 3361 failed++; 3362 else 3363 BUG_ON(ret); 3364 } 3365 mutex_unlock(&fs_info->reclaim_bgs_lock); 3366 3367 if (found_key.offset == 0) 3368 break; 3369 key.offset = found_key.offset - 1; 3370 } 3371 ret = 0; 3372 if (failed && !retried) { 3373 failed = 0; 3374 retried = true; 3375 goto again; 3376 } else if (WARN_ON(failed && retried)) { 3377 ret = -ENOSPC; 3378 } 3379 error: 3380 btrfs_free_path(path); 3381 return ret; 3382 } 3383 3384 /* 3385 * return 1 : allocate a data chunk successfully, 3386 * return <0: errors during allocating a data chunk, 3387 * return 0 : no need to allocate a data chunk. 3388 */ 3389 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3390 u64 chunk_offset) 3391 { 3392 struct btrfs_block_group *cache; 3393 u64 bytes_used; 3394 u64 chunk_type; 3395 3396 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3397 ASSERT(cache); 3398 chunk_type = cache->flags; 3399 btrfs_put_block_group(cache); 3400 3401 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3402 return 0; 3403 3404 spin_lock(&fs_info->data_sinfo->lock); 3405 bytes_used = fs_info->data_sinfo->bytes_used; 3406 spin_unlock(&fs_info->data_sinfo->lock); 3407 3408 if (!bytes_used) { 3409 struct btrfs_trans_handle *trans; 3410 int ret; 3411 3412 trans = btrfs_join_transaction(fs_info->tree_root); 3413 if (IS_ERR(trans)) 3414 return PTR_ERR(trans); 3415 3416 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3417 btrfs_end_transaction(trans); 3418 if (ret < 0) 3419 return ret; 3420 return 1; 3421 } 3422 3423 return 0; 3424 } 3425 3426 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3427 struct btrfs_balance_control *bctl) 3428 { 3429 struct btrfs_root *root = fs_info->tree_root; 3430 struct btrfs_trans_handle *trans; 3431 struct btrfs_balance_item *item; 3432 struct btrfs_disk_balance_args disk_bargs; 3433 struct btrfs_path *path; 3434 struct extent_buffer *leaf; 3435 struct btrfs_key key; 3436 int ret, err; 3437 3438 path = btrfs_alloc_path(); 3439 if (!path) 3440 return -ENOMEM; 3441 3442 trans = btrfs_start_transaction(root, 0); 3443 if (IS_ERR(trans)) { 3444 btrfs_free_path(path); 3445 return PTR_ERR(trans); 3446 } 3447 3448 key.objectid = BTRFS_BALANCE_OBJECTID; 3449 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3450 key.offset = 0; 3451 3452 ret = btrfs_insert_empty_item(trans, root, path, &key, 3453 sizeof(*item)); 3454 if (ret) 3455 goto out; 3456 3457 leaf = path->nodes[0]; 3458 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3459 3460 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3461 3462 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3463 btrfs_set_balance_data(leaf, item, &disk_bargs); 3464 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3465 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3466 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3467 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3468 3469 btrfs_set_balance_flags(leaf, item, bctl->flags); 3470 3471 btrfs_mark_buffer_dirty(leaf); 3472 out: 3473 btrfs_free_path(path); 3474 err = btrfs_commit_transaction(trans); 3475 if (err && !ret) 3476 ret = err; 3477 return ret; 3478 } 3479 3480 static int del_balance_item(struct btrfs_fs_info *fs_info) 3481 { 3482 struct btrfs_root *root = fs_info->tree_root; 3483 struct btrfs_trans_handle *trans; 3484 struct btrfs_path *path; 3485 struct btrfs_key key; 3486 int ret, err; 3487 3488 path = btrfs_alloc_path(); 3489 if (!path) 3490 return -ENOMEM; 3491 3492 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3493 if (IS_ERR(trans)) { 3494 btrfs_free_path(path); 3495 return PTR_ERR(trans); 3496 } 3497 3498 key.objectid = BTRFS_BALANCE_OBJECTID; 3499 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3500 key.offset = 0; 3501 3502 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3503 if (ret < 0) 3504 goto out; 3505 if (ret > 0) { 3506 ret = -ENOENT; 3507 goto out; 3508 } 3509 3510 ret = btrfs_del_item(trans, root, path); 3511 out: 3512 btrfs_free_path(path); 3513 err = btrfs_commit_transaction(trans); 3514 if (err && !ret) 3515 ret = err; 3516 return ret; 3517 } 3518 3519 /* 3520 * This is a heuristic used to reduce the number of chunks balanced on 3521 * resume after balance was interrupted. 3522 */ 3523 static void update_balance_args(struct btrfs_balance_control *bctl) 3524 { 3525 /* 3526 * Turn on soft mode for chunk types that were being converted. 3527 */ 3528 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3529 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3530 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3531 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3532 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3533 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3534 3535 /* 3536 * Turn on usage filter if is not already used. The idea is 3537 * that chunks that we have already balanced should be 3538 * reasonably full. Don't do it for chunks that are being 3539 * converted - that will keep us from relocating unconverted 3540 * (albeit full) chunks. 3541 */ 3542 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3543 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3544 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3545 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3546 bctl->data.usage = 90; 3547 } 3548 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3549 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3550 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3551 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3552 bctl->sys.usage = 90; 3553 } 3554 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3555 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3556 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3557 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3558 bctl->meta.usage = 90; 3559 } 3560 } 3561 3562 /* 3563 * Clear the balance status in fs_info and delete the balance item from disk. 3564 */ 3565 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3566 { 3567 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3568 int ret; 3569 3570 BUG_ON(!fs_info->balance_ctl); 3571 3572 spin_lock(&fs_info->balance_lock); 3573 fs_info->balance_ctl = NULL; 3574 spin_unlock(&fs_info->balance_lock); 3575 3576 kfree(bctl); 3577 ret = del_balance_item(fs_info); 3578 if (ret) 3579 btrfs_handle_fs_error(fs_info, ret, NULL); 3580 } 3581 3582 /* 3583 * Balance filters. Return 1 if chunk should be filtered out 3584 * (should not be balanced). 3585 */ 3586 static int chunk_profiles_filter(u64 chunk_type, 3587 struct btrfs_balance_args *bargs) 3588 { 3589 chunk_type = chunk_to_extended(chunk_type) & 3590 BTRFS_EXTENDED_PROFILE_MASK; 3591 3592 if (bargs->profiles & chunk_type) 3593 return 0; 3594 3595 return 1; 3596 } 3597 3598 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3599 struct btrfs_balance_args *bargs) 3600 { 3601 struct btrfs_block_group *cache; 3602 u64 chunk_used; 3603 u64 user_thresh_min; 3604 u64 user_thresh_max; 3605 int ret = 1; 3606 3607 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3608 chunk_used = cache->used; 3609 3610 if (bargs->usage_min == 0) 3611 user_thresh_min = 0; 3612 else 3613 user_thresh_min = mult_perc(cache->length, bargs->usage_min); 3614 3615 if (bargs->usage_max == 0) 3616 user_thresh_max = 1; 3617 else if (bargs->usage_max > 100) 3618 user_thresh_max = cache->length; 3619 else 3620 user_thresh_max = mult_perc(cache->length, bargs->usage_max); 3621 3622 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3623 ret = 0; 3624 3625 btrfs_put_block_group(cache); 3626 return ret; 3627 } 3628 3629 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3630 u64 chunk_offset, struct btrfs_balance_args *bargs) 3631 { 3632 struct btrfs_block_group *cache; 3633 u64 chunk_used, user_thresh; 3634 int ret = 1; 3635 3636 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3637 chunk_used = cache->used; 3638 3639 if (bargs->usage_min == 0) 3640 user_thresh = 1; 3641 else if (bargs->usage > 100) 3642 user_thresh = cache->length; 3643 else 3644 user_thresh = mult_perc(cache->length, bargs->usage); 3645 3646 if (chunk_used < user_thresh) 3647 ret = 0; 3648 3649 btrfs_put_block_group(cache); 3650 return ret; 3651 } 3652 3653 static int chunk_devid_filter(struct extent_buffer *leaf, 3654 struct btrfs_chunk *chunk, 3655 struct btrfs_balance_args *bargs) 3656 { 3657 struct btrfs_stripe *stripe; 3658 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3659 int i; 3660 3661 for (i = 0; i < num_stripes; i++) { 3662 stripe = btrfs_stripe_nr(chunk, i); 3663 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3664 return 0; 3665 } 3666 3667 return 1; 3668 } 3669 3670 static u64 calc_data_stripes(u64 type, int num_stripes) 3671 { 3672 const int index = btrfs_bg_flags_to_raid_index(type); 3673 const int ncopies = btrfs_raid_array[index].ncopies; 3674 const int nparity = btrfs_raid_array[index].nparity; 3675 3676 return (num_stripes - nparity) / ncopies; 3677 } 3678 3679 /* [pstart, pend) */ 3680 static int chunk_drange_filter(struct extent_buffer *leaf, 3681 struct btrfs_chunk *chunk, 3682 struct btrfs_balance_args *bargs) 3683 { 3684 struct btrfs_stripe *stripe; 3685 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3686 u64 stripe_offset; 3687 u64 stripe_length; 3688 u64 type; 3689 int factor; 3690 int i; 3691 3692 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3693 return 0; 3694 3695 type = btrfs_chunk_type(leaf, chunk); 3696 factor = calc_data_stripes(type, num_stripes); 3697 3698 for (i = 0; i < num_stripes; i++) { 3699 stripe = btrfs_stripe_nr(chunk, i); 3700 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3701 continue; 3702 3703 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3704 stripe_length = btrfs_chunk_length(leaf, chunk); 3705 stripe_length = div_u64(stripe_length, factor); 3706 3707 if (stripe_offset < bargs->pend && 3708 stripe_offset + stripe_length > bargs->pstart) 3709 return 0; 3710 } 3711 3712 return 1; 3713 } 3714 3715 /* [vstart, vend) */ 3716 static int chunk_vrange_filter(struct extent_buffer *leaf, 3717 struct btrfs_chunk *chunk, 3718 u64 chunk_offset, 3719 struct btrfs_balance_args *bargs) 3720 { 3721 if (chunk_offset < bargs->vend && 3722 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3723 /* at least part of the chunk is inside this vrange */ 3724 return 0; 3725 3726 return 1; 3727 } 3728 3729 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3730 struct btrfs_chunk *chunk, 3731 struct btrfs_balance_args *bargs) 3732 { 3733 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3734 3735 if (bargs->stripes_min <= num_stripes 3736 && num_stripes <= bargs->stripes_max) 3737 return 0; 3738 3739 return 1; 3740 } 3741 3742 static int chunk_soft_convert_filter(u64 chunk_type, 3743 struct btrfs_balance_args *bargs) 3744 { 3745 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3746 return 0; 3747 3748 chunk_type = chunk_to_extended(chunk_type) & 3749 BTRFS_EXTENDED_PROFILE_MASK; 3750 3751 if (bargs->target == chunk_type) 3752 return 1; 3753 3754 return 0; 3755 } 3756 3757 static int should_balance_chunk(struct extent_buffer *leaf, 3758 struct btrfs_chunk *chunk, u64 chunk_offset) 3759 { 3760 struct btrfs_fs_info *fs_info = leaf->fs_info; 3761 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3762 struct btrfs_balance_args *bargs = NULL; 3763 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3764 3765 /* type filter */ 3766 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3767 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3768 return 0; 3769 } 3770 3771 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3772 bargs = &bctl->data; 3773 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3774 bargs = &bctl->sys; 3775 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3776 bargs = &bctl->meta; 3777 3778 /* profiles filter */ 3779 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3780 chunk_profiles_filter(chunk_type, bargs)) { 3781 return 0; 3782 } 3783 3784 /* usage filter */ 3785 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3786 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3787 return 0; 3788 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3789 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3790 return 0; 3791 } 3792 3793 /* devid filter */ 3794 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3795 chunk_devid_filter(leaf, chunk, bargs)) { 3796 return 0; 3797 } 3798 3799 /* drange filter, makes sense only with devid filter */ 3800 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3801 chunk_drange_filter(leaf, chunk, bargs)) { 3802 return 0; 3803 } 3804 3805 /* vrange filter */ 3806 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3807 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3808 return 0; 3809 } 3810 3811 /* stripes filter */ 3812 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3813 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3814 return 0; 3815 } 3816 3817 /* soft profile changing mode */ 3818 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3819 chunk_soft_convert_filter(chunk_type, bargs)) { 3820 return 0; 3821 } 3822 3823 /* 3824 * limited by count, must be the last filter 3825 */ 3826 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3827 if (bargs->limit == 0) 3828 return 0; 3829 else 3830 bargs->limit--; 3831 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3832 /* 3833 * Same logic as the 'limit' filter; the minimum cannot be 3834 * determined here because we do not have the global information 3835 * about the count of all chunks that satisfy the filters. 3836 */ 3837 if (bargs->limit_max == 0) 3838 return 0; 3839 else 3840 bargs->limit_max--; 3841 } 3842 3843 return 1; 3844 } 3845 3846 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3847 { 3848 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3849 struct btrfs_root *chunk_root = fs_info->chunk_root; 3850 u64 chunk_type; 3851 struct btrfs_chunk *chunk; 3852 struct btrfs_path *path = NULL; 3853 struct btrfs_key key; 3854 struct btrfs_key found_key; 3855 struct extent_buffer *leaf; 3856 int slot; 3857 int ret; 3858 int enospc_errors = 0; 3859 bool counting = true; 3860 /* The single value limit and min/max limits use the same bytes in the */ 3861 u64 limit_data = bctl->data.limit; 3862 u64 limit_meta = bctl->meta.limit; 3863 u64 limit_sys = bctl->sys.limit; 3864 u32 count_data = 0; 3865 u32 count_meta = 0; 3866 u32 count_sys = 0; 3867 int chunk_reserved = 0; 3868 3869 path = btrfs_alloc_path(); 3870 if (!path) { 3871 ret = -ENOMEM; 3872 goto error; 3873 } 3874 3875 /* zero out stat counters */ 3876 spin_lock(&fs_info->balance_lock); 3877 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3878 spin_unlock(&fs_info->balance_lock); 3879 again: 3880 if (!counting) { 3881 /* 3882 * The single value limit and min/max limits use the same bytes 3883 * in the 3884 */ 3885 bctl->data.limit = limit_data; 3886 bctl->meta.limit = limit_meta; 3887 bctl->sys.limit = limit_sys; 3888 } 3889 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3890 key.offset = (u64)-1; 3891 key.type = BTRFS_CHUNK_ITEM_KEY; 3892 3893 while (1) { 3894 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3895 atomic_read(&fs_info->balance_cancel_req)) { 3896 ret = -ECANCELED; 3897 goto error; 3898 } 3899 3900 mutex_lock(&fs_info->reclaim_bgs_lock); 3901 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3902 if (ret < 0) { 3903 mutex_unlock(&fs_info->reclaim_bgs_lock); 3904 goto error; 3905 } 3906 3907 /* 3908 * this shouldn't happen, it means the last relocate 3909 * failed 3910 */ 3911 if (ret == 0) 3912 BUG(); /* FIXME break ? */ 3913 3914 ret = btrfs_previous_item(chunk_root, path, 0, 3915 BTRFS_CHUNK_ITEM_KEY); 3916 if (ret) { 3917 mutex_unlock(&fs_info->reclaim_bgs_lock); 3918 ret = 0; 3919 break; 3920 } 3921 3922 leaf = path->nodes[0]; 3923 slot = path->slots[0]; 3924 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3925 3926 if (found_key.objectid != key.objectid) { 3927 mutex_unlock(&fs_info->reclaim_bgs_lock); 3928 break; 3929 } 3930 3931 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3932 chunk_type = btrfs_chunk_type(leaf, chunk); 3933 3934 if (!counting) { 3935 spin_lock(&fs_info->balance_lock); 3936 bctl->stat.considered++; 3937 spin_unlock(&fs_info->balance_lock); 3938 } 3939 3940 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3941 3942 btrfs_release_path(path); 3943 if (!ret) { 3944 mutex_unlock(&fs_info->reclaim_bgs_lock); 3945 goto loop; 3946 } 3947 3948 if (counting) { 3949 mutex_unlock(&fs_info->reclaim_bgs_lock); 3950 spin_lock(&fs_info->balance_lock); 3951 bctl->stat.expected++; 3952 spin_unlock(&fs_info->balance_lock); 3953 3954 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3955 count_data++; 3956 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3957 count_sys++; 3958 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3959 count_meta++; 3960 3961 goto loop; 3962 } 3963 3964 /* 3965 * Apply limit_min filter, no need to check if the LIMITS 3966 * filter is used, limit_min is 0 by default 3967 */ 3968 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3969 count_data < bctl->data.limit_min) 3970 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3971 count_meta < bctl->meta.limit_min) 3972 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3973 count_sys < bctl->sys.limit_min)) { 3974 mutex_unlock(&fs_info->reclaim_bgs_lock); 3975 goto loop; 3976 } 3977 3978 if (!chunk_reserved) { 3979 /* 3980 * We may be relocating the only data chunk we have, 3981 * which could potentially end up with losing data's 3982 * raid profile, so lets allocate an empty one in 3983 * advance. 3984 */ 3985 ret = btrfs_may_alloc_data_chunk(fs_info, 3986 found_key.offset); 3987 if (ret < 0) { 3988 mutex_unlock(&fs_info->reclaim_bgs_lock); 3989 goto error; 3990 } else if (ret == 1) { 3991 chunk_reserved = 1; 3992 } 3993 } 3994 3995 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3996 mutex_unlock(&fs_info->reclaim_bgs_lock); 3997 if (ret == -ENOSPC) { 3998 enospc_errors++; 3999 } else if (ret == -ETXTBSY) { 4000 btrfs_info(fs_info, 4001 "skipping relocation of block group %llu due to active swapfile", 4002 found_key.offset); 4003 ret = 0; 4004 } else if (ret) { 4005 goto error; 4006 } else { 4007 spin_lock(&fs_info->balance_lock); 4008 bctl->stat.completed++; 4009 spin_unlock(&fs_info->balance_lock); 4010 } 4011 loop: 4012 if (found_key.offset == 0) 4013 break; 4014 key.offset = found_key.offset - 1; 4015 } 4016 4017 if (counting) { 4018 btrfs_release_path(path); 4019 counting = false; 4020 goto again; 4021 } 4022 error: 4023 btrfs_free_path(path); 4024 if (enospc_errors) { 4025 btrfs_info(fs_info, "%d enospc errors during balance", 4026 enospc_errors); 4027 if (!ret) 4028 ret = -ENOSPC; 4029 } 4030 4031 return ret; 4032 } 4033 4034 /* 4035 * See if a given profile is valid and reduced. 4036 * 4037 * @flags: profile to validate 4038 * @extended: if true @flags is treated as an extended profile 4039 */ 4040 static int alloc_profile_is_valid(u64 flags, int extended) 4041 { 4042 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 4043 BTRFS_BLOCK_GROUP_PROFILE_MASK); 4044 4045 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 4046 4047 /* 1) check that all other bits are zeroed */ 4048 if (flags & ~mask) 4049 return 0; 4050 4051 /* 2) see if profile is reduced */ 4052 if (flags == 0) 4053 return !extended; /* "0" is valid for usual profiles */ 4054 4055 return has_single_bit_set(flags); 4056 } 4057 4058 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 4059 { 4060 /* cancel requested || normal exit path */ 4061 return atomic_read(&fs_info->balance_cancel_req) || 4062 (atomic_read(&fs_info->balance_pause_req) == 0 && 4063 atomic_read(&fs_info->balance_cancel_req) == 0); 4064 } 4065 4066 /* 4067 * Validate target profile against allowed profiles and return true if it's OK. 4068 * Otherwise print the error message and return false. 4069 */ 4070 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 4071 const struct btrfs_balance_args *bargs, 4072 u64 allowed, const char *type) 4073 { 4074 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4075 return true; 4076 4077 /* Profile is valid and does not have bits outside of the allowed set */ 4078 if (alloc_profile_is_valid(bargs->target, 1) && 4079 (bargs->target & ~allowed) == 0) 4080 return true; 4081 4082 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4083 type, btrfs_bg_type_to_raid_name(bargs->target)); 4084 return false; 4085 } 4086 4087 /* 4088 * Fill @buf with textual description of balance filter flags @bargs, up to 4089 * @size_buf including the terminating null. The output may be trimmed if it 4090 * does not fit into the provided buffer. 4091 */ 4092 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4093 u32 size_buf) 4094 { 4095 int ret; 4096 u32 size_bp = size_buf; 4097 char *bp = buf; 4098 u64 flags = bargs->flags; 4099 char tmp_buf[128] = {'\0'}; 4100 4101 if (!flags) 4102 return; 4103 4104 #define CHECK_APPEND_NOARG(a) \ 4105 do { \ 4106 ret = snprintf(bp, size_bp, (a)); \ 4107 if (ret < 0 || ret >= size_bp) \ 4108 goto out_overflow; \ 4109 size_bp -= ret; \ 4110 bp += ret; \ 4111 } while (0) 4112 4113 #define CHECK_APPEND_1ARG(a, v1) \ 4114 do { \ 4115 ret = snprintf(bp, size_bp, (a), (v1)); \ 4116 if (ret < 0 || ret >= size_bp) \ 4117 goto out_overflow; \ 4118 size_bp -= ret; \ 4119 bp += ret; \ 4120 } while (0) 4121 4122 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4123 do { \ 4124 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4125 if (ret < 0 || ret >= size_bp) \ 4126 goto out_overflow; \ 4127 size_bp -= ret; \ 4128 bp += ret; \ 4129 } while (0) 4130 4131 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4132 CHECK_APPEND_1ARG("convert=%s,", 4133 btrfs_bg_type_to_raid_name(bargs->target)); 4134 4135 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4136 CHECK_APPEND_NOARG("soft,"); 4137 4138 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4139 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4140 sizeof(tmp_buf)); 4141 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4142 } 4143 4144 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4145 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4146 4147 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4148 CHECK_APPEND_2ARG("usage=%u..%u,", 4149 bargs->usage_min, bargs->usage_max); 4150 4151 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4152 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4153 4154 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4155 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4156 bargs->pstart, bargs->pend); 4157 4158 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4159 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4160 bargs->vstart, bargs->vend); 4161 4162 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4163 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4164 4165 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4166 CHECK_APPEND_2ARG("limit=%u..%u,", 4167 bargs->limit_min, bargs->limit_max); 4168 4169 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4170 CHECK_APPEND_2ARG("stripes=%u..%u,", 4171 bargs->stripes_min, bargs->stripes_max); 4172 4173 #undef CHECK_APPEND_2ARG 4174 #undef CHECK_APPEND_1ARG 4175 #undef CHECK_APPEND_NOARG 4176 4177 out_overflow: 4178 4179 if (size_bp < size_buf) 4180 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4181 else 4182 buf[0] = '\0'; 4183 } 4184 4185 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4186 { 4187 u32 size_buf = 1024; 4188 char tmp_buf[192] = {'\0'}; 4189 char *buf; 4190 char *bp; 4191 u32 size_bp = size_buf; 4192 int ret; 4193 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4194 4195 buf = kzalloc(size_buf, GFP_KERNEL); 4196 if (!buf) 4197 return; 4198 4199 bp = buf; 4200 4201 #define CHECK_APPEND_1ARG(a, v1) \ 4202 do { \ 4203 ret = snprintf(bp, size_bp, (a), (v1)); \ 4204 if (ret < 0 || ret >= size_bp) \ 4205 goto out_overflow; \ 4206 size_bp -= ret; \ 4207 bp += ret; \ 4208 } while (0) 4209 4210 if (bctl->flags & BTRFS_BALANCE_FORCE) 4211 CHECK_APPEND_1ARG("%s", "-f "); 4212 4213 if (bctl->flags & BTRFS_BALANCE_DATA) { 4214 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4215 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4216 } 4217 4218 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4219 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4220 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4221 } 4222 4223 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4224 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4225 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4226 } 4227 4228 #undef CHECK_APPEND_1ARG 4229 4230 out_overflow: 4231 4232 if (size_bp < size_buf) 4233 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4234 btrfs_info(fs_info, "balance: %s %s", 4235 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4236 "resume" : "start", buf); 4237 4238 kfree(buf); 4239 } 4240 4241 /* 4242 * Should be called with balance mutexe held 4243 */ 4244 int btrfs_balance(struct btrfs_fs_info *fs_info, 4245 struct btrfs_balance_control *bctl, 4246 struct btrfs_ioctl_balance_args *bargs) 4247 { 4248 u64 meta_target, data_target; 4249 u64 allowed; 4250 int mixed = 0; 4251 int ret; 4252 u64 num_devices; 4253 unsigned seq; 4254 bool reducing_redundancy; 4255 int i; 4256 4257 if (btrfs_fs_closing(fs_info) || 4258 atomic_read(&fs_info->balance_pause_req) || 4259 btrfs_should_cancel_balance(fs_info)) { 4260 ret = -EINVAL; 4261 goto out; 4262 } 4263 4264 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4265 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4266 mixed = 1; 4267 4268 /* 4269 * In case of mixed groups both data and meta should be picked, 4270 * and identical options should be given for both of them. 4271 */ 4272 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4273 if (mixed && (bctl->flags & allowed)) { 4274 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4275 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4276 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4277 btrfs_err(fs_info, 4278 "balance: mixed groups data and metadata options must be the same"); 4279 ret = -EINVAL; 4280 goto out; 4281 } 4282 } 4283 4284 /* 4285 * rw_devices will not change at the moment, device add/delete/replace 4286 * are exclusive 4287 */ 4288 num_devices = fs_info->fs_devices->rw_devices; 4289 4290 /* 4291 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4292 * special bit for it, to make it easier to distinguish. Thus we need 4293 * to set it manually, or balance would refuse the profile. 4294 */ 4295 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4296 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4297 if (num_devices >= btrfs_raid_array[i].devs_min) 4298 allowed |= btrfs_raid_array[i].bg_flag; 4299 4300 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4301 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4302 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4303 ret = -EINVAL; 4304 goto out; 4305 } 4306 4307 /* 4308 * Allow to reduce metadata or system integrity only if force set for 4309 * profiles with redundancy (copies, parity) 4310 */ 4311 allowed = 0; 4312 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4313 if (btrfs_raid_array[i].ncopies >= 2 || 4314 btrfs_raid_array[i].tolerated_failures >= 1) 4315 allowed |= btrfs_raid_array[i].bg_flag; 4316 } 4317 do { 4318 seq = read_seqbegin(&fs_info->profiles_lock); 4319 4320 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4321 (fs_info->avail_system_alloc_bits & allowed) && 4322 !(bctl->sys.target & allowed)) || 4323 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4324 (fs_info->avail_metadata_alloc_bits & allowed) && 4325 !(bctl->meta.target & allowed))) 4326 reducing_redundancy = true; 4327 else 4328 reducing_redundancy = false; 4329 4330 /* if we're not converting, the target field is uninitialized */ 4331 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4332 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4333 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4334 bctl->data.target : fs_info->avail_data_alloc_bits; 4335 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4336 4337 if (reducing_redundancy) { 4338 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4339 btrfs_info(fs_info, 4340 "balance: force reducing metadata redundancy"); 4341 } else { 4342 btrfs_err(fs_info, 4343 "balance: reduces metadata redundancy, use --force if you want this"); 4344 ret = -EINVAL; 4345 goto out; 4346 } 4347 } 4348 4349 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4350 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4351 btrfs_warn(fs_info, 4352 "balance: metadata profile %s has lower redundancy than data profile %s", 4353 btrfs_bg_type_to_raid_name(meta_target), 4354 btrfs_bg_type_to_raid_name(data_target)); 4355 } 4356 4357 ret = insert_balance_item(fs_info, bctl); 4358 if (ret && ret != -EEXIST) 4359 goto out; 4360 4361 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4362 BUG_ON(ret == -EEXIST); 4363 BUG_ON(fs_info->balance_ctl); 4364 spin_lock(&fs_info->balance_lock); 4365 fs_info->balance_ctl = bctl; 4366 spin_unlock(&fs_info->balance_lock); 4367 } else { 4368 BUG_ON(ret != -EEXIST); 4369 spin_lock(&fs_info->balance_lock); 4370 update_balance_args(bctl); 4371 spin_unlock(&fs_info->balance_lock); 4372 } 4373 4374 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4375 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4376 describe_balance_start_or_resume(fs_info); 4377 mutex_unlock(&fs_info->balance_mutex); 4378 4379 ret = __btrfs_balance(fs_info); 4380 4381 mutex_lock(&fs_info->balance_mutex); 4382 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4383 btrfs_info(fs_info, "balance: paused"); 4384 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4385 } 4386 /* 4387 * Balance can be canceled by: 4388 * 4389 * - Regular cancel request 4390 * Then ret == -ECANCELED and balance_cancel_req > 0 4391 * 4392 * - Fatal signal to "btrfs" process 4393 * Either the signal caught by wait_reserve_ticket() and callers 4394 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4395 * got -ECANCELED. 4396 * Either way, in this case balance_cancel_req = 0, and 4397 * ret == -EINTR or ret == -ECANCELED. 4398 * 4399 * So here we only check the return value to catch canceled balance. 4400 */ 4401 else if (ret == -ECANCELED || ret == -EINTR) 4402 btrfs_info(fs_info, "balance: canceled"); 4403 else 4404 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4405 4406 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4407 4408 if (bargs) { 4409 memset(bargs, 0, sizeof(*bargs)); 4410 btrfs_update_ioctl_balance_args(fs_info, bargs); 4411 } 4412 4413 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4414 balance_need_close(fs_info)) { 4415 reset_balance_state(fs_info); 4416 btrfs_exclop_finish(fs_info); 4417 } 4418 4419 wake_up(&fs_info->balance_wait_q); 4420 4421 return ret; 4422 out: 4423 if (bctl->flags & BTRFS_BALANCE_RESUME) 4424 reset_balance_state(fs_info); 4425 else 4426 kfree(bctl); 4427 btrfs_exclop_finish(fs_info); 4428 4429 return ret; 4430 } 4431 4432 static int balance_kthread(void *data) 4433 { 4434 struct btrfs_fs_info *fs_info = data; 4435 int ret = 0; 4436 4437 sb_start_write(fs_info->sb); 4438 mutex_lock(&fs_info->balance_mutex); 4439 if (fs_info->balance_ctl) 4440 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4441 mutex_unlock(&fs_info->balance_mutex); 4442 sb_end_write(fs_info->sb); 4443 4444 return ret; 4445 } 4446 4447 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4448 { 4449 struct task_struct *tsk; 4450 4451 mutex_lock(&fs_info->balance_mutex); 4452 if (!fs_info->balance_ctl) { 4453 mutex_unlock(&fs_info->balance_mutex); 4454 return 0; 4455 } 4456 mutex_unlock(&fs_info->balance_mutex); 4457 4458 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4459 btrfs_info(fs_info, "balance: resume skipped"); 4460 return 0; 4461 } 4462 4463 spin_lock(&fs_info->super_lock); 4464 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 4465 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 4466 spin_unlock(&fs_info->super_lock); 4467 /* 4468 * A ro->rw remount sequence should continue with the paused balance 4469 * regardless of who pauses it, system or the user as of now, so set 4470 * the resume flag. 4471 */ 4472 spin_lock(&fs_info->balance_lock); 4473 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4474 spin_unlock(&fs_info->balance_lock); 4475 4476 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4477 return PTR_ERR_OR_ZERO(tsk); 4478 } 4479 4480 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4481 { 4482 struct btrfs_balance_control *bctl; 4483 struct btrfs_balance_item *item; 4484 struct btrfs_disk_balance_args disk_bargs; 4485 struct btrfs_path *path; 4486 struct extent_buffer *leaf; 4487 struct btrfs_key key; 4488 int ret; 4489 4490 path = btrfs_alloc_path(); 4491 if (!path) 4492 return -ENOMEM; 4493 4494 key.objectid = BTRFS_BALANCE_OBJECTID; 4495 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4496 key.offset = 0; 4497 4498 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4499 if (ret < 0) 4500 goto out; 4501 if (ret > 0) { /* ret = -ENOENT; */ 4502 ret = 0; 4503 goto out; 4504 } 4505 4506 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4507 if (!bctl) { 4508 ret = -ENOMEM; 4509 goto out; 4510 } 4511 4512 leaf = path->nodes[0]; 4513 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4514 4515 bctl->flags = btrfs_balance_flags(leaf, item); 4516 bctl->flags |= BTRFS_BALANCE_RESUME; 4517 4518 btrfs_balance_data(leaf, item, &disk_bargs); 4519 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4520 btrfs_balance_meta(leaf, item, &disk_bargs); 4521 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4522 btrfs_balance_sys(leaf, item, &disk_bargs); 4523 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4524 4525 /* 4526 * This should never happen, as the paused balance state is recovered 4527 * during mount without any chance of other exclusive ops to collide. 4528 * 4529 * This gives the exclusive op status to balance and keeps in paused 4530 * state until user intervention (cancel or umount). If the ownership 4531 * cannot be assigned, show a message but do not fail. The balance 4532 * is in a paused state and must have fs_info::balance_ctl properly 4533 * set up. 4534 */ 4535 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 4536 btrfs_warn(fs_info, 4537 "balance: cannot set exclusive op status, resume manually"); 4538 4539 btrfs_release_path(path); 4540 4541 mutex_lock(&fs_info->balance_mutex); 4542 BUG_ON(fs_info->balance_ctl); 4543 spin_lock(&fs_info->balance_lock); 4544 fs_info->balance_ctl = bctl; 4545 spin_unlock(&fs_info->balance_lock); 4546 mutex_unlock(&fs_info->balance_mutex); 4547 out: 4548 btrfs_free_path(path); 4549 return ret; 4550 } 4551 4552 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4553 { 4554 int ret = 0; 4555 4556 mutex_lock(&fs_info->balance_mutex); 4557 if (!fs_info->balance_ctl) { 4558 mutex_unlock(&fs_info->balance_mutex); 4559 return -ENOTCONN; 4560 } 4561 4562 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4563 atomic_inc(&fs_info->balance_pause_req); 4564 mutex_unlock(&fs_info->balance_mutex); 4565 4566 wait_event(fs_info->balance_wait_q, 4567 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4568 4569 mutex_lock(&fs_info->balance_mutex); 4570 /* we are good with balance_ctl ripped off from under us */ 4571 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4572 atomic_dec(&fs_info->balance_pause_req); 4573 } else { 4574 ret = -ENOTCONN; 4575 } 4576 4577 mutex_unlock(&fs_info->balance_mutex); 4578 return ret; 4579 } 4580 4581 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4582 { 4583 mutex_lock(&fs_info->balance_mutex); 4584 if (!fs_info->balance_ctl) { 4585 mutex_unlock(&fs_info->balance_mutex); 4586 return -ENOTCONN; 4587 } 4588 4589 /* 4590 * A paused balance with the item stored on disk can be resumed at 4591 * mount time if the mount is read-write. Otherwise it's still paused 4592 * and we must not allow cancelling as it deletes the item. 4593 */ 4594 if (sb_rdonly(fs_info->sb)) { 4595 mutex_unlock(&fs_info->balance_mutex); 4596 return -EROFS; 4597 } 4598 4599 atomic_inc(&fs_info->balance_cancel_req); 4600 /* 4601 * if we are running just wait and return, balance item is 4602 * deleted in btrfs_balance in this case 4603 */ 4604 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4605 mutex_unlock(&fs_info->balance_mutex); 4606 wait_event(fs_info->balance_wait_q, 4607 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4608 mutex_lock(&fs_info->balance_mutex); 4609 } else { 4610 mutex_unlock(&fs_info->balance_mutex); 4611 /* 4612 * Lock released to allow other waiters to continue, we'll 4613 * reexamine the status again. 4614 */ 4615 mutex_lock(&fs_info->balance_mutex); 4616 4617 if (fs_info->balance_ctl) { 4618 reset_balance_state(fs_info); 4619 btrfs_exclop_finish(fs_info); 4620 btrfs_info(fs_info, "balance: canceled"); 4621 } 4622 } 4623 4624 BUG_ON(fs_info->balance_ctl || 4625 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4626 atomic_dec(&fs_info->balance_cancel_req); 4627 mutex_unlock(&fs_info->balance_mutex); 4628 return 0; 4629 } 4630 4631 int btrfs_uuid_scan_kthread(void *data) 4632 { 4633 struct btrfs_fs_info *fs_info = data; 4634 struct btrfs_root *root = fs_info->tree_root; 4635 struct btrfs_key key; 4636 struct btrfs_path *path = NULL; 4637 int ret = 0; 4638 struct extent_buffer *eb; 4639 int slot; 4640 struct btrfs_root_item root_item; 4641 u32 item_size; 4642 struct btrfs_trans_handle *trans = NULL; 4643 bool closing = false; 4644 4645 path = btrfs_alloc_path(); 4646 if (!path) { 4647 ret = -ENOMEM; 4648 goto out; 4649 } 4650 4651 key.objectid = 0; 4652 key.type = BTRFS_ROOT_ITEM_KEY; 4653 key.offset = 0; 4654 4655 while (1) { 4656 if (btrfs_fs_closing(fs_info)) { 4657 closing = true; 4658 break; 4659 } 4660 ret = btrfs_search_forward(root, &key, path, 4661 BTRFS_OLDEST_GENERATION); 4662 if (ret) { 4663 if (ret > 0) 4664 ret = 0; 4665 break; 4666 } 4667 4668 if (key.type != BTRFS_ROOT_ITEM_KEY || 4669 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4670 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4671 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4672 goto skip; 4673 4674 eb = path->nodes[0]; 4675 slot = path->slots[0]; 4676 item_size = btrfs_item_size(eb, slot); 4677 if (item_size < sizeof(root_item)) 4678 goto skip; 4679 4680 read_extent_buffer(eb, &root_item, 4681 btrfs_item_ptr_offset(eb, slot), 4682 (int)sizeof(root_item)); 4683 if (btrfs_root_refs(&root_item) == 0) 4684 goto skip; 4685 4686 if (!btrfs_is_empty_uuid(root_item.uuid) || 4687 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4688 if (trans) 4689 goto update_tree; 4690 4691 btrfs_release_path(path); 4692 /* 4693 * 1 - subvol uuid item 4694 * 1 - received_subvol uuid item 4695 */ 4696 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4697 if (IS_ERR(trans)) { 4698 ret = PTR_ERR(trans); 4699 break; 4700 } 4701 continue; 4702 } else { 4703 goto skip; 4704 } 4705 update_tree: 4706 btrfs_release_path(path); 4707 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4708 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4709 BTRFS_UUID_KEY_SUBVOL, 4710 key.objectid); 4711 if (ret < 0) { 4712 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4713 ret); 4714 break; 4715 } 4716 } 4717 4718 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4719 ret = btrfs_uuid_tree_add(trans, 4720 root_item.received_uuid, 4721 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4722 key.objectid); 4723 if (ret < 0) { 4724 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4725 ret); 4726 break; 4727 } 4728 } 4729 4730 skip: 4731 btrfs_release_path(path); 4732 if (trans) { 4733 ret = btrfs_end_transaction(trans); 4734 trans = NULL; 4735 if (ret) 4736 break; 4737 } 4738 4739 if (key.offset < (u64)-1) { 4740 key.offset++; 4741 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4742 key.offset = 0; 4743 key.type = BTRFS_ROOT_ITEM_KEY; 4744 } else if (key.objectid < (u64)-1) { 4745 key.offset = 0; 4746 key.type = BTRFS_ROOT_ITEM_KEY; 4747 key.objectid++; 4748 } else { 4749 break; 4750 } 4751 cond_resched(); 4752 } 4753 4754 out: 4755 btrfs_free_path(path); 4756 if (trans && !IS_ERR(trans)) 4757 btrfs_end_transaction(trans); 4758 if (ret) 4759 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4760 else if (!closing) 4761 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4762 up(&fs_info->uuid_tree_rescan_sem); 4763 return 0; 4764 } 4765 4766 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4767 { 4768 struct btrfs_trans_handle *trans; 4769 struct btrfs_root *tree_root = fs_info->tree_root; 4770 struct btrfs_root *uuid_root; 4771 struct task_struct *task; 4772 int ret; 4773 4774 /* 4775 * 1 - root node 4776 * 1 - root item 4777 */ 4778 trans = btrfs_start_transaction(tree_root, 2); 4779 if (IS_ERR(trans)) 4780 return PTR_ERR(trans); 4781 4782 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4783 if (IS_ERR(uuid_root)) { 4784 ret = PTR_ERR(uuid_root); 4785 btrfs_abort_transaction(trans, ret); 4786 btrfs_end_transaction(trans); 4787 return ret; 4788 } 4789 4790 fs_info->uuid_root = uuid_root; 4791 4792 ret = btrfs_commit_transaction(trans); 4793 if (ret) 4794 return ret; 4795 4796 down(&fs_info->uuid_tree_rescan_sem); 4797 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4798 if (IS_ERR(task)) { 4799 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4800 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4801 up(&fs_info->uuid_tree_rescan_sem); 4802 return PTR_ERR(task); 4803 } 4804 4805 return 0; 4806 } 4807 4808 /* 4809 * shrinking a device means finding all of the device extents past 4810 * the new size, and then following the back refs to the chunks. 4811 * The chunk relocation code actually frees the device extent 4812 */ 4813 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4814 { 4815 struct btrfs_fs_info *fs_info = device->fs_info; 4816 struct btrfs_root *root = fs_info->dev_root; 4817 struct btrfs_trans_handle *trans; 4818 struct btrfs_dev_extent *dev_extent = NULL; 4819 struct btrfs_path *path; 4820 u64 length; 4821 u64 chunk_offset; 4822 int ret; 4823 int slot; 4824 int failed = 0; 4825 bool retried = false; 4826 struct extent_buffer *l; 4827 struct btrfs_key key; 4828 struct btrfs_super_block *super_copy = fs_info->super_copy; 4829 u64 old_total = btrfs_super_total_bytes(super_copy); 4830 u64 old_size = btrfs_device_get_total_bytes(device); 4831 u64 diff; 4832 u64 start; 4833 4834 new_size = round_down(new_size, fs_info->sectorsize); 4835 start = new_size; 4836 diff = round_down(old_size - new_size, fs_info->sectorsize); 4837 4838 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4839 return -EINVAL; 4840 4841 path = btrfs_alloc_path(); 4842 if (!path) 4843 return -ENOMEM; 4844 4845 path->reada = READA_BACK; 4846 4847 trans = btrfs_start_transaction(root, 0); 4848 if (IS_ERR(trans)) { 4849 btrfs_free_path(path); 4850 return PTR_ERR(trans); 4851 } 4852 4853 mutex_lock(&fs_info->chunk_mutex); 4854 4855 btrfs_device_set_total_bytes(device, new_size); 4856 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4857 device->fs_devices->total_rw_bytes -= diff; 4858 atomic64_sub(diff, &fs_info->free_chunk_space); 4859 } 4860 4861 /* 4862 * Once the device's size has been set to the new size, ensure all 4863 * in-memory chunks are synced to disk so that the loop below sees them 4864 * and relocates them accordingly. 4865 */ 4866 if (contains_pending_extent(device, &start, diff)) { 4867 mutex_unlock(&fs_info->chunk_mutex); 4868 ret = btrfs_commit_transaction(trans); 4869 if (ret) 4870 goto done; 4871 } else { 4872 mutex_unlock(&fs_info->chunk_mutex); 4873 btrfs_end_transaction(trans); 4874 } 4875 4876 again: 4877 key.objectid = device->devid; 4878 key.offset = (u64)-1; 4879 key.type = BTRFS_DEV_EXTENT_KEY; 4880 4881 do { 4882 mutex_lock(&fs_info->reclaim_bgs_lock); 4883 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4884 if (ret < 0) { 4885 mutex_unlock(&fs_info->reclaim_bgs_lock); 4886 goto done; 4887 } 4888 4889 ret = btrfs_previous_item(root, path, 0, key.type); 4890 if (ret) { 4891 mutex_unlock(&fs_info->reclaim_bgs_lock); 4892 if (ret < 0) 4893 goto done; 4894 ret = 0; 4895 btrfs_release_path(path); 4896 break; 4897 } 4898 4899 l = path->nodes[0]; 4900 slot = path->slots[0]; 4901 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4902 4903 if (key.objectid != device->devid) { 4904 mutex_unlock(&fs_info->reclaim_bgs_lock); 4905 btrfs_release_path(path); 4906 break; 4907 } 4908 4909 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4910 length = btrfs_dev_extent_length(l, dev_extent); 4911 4912 if (key.offset + length <= new_size) { 4913 mutex_unlock(&fs_info->reclaim_bgs_lock); 4914 btrfs_release_path(path); 4915 break; 4916 } 4917 4918 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4919 btrfs_release_path(path); 4920 4921 /* 4922 * We may be relocating the only data chunk we have, 4923 * which could potentially end up with losing data's 4924 * raid profile, so lets allocate an empty one in 4925 * advance. 4926 */ 4927 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4928 if (ret < 0) { 4929 mutex_unlock(&fs_info->reclaim_bgs_lock); 4930 goto done; 4931 } 4932 4933 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4934 mutex_unlock(&fs_info->reclaim_bgs_lock); 4935 if (ret == -ENOSPC) { 4936 failed++; 4937 } else if (ret) { 4938 if (ret == -ETXTBSY) { 4939 btrfs_warn(fs_info, 4940 "could not shrink block group %llu due to active swapfile", 4941 chunk_offset); 4942 } 4943 goto done; 4944 } 4945 } while (key.offset-- > 0); 4946 4947 if (failed && !retried) { 4948 failed = 0; 4949 retried = true; 4950 goto again; 4951 } else if (failed && retried) { 4952 ret = -ENOSPC; 4953 goto done; 4954 } 4955 4956 /* Shrinking succeeded, else we would be at "done". */ 4957 trans = btrfs_start_transaction(root, 0); 4958 if (IS_ERR(trans)) { 4959 ret = PTR_ERR(trans); 4960 goto done; 4961 } 4962 4963 mutex_lock(&fs_info->chunk_mutex); 4964 /* Clear all state bits beyond the shrunk device size */ 4965 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4966 CHUNK_STATE_MASK); 4967 4968 btrfs_device_set_disk_total_bytes(device, new_size); 4969 if (list_empty(&device->post_commit_list)) 4970 list_add_tail(&device->post_commit_list, 4971 &trans->transaction->dev_update_list); 4972 4973 WARN_ON(diff > old_total); 4974 btrfs_set_super_total_bytes(super_copy, 4975 round_down(old_total - diff, fs_info->sectorsize)); 4976 mutex_unlock(&fs_info->chunk_mutex); 4977 4978 btrfs_reserve_chunk_metadata(trans, false); 4979 /* Now btrfs_update_device() will change the on-disk size. */ 4980 ret = btrfs_update_device(trans, device); 4981 btrfs_trans_release_chunk_metadata(trans); 4982 if (ret < 0) { 4983 btrfs_abort_transaction(trans, ret); 4984 btrfs_end_transaction(trans); 4985 } else { 4986 ret = btrfs_commit_transaction(trans); 4987 } 4988 done: 4989 btrfs_free_path(path); 4990 if (ret) { 4991 mutex_lock(&fs_info->chunk_mutex); 4992 btrfs_device_set_total_bytes(device, old_size); 4993 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4994 device->fs_devices->total_rw_bytes += diff; 4995 atomic64_add(diff, &fs_info->free_chunk_space); 4996 mutex_unlock(&fs_info->chunk_mutex); 4997 } 4998 return ret; 4999 } 5000 5001 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 5002 struct btrfs_key *key, 5003 struct btrfs_chunk *chunk, int item_size) 5004 { 5005 struct btrfs_super_block *super_copy = fs_info->super_copy; 5006 struct btrfs_disk_key disk_key; 5007 u32 array_size; 5008 u8 *ptr; 5009 5010 lockdep_assert_held(&fs_info->chunk_mutex); 5011 5012 array_size = btrfs_super_sys_array_size(super_copy); 5013 if (array_size + item_size + sizeof(disk_key) 5014 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 5015 return -EFBIG; 5016 5017 ptr = super_copy->sys_chunk_array + array_size; 5018 btrfs_cpu_key_to_disk(&disk_key, key); 5019 memcpy(ptr, &disk_key, sizeof(disk_key)); 5020 ptr += sizeof(disk_key); 5021 memcpy(ptr, chunk, item_size); 5022 item_size += sizeof(disk_key); 5023 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 5024 5025 return 0; 5026 } 5027 5028 /* 5029 * sort the devices in descending order by max_avail, total_avail 5030 */ 5031 static int btrfs_cmp_device_info(const void *a, const void *b) 5032 { 5033 const struct btrfs_device_info *di_a = a; 5034 const struct btrfs_device_info *di_b = b; 5035 5036 if (di_a->max_avail > di_b->max_avail) 5037 return -1; 5038 if (di_a->max_avail < di_b->max_avail) 5039 return 1; 5040 if (di_a->total_avail > di_b->total_avail) 5041 return -1; 5042 if (di_a->total_avail < di_b->total_avail) 5043 return 1; 5044 return 0; 5045 } 5046 5047 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 5048 { 5049 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5050 return; 5051 5052 btrfs_set_fs_incompat(info, RAID56); 5053 } 5054 5055 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 5056 { 5057 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 5058 return; 5059 5060 btrfs_set_fs_incompat(info, RAID1C34); 5061 } 5062 5063 /* 5064 * Structure used internally for btrfs_create_chunk() function. 5065 * Wraps needed parameters. 5066 */ 5067 struct alloc_chunk_ctl { 5068 u64 start; 5069 u64 type; 5070 /* Total number of stripes to allocate */ 5071 int num_stripes; 5072 /* sub_stripes info for map */ 5073 int sub_stripes; 5074 /* Stripes per device */ 5075 int dev_stripes; 5076 /* Maximum number of devices to use */ 5077 int devs_max; 5078 /* Minimum number of devices to use */ 5079 int devs_min; 5080 /* ndevs has to be a multiple of this */ 5081 int devs_increment; 5082 /* Number of copies */ 5083 int ncopies; 5084 /* Number of stripes worth of bytes to store parity information */ 5085 int nparity; 5086 u64 max_stripe_size; 5087 u64 max_chunk_size; 5088 u64 dev_extent_min; 5089 u64 stripe_size; 5090 u64 chunk_size; 5091 int ndevs; 5092 }; 5093 5094 static void init_alloc_chunk_ctl_policy_regular( 5095 struct btrfs_fs_devices *fs_devices, 5096 struct alloc_chunk_ctl *ctl) 5097 { 5098 struct btrfs_space_info *space_info; 5099 5100 space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); 5101 ASSERT(space_info); 5102 5103 ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); 5104 ctl->max_stripe_size = ctl->max_chunk_size; 5105 5106 if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) 5107 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); 5108 5109 /* We don't want a chunk larger than 10% of writable space */ 5110 ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), 5111 ctl->max_chunk_size); 5112 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5113 } 5114 5115 static void init_alloc_chunk_ctl_policy_zoned( 5116 struct btrfs_fs_devices *fs_devices, 5117 struct alloc_chunk_ctl *ctl) 5118 { 5119 u64 zone_size = fs_devices->fs_info->zone_size; 5120 u64 limit; 5121 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5122 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5123 u64 min_chunk_size = min_data_stripes * zone_size; 5124 u64 type = ctl->type; 5125 5126 ctl->max_stripe_size = zone_size; 5127 if (type & BTRFS_BLOCK_GROUP_DATA) { 5128 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5129 zone_size); 5130 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5131 ctl->max_chunk_size = ctl->max_stripe_size; 5132 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5133 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5134 ctl->devs_max = min_t(int, ctl->devs_max, 5135 BTRFS_MAX_DEVS_SYS_CHUNK); 5136 } else { 5137 BUG(); 5138 } 5139 5140 /* We don't want a chunk larger than 10% of writable space */ 5141 limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), 5142 zone_size), 5143 min_chunk_size); 5144 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5145 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5146 } 5147 5148 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5149 struct alloc_chunk_ctl *ctl) 5150 { 5151 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5152 5153 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5154 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5155 ctl->devs_max = btrfs_raid_array[index].devs_max; 5156 if (!ctl->devs_max) 5157 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5158 ctl->devs_min = btrfs_raid_array[index].devs_min; 5159 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5160 ctl->ncopies = btrfs_raid_array[index].ncopies; 5161 ctl->nparity = btrfs_raid_array[index].nparity; 5162 ctl->ndevs = 0; 5163 5164 switch (fs_devices->chunk_alloc_policy) { 5165 case BTRFS_CHUNK_ALLOC_REGULAR: 5166 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5167 break; 5168 case BTRFS_CHUNK_ALLOC_ZONED: 5169 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5170 break; 5171 default: 5172 BUG(); 5173 } 5174 } 5175 5176 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5177 struct alloc_chunk_ctl *ctl, 5178 struct btrfs_device_info *devices_info) 5179 { 5180 struct btrfs_fs_info *info = fs_devices->fs_info; 5181 struct btrfs_device *device; 5182 u64 total_avail; 5183 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5184 int ret; 5185 int ndevs = 0; 5186 u64 max_avail; 5187 u64 dev_offset; 5188 5189 /* 5190 * in the first pass through the devices list, we gather information 5191 * about the available holes on each device. 5192 */ 5193 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5194 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5195 WARN(1, KERN_ERR 5196 "BTRFS: read-only device in alloc_list\n"); 5197 continue; 5198 } 5199 5200 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5201 &device->dev_state) || 5202 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5203 continue; 5204 5205 if (device->total_bytes > device->bytes_used) 5206 total_avail = device->total_bytes - device->bytes_used; 5207 else 5208 total_avail = 0; 5209 5210 /* If there is no space on this device, skip it. */ 5211 if (total_avail < ctl->dev_extent_min) 5212 continue; 5213 5214 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5215 &max_avail); 5216 if (ret && ret != -ENOSPC) 5217 return ret; 5218 5219 if (ret == 0) 5220 max_avail = dev_extent_want; 5221 5222 if (max_avail < ctl->dev_extent_min) { 5223 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5224 btrfs_debug(info, 5225 "%s: devid %llu has no free space, have=%llu want=%llu", 5226 __func__, device->devid, max_avail, 5227 ctl->dev_extent_min); 5228 continue; 5229 } 5230 5231 if (ndevs == fs_devices->rw_devices) { 5232 WARN(1, "%s: found more than %llu devices\n", 5233 __func__, fs_devices->rw_devices); 5234 break; 5235 } 5236 devices_info[ndevs].dev_offset = dev_offset; 5237 devices_info[ndevs].max_avail = max_avail; 5238 devices_info[ndevs].total_avail = total_avail; 5239 devices_info[ndevs].dev = device; 5240 ++ndevs; 5241 } 5242 ctl->ndevs = ndevs; 5243 5244 /* 5245 * now sort the devices by hole size / available space 5246 */ 5247 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5248 btrfs_cmp_device_info, NULL); 5249 5250 return 0; 5251 } 5252 5253 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5254 struct btrfs_device_info *devices_info) 5255 { 5256 /* Number of stripes that count for block group size */ 5257 int data_stripes; 5258 5259 /* 5260 * The primary goal is to maximize the number of stripes, so use as 5261 * many devices as possible, even if the stripes are not maximum sized. 5262 * 5263 * The DUP profile stores more than one stripe per device, the 5264 * max_avail is the total size so we have to adjust. 5265 */ 5266 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5267 ctl->dev_stripes); 5268 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5269 5270 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5271 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5272 5273 /* 5274 * Use the number of data stripes to figure out how big this chunk is 5275 * really going to be in terms of logical address space, and compare 5276 * that answer with the max chunk size. If it's higher, we try to 5277 * reduce stripe_size. 5278 */ 5279 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5280 /* 5281 * Reduce stripe_size, round it up to a 16MB boundary again and 5282 * then use it, unless it ends up being even bigger than the 5283 * previous value we had already. 5284 */ 5285 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5286 data_stripes), SZ_16M), 5287 ctl->stripe_size); 5288 } 5289 5290 /* Stripe size should not go beyond 1G. */ 5291 ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); 5292 5293 /* Align to BTRFS_STRIPE_LEN */ 5294 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5295 ctl->chunk_size = ctl->stripe_size * data_stripes; 5296 5297 return 0; 5298 } 5299 5300 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5301 struct btrfs_device_info *devices_info) 5302 { 5303 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5304 /* Number of stripes that count for block group size */ 5305 int data_stripes; 5306 5307 /* 5308 * It should hold because: 5309 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5310 */ 5311 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5312 5313 ctl->stripe_size = zone_size; 5314 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5315 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5316 5317 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5318 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5319 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5320 ctl->stripe_size) + ctl->nparity, 5321 ctl->dev_stripes); 5322 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5323 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5324 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5325 } 5326 5327 ctl->chunk_size = ctl->stripe_size * data_stripes; 5328 5329 return 0; 5330 } 5331 5332 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5333 struct alloc_chunk_ctl *ctl, 5334 struct btrfs_device_info *devices_info) 5335 { 5336 struct btrfs_fs_info *info = fs_devices->fs_info; 5337 5338 /* 5339 * Round down to number of usable stripes, devs_increment can be any 5340 * number so we can't use round_down() that requires power of 2, while 5341 * rounddown is safe. 5342 */ 5343 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5344 5345 if (ctl->ndevs < ctl->devs_min) { 5346 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5347 btrfs_debug(info, 5348 "%s: not enough devices with free space: have=%d minimum required=%d", 5349 __func__, ctl->ndevs, ctl->devs_min); 5350 } 5351 return -ENOSPC; 5352 } 5353 5354 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5355 5356 switch (fs_devices->chunk_alloc_policy) { 5357 case BTRFS_CHUNK_ALLOC_REGULAR: 5358 return decide_stripe_size_regular(ctl, devices_info); 5359 case BTRFS_CHUNK_ALLOC_ZONED: 5360 return decide_stripe_size_zoned(ctl, devices_info); 5361 default: 5362 BUG(); 5363 } 5364 } 5365 5366 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5367 struct alloc_chunk_ctl *ctl, 5368 struct btrfs_device_info *devices_info) 5369 { 5370 struct btrfs_fs_info *info = trans->fs_info; 5371 struct map_lookup *map = NULL; 5372 struct extent_map_tree *em_tree; 5373 struct btrfs_block_group *block_group; 5374 struct extent_map *em; 5375 u64 start = ctl->start; 5376 u64 type = ctl->type; 5377 int ret; 5378 int i; 5379 int j; 5380 5381 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5382 if (!map) 5383 return ERR_PTR(-ENOMEM); 5384 map->num_stripes = ctl->num_stripes; 5385 5386 for (i = 0; i < ctl->ndevs; ++i) { 5387 for (j = 0; j < ctl->dev_stripes; ++j) { 5388 int s = i * ctl->dev_stripes + j; 5389 map->stripes[s].dev = devices_info[i].dev; 5390 map->stripes[s].physical = devices_info[i].dev_offset + 5391 j * ctl->stripe_size; 5392 } 5393 } 5394 map->stripe_len = BTRFS_STRIPE_LEN; 5395 map->io_align = BTRFS_STRIPE_LEN; 5396 map->io_width = BTRFS_STRIPE_LEN; 5397 map->type = type; 5398 map->sub_stripes = ctl->sub_stripes; 5399 5400 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5401 5402 em = alloc_extent_map(); 5403 if (!em) { 5404 kfree(map); 5405 return ERR_PTR(-ENOMEM); 5406 } 5407 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5408 em->map_lookup = map; 5409 em->start = start; 5410 em->len = ctl->chunk_size; 5411 em->block_start = 0; 5412 em->block_len = em->len; 5413 em->orig_block_len = ctl->stripe_size; 5414 5415 em_tree = &info->mapping_tree; 5416 write_lock(&em_tree->lock); 5417 ret = add_extent_mapping(em_tree, em, 0); 5418 if (ret) { 5419 write_unlock(&em_tree->lock); 5420 free_extent_map(em); 5421 return ERR_PTR(ret); 5422 } 5423 write_unlock(&em_tree->lock); 5424 5425 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5426 if (IS_ERR(block_group)) 5427 goto error_del_extent; 5428 5429 for (i = 0; i < map->num_stripes; i++) { 5430 struct btrfs_device *dev = map->stripes[i].dev; 5431 5432 btrfs_device_set_bytes_used(dev, 5433 dev->bytes_used + ctl->stripe_size); 5434 if (list_empty(&dev->post_commit_list)) 5435 list_add_tail(&dev->post_commit_list, 5436 &trans->transaction->dev_update_list); 5437 } 5438 5439 atomic64_sub(ctl->stripe_size * map->num_stripes, 5440 &info->free_chunk_space); 5441 5442 free_extent_map(em); 5443 check_raid56_incompat_flag(info, type); 5444 check_raid1c34_incompat_flag(info, type); 5445 5446 return block_group; 5447 5448 error_del_extent: 5449 write_lock(&em_tree->lock); 5450 remove_extent_mapping(em_tree, em); 5451 write_unlock(&em_tree->lock); 5452 5453 /* One for our allocation */ 5454 free_extent_map(em); 5455 /* One for the tree reference */ 5456 free_extent_map(em); 5457 5458 return block_group; 5459 } 5460 5461 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5462 u64 type) 5463 { 5464 struct btrfs_fs_info *info = trans->fs_info; 5465 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5466 struct btrfs_device_info *devices_info = NULL; 5467 struct alloc_chunk_ctl ctl; 5468 struct btrfs_block_group *block_group; 5469 int ret; 5470 5471 lockdep_assert_held(&info->chunk_mutex); 5472 5473 if (!alloc_profile_is_valid(type, 0)) { 5474 ASSERT(0); 5475 return ERR_PTR(-EINVAL); 5476 } 5477 5478 if (list_empty(&fs_devices->alloc_list)) { 5479 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5480 btrfs_debug(info, "%s: no writable device", __func__); 5481 return ERR_PTR(-ENOSPC); 5482 } 5483 5484 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5485 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5486 ASSERT(0); 5487 return ERR_PTR(-EINVAL); 5488 } 5489 5490 ctl.start = find_next_chunk(info); 5491 ctl.type = type; 5492 init_alloc_chunk_ctl(fs_devices, &ctl); 5493 5494 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5495 GFP_NOFS); 5496 if (!devices_info) 5497 return ERR_PTR(-ENOMEM); 5498 5499 ret = gather_device_info(fs_devices, &ctl, devices_info); 5500 if (ret < 0) { 5501 block_group = ERR_PTR(ret); 5502 goto out; 5503 } 5504 5505 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5506 if (ret < 0) { 5507 block_group = ERR_PTR(ret); 5508 goto out; 5509 } 5510 5511 block_group = create_chunk(trans, &ctl, devices_info); 5512 5513 out: 5514 kfree(devices_info); 5515 return block_group; 5516 } 5517 5518 /* 5519 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5520 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5521 * chunks. 5522 * 5523 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5524 * phases. 5525 */ 5526 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5527 struct btrfs_block_group *bg) 5528 { 5529 struct btrfs_fs_info *fs_info = trans->fs_info; 5530 struct btrfs_root *chunk_root = fs_info->chunk_root; 5531 struct btrfs_key key; 5532 struct btrfs_chunk *chunk; 5533 struct btrfs_stripe *stripe; 5534 struct extent_map *em; 5535 struct map_lookup *map; 5536 size_t item_size; 5537 int i; 5538 int ret; 5539 5540 /* 5541 * We take the chunk_mutex for 2 reasons: 5542 * 5543 * 1) Updates and insertions in the chunk btree must be done while holding 5544 * the chunk_mutex, as well as updating the system chunk array in the 5545 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5546 * details; 5547 * 5548 * 2) To prevent races with the final phase of a device replace operation 5549 * that replaces the device object associated with the map's stripes, 5550 * because the device object's id can change at any time during that 5551 * final phase of the device replace operation 5552 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5553 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5554 * which would cause a failure when updating the device item, which does 5555 * not exists, or persisting a stripe of the chunk item with such ID. 5556 * Here we can't use the device_list_mutex because our caller already 5557 * has locked the chunk_mutex, and the final phase of device replace 5558 * acquires both mutexes - first the device_list_mutex and then the 5559 * chunk_mutex. Using any of those two mutexes protects us from a 5560 * concurrent device replace. 5561 */ 5562 lockdep_assert_held(&fs_info->chunk_mutex); 5563 5564 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5565 if (IS_ERR(em)) { 5566 ret = PTR_ERR(em); 5567 btrfs_abort_transaction(trans, ret); 5568 return ret; 5569 } 5570 5571 map = em->map_lookup; 5572 item_size = btrfs_chunk_item_size(map->num_stripes); 5573 5574 chunk = kzalloc(item_size, GFP_NOFS); 5575 if (!chunk) { 5576 ret = -ENOMEM; 5577 btrfs_abort_transaction(trans, ret); 5578 goto out; 5579 } 5580 5581 for (i = 0; i < map->num_stripes; i++) { 5582 struct btrfs_device *device = map->stripes[i].dev; 5583 5584 ret = btrfs_update_device(trans, device); 5585 if (ret) 5586 goto out; 5587 } 5588 5589 stripe = &chunk->stripe; 5590 for (i = 0; i < map->num_stripes; i++) { 5591 struct btrfs_device *device = map->stripes[i].dev; 5592 const u64 dev_offset = map->stripes[i].physical; 5593 5594 btrfs_set_stack_stripe_devid(stripe, device->devid); 5595 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5596 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5597 stripe++; 5598 } 5599 5600 btrfs_set_stack_chunk_length(chunk, bg->length); 5601 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 5602 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5603 btrfs_set_stack_chunk_type(chunk, map->type); 5604 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5605 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5606 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5607 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5608 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5609 5610 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5611 key.type = BTRFS_CHUNK_ITEM_KEY; 5612 key.offset = bg->start; 5613 5614 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5615 if (ret) 5616 goto out; 5617 5618 set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); 5619 5620 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5621 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5622 if (ret) 5623 goto out; 5624 } 5625 5626 out: 5627 kfree(chunk); 5628 free_extent_map(em); 5629 return ret; 5630 } 5631 5632 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5633 { 5634 struct btrfs_fs_info *fs_info = trans->fs_info; 5635 u64 alloc_profile; 5636 struct btrfs_block_group *meta_bg; 5637 struct btrfs_block_group *sys_bg; 5638 5639 /* 5640 * When adding a new device for sprouting, the seed device is read-only 5641 * so we must first allocate a metadata and a system chunk. But before 5642 * adding the block group items to the extent, device and chunk btrees, 5643 * we must first: 5644 * 5645 * 1) Create both chunks without doing any changes to the btrees, as 5646 * otherwise we would get -ENOSPC since the block groups from the 5647 * seed device are read-only; 5648 * 5649 * 2) Add the device item for the new sprout device - finishing the setup 5650 * of a new block group requires updating the device item in the chunk 5651 * btree, so it must exist when we attempt to do it. The previous step 5652 * ensures this does not fail with -ENOSPC. 5653 * 5654 * After that we can add the block group items to their btrees: 5655 * update existing device item in the chunk btree, add a new block group 5656 * item to the extent btree, add a new chunk item to the chunk btree and 5657 * finally add the new device extent items to the devices btree. 5658 */ 5659 5660 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5661 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5662 if (IS_ERR(meta_bg)) 5663 return PTR_ERR(meta_bg); 5664 5665 alloc_profile = btrfs_system_alloc_profile(fs_info); 5666 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5667 if (IS_ERR(sys_bg)) 5668 return PTR_ERR(sys_bg); 5669 5670 return 0; 5671 } 5672 5673 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5674 { 5675 const int index = btrfs_bg_flags_to_raid_index(map->type); 5676 5677 return btrfs_raid_array[index].tolerated_failures; 5678 } 5679 5680 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5681 { 5682 struct extent_map *em; 5683 struct map_lookup *map; 5684 int miss_ndevs = 0; 5685 int i; 5686 bool ret = true; 5687 5688 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5689 if (IS_ERR(em)) 5690 return false; 5691 5692 map = em->map_lookup; 5693 for (i = 0; i < map->num_stripes; i++) { 5694 if (test_bit(BTRFS_DEV_STATE_MISSING, 5695 &map->stripes[i].dev->dev_state)) { 5696 miss_ndevs++; 5697 continue; 5698 } 5699 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5700 &map->stripes[i].dev->dev_state)) { 5701 ret = false; 5702 goto end; 5703 } 5704 } 5705 5706 /* 5707 * If the number of missing devices is larger than max errors, we can 5708 * not write the data into that chunk successfully. 5709 */ 5710 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5711 ret = false; 5712 end: 5713 free_extent_map(em); 5714 return ret; 5715 } 5716 5717 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5718 { 5719 struct extent_map *em; 5720 5721 while (1) { 5722 write_lock(&tree->lock); 5723 em = lookup_extent_mapping(tree, 0, (u64)-1); 5724 if (em) 5725 remove_extent_mapping(tree, em); 5726 write_unlock(&tree->lock); 5727 if (!em) 5728 break; 5729 /* once for us */ 5730 free_extent_map(em); 5731 /* once for the tree */ 5732 free_extent_map(em); 5733 } 5734 } 5735 5736 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5737 { 5738 struct extent_map *em; 5739 struct map_lookup *map; 5740 enum btrfs_raid_types index; 5741 int ret = 1; 5742 5743 em = btrfs_get_chunk_map(fs_info, logical, len); 5744 if (IS_ERR(em)) 5745 /* 5746 * We could return errors for these cases, but that could get 5747 * ugly and we'd probably do the same thing which is just not do 5748 * anything else and exit, so return 1 so the callers don't try 5749 * to use other copies. 5750 */ 5751 return 1; 5752 5753 map = em->map_lookup; 5754 index = btrfs_bg_flags_to_raid_index(map->type); 5755 5756 /* Non-RAID56, use their ncopies from btrfs_raid_array. */ 5757 if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5758 ret = btrfs_raid_array[index].ncopies; 5759 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5760 ret = 2; 5761 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5762 /* 5763 * There could be two corrupted data stripes, we need 5764 * to loop retry in order to rebuild the correct data. 5765 * 5766 * Fail a stripe at a time on every retry except the 5767 * stripe under reconstruction. 5768 */ 5769 ret = map->num_stripes; 5770 free_extent_map(em); 5771 5772 down_read(&fs_info->dev_replace.rwsem); 5773 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5774 fs_info->dev_replace.tgtdev) 5775 ret++; 5776 up_read(&fs_info->dev_replace.rwsem); 5777 5778 return ret; 5779 } 5780 5781 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5782 u64 logical) 5783 { 5784 struct extent_map *em; 5785 struct map_lookup *map; 5786 unsigned long len = fs_info->sectorsize; 5787 5788 if (!btrfs_fs_incompat(fs_info, RAID56)) 5789 return len; 5790 5791 em = btrfs_get_chunk_map(fs_info, logical, len); 5792 5793 if (!WARN_ON(IS_ERR(em))) { 5794 map = em->map_lookup; 5795 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5796 len = map->stripe_len * nr_data_stripes(map); 5797 free_extent_map(em); 5798 } 5799 return len; 5800 } 5801 5802 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5803 { 5804 struct extent_map *em; 5805 struct map_lookup *map; 5806 int ret = 0; 5807 5808 if (!btrfs_fs_incompat(fs_info, RAID56)) 5809 return 0; 5810 5811 em = btrfs_get_chunk_map(fs_info, logical, len); 5812 5813 if(!WARN_ON(IS_ERR(em))) { 5814 map = em->map_lookup; 5815 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5816 ret = 1; 5817 free_extent_map(em); 5818 } 5819 return ret; 5820 } 5821 5822 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5823 struct map_lookup *map, int first, 5824 int dev_replace_is_ongoing) 5825 { 5826 int i; 5827 int num_stripes; 5828 int preferred_mirror; 5829 int tolerance; 5830 struct btrfs_device *srcdev; 5831 5832 ASSERT((map->type & 5833 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5834 5835 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5836 num_stripes = map->sub_stripes; 5837 else 5838 num_stripes = map->num_stripes; 5839 5840 switch (fs_info->fs_devices->read_policy) { 5841 default: 5842 /* Shouldn't happen, just warn and use pid instead of failing */ 5843 btrfs_warn_rl(fs_info, 5844 "unknown read_policy type %u, reset to pid", 5845 fs_info->fs_devices->read_policy); 5846 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5847 fallthrough; 5848 case BTRFS_READ_POLICY_PID: 5849 preferred_mirror = first + (current->pid % num_stripes); 5850 break; 5851 } 5852 5853 if (dev_replace_is_ongoing && 5854 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5855 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5856 srcdev = fs_info->dev_replace.srcdev; 5857 else 5858 srcdev = NULL; 5859 5860 /* 5861 * try to avoid the drive that is the source drive for a 5862 * dev-replace procedure, only choose it if no other non-missing 5863 * mirror is available 5864 */ 5865 for (tolerance = 0; tolerance < 2; tolerance++) { 5866 if (map->stripes[preferred_mirror].dev->bdev && 5867 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5868 return preferred_mirror; 5869 for (i = first; i < first + num_stripes; i++) { 5870 if (map->stripes[i].dev->bdev && 5871 (tolerance || map->stripes[i].dev != srcdev)) 5872 return i; 5873 } 5874 } 5875 5876 /* we couldn't find one that doesn't fail. Just return something 5877 * and the io error handling code will clean up eventually 5878 */ 5879 return preferred_mirror; 5880 } 5881 5882 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5883 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) 5884 { 5885 int i; 5886 int again = 1; 5887 5888 while (again) { 5889 again = 0; 5890 for (i = 0; i < num_stripes - 1; i++) { 5891 /* Swap if parity is on a smaller index */ 5892 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { 5893 swap(bioc->stripes[i], bioc->stripes[i + 1]); 5894 swap(bioc->raid_map[i], bioc->raid_map[i + 1]); 5895 again = 1; 5896 } 5897 } 5898 } 5899 } 5900 5901 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 5902 int total_stripes, 5903 int real_stripes) 5904 { 5905 struct btrfs_io_context *bioc = kzalloc( 5906 /* The size of btrfs_io_context */ 5907 sizeof(struct btrfs_io_context) + 5908 /* Plus the variable array for the stripes */ 5909 sizeof(struct btrfs_io_stripe) * (total_stripes) + 5910 /* Plus the variable array for the tgt dev */ 5911 sizeof(int) * (real_stripes) + 5912 /* 5913 * Plus the raid_map, which includes both the tgt dev 5914 * and the stripes. 5915 */ 5916 sizeof(u64) * (total_stripes), 5917 GFP_NOFS); 5918 5919 if (!bioc) 5920 return NULL; 5921 5922 refcount_set(&bioc->refs, 1); 5923 5924 bioc->fs_info = fs_info; 5925 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); 5926 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); 5927 5928 return bioc; 5929 } 5930 5931 void btrfs_get_bioc(struct btrfs_io_context *bioc) 5932 { 5933 WARN_ON(!refcount_read(&bioc->refs)); 5934 refcount_inc(&bioc->refs); 5935 } 5936 5937 void btrfs_put_bioc(struct btrfs_io_context *bioc) 5938 { 5939 if (!bioc) 5940 return; 5941 if (refcount_dec_and_test(&bioc->refs)) 5942 kfree(bioc); 5943 } 5944 5945 /* 5946 * Please note that, discard won't be sent to target device of device 5947 * replace. 5948 */ 5949 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, 5950 u64 logical, u64 *length_ret, 5951 u32 *num_stripes) 5952 { 5953 struct extent_map *em; 5954 struct map_lookup *map; 5955 struct btrfs_discard_stripe *stripes; 5956 u64 length = *length_ret; 5957 u64 offset; 5958 u64 stripe_nr; 5959 u64 stripe_nr_end; 5960 u64 stripe_end_offset; 5961 u64 stripe_cnt; 5962 u64 stripe_len; 5963 u64 stripe_offset; 5964 u32 stripe_index; 5965 u32 factor = 0; 5966 u32 sub_stripes = 0; 5967 u64 stripes_per_dev = 0; 5968 u32 remaining_stripes = 0; 5969 u32 last_stripe = 0; 5970 int ret; 5971 int i; 5972 5973 em = btrfs_get_chunk_map(fs_info, logical, length); 5974 if (IS_ERR(em)) 5975 return ERR_CAST(em); 5976 5977 map = em->map_lookup; 5978 5979 /* we don't discard raid56 yet */ 5980 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5981 ret = -EOPNOTSUPP; 5982 goto out_free_map; 5983 } 5984 5985 offset = logical - em->start; 5986 length = min_t(u64, em->start + em->len - logical, length); 5987 *length_ret = length; 5988 5989 stripe_len = map->stripe_len; 5990 /* 5991 * stripe_nr counts the total number of stripes we have to stride 5992 * to get to this block 5993 */ 5994 stripe_nr = div64_u64(offset, stripe_len); 5995 5996 /* stripe_offset is the offset of this block in its stripe */ 5997 stripe_offset = offset - stripe_nr * stripe_len; 5998 5999 stripe_nr_end = round_up(offset + length, map->stripe_len); 6000 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 6001 stripe_cnt = stripe_nr_end - stripe_nr; 6002 stripe_end_offset = stripe_nr_end * map->stripe_len - 6003 (offset + length); 6004 /* 6005 * after this, stripe_nr is the number of stripes on this 6006 * device we have to walk to find the data, and stripe_index is 6007 * the number of our device in the stripe array 6008 */ 6009 *num_stripes = 1; 6010 stripe_index = 0; 6011 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6012 BTRFS_BLOCK_GROUP_RAID10)) { 6013 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6014 sub_stripes = 1; 6015 else 6016 sub_stripes = map->sub_stripes; 6017 6018 factor = map->num_stripes / sub_stripes; 6019 *num_stripes = min_t(u64, map->num_stripes, 6020 sub_stripes * stripe_cnt); 6021 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6022 stripe_index *= sub_stripes; 6023 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 6024 &remaining_stripes); 6025 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 6026 last_stripe *= sub_stripes; 6027 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 6028 BTRFS_BLOCK_GROUP_DUP)) { 6029 *num_stripes = map->num_stripes; 6030 } else { 6031 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6032 &stripe_index); 6033 } 6034 6035 stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); 6036 if (!stripes) { 6037 ret = -ENOMEM; 6038 goto out_free_map; 6039 } 6040 6041 for (i = 0; i < *num_stripes; i++) { 6042 stripes[i].physical = 6043 map->stripes[stripe_index].physical + 6044 stripe_offset + stripe_nr * map->stripe_len; 6045 stripes[i].dev = map->stripes[stripe_index].dev; 6046 6047 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6048 BTRFS_BLOCK_GROUP_RAID10)) { 6049 stripes[i].length = stripes_per_dev * map->stripe_len; 6050 6051 if (i / sub_stripes < remaining_stripes) 6052 stripes[i].length += map->stripe_len; 6053 6054 /* 6055 * Special for the first stripe and 6056 * the last stripe: 6057 * 6058 * |-------|...|-------| 6059 * |----------| 6060 * off end_off 6061 */ 6062 if (i < sub_stripes) 6063 stripes[i].length -= stripe_offset; 6064 6065 if (stripe_index >= last_stripe && 6066 stripe_index <= (last_stripe + 6067 sub_stripes - 1)) 6068 stripes[i].length -= stripe_end_offset; 6069 6070 if (i == sub_stripes - 1) 6071 stripe_offset = 0; 6072 } else { 6073 stripes[i].length = length; 6074 } 6075 6076 stripe_index++; 6077 if (stripe_index == map->num_stripes) { 6078 stripe_index = 0; 6079 stripe_nr++; 6080 } 6081 } 6082 6083 free_extent_map(em); 6084 return stripes; 6085 out_free_map: 6086 free_extent_map(em); 6087 return ERR_PTR(ret); 6088 } 6089 6090 /* 6091 * In dev-replace case, for repair case (that's the only case where the mirror 6092 * is selected explicitly when calling btrfs_map_block), blocks left of the 6093 * left cursor can also be read from the target drive. 6094 * 6095 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6096 * array of stripes. 6097 * For READ, it also needs to be supported using the same mirror number. 6098 * 6099 * If the requested block is not left of the left cursor, EIO is returned. This 6100 * can happen because btrfs_num_copies() returns one more in the dev-replace 6101 * case. 6102 */ 6103 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6104 u64 logical, u64 length, 6105 u64 srcdev_devid, int *mirror_num, 6106 u64 *physical) 6107 { 6108 struct btrfs_io_context *bioc = NULL; 6109 int num_stripes; 6110 int index_srcdev = 0; 6111 int found = 0; 6112 u64 physical_of_found = 0; 6113 int i; 6114 int ret = 0; 6115 6116 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6117 logical, &length, &bioc, NULL, NULL, 0); 6118 if (ret) { 6119 ASSERT(bioc == NULL); 6120 return ret; 6121 } 6122 6123 num_stripes = bioc->num_stripes; 6124 if (*mirror_num > num_stripes) { 6125 /* 6126 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6127 * that means that the requested area is not left of the left 6128 * cursor 6129 */ 6130 btrfs_put_bioc(bioc); 6131 return -EIO; 6132 } 6133 6134 /* 6135 * process the rest of the function using the mirror_num of the source 6136 * drive. Therefore look it up first. At the end, patch the device 6137 * pointer to the one of the target drive. 6138 */ 6139 for (i = 0; i < num_stripes; i++) { 6140 if (bioc->stripes[i].dev->devid != srcdev_devid) 6141 continue; 6142 6143 /* 6144 * In case of DUP, in order to keep it simple, only add the 6145 * mirror with the lowest physical address 6146 */ 6147 if (found && 6148 physical_of_found <= bioc->stripes[i].physical) 6149 continue; 6150 6151 index_srcdev = i; 6152 found = 1; 6153 physical_of_found = bioc->stripes[i].physical; 6154 } 6155 6156 btrfs_put_bioc(bioc); 6157 6158 ASSERT(found); 6159 if (!found) 6160 return -EIO; 6161 6162 *mirror_num = index_srcdev + 1; 6163 *physical = physical_of_found; 6164 return ret; 6165 } 6166 6167 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6168 { 6169 struct btrfs_block_group *cache; 6170 bool ret; 6171 6172 /* Non zoned filesystem does not use "to_copy" flag */ 6173 if (!btrfs_is_zoned(fs_info)) 6174 return false; 6175 6176 cache = btrfs_lookup_block_group(fs_info, logical); 6177 6178 ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 6179 6180 btrfs_put_block_group(cache); 6181 return ret; 6182 } 6183 6184 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6185 struct btrfs_io_context **bioc_ret, 6186 struct btrfs_dev_replace *dev_replace, 6187 u64 logical, 6188 int *num_stripes_ret, int *max_errors_ret) 6189 { 6190 struct btrfs_io_context *bioc = *bioc_ret; 6191 u64 srcdev_devid = dev_replace->srcdev->devid; 6192 int tgtdev_indexes = 0; 6193 int num_stripes = *num_stripes_ret; 6194 int max_errors = *max_errors_ret; 6195 int i; 6196 6197 if (op == BTRFS_MAP_WRITE) { 6198 int index_where_to_add; 6199 6200 /* 6201 * A block group which have "to_copy" set will eventually 6202 * copied by dev-replace process. We can avoid cloning IO here. 6203 */ 6204 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6205 return; 6206 6207 /* 6208 * duplicate the write operations while the dev replace 6209 * procedure is running. Since the copying of the old disk to 6210 * the new disk takes place at run time while the filesystem is 6211 * mounted writable, the regular write operations to the old 6212 * disk have to be duplicated to go to the new disk as well. 6213 * 6214 * Note that device->missing is handled by the caller, and that 6215 * the write to the old disk is already set up in the stripes 6216 * array. 6217 */ 6218 index_where_to_add = num_stripes; 6219 for (i = 0; i < num_stripes; i++) { 6220 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6221 /* write to new disk, too */ 6222 struct btrfs_io_stripe *new = 6223 bioc->stripes + index_where_to_add; 6224 struct btrfs_io_stripe *old = 6225 bioc->stripes + i; 6226 6227 new->physical = old->physical; 6228 new->dev = dev_replace->tgtdev; 6229 bioc->tgtdev_map[i] = index_where_to_add; 6230 index_where_to_add++; 6231 max_errors++; 6232 tgtdev_indexes++; 6233 } 6234 } 6235 num_stripes = index_where_to_add; 6236 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6237 int index_srcdev = 0; 6238 int found = 0; 6239 u64 physical_of_found = 0; 6240 6241 /* 6242 * During the dev-replace procedure, the target drive can also 6243 * be used to read data in case it is needed to repair a corrupt 6244 * block elsewhere. This is possible if the requested area is 6245 * left of the left cursor. In this area, the target drive is a 6246 * full copy of the source drive. 6247 */ 6248 for (i = 0; i < num_stripes; i++) { 6249 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6250 /* 6251 * In case of DUP, in order to keep it simple, 6252 * only add the mirror with the lowest physical 6253 * address 6254 */ 6255 if (found && 6256 physical_of_found <= bioc->stripes[i].physical) 6257 continue; 6258 index_srcdev = i; 6259 found = 1; 6260 physical_of_found = bioc->stripes[i].physical; 6261 } 6262 } 6263 if (found) { 6264 struct btrfs_io_stripe *tgtdev_stripe = 6265 bioc->stripes + num_stripes; 6266 6267 tgtdev_stripe->physical = physical_of_found; 6268 tgtdev_stripe->dev = dev_replace->tgtdev; 6269 bioc->tgtdev_map[index_srcdev] = num_stripes; 6270 6271 tgtdev_indexes++; 6272 num_stripes++; 6273 } 6274 } 6275 6276 *num_stripes_ret = num_stripes; 6277 *max_errors_ret = max_errors; 6278 bioc->num_tgtdevs = tgtdev_indexes; 6279 *bioc_ret = bioc; 6280 } 6281 6282 static bool need_full_stripe(enum btrfs_map_op op) 6283 { 6284 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6285 } 6286 6287 static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, 6288 u64 offset, u64 *stripe_nr, u64 *stripe_offset, 6289 u64 *full_stripe_start) 6290 { 6291 u32 stripe_len = map->stripe_len; 6292 6293 ASSERT(op != BTRFS_MAP_DISCARD); 6294 6295 /* 6296 * Stripe_nr is the stripe where this block falls. stripe_offset is 6297 * the offset of this block in its stripe. 6298 */ 6299 *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); 6300 ASSERT(*stripe_offset < U32_MAX); 6301 6302 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6303 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); 6304 6305 *full_stripe_start = 6306 div64_u64(offset, full_stripe_len) * full_stripe_len; 6307 6308 /* 6309 * For writes to RAID56, allow to write a full stripe set, but 6310 * no straddling of stripe sets. 6311 */ 6312 if (op == BTRFS_MAP_WRITE) 6313 return full_stripe_len - (offset - *full_stripe_start); 6314 } 6315 6316 /* 6317 * For other RAID types and for RAID56 reads, allow a single stripe (on 6318 * a single disk). 6319 */ 6320 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) 6321 return stripe_len - *stripe_offset; 6322 return U64_MAX; 6323 } 6324 6325 static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, 6326 u32 stripe_index, u64 stripe_offset, u64 stripe_nr) 6327 { 6328 dst->dev = map->stripes[stripe_index].dev; 6329 dst->physical = map->stripes[stripe_index].physical + 6330 stripe_offset + stripe_nr * map->stripe_len; 6331 } 6332 6333 int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6334 u64 logical, u64 *length, 6335 struct btrfs_io_context **bioc_ret, 6336 struct btrfs_io_stripe *smap, int *mirror_num_ret, 6337 int need_raid_map) 6338 { 6339 struct extent_map *em; 6340 struct map_lookup *map; 6341 u64 map_offset; 6342 u64 stripe_offset; 6343 u64 stripe_nr; 6344 u64 stripe_len; 6345 u32 stripe_index; 6346 int data_stripes; 6347 int i; 6348 int ret = 0; 6349 int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); 6350 int num_stripes; 6351 int max_errors = 0; 6352 int tgtdev_indexes = 0; 6353 struct btrfs_io_context *bioc = NULL; 6354 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6355 int dev_replace_is_ongoing = 0; 6356 int num_alloc_stripes; 6357 int patch_the_first_stripe_for_dev_replace = 0; 6358 u64 physical_to_patch_in_first_stripe = 0; 6359 u64 raid56_full_stripe_start = (u64)-1; 6360 u64 max_len; 6361 6362 ASSERT(bioc_ret); 6363 ASSERT(op != BTRFS_MAP_DISCARD); 6364 6365 em = btrfs_get_chunk_map(fs_info, logical, *length); 6366 ASSERT(!IS_ERR(em)); 6367 6368 map = em->map_lookup; 6369 data_stripes = nr_data_stripes(map); 6370 stripe_len = map->stripe_len; 6371 6372 map_offset = logical - em->start; 6373 max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, 6374 &stripe_offset, &raid56_full_stripe_start); 6375 *length = min_t(u64, em->len - map_offset, max_len); 6376 6377 down_read(&dev_replace->rwsem); 6378 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6379 /* 6380 * Hold the semaphore for read during the whole operation, write is 6381 * requested at commit time but must wait. 6382 */ 6383 if (!dev_replace_is_ongoing) 6384 up_read(&dev_replace->rwsem); 6385 6386 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6387 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6388 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6389 dev_replace->srcdev->devid, 6390 &mirror_num, 6391 &physical_to_patch_in_first_stripe); 6392 if (ret) 6393 goto out; 6394 else 6395 patch_the_first_stripe_for_dev_replace = 1; 6396 } else if (mirror_num > map->num_stripes) { 6397 mirror_num = 0; 6398 } 6399 6400 num_stripes = 1; 6401 stripe_index = 0; 6402 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6403 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6404 &stripe_index); 6405 if (!need_full_stripe(op)) 6406 mirror_num = 1; 6407 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6408 if (need_full_stripe(op)) 6409 num_stripes = map->num_stripes; 6410 else if (mirror_num) 6411 stripe_index = mirror_num - 1; 6412 else { 6413 stripe_index = find_live_mirror(fs_info, map, 0, 6414 dev_replace_is_ongoing); 6415 mirror_num = stripe_index + 1; 6416 } 6417 6418 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6419 if (need_full_stripe(op)) { 6420 num_stripes = map->num_stripes; 6421 } else if (mirror_num) { 6422 stripe_index = mirror_num - 1; 6423 } else { 6424 mirror_num = 1; 6425 } 6426 6427 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6428 u32 factor = map->num_stripes / map->sub_stripes; 6429 6430 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6431 stripe_index *= map->sub_stripes; 6432 6433 if (need_full_stripe(op)) 6434 num_stripes = map->sub_stripes; 6435 else if (mirror_num) 6436 stripe_index += mirror_num - 1; 6437 else { 6438 int old_stripe_index = stripe_index; 6439 stripe_index = find_live_mirror(fs_info, map, 6440 stripe_index, 6441 dev_replace_is_ongoing); 6442 mirror_num = stripe_index - old_stripe_index + 1; 6443 } 6444 6445 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6446 ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); 6447 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6448 /* push stripe_nr back to the start of the full stripe */ 6449 stripe_nr = div64_u64(raid56_full_stripe_start, 6450 stripe_len * data_stripes); 6451 6452 /* RAID[56] write or recovery. Return all stripes */ 6453 num_stripes = map->num_stripes; 6454 max_errors = btrfs_chunk_max_errors(map); 6455 6456 /* Return the length to the full stripe end */ 6457 *length = min(logical + *length, 6458 raid56_full_stripe_start + em->start + 6459 data_stripes * stripe_len) - logical; 6460 stripe_index = 0; 6461 stripe_offset = 0; 6462 } else { 6463 /* 6464 * Mirror #0 or #1 means the original data block. 6465 * Mirror #2 is RAID5 parity block. 6466 * Mirror #3 is RAID6 Q block. 6467 */ 6468 stripe_nr = div_u64_rem(stripe_nr, 6469 data_stripes, &stripe_index); 6470 if (mirror_num > 1) 6471 stripe_index = data_stripes + mirror_num - 2; 6472 6473 /* We distribute the parity blocks across stripes */ 6474 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6475 &stripe_index); 6476 if (!need_full_stripe(op) && mirror_num <= 1) 6477 mirror_num = 1; 6478 } 6479 } else { 6480 /* 6481 * after this, stripe_nr is the number of stripes on this 6482 * device we have to walk to find the data, and stripe_index is 6483 * the number of our device in the stripe array 6484 */ 6485 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6486 &stripe_index); 6487 mirror_num = stripe_index + 1; 6488 } 6489 if (stripe_index >= map->num_stripes) { 6490 btrfs_crit(fs_info, 6491 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6492 stripe_index, map->num_stripes); 6493 ret = -EINVAL; 6494 goto out; 6495 } 6496 6497 num_alloc_stripes = num_stripes; 6498 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6499 if (op == BTRFS_MAP_WRITE) 6500 num_alloc_stripes <<= 1; 6501 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6502 num_alloc_stripes++; 6503 tgtdev_indexes = num_stripes; 6504 } 6505 6506 /* 6507 * If this I/O maps to a single device, try to return the device and 6508 * physical block information on the stack instead of allocating an 6509 * I/O context structure. 6510 */ 6511 if (smap && num_alloc_stripes == 1 && 6512 !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && 6513 (!need_full_stripe(op) || !dev_replace_is_ongoing || 6514 !dev_replace->tgtdev)) { 6515 if (patch_the_first_stripe_for_dev_replace) { 6516 smap->dev = dev_replace->tgtdev; 6517 smap->physical = physical_to_patch_in_first_stripe; 6518 *mirror_num_ret = map->num_stripes + 1; 6519 } else { 6520 set_io_stripe(smap, map, stripe_index, stripe_offset, 6521 stripe_nr); 6522 *mirror_num_ret = mirror_num; 6523 } 6524 *bioc_ret = NULL; 6525 ret = 0; 6526 goto out; 6527 } 6528 6529 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); 6530 if (!bioc) { 6531 ret = -ENOMEM; 6532 goto out; 6533 } 6534 6535 for (i = 0; i < num_stripes; i++) { 6536 set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, 6537 stripe_nr); 6538 stripe_index++; 6539 } 6540 6541 /* Build raid_map */ 6542 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6543 (need_full_stripe(op) || mirror_num > 1)) { 6544 u64 tmp; 6545 unsigned rot; 6546 6547 /* Work out the disk rotation on this stripe-set */ 6548 div_u64_rem(stripe_nr, num_stripes, &rot); 6549 6550 /* Fill in the logical address of each stripe */ 6551 tmp = stripe_nr * data_stripes; 6552 for (i = 0; i < data_stripes; i++) 6553 bioc->raid_map[(i + rot) % num_stripes] = 6554 em->start + (tmp + i) * map->stripe_len; 6555 6556 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; 6557 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6558 bioc->raid_map[(i + rot + 1) % num_stripes] = 6559 RAID6_Q_STRIPE; 6560 6561 sort_parity_stripes(bioc, num_stripes); 6562 } 6563 6564 if (need_full_stripe(op)) 6565 max_errors = btrfs_chunk_max_errors(map); 6566 6567 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6568 need_full_stripe(op)) { 6569 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, 6570 &num_stripes, &max_errors); 6571 } 6572 6573 *bioc_ret = bioc; 6574 bioc->map_type = map->type; 6575 bioc->num_stripes = num_stripes; 6576 bioc->max_errors = max_errors; 6577 bioc->mirror_num = mirror_num; 6578 6579 /* 6580 * this is the case that REQ_READ && dev_replace_is_ongoing && 6581 * mirror_num == num_stripes + 1 && dev_replace target drive is 6582 * available as a mirror 6583 */ 6584 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6585 WARN_ON(num_stripes > 1); 6586 bioc->stripes[0].dev = dev_replace->tgtdev; 6587 bioc->stripes[0].physical = physical_to_patch_in_first_stripe; 6588 bioc->mirror_num = map->num_stripes + 1; 6589 } 6590 out: 6591 if (dev_replace_is_ongoing) { 6592 lockdep_assert_held(&dev_replace->rwsem); 6593 /* Unlock and let waiting writers proceed */ 6594 up_read(&dev_replace->rwsem); 6595 } 6596 free_extent_map(em); 6597 return ret; 6598 } 6599 6600 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6601 u64 logical, u64 *length, 6602 struct btrfs_io_context **bioc_ret, int mirror_num) 6603 { 6604 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6605 NULL, &mirror_num, 0); 6606 } 6607 6608 /* For Scrub/replace */ 6609 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6610 u64 logical, u64 *length, 6611 struct btrfs_io_context **bioc_ret) 6612 { 6613 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6614 NULL, NULL, 1); 6615 } 6616 6617 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 6618 const struct btrfs_fs_devices *fs_devices) 6619 { 6620 if (args->fsid == NULL) 6621 return true; 6622 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 6623 return true; 6624 return false; 6625 } 6626 6627 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 6628 const struct btrfs_device *device) 6629 { 6630 if (args->missing) { 6631 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 6632 !device->bdev) 6633 return true; 6634 return false; 6635 } 6636 6637 if (device->devid != args->devid) 6638 return false; 6639 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 6640 return false; 6641 return true; 6642 } 6643 6644 /* 6645 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6646 * return NULL. 6647 * 6648 * If devid and uuid are both specified, the match must be exact, otherwise 6649 * only devid is used. 6650 */ 6651 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 6652 const struct btrfs_dev_lookup_args *args) 6653 { 6654 struct btrfs_device *device; 6655 struct btrfs_fs_devices *seed_devs; 6656 6657 if (dev_args_match_fs_devices(args, fs_devices)) { 6658 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6659 if (dev_args_match_device(args, device)) 6660 return device; 6661 } 6662 } 6663 6664 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6665 if (!dev_args_match_fs_devices(args, seed_devs)) 6666 continue; 6667 list_for_each_entry(device, &seed_devs->devices, dev_list) { 6668 if (dev_args_match_device(args, device)) 6669 return device; 6670 } 6671 } 6672 6673 return NULL; 6674 } 6675 6676 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6677 u64 devid, u8 *dev_uuid) 6678 { 6679 struct btrfs_device *device; 6680 unsigned int nofs_flag; 6681 6682 /* 6683 * We call this under the chunk_mutex, so we want to use NOFS for this 6684 * allocation, however we don't want to change btrfs_alloc_device() to 6685 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6686 * places. 6687 */ 6688 6689 nofs_flag = memalloc_nofs_save(); 6690 device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); 6691 memalloc_nofs_restore(nofs_flag); 6692 if (IS_ERR(device)) 6693 return device; 6694 6695 list_add(&device->dev_list, &fs_devices->devices); 6696 device->fs_devices = fs_devices; 6697 fs_devices->num_devices++; 6698 6699 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6700 fs_devices->missing_devices++; 6701 6702 return device; 6703 } 6704 6705 /* 6706 * Allocate new device struct, set up devid and UUID. 6707 * 6708 * @fs_info: used only for generating a new devid, can be NULL if 6709 * devid is provided (i.e. @devid != NULL). 6710 * @devid: a pointer to devid for this device. If NULL a new devid 6711 * is generated. 6712 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6713 * is generated. 6714 * @path: a pointer to device path if available, NULL otherwise. 6715 * 6716 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6717 * on error. Returned struct is not linked onto any lists and must be 6718 * destroyed with btrfs_free_device. 6719 */ 6720 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6721 const u64 *devid, const u8 *uuid, 6722 const char *path) 6723 { 6724 struct btrfs_device *dev; 6725 u64 tmp; 6726 6727 if (WARN_ON(!devid && !fs_info)) 6728 return ERR_PTR(-EINVAL); 6729 6730 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6731 if (!dev) 6732 return ERR_PTR(-ENOMEM); 6733 6734 INIT_LIST_HEAD(&dev->dev_list); 6735 INIT_LIST_HEAD(&dev->dev_alloc_list); 6736 INIT_LIST_HEAD(&dev->post_commit_list); 6737 6738 atomic_set(&dev->dev_stats_ccnt, 0); 6739 btrfs_device_data_ordered_init(dev); 6740 extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); 6741 6742 if (devid) 6743 tmp = *devid; 6744 else { 6745 int ret; 6746 6747 ret = find_next_devid(fs_info, &tmp); 6748 if (ret) { 6749 btrfs_free_device(dev); 6750 return ERR_PTR(ret); 6751 } 6752 } 6753 dev->devid = tmp; 6754 6755 if (uuid) 6756 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 6757 else 6758 generate_random_uuid(dev->uuid); 6759 6760 if (path) { 6761 struct rcu_string *name; 6762 6763 name = rcu_string_strdup(path, GFP_KERNEL); 6764 if (!name) { 6765 btrfs_free_device(dev); 6766 return ERR_PTR(-ENOMEM); 6767 } 6768 rcu_assign_pointer(dev->name, name); 6769 } 6770 6771 return dev; 6772 } 6773 6774 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 6775 u64 devid, u8 *uuid, bool error) 6776 { 6777 if (error) 6778 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 6779 devid, uuid); 6780 else 6781 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 6782 devid, uuid); 6783 } 6784 6785 u64 btrfs_calc_stripe_length(const struct extent_map *em) 6786 { 6787 const struct map_lookup *map = em->map_lookup; 6788 const int data_stripes = calc_data_stripes(map->type, map->num_stripes); 6789 6790 return div_u64(em->len, data_stripes); 6791 } 6792 6793 #if BITS_PER_LONG == 32 6794 /* 6795 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 6796 * can't be accessed on 32bit systems. 6797 * 6798 * This function do mount time check to reject the fs if it already has 6799 * metadata chunk beyond that limit. 6800 */ 6801 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6802 u64 logical, u64 length, u64 type) 6803 { 6804 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6805 return 0; 6806 6807 if (logical + length < MAX_LFS_FILESIZE) 6808 return 0; 6809 6810 btrfs_err_32bit_limit(fs_info); 6811 return -EOVERFLOW; 6812 } 6813 6814 /* 6815 * This is to give early warning for any metadata chunk reaching 6816 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 6817 * Although we can still access the metadata, it's not going to be possible 6818 * once the limit is reached. 6819 */ 6820 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 6821 u64 logical, u64 length, u64 type) 6822 { 6823 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 6824 return; 6825 6826 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 6827 return; 6828 6829 btrfs_warn_32bit_limit(fs_info); 6830 } 6831 #endif 6832 6833 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 6834 u64 devid, u8 *uuid) 6835 { 6836 struct btrfs_device *dev; 6837 6838 if (!btrfs_test_opt(fs_info, DEGRADED)) { 6839 btrfs_report_missing_device(fs_info, devid, uuid, true); 6840 return ERR_PTR(-ENOENT); 6841 } 6842 6843 dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 6844 if (IS_ERR(dev)) { 6845 btrfs_err(fs_info, "failed to init missing device %llu: %ld", 6846 devid, PTR_ERR(dev)); 6847 return dev; 6848 } 6849 btrfs_report_missing_device(fs_info, devid, uuid, false); 6850 6851 return dev; 6852 } 6853 6854 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 6855 struct btrfs_chunk *chunk) 6856 { 6857 BTRFS_DEV_LOOKUP_ARGS(args); 6858 struct btrfs_fs_info *fs_info = leaf->fs_info; 6859 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 6860 struct map_lookup *map; 6861 struct extent_map *em; 6862 u64 logical; 6863 u64 length; 6864 u64 devid; 6865 u64 type; 6866 u8 uuid[BTRFS_UUID_SIZE]; 6867 int index; 6868 int num_stripes; 6869 int ret; 6870 int i; 6871 6872 logical = key->offset; 6873 length = btrfs_chunk_length(leaf, chunk); 6874 type = btrfs_chunk_type(leaf, chunk); 6875 index = btrfs_bg_flags_to_raid_index(type); 6876 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 6877 6878 #if BITS_PER_LONG == 32 6879 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 6880 if (ret < 0) 6881 return ret; 6882 warn_32bit_meta_chunk(fs_info, logical, length, type); 6883 #endif 6884 6885 /* 6886 * Only need to verify chunk item if we're reading from sys chunk array, 6887 * as chunk item in tree block is already verified by tree-checker. 6888 */ 6889 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 6890 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 6891 if (ret) 6892 return ret; 6893 } 6894 6895 read_lock(&map_tree->lock); 6896 em = lookup_extent_mapping(map_tree, logical, 1); 6897 read_unlock(&map_tree->lock); 6898 6899 /* already mapped? */ 6900 if (em && em->start <= logical && em->start + em->len > logical) { 6901 free_extent_map(em); 6902 return 0; 6903 } else if (em) { 6904 free_extent_map(em); 6905 } 6906 6907 em = alloc_extent_map(); 6908 if (!em) 6909 return -ENOMEM; 6910 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 6911 if (!map) { 6912 free_extent_map(em); 6913 return -ENOMEM; 6914 } 6915 6916 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 6917 em->map_lookup = map; 6918 em->start = logical; 6919 em->len = length; 6920 em->orig_start = 0; 6921 em->block_start = 0; 6922 em->block_len = em->len; 6923 6924 map->num_stripes = num_stripes; 6925 map->io_width = btrfs_chunk_io_width(leaf, chunk); 6926 map->io_align = btrfs_chunk_io_align(leaf, chunk); 6927 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 6928 map->type = type; 6929 /* 6930 * We can't use the sub_stripes value, as for profiles other than 6931 * RAID10, they may have 0 as sub_stripes for filesystems created by 6932 * older mkfs (<v5.4). 6933 * In that case, it can cause divide-by-zero errors later. 6934 * Since currently sub_stripes is fixed for each profile, let's 6935 * use the trusted value instead. 6936 */ 6937 map->sub_stripes = btrfs_raid_array[index].sub_stripes; 6938 map->verified_stripes = 0; 6939 em->orig_block_len = btrfs_calc_stripe_length(em); 6940 for (i = 0; i < num_stripes; i++) { 6941 map->stripes[i].physical = 6942 btrfs_stripe_offset_nr(leaf, chunk, i); 6943 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 6944 args.devid = devid; 6945 read_extent_buffer(leaf, uuid, (unsigned long) 6946 btrfs_stripe_dev_uuid_nr(chunk, i), 6947 BTRFS_UUID_SIZE); 6948 args.uuid = uuid; 6949 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 6950 if (!map->stripes[i].dev) { 6951 map->stripes[i].dev = handle_missing_device(fs_info, 6952 devid, uuid); 6953 if (IS_ERR(map->stripes[i].dev)) { 6954 ret = PTR_ERR(map->stripes[i].dev); 6955 free_extent_map(em); 6956 return ret; 6957 } 6958 } 6959 6960 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 6961 &(map->stripes[i].dev->dev_state)); 6962 } 6963 6964 write_lock(&map_tree->lock); 6965 ret = add_extent_mapping(map_tree, em, 0); 6966 write_unlock(&map_tree->lock); 6967 if (ret < 0) { 6968 btrfs_err(fs_info, 6969 "failed to add chunk map, start=%llu len=%llu: %d", 6970 em->start, em->len, ret); 6971 } 6972 free_extent_map(em); 6973 6974 return ret; 6975 } 6976 6977 static void fill_device_from_item(struct extent_buffer *leaf, 6978 struct btrfs_dev_item *dev_item, 6979 struct btrfs_device *device) 6980 { 6981 unsigned long ptr; 6982 6983 device->devid = btrfs_device_id(leaf, dev_item); 6984 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6985 device->total_bytes = device->disk_total_bytes; 6986 device->commit_total_bytes = device->disk_total_bytes; 6987 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6988 device->commit_bytes_used = device->bytes_used; 6989 device->type = btrfs_device_type(leaf, dev_item); 6990 device->io_align = btrfs_device_io_align(leaf, dev_item); 6991 device->io_width = btrfs_device_io_width(leaf, dev_item); 6992 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 6993 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 6994 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 6995 6996 ptr = btrfs_device_uuid(dev_item); 6997 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6998 } 6999 7000 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7001 u8 *fsid) 7002 { 7003 struct btrfs_fs_devices *fs_devices; 7004 int ret; 7005 7006 lockdep_assert_held(&uuid_mutex); 7007 ASSERT(fsid); 7008 7009 /* This will match only for multi-device seed fs */ 7010 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7011 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7012 return fs_devices; 7013 7014 7015 fs_devices = find_fsid(fsid, NULL); 7016 if (!fs_devices) { 7017 if (!btrfs_test_opt(fs_info, DEGRADED)) 7018 return ERR_PTR(-ENOENT); 7019 7020 fs_devices = alloc_fs_devices(fsid, NULL); 7021 if (IS_ERR(fs_devices)) 7022 return fs_devices; 7023 7024 fs_devices->seeding = true; 7025 fs_devices->opened = 1; 7026 return fs_devices; 7027 } 7028 7029 /* 7030 * Upon first call for a seed fs fsid, just create a private copy of the 7031 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7032 */ 7033 fs_devices = clone_fs_devices(fs_devices); 7034 if (IS_ERR(fs_devices)) 7035 return fs_devices; 7036 7037 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7038 if (ret) { 7039 free_fs_devices(fs_devices); 7040 return ERR_PTR(ret); 7041 } 7042 7043 if (!fs_devices->seeding) { 7044 close_fs_devices(fs_devices); 7045 free_fs_devices(fs_devices); 7046 return ERR_PTR(-EINVAL); 7047 } 7048 7049 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7050 7051 return fs_devices; 7052 } 7053 7054 static int read_one_dev(struct extent_buffer *leaf, 7055 struct btrfs_dev_item *dev_item) 7056 { 7057 BTRFS_DEV_LOOKUP_ARGS(args); 7058 struct btrfs_fs_info *fs_info = leaf->fs_info; 7059 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7060 struct btrfs_device *device; 7061 u64 devid; 7062 int ret; 7063 u8 fs_uuid[BTRFS_FSID_SIZE]; 7064 u8 dev_uuid[BTRFS_UUID_SIZE]; 7065 7066 devid = btrfs_device_id(leaf, dev_item); 7067 args.devid = devid; 7068 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7069 BTRFS_UUID_SIZE); 7070 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7071 BTRFS_FSID_SIZE); 7072 args.uuid = dev_uuid; 7073 args.fsid = fs_uuid; 7074 7075 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7076 fs_devices = open_seed_devices(fs_info, fs_uuid); 7077 if (IS_ERR(fs_devices)) 7078 return PTR_ERR(fs_devices); 7079 } 7080 7081 device = btrfs_find_device(fs_info->fs_devices, &args); 7082 if (!device) { 7083 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7084 btrfs_report_missing_device(fs_info, devid, 7085 dev_uuid, true); 7086 return -ENOENT; 7087 } 7088 7089 device = add_missing_dev(fs_devices, devid, dev_uuid); 7090 if (IS_ERR(device)) { 7091 btrfs_err(fs_info, 7092 "failed to add missing dev %llu: %ld", 7093 devid, PTR_ERR(device)); 7094 return PTR_ERR(device); 7095 } 7096 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7097 } else { 7098 if (!device->bdev) { 7099 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7100 btrfs_report_missing_device(fs_info, 7101 devid, dev_uuid, true); 7102 return -ENOENT; 7103 } 7104 btrfs_report_missing_device(fs_info, devid, 7105 dev_uuid, false); 7106 } 7107 7108 if (!device->bdev && 7109 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7110 /* 7111 * this happens when a device that was properly setup 7112 * in the device info lists suddenly goes bad. 7113 * device->bdev is NULL, and so we have to set 7114 * device->missing to one here 7115 */ 7116 device->fs_devices->missing_devices++; 7117 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7118 } 7119 7120 /* Move the device to its own fs_devices */ 7121 if (device->fs_devices != fs_devices) { 7122 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7123 &device->dev_state)); 7124 7125 list_move(&device->dev_list, &fs_devices->devices); 7126 device->fs_devices->num_devices--; 7127 fs_devices->num_devices++; 7128 7129 device->fs_devices->missing_devices--; 7130 fs_devices->missing_devices++; 7131 7132 device->fs_devices = fs_devices; 7133 } 7134 } 7135 7136 if (device->fs_devices != fs_info->fs_devices) { 7137 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7138 if (device->generation != 7139 btrfs_device_generation(leaf, dev_item)) 7140 return -EINVAL; 7141 } 7142 7143 fill_device_from_item(leaf, dev_item, device); 7144 if (device->bdev) { 7145 u64 max_total_bytes = bdev_nr_bytes(device->bdev); 7146 7147 if (device->total_bytes > max_total_bytes) { 7148 btrfs_err(fs_info, 7149 "device total_bytes should be at most %llu but found %llu", 7150 max_total_bytes, device->total_bytes); 7151 return -EINVAL; 7152 } 7153 } 7154 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7155 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7156 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7157 device->fs_devices->total_rw_bytes += device->total_bytes; 7158 atomic64_add(device->total_bytes - device->bytes_used, 7159 &fs_info->free_chunk_space); 7160 } 7161 ret = 0; 7162 return ret; 7163 } 7164 7165 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7166 { 7167 struct btrfs_super_block *super_copy = fs_info->super_copy; 7168 struct extent_buffer *sb; 7169 struct btrfs_disk_key *disk_key; 7170 struct btrfs_chunk *chunk; 7171 u8 *array_ptr; 7172 unsigned long sb_array_offset; 7173 int ret = 0; 7174 u32 num_stripes; 7175 u32 array_size; 7176 u32 len = 0; 7177 u32 cur_offset; 7178 u64 type; 7179 struct btrfs_key key; 7180 7181 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7182 7183 /* 7184 * We allocated a dummy extent, just to use extent buffer accessors. 7185 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but 7186 * that's fine, we will not go beyond system chunk array anyway. 7187 */ 7188 sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); 7189 if (!sb) 7190 return -ENOMEM; 7191 set_extent_buffer_uptodate(sb); 7192 7193 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7194 array_size = btrfs_super_sys_array_size(super_copy); 7195 7196 array_ptr = super_copy->sys_chunk_array; 7197 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7198 cur_offset = 0; 7199 7200 while (cur_offset < array_size) { 7201 disk_key = (struct btrfs_disk_key *)array_ptr; 7202 len = sizeof(*disk_key); 7203 if (cur_offset + len > array_size) 7204 goto out_short_read; 7205 7206 btrfs_disk_key_to_cpu(&key, disk_key); 7207 7208 array_ptr += len; 7209 sb_array_offset += len; 7210 cur_offset += len; 7211 7212 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7213 btrfs_err(fs_info, 7214 "unexpected item type %u in sys_array at offset %u", 7215 (u32)key.type, cur_offset); 7216 ret = -EIO; 7217 break; 7218 } 7219 7220 chunk = (struct btrfs_chunk *)sb_array_offset; 7221 /* 7222 * At least one btrfs_chunk with one stripe must be present, 7223 * exact stripe count check comes afterwards 7224 */ 7225 len = btrfs_chunk_item_size(1); 7226 if (cur_offset + len > array_size) 7227 goto out_short_read; 7228 7229 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7230 if (!num_stripes) { 7231 btrfs_err(fs_info, 7232 "invalid number of stripes %u in sys_array at offset %u", 7233 num_stripes, cur_offset); 7234 ret = -EIO; 7235 break; 7236 } 7237 7238 type = btrfs_chunk_type(sb, chunk); 7239 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7240 btrfs_err(fs_info, 7241 "invalid chunk type %llu in sys_array at offset %u", 7242 type, cur_offset); 7243 ret = -EIO; 7244 break; 7245 } 7246 7247 len = btrfs_chunk_item_size(num_stripes); 7248 if (cur_offset + len > array_size) 7249 goto out_short_read; 7250 7251 ret = read_one_chunk(&key, sb, chunk); 7252 if (ret) 7253 break; 7254 7255 array_ptr += len; 7256 sb_array_offset += len; 7257 cur_offset += len; 7258 } 7259 clear_extent_buffer_uptodate(sb); 7260 free_extent_buffer_stale(sb); 7261 return ret; 7262 7263 out_short_read: 7264 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7265 len, cur_offset); 7266 clear_extent_buffer_uptodate(sb); 7267 free_extent_buffer_stale(sb); 7268 return -EIO; 7269 } 7270 7271 /* 7272 * Check if all chunks in the fs are OK for read-write degraded mount 7273 * 7274 * If the @failing_dev is specified, it's accounted as missing. 7275 * 7276 * Return true if all chunks meet the minimal RW mount requirements. 7277 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7278 */ 7279 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7280 struct btrfs_device *failing_dev) 7281 { 7282 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7283 struct extent_map *em; 7284 u64 next_start = 0; 7285 bool ret = true; 7286 7287 read_lock(&map_tree->lock); 7288 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7289 read_unlock(&map_tree->lock); 7290 /* No chunk at all? Return false anyway */ 7291 if (!em) { 7292 ret = false; 7293 goto out; 7294 } 7295 while (em) { 7296 struct map_lookup *map; 7297 int missing = 0; 7298 int max_tolerated; 7299 int i; 7300 7301 map = em->map_lookup; 7302 max_tolerated = 7303 btrfs_get_num_tolerated_disk_barrier_failures( 7304 map->type); 7305 for (i = 0; i < map->num_stripes; i++) { 7306 struct btrfs_device *dev = map->stripes[i].dev; 7307 7308 if (!dev || !dev->bdev || 7309 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7310 dev->last_flush_error) 7311 missing++; 7312 else if (failing_dev && failing_dev == dev) 7313 missing++; 7314 } 7315 if (missing > max_tolerated) { 7316 if (!failing_dev) 7317 btrfs_warn(fs_info, 7318 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7319 em->start, missing, max_tolerated); 7320 free_extent_map(em); 7321 ret = false; 7322 goto out; 7323 } 7324 next_start = extent_map_end(em); 7325 free_extent_map(em); 7326 7327 read_lock(&map_tree->lock); 7328 em = lookup_extent_mapping(map_tree, next_start, 7329 (u64)(-1) - next_start); 7330 read_unlock(&map_tree->lock); 7331 } 7332 out: 7333 return ret; 7334 } 7335 7336 static void readahead_tree_node_children(struct extent_buffer *node) 7337 { 7338 int i; 7339 const int nr_items = btrfs_header_nritems(node); 7340 7341 for (i = 0; i < nr_items; i++) 7342 btrfs_readahead_node_child(node, i); 7343 } 7344 7345 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7346 { 7347 struct btrfs_root *root = fs_info->chunk_root; 7348 struct btrfs_path *path; 7349 struct extent_buffer *leaf; 7350 struct btrfs_key key; 7351 struct btrfs_key found_key; 7352 int ret; 7353 int slot; 7354 int iter_ret = 0; 7355 u64 total_dev = 0; 7356 u64 last_ra_node = 0; 7357 7358 path = btrfs_alloc_path(); 7359 if (!path) 7360 return -ENOMEM; 7361 7362 /* 7363 * uuid_mutex is needed only if we are mounting a sprout FS 7364 * otherwise we don't need it. 7365 */ 7366 mutex_lock(&uuid_mutex); 7367 7368 /* 7369 * It is possible for mount and umount to race in such a way that 7370 * we execute this code path, but open_fs_devices failed to clear 7371 * total_rw_bytes. We certainly want it cleared before reading the 7372 * device items, so clear it here. 7373 */ 7374 fs_info->fs_devices->total_rw_bytes = 0; 7375 7376 /* 7377 * Lockdep complains about possible circular locking dependency between 7378 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 7379 * used for freeze procection of a fs (struct super_block.s_writers), 7380 * which we take when starting a transaction, and extent buffers of the 7381 * chunk tree if we call read_one_dev() while holding a lock on an 7382 * extent buffer of the chunk tree. Since we are mounting the filesystem 7383 * and at this point there can't be any concurrent task modifying the 7384 * chunk tree, to keep it simple, just skip locking on the chunk tree. 7385 */ 7386 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7387 path->skip_locking = 1; 7388 7389 /* 7390 * Read all device items, and then all the chunk items. All 7391 * device items are found before any chunk item (their object id 7392 * is smaller than the lowest possible object id for a chunk 7393 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7394 */ 7395 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7396 key.offset = 0; 7397 key.type = 0; 7398 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 7399 struct extent_buffer *node = path->nodes[1]; 7400 7401 leaf = path->nodes[0]; 7402 slot = path->slots[0]; 7403 7404 if (node) { 7405 if (last_ra_node != node->start) { 7406 readahead_tree_node_children(node); 7407 last_ra_node = node->start; 7408 } 7409 } 7410 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7411 struct btrfs_dev_item *dev_item; 7412 dev_item = btrfs_item_ptr(leaf, slot, 7413 struct btrfs_dev_item); 7414 ret = read_one_dev(leaf, dev_item); 7415 if (ret) 7416 goto error; 7417 total_dev++; 7418 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7419 struct btrfs_chunk *chunk; 7420 7421 /* 7422 * We are only called at mount time, so no need to take 7423 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7424 * we always lock first fs_info->chunk_mutex before 7425 * acquiring any locks on the chunk tree. This is a 7426 * requirement for chunk allocation, see the comment on 7427 * top of btrfs_chunk_alloc() for details. 7428 */ 7429 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7430 ret = read_one_chunk(&found_key, leaf, chunk); 7431 if (ret) 7432 goto error; 7433 } 7434 } 7435 /* Catch error found during iteration */ 7436 if (iter_ret < 0) { 7437 ret = iter_ret; 7438 goto error; 7439 } 7440 7441 /* 7442 * After loading chunk tree, we've got all device information, 7443 * do another round of validation checks. 7444 */ 7445 if (total_dev != fs_info->fs_devices->total_devices) { 7446 btrfs_warn(fs_info, 7447 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", 7448 btrfs_super_num_devices(fs_info->super_copy), 7449 total_dev); 7450 fs_info->fs_devices->total_devices = total_dev; 7451 btrfs_set_super_num_devices(fs_info->super_copy, total_dev); 7452 } 7453 if (btrfs_super_total_bytes(fs_info->super_copy) < 7454 fs_info->fs_devices->total_rw_bytes) { 7455 btrfs_err(fs_info, 7456 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7457 btrfs_super_total_bytes(fs_info->super_copy), 7458 fs_info->fs_devices->total_rw_bytes); 7459 ret = -EINVAL; 7460 goto error; 7461 } 7462 ret = 0; 7463 error: 7464 mutex_unlock(&uuid_mutex); 7465 7466 btrfs_free_path(path); 7467 return ret; 7468 } 7469 7470 int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7471 { 7472 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7473 struct btrfs_device *device; 7474 int ret = 0; 7475 7476 fs_devices->fs_info = fs_info; 7477 7478 mutex_lock(&fs_devices->device_list_mutex); 7479 list_for_each_entry(device, &fs_devices->devices, dev_list) 7480 device->fs_info = fs_info; 7481 7482 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7483 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7484 device->fs_info = fs_info; 7485 ret = btrfs_get_dev_zone_info(device, false); 7486 if (ret) 7487 break; 7488 } 7489 7490 seed_devs->fs_info = fs_info; 7491 } 7492 mutex_unlock(&fs_devices->device_list_mutex); 7493 7494 return ret; 7495 } 7496 7497 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7498 const struct btrfs_dev_stats_item *ptr, 7499 int index) 7500 { 7501 u64 val; 7502 7503 read_extent_buffer(eb, &val, 7504 offsetof(struct btrfs_dev_stats_item, values) + 7505 ((unsigned long)ptr) + (index * sizeof(u64)), 7506 sizeof(val)); 7507 return val; 7508 } 7509 7510 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7511 struct btrfs_dev_stats_item *ptr, 7512 int index, u64 val) 7513 { 7514 write_extent_buffer(eb, &val, 7515 offsetof(struct btrfs_dev_stats_item, values) + 7516 ((unsigned long)ptr) + (index * sizeof(u64)), 7517 sizeof(val)); 7518 } 7519 7520 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7521 struct btrfs_path *path) 7522 { 7523 struct btrfs_dev_stats_item *ptr; 7524 struct extent_buffer *eb; 7525 struct btrfs_key key; 7526 int item_size; 7527 int i, ret, slot; 7528 7529 if (!device->fs_info->dev_root) 7530 return 0; 7531 7532 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7533 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7534 key.offset = device->devid; 7535 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7536 if (ret) { 7537 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7538 btrfs_dev_stat_set(device, i, 0); 7539 device->dev_stats_valid = 1; 7540 btrfs_release_path(path); 7541 return ret < 0 ? ret : 0; 7542 } 7543 slot = path->slots[0]; 7544 eb = path->nodes[0]; 7545 item_size = btrfs_item_size(eb, slot); 7546 7547 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7548 7549 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7550 if (item_size >= (1 + i) * sizeof(__le64)) 7551 btrfs_dev_stat_set(device, i, 7552 btrfs_dev_stats_value(eb, ptr, i)); 7553 else 7554 btrfs_dev_stat_set(device, i, 0); 7555 } 7556 7557 device->dev_stats_valid = 1; 7558 btrfs_dev_stat_print_on_load(device); 7559 btrfs_release_path(path); 7560 7561 return 0; 7562 } 7563 7564 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7565 { 7566 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7567 struct btrfs_device *device; 7568 struct btrfs_path *path = NULL; 7569 int ret = 0; 7570 7571 path = btrfs_alloc_path(); 7572 if (!path) 7573 return -ENOMEM; 7574 7575 mutex_lock(&fs_devices->device_list_mutex); 7576 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7577 ret = btrfs_device_init_dev_stats(device, path); 7578 if (ret) 7579 goto out; 7580 } 7581 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7582 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7583 ret = btrfs_device_init_dev_stats(device, path); 7584 if (ret) 7585 goto out; 7586 } 7587 } 7588 out: 7589 mutex_unlock(&fs_devices->device_list_mutex); 7590 7591 btrfs_free_path(path); 7592 return ret; 7593 } 7594 7595 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7596 struct btrfs_device *device) 7597 { 7598 struct btrfs_fs_info *fs_info = trans->fs_info; 7599 struct btrfs_root *dev_root = fs_info->dev_root; 7600 struct btrfs_path *path; 7601 struct btrfs_key key; 7602 struct extent_buffer *eb; 7603 struct btrfs_dev_stats_item *ptr; 7604 int ret; 7605 int i; 7606 7607 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7608 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7609 key.offset = device->devid; 7610 7611 path = btrfs_alloc_path(); 7612 if (!path) 7613 return -ENOMEM; 7614 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7615 if (ret < 0) { 7616 btrfs_warn_in_rcu(fs_info, 7617 "error %d while searching for dev_stats item for device %s", 7618 ret, btrfs_dev_name(device)); 7619 goto out; 7620 } 7621 7622 if (ret == 0 && 7623 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7624 /* need to delete old one and insert a new one */ 7625 ret = btrfs_del_item(trans, dev_root, path); 7626 if (ret != 0) { 7627 btrfs_warn_in_rcu(fs_info, 7628 "delete too small dev_stats item for device %s failed %d", 7629 btrfs_dev_name(device), ret); 7630 goto out; 7631 } 7632 ret = 1; 7633 } 7634 7635 if (ret == 1) { 7636 /* need to insert a new item */ 7637 btrfs_release_path(path); 7638 ret = btrfs_insert_empty_item(trans, dev_root, path, 7639 &key, sizeof(*ptr)); 7640 if (ret < 0) { 7641 btrfs_warn_in_rcu(fs_info, 7642 "insert dev_stats item for device %s failed %d", 7643 btrfs_dev_name(device), ret); 7644 goto out; 7645 } 7646 } 7647 7648 eb = path->nodes[0]; 7649 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7650 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7651 btrfs_set_dev_stats_value(eb, ptr, i, 7652 btrfs_dev_stat_read(device, i)); 7653 btrfs_mark_buffer_dirty(eb); 7654 7655 out: 7656 btrfs_free_path(path); 7657 return ret; 7658 } 7659 7660 /* 7661 * called from commit_transaction. Writes all changed device stats to disk. 7662 */ 7663 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7664 { 7665 struct btrfs_fs_info *fs_info = trans->fs_info; 7666 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7667 struct btrfs_device *device; 7668 int stats_cnt; 7669 int ret = 0; 7670 7671 mutex_lock(&fs_devices->device_list_mutex); 7672 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7673 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7674 if (!device->dev_stats_valid || stats_cnt == 0) 7675 continue; 7676 7677 7678 /* 7679 * There is a LOAD-LOAD control dependency between the value of 7680 * dev_stats_ccnt and updating the on-disk values which requires 7681 * reading the in-memory counters. Such control dependencies 7682 * require explicit read memory barriers. 7683 * 7684 * This memory barriers pairs with smp_mb__before_atomic in 7685 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7686 * barrier implied by atomic_xchg in 7687 * btrfs_dev_stats_read_and_reset 7688 */ 7689 smp_rmb(); 7690 7691 ret = update_dev_stat_item(trans, device); 7692 if (!ret) 7693 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7694 } 7695 mutex_unlock(&fs_devices->device_list_mutex); 7696 7697 return ret; 7698 } 7699 7700 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7701 { 7702 btrfs_dev_stat_inc(dev, index); 7703 7704 if (!dev->dev_stats_valid) 7705 return; 7706 btrfs_err_rl_in_rcu(dev->fs_info, 7707 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7708 btrfs_dev_name(dev), 7709 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7710 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7711 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7712 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7713 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7714 } 7715 7716 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7717 { 7718 int i; 7719 7720 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7721 if (btrfs_dev_stat_read(dev, i) != 0) 7722 break; 7723 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7724 return; /* all values == 0, suppress message */ 7725 7726 btrfs_info_in_rcu(dev->fs_info, 7727 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7728 btrfs_dev_name(dev), 7729 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7730 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7731 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7732 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7733 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7734 } 7735 7736 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 7737 struct btrfs_ioctl_get_dev_stats *stats) 7738 { 7739 BTRFS_DEV_LOOKUP_ARGS(args); 7740 struct btrfs_device *dev; 7741 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7742 int i; 7743 7744 mutex_lock(&fs_devices->device_list_mutex); 7745 args.devid = stats->devid; 7746 dev = btrfs_find_device(fs_info->fs_devices, &args); 7747 mutex_unlock(&fs_devices->device_list_mutex); 7748 7749 if (!dev) { 7750 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 7751 return -ENODEV; 7752 } else if (!dev->dev_stats_valid) { 7753 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 7754 return -ENODEV; 7755 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 7756 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7757 if (stats->nr_items > i) 7758 stats->values[i] = 7759 btrfs_dev_stat_read_and_reset(dev, i); 7760 else 7761 btrfs_dev_stat_set(dev, i, 0); 7762 } 7763 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 7764 current->comm, task_pid_nr(current)); 7765 } else { 7766 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7767 if (stats->nr_items > i) 7768 stats->values[i] = btrfs_dev_stat_read(dev, i); 7769 } 7770 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 7771 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 7772 return 0; 7773 } 7774 7775 /* 7776 * Update the size and bytes used for each device where it changed. This is 7777 * delayed since we would otherwise get errors while writing out the 7778 * superblocks. 7779 * 7780 * Must be invoked during transaction commit. 7781 */ 7782 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 7783 { 7784 struct btrfs_device *curr, *next; 7785 7786 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 7787 7788 if (list_empty(&trans->dev_update_list)) 7789 return; 7790 7791 /* 7792 * We don't need the device_list_mutex here. This list is owned by the 7793 * transaction and the transaction must complete before the device is 7794 * released. 7795 */ 7796 mutex_lock(&trans->fs_info->chunk_mutex); 7797 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 7798 post_commit_list) { 7799 list_del_init(&curr->post_commit_list); 7800 curr->commit_total_bytes = curr->disk_total_bytes; 7801 curr->commit_bytes_used = curr->bytes_used; 7802 } 7803 mutex_unlock(&trans->fs_info->chunk_mutex); 7804 } 7805 7806 /* 7807 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 7808 */ 7809 int btrfs_bg_type_to_factor(u64 flags) 7810 { 7811 const int index = btrfs_bg_flags_to_raid_index(flags); 7812 7813 return btrfs_raid_array[index].ncopies; 7814 } 7815 7816 7817 7818 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 7819 u64 chunk_offset, u64 devid, 7820 u64 physical_offset, u64 physical_len) 7821 { 7822 struct btrfs_dev_lookup_args args = { .devid = devid }; 7823 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7824 struct extent_map *em; 7825 struct map_lookup *map; 7826 struct btrfs_device *dev; 7827 u64 stripe_len; 7828 bool found = false; 7829 int ret = 0; 7830 int i; 7831 7832 read_lock(&em_tree->lock); 7833 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 7834 read_unlock(&em_tree->lock); 7835 7836 if (!em) { 7837 btrfs_err(fs_info, 7838 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 7839 physical_offset, devid); 7840 ret = -EUCLEAN; 7841 goto out; 7842 } 7843 7844 map = em->map_lookup; 7845 stripe_len = btrfs_calc_stripe_length(em); 7846 if (physical_len != stripe_len) { 7847 btrfs_err(fs_info, 7848 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 7849 physical_offset, devid, em->start, physical_len, 7850 stripe_len); 7851 ret = -EUCLEAN; 7852 goto out; 7853 } 7854 7855 /* 7856 * Very old mkfs.btrfs (before v4.1) will not respect the reserved 7857 * space. Although kernel can handle it without problem, better to warn 7858 * the users. 7859 */ 7860 if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) 7861 btrfs_warn(fs_info, 7862 "devid %llu physical %llu len %llu inside the reserved space", 7863 devid, physical_offset, physical_len); 7864 7865 for (i = 0; i < map->num_stripes; i++) { 7866 if (map->stripes[i].dev->devid == devid && 7867 map->stripes[i].physical == physical_offset) { 7868 found = true; 7869 if (map->verified_stripes >= map->num_stripes) { 7870 btrfs_err(fs_info, 7871 "too many dev extents for chunk %llu found", 7872 em->start); 7873 ret = -EUCLEAN; 7874 goto out; 7875 } 7876 map->verified_stripes++; 7877 break; 7878 } 7879 } 7880 if (!found) { 7881 btrfs_err(fs_info, 7882 "dev extent physical offset %llu devid %llu has no corresponding chunk", 7883 physical_offset, devid); 7884 ret = -EUCLEAN; 7885 } 7886 7887 /* Make sure no dev extent is beyond device boundary */ 7888 dev = btrfs_find_device(fs_info->fs_devices, &args); 7889 if (!dev) { 7890 btrfs_err(fs_info, "failed to find devid %llu", devid); 7891 ret = -EUCLEAN; 7892 goto out; 7893 } 7894 7895 if (physical_offset + physical_len > dev->disk_total_bytes) { 7896 btrfs_err(fs_info, 7897 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 7898 devid, physical_offset, physical_len, 7899 dev->disk_total_bytes); 7900 ret = -EUCLEAN; 7901 goto out; 7902 } 7903 7904 if (dev->zone_info) { 7905 u64 zone_size = dev->zone_info->zone_size; 7906 7907 if (!IS_ALIGNED(physical_offset, zone_size) || 7908 !IS_ALIGNED(physical_len, zone_size)) { 7909 btrfs_err(fs_info, 7910 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 7911 devid, physical_offset, physical_len); 7912 ret = -EUCLEAN; 7913 goto out; 7914 } 7915 } 7916 7917 out: 7918 free_extent_map(em); 7919 return ret; 7920 } 7921 7922 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 7923 { 7924 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 7925 struct extent_map *em; 7926 struct rb_node *node; 7927 int ret = 0; 7928 7929 read_lock(&em_tree->lock); 7930 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 7931 em = rb_entry(node, struct extent_map, rb_node); 7932 if (em->map_lookup->num_stripes != 7933 em->map_lookup->verified_stripes) { 7934 btrfs_err(fs_info, 7935 "chunk %llu has missing dev extent, have %d expect %d", 7936 em->start, em->map_lookup->verified_stripes, 7937 em->map_lookup->num_stripes); 7938 ret = -EUCLEAN; 7939 goto out; 7940 } 7941 } 7942 out: 7943 read_unlock(&em_tree->lock); 7944 return ret; 7945 } 7946 7947 /* 7948 * Ensure that all dev extents are mapped to correct chunk, otherwise 7949 * later chunk allocation/free would cause unexpected behavior. 7950 * 7951 * NOTE: This will iterate through the whole device tree, which should be of 7952 * the same size level as the chunk tree. This slightly increases mount time. 7953 */ 7954 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 7955 { 7956 struct btrfs_path *path; 7957 struct btrfs_root *root = fs_info->dev_root; 7958 struct btrfs_key key; 7959 u64 prev_devid = 0; 7960 u64 prev_dev_ext_end = 0; 7961 int ret = 0; 7962 7963 /* 7964 * We don't have a dev_root because we mounted with ignorebadroots and 7965 * failed to load the root, so we want to skip the verification in this 7966 * case for sure. 7967 * 7968 * However if the dev root is fine, but the tree itself is corrupted 7969 * we'd still fail to mount. This verification is only to make sure 7970 * writes can happen safely, so instead just bypass this check 7971 * completely in the case of IGNOREBADROOTS. 7972 */ 7973 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 7974 return 0; 7975 7976 key.objectid = 1; 7977 key.type = BTRFS_DEV_EXTENT_KEY; 7978 key.offset = 0; 7979 7980 path = btrfs_alloc_path(); 7981 if (!path) 7982 return -ENOMEM; 7983 7984 path->reada = READA_FORWARD; 7985 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7986 if (ret < 0) 7987 goto out; 7988 7989 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 7990 ret = btrfs_next_leaf(root, path); 7991 if (ret < 0) 7992 goto out; 7993 /* No dev extents at all? Not good */ 7994 if (ret > 0) { 7995 ret = -EUCLEAN; 7996 goto out; 7997 } 7998 } 7999 while (1) { 8000 struct extent_buffer *leaf = path->nodes[0]; 8001 struct btrfs_dev_extent *dext; 8002 int slot = path->slots[0]; 8003 u64 chunk_offset; 8004 u64 physical_offset; 8005 u64 physical_len; 8006 u64 devid; 8007 8008 btrfs_item_key_to_cpu(leaf, &key, slot); 8009 if (key.type != BTRFS_DEV_EXTENT_KEY) 8010 break; 8011 devid = key.objectid; 8012 physical_offset = key.offset; 8013 8014 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8015 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8016 physical_len = btrfs_dev_extent_length(leaf, dext); 8017 8018 /* Check if this dev extent overlaps with the previous one */ 8019 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8020 btrfs_err(fs_info, 8021 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8022 devid, physical_offset, prev_dev_ext_end); 8023 ret = -EUCLEAN; 8024 goto out; 8025 } 8026 8027 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8028 physical_offset, physical_len); 8029 if (ret < 0) 8030 goto out; 8031 prev_devid = devid; 8032 prev_dev_ext_end = physical_offset + physical_len; 8033 8034 ret = btrfs_next_item(root, path); 8035 if (ret < 0) 8036 goto out; 8037 if (ret > 0) { 8038 ret = 0; 8039 break; 8040 } 8041 } 8042 8043 /* Ensure all chunks have corresponding dev extents */ 8044 ret = verify_chunk_dev_extent_mapping(fs_info); 8045 out: 8046 btrfs_free_path(path); 8047 return ret; 8048 } 8049 8050 /* 8051 * Check whether the given block group or device is pinned by any inode being 8052 * used as a swapfile. 8053 */ 8054 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8055 { 8056 struct btrfs_swapfile_pin *sp; 8057 struct rb_node *node; 8058 8059 spin_lock(&fs_info->swapfile_pins_lock); 8060 node = fs_info->swapfile_pins.rb_node; 8061 while (node) { 8062 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8063 if (ptr < sp->ptr) 8064 node = node->rb_left; 8065 else if (ptr > sp->ptr) 8066 node = node->rb_right; 8067 else 8068 break; 8069 } 8070 spin_unlock(&fs_info->swapfile_pins_lock); 8071 return node != NULL; 8072 } 8073 8074 static int relocating_repair_kthread(void *data) 8075 { 8076 struct btrfs_block_group *cache = data; 8077 struct btrfs_fs_info *fs_info = cache->fs_info; 8078 u64 target; 8079 int ret = 0; 8080 8081 target = cache->start; 8082 btrfs_put_block_group(cache); 8083 8084 sb_start_write(fs_info->sb); 8085 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8086 btrfs_info(fs_info, 8087 "zoned: skip relocating block group %llu to repair: EBUSY", 8088 target); 8089 sb_end_write(fs_info->sb); 8090 return -EBUSY; 8091 } 8092 8093 mutex_lock(&fs_info->reclaim_bgs_lock); 8094 8095 /* Ensure block group still exists */ 8096 cache = btrfs_lookup_block_group(fs_info, target); 8097 if (!cache) 8098 goto out; 8099 8100 if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) 8101 goto out; 8102 8103 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8104 if (ret < 0) 8105 goto out; 8106 8107 btrfs_info(fs_info, 8108 "zoned: relocating block group %llu to repair IO failure", 8109 target); 8110 ret = btrfs_relocate_chunk(fs_info, target); 8111 8112 out: 8113 if (cache) 8114 btrfs_put_block_group(cache); 8115 mutex_unlock(&fs_info->reclaim_bgs_lock); 8116 btrfs_exclop_finish(fs_info); 8117 sb_end_write(fs_info->sb); 8118 8119 return ret; 8120 } 8121 8122 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8123 { 8124 struct btrfs_block_group *cache; 8125 8126 if (!btrfs_is_zoned(fs_info)) 8127 return false; 8128 8129 /* Do not attempt to repair in degraded state */ 8130 if (btrfs_test_opt(fs_info, DEGRADED)) 8131 return true; 8132 8133 cache = btrfs_lookup_block_group(fs_info, logical); 8134 if (!cache) 8135 return true; 8136 8137 if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { 8138 btrfs_put_block_group(cache); 8139 return true; 8140 } 8141 8142 kthread_run(relocating_repair_kthread, cache, 8143 "btrfs-relocating-repair"); 8144 8145 return true; 8146 } 8147