1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include <linux/namei.h> 18 #include "misc.h" 19 #include "ctree.h" 20 #include "extent_map.h" 21 #include "disk-io.h" 22 #include "transaction.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "async-thread.h" 27 #include "check-integrity.h" 28 #include "rcu-string.h" 29 #include "dev-replace.h" 30 #include "sysfs.h" 31 #include "tree-checker.h" 32 #include "space-info.h" 33 #include "block-group.h" 34 #include "discard.h" 35 #include "zoned.h" 36 37 static struct bio_set btrfs_bioset; 38 39 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 40 BTRFS_BLOCK_GROUP_RAID10 | \ 41 BTRFS_BLOCK_GROUP_RAID56_MASK) 42 43 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 44 [BTRFS_RAID_RAID10] = { 45 .sub_stripes = 2, 46 .dev_stripes = 1, 47 .devs_max = 0, /* 0 == as many as possible */ 48 .devs_min = 2, 49 .tolerated_failures = 1, 50 .devs_increment = 2, 51 .ncopies = 2, 52 .nparity = 0, 53 .raid_name = "raid10", 54 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 55 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 56 }, 57 [BTRFS_RAID_RAID1] = { 58 .sub_stripes = 1, 59 .dev_stripes = 1, 60 .devs_max = 2, 61 .devs_min = 2, 62 .tolerated_failures = 1, 63 .devs_increment = 2, 64 .ncopies = 2, 65 .nparity = 0, 66 .raid_name = "raid1", 67 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 68 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 69 }, 70 [BTRFS_RAID_RAID1C3] = { 71 .sub_stripes = 1, 72 .dev_stripes = 1, 73 .devs_max = 3, 74 .devs_min = 3, 75 .tolerated_failures = 2, 76 .devs_increment = 3, 77 .ncopies = 3, 78 .nparity = 0, 79 .raid_name = "raid1c3", 80 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 81 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 82 }, 83 [BTRFS_RAID_RAID1C4] = { 84 .sub_stripes = 1, 85 .dev_stripes = 1, 86 .devs_max = 4, 87 .devs_min = 4, 88 .tolerated_failures = 3, 89 .devs_increment = 4, 90 .ncopies = 4, 91 .nparity = 0, 92 .raid_name = "raid1c4", 93 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 94 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 95 }, 96 [BTRFS_RAID_DUP] = { 97 .sub_stripes = 1, 98 .dev_stripes = 2, 99 .devs_max = 1, 100 .devs_min = 1, 101 .tolerated_failures = 0, 102 .devs_increment = 1, 103 .ncopies = 2, 104 .nparity = 0, 105 .raid_name = "dup", 106 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 107 .mindev_error = 0, 108 }, 109 [BTRFS_RAID_RAID0] = { 110 .sub_stripes = 1, 111 .dev_stripes = 1, 112 .devs_max = 0, 113 .devs_min = 1, 114 .tolerated_failures = 0, 115 .devs_increment = 1, 116 .ncopies = 1, 117 .nparity = 0, 118 .raid_name = "raid0", 119 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 120 .mindev_error = 0, 121 }, 122 [BTRFS_RAID_SINGLE] = { 123 .sub_stripes = 1, 124 .dev_stripes = 1, 125 .devs_max = 1, 126 .devs_min = 1, 127 .tolerated_failures = 0, 128 .devs_increment = 1, 129 .ncopies = 1, 130 .nparity = 0, 131 .raid_name = "single", 132 .bg_flag = 0, 133 .mindev_error = 0, 134 }, 135 [BTRFS_RAID_RAID5] = { 136 .sub_stripes = 1, 137 .dev_stripes = 1, 138 .devs_max = 0, 139 .devs_min = 2, 140 .tolerated_failures = 1, 141 .devs_increment = 1, 142 .ncopies = 1, 143 .nparity = 1, 144 .raid_name = "raid5", 145 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 146 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 147 }, 148 [BTRFS_RAID_RAID6] = { 149 .sub_stripes = 1, 150 .dev_stripes = 1, 151 .devs_max = 0, 152 .devs_min = 3, 153 .tolerated_failures = 2, 154 .devs_increment = 1, 155 .ncopies = 1, 156 .nparity = 2, 157 .raid_name = "raid6", 158 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 159 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 160 }, 161 }; 162 163 /* 164 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 165 * can be used as index to access btrfs_raid_array[]. 166 */ 167 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 168 { 169 const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); 170 171 if (!profile) 172 return BTRFS_RAID_SINGLE; 173 174 return BTRFS_BG_FLAG_TO_INDEX(profile); 175 } 176 177 const char *btrfs_bg_type_to_raid_name(u64 flags) 178 { 179 const int index = btrfs_bg_flags_to_raid_index(flags); 180 181 if (index >= BTRFS_NR_RAID_TYPES) 182 return NULL; 183 184 return btrfs_raid_array[index].raid_name; 185 } 186 187 int btrfs_nr_parity_stripes(u64 type) 188 { 189 enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); 190 191 return btrfs_raid_array[index].nparity; 192 } 193 194 /* 195 * Fill @buf with textual description of @bg_flags, no more than @size_buf 196 * bytes including terminating null byte. 197 */ 198 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 199 { 200 int i; 201 int ret; 202 char *bp = buf; 203 u64 flags = bg_flags; 204 u32 size_bp = size_buf; 205 206 if (!flags) { 207 strcpy(bp, "NONE"); 208 return; 209 } 210 211 #define DESCRIBE_FLAG(flag, desc) \ 212 do { \ 213 if (flags & (flag)) { \ 214 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 215 if (ret < 0 || ret >= size_bp) \ 216 goto out_overflow; \ 217 size_bp -= ret; \ 218 bp += ret; \ 219 flags &= ~(flag); \ 220 } \ 221 } while (0) 222 223 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 224 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 225 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 226 227 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 228 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 229 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 230 btrfs_raid_array[i].raid_name); 231 #undef DESCRIBE_FLAG 232 233 if (flags) { 234 ret = snprintf(bp, size_bp, "0x%llx|", flags); 235 size_bp -= ret; 236 } 237 238 if (size_bp < size_buf) 239 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 240 241 /* 242 * The text is trimmed, it's up to the caller to provide sufficiently 243 * large buffer 244 */ 245 out_overflow:; 246 } 247 248 static int init_first_rw_device(struct btrfs_trans_handle *trans); 249 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 250 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 251 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 252 enum btrfs_map_op op, u64 logical, u64 *length, 253 struct btrfs_io_context **bioc_ret, 254 struct btrfs_io_stripe *smap, 255 int *mirror_num_ret, int need_raid_map); 256 257 /* 258 * Device locking 259 * ============== 260 * 261 * There are several mutexes that protect manipulation of devices and low-level 262 * structures like chunks but not block groups, extents or files 263 * 264 * uuid_mutex (global lock) 265 * ------------------------ 266 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 267 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 268 * device) or requested by the device= mount option 269 * 270 * the mutex can be very coarse and can cover long-running operations 271 * 272 * protects: updates to fs_devices counters like missing devices, rw devices, 273 * seeding, structure cloning, opening/closing devices at mount/umount time 274 * 275 * global::fs_devs - add, remove, updates to the global list 276 * 277 * does not protect: manipulation of the fs_devices::devices list in general 278 * but in mount context it could be used to exclude list modifications by eg. 279 * scan ioctl 280 * 281 * btrfs_device::name - renames (write side), read is RCU 282 * 283 * fs_devices::device_list_mutex (per-fs, with RCU) 284 * ------------------------------------------------ 285 * protects updates to fs_devices::devices, ie. adding and deleting 286 * 287 * simple list traversal with read-only actions can be done with RCU protection 288 * 289 * may be used to exclude some operations from running concurrently without any 290 * modifications to the list (see write_all_supers) 291 * 292 * Is not required at mount and close times, because our device list is 293 * protected by the uuid_mutex at that point. 294 * 295 * balance_mutex 296 * ------------- 297 * protects balance structures (status, state) and context accessed from 298 * several places (internally, ioctl) 299 * 300 * chunk_mutex 301 * ----------- 302 * protects chunks, adding or removing during allocation, trim or when a new 303 * device is added/removed. Additionally it also protects post_commit_list of 304 * individual devices, since they can be added to the transaction's 305 * post_commit_list only with chunk_mutex held. 306 * 307 * cleaner_mutex 308 * ------------- 309 * a big lock that is held by the cleaner thread and prevents running subvolume 310 * cleaning together with relocation or delayed iputs 311 * 312 * 313 * Lock nesting 314 * ============ 315 * 316 * uuid_mutex 317 * device_list_mutex 318 * chunk_mutex 319 * balance_mutex 320 * 321 * 322 * Exclusive operations 323 * ==================== 324 * 325 * Maintains the exclusivity of the following operations that apply to the 326 * whole filesystem and cannot run in parallel. 327 * 328 * - Balance (*) 329 * - Device add 330 * - Device remove 331 * - Device replace (*) 332 * - Resize 333 * 334 * The device operations (as above) can be in one of the following states: 335 * 336 * - Running state 337 * - Paused state 338 * - Completed state 339 * 340 * Only device operations marked with (*) can go into the Paused state for the 341 * following reasons: 342 * 343 * - ioctl (only Balance can be Paused through ioctl) 344 * - filesystem remounted as read-only 345 * - filesystem unmounted and mounted as read-only 346 * - system power-cycle and filesystem mounted as read-only 347 * - filesystem or device errors leading to forced read-only 348 * 349 * The status of exclusive operation is set and cleared atomically. 350 * During the course of Paused state, fs_info::exclusive_operation remains set. 351 * A device operation in Paused or Running state can be canceled or resumed 352 * either by ioctl (Balance only) or when remounted as read-write. 353 * The exclusive status is cleared when the device operation is canceled or 354 * completed. 355 */ 356 357 DEFINE_MUTEX(uuid_mutex); 358 static LIST_HEAD(fs_uuids); 359 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 360 { 361 return &fs_uuids; 362 } 363 364 /* 365 * alloc_fs_devices - allocate struct btrfs_fs_devices 366 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 367 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 368 * 369 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 370 * The returned struct is not linked onto any lists and can be destroyed with 371 * kfree() right away. 372 */ 373 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 374 const u8 *metadata_fsid) 375 { 376 struct btrfs_fs_devices *fs_devs; 377 378 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 379 if (!fs_devs) 380 return ERR_PTR(-ENOMEM); 381 382 mutex_init(&fs_devs->device_list_mutex); 383 384 INIT_LIST_HEAD(&fs_devs->devices); 385 INIT_LIST_HEAD(&fs_devs->alloc_list); 386 INIT_LIST_HEAD(&fs_devs->fs_list); 387 INIT_LIST_HEAD(&fs_devs->seed_list); 388 if (fsid) 389 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 390 391 if (metadata_fsid) 392 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 393 else if (fsid) 394 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 395 396 return fs_devs; 397 } 398 399 void btrfs_free_device(struct btrfs_device *device) 400 { 401 WARN_ON(!list_empty(&device->post_commit_list)); 402 rcu_string_free(device->name); 403 extent_io_tree_release(&device->alloc_state); 404 btrfs_destroy_dev_zone_info(device); 405 kfree(device); 406 } 407 408 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 409 { 410 struct btrfs_device *device; 411 WARN_ON(fs_devices->opened); 412 while (!list_empty(&fs_devices->devices)) { 413 device = list_entry(fs_devices->devices.next, 414 struct btrfs_device, dev_list); 415 list_del(&device->dev_list); 416 btrfs_free_device(device); 417 } 418 kfree(fs_devices); 419 } 420 421 void __exit btrfs_cleanup_fs_uuids(void) 422 { 423 struct btrfs_fs_devices *fs_devices; 424 425 while (!list_empty(&fs_uuids)) { 426 fs_devices = list_entry(fs_uuids.next, 427 struct btrfs_fs_devices, fs_list); 428 list_del(&fs_devices->fs_list); 429 free_fs_devices(fs_devices); 430 } 431 } 432 433 static noinline struct btrfs_fs_devices *find_fsid( 434 const u8 *fsid, const u8 *metadata_fsid) 435 { 436 struct btrfs_fs_devices *fs_devices; 437 438 ASSERT(fsid); 439 440 /* Handle non-split brain cases */ 441 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 442 if (metadata_fsid) { 443 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 444 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 445 BTRFS_FSID_SIZE) == 0) 446 return fs_devices; 447 } else { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 449 return fs_devices; 450 } 451 } 452 return NULL; 453 } 454 455 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 456 struct btrfs_super_block *disk_super) 457 { 458 459 struct btrfs_fs_devices *fs_devices; 460 461 /* 462 * Handle scanned device having completed its fsid change but 463 * belonging to a fs_devices that was created by first scanning 464 * a device which didn't have its fsid/metadata_uuid changed 465 * at all and the CHANGING_FSID_V2 flag set. 466 */ 467 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 468 if (fs_devices->fsid_change && 469 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 470 BTRFS_FSID_SIZE) == 0 && 471 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 472 BTRFS_FSID_SIZE) == 0) { 473 return fs_devices; 474 } 475 } 476 /* 477 * Handle scanned device having completed its fsid change but 478 * belonging to a fs_devices that was created by a device that 479 * has an outdated pair of fsid/metadata_uuid and 480 * CHANGING_FSID_V2 flag set. 481 */ 482 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 483 if (fs_devices->fsid_change && 484 memcmp(fs_devices->metadata_uuid, 485 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 486 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 487 BTRFS_FSID_SIZE) == 0) { 488 return fs_devices; 489 } 490 } 491 492 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 493 } 494 495 496 static int 497 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 498 int flush, struct block_device **bdev, 499 struct btrfs_super_block **disk_super) 500 { 501 int ret; 502 503 *bdev = blkdev_get_by_path(device_path, flags, holder); 504 505 if (IS_ERR(*bdev)) { 506 ret = PTR_ERR(*bdev); 507 goto error; 508 } 509 510 if (flush) 511 sync_blockdev(*bdev); 512 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 513 if (ret) { 514 blkdev_put(*bdev, flags); 515 goto error; 516 } 517 invalidate_bdev(*bdev); 518 *disk_super = btrfs_read_dev_super(*bdev); 519 if (IS_ERR(*disk_super)) { 520 ret = PTR_ERR(*disk_super); 521 blkdev_put(*bdev, flags); 522 goto error; 523 } 524 525 return 0; 526 527 error: 528 *bdev = NULL; 529 return ret; 530 } 531 532 /** 533 * Search and remove all stale devices (which are not mounted). 534 * When both inputs are NULL, it will search and release all stale devices. 535 * 536 * @devt: Optional. When provided will it release all unmounted devices 537 * matching this devt only. 538 * @skip_device: Optional. Will skip this device when searching for the stale 539 * devices. 540 * 541 * Return: 0 for success or if @devt is 0. 542 * -EBUSY if @devt is a mounted device. 543 * -ENOENT if @devt does not match any device in the list. 544 */ 545 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 546 { 547 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 548 struct btrfs_device *device, *tmp_device; 549 int ret = 0; 550 551 lockdep_assert_held(&uuid_mutex); 552 553 if (devt) 554 ret = -ENOENT; 555 556 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 557 558 mutex_lock(&fs_devices->device_list_mutex); 559 list_for_each_entry_safe(device, tmp_device, 560 &fs_devices->devices, dev_list) { 561 if (skip_device && skip_device == device) 562 continue; 563 if (devt && devt != device->devt) 564 continue; 565 if (fs_devices->opened) { 566 /* for an already deleted device return 0 */ 567 if (devt && ret != 0) 568 ret = -EBUSY; 569 break; 570 } 571 572 /* delete the stale device */ 573 fs_devices->num_devices--; 574 list_del(&device->dev_list); 575 btrfs_free_device(device); 576 577 ret = 0; 578 } 579 mutex_unlock(&fs_devices->device_list_mutex); 580 581 if (fs_devices->num_devices == 0) { 582 btrfs_sysfs_remove_fsid(fs_devices); 583 list_del(&fs_devices->fs_list); 584 free_fs_devices(fs_devices); 585 } 586 } 587 588 return ret; 589 } 590 591 /* 592 * This is only used on mount, and we are protected from competing things 593 * messing with our fs_devices by the uuid_mutex, thus we do not need the 594 * fs_devices->device_list_mutex here. 595 */ 596 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 597 struct btrfs_device *device, fmode_t flags, 598 void *holder) 599 { 600 struct block_device *bdev; 601 struct btrfs_super_block *disk_super; 602 u64 devid; 603 int ret; 604 605 if (device->bdev) 606 return -EINVAL; 607 if (!device->name) 608 return -EINVAL; 609 610 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 611 &bdev, &disk_super); 612 if (ret) 613 return ret; 614 615 devid = btrfs_stack_device_id(&disk_super->dev_item); 616 if (devid != device->devid) 617 goto error_free_page; 618 619 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 620 goto error_free_page; 621 622 device->generation = btrfs_super_generation(disk_super); 623 624 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 625 if (btrfs_super_incompat_flags(disk_super) & 626 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 627 pr_err( 628 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 629 goto error_free_page; 630 } 631 632 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 633 fs_devices->seeding = true; 634 } else { 635 if (bdev_read_only(bdev)) 636 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 637 else 638 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 639 } 640 641 if (!bdev_nonrot(bdev)) 642 fs_devices->rotating = true; 643 644 if (bdev_max_discard_sectors(bdev)) 645 fs_devices->discardable = true; 646 647 device->bdev = bdev; 648 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 649 device->mode = flags; 650 651 fs_devices->open_devices++; 652 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 653 device->devid != BTRFS_DEV_REPLACE_DEVID) { 654 fs_devices->rw_devices++; 655 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 656 } 657 btrfs_release_disk_super(disk_super); 658 659 return 0; 660 661 error_free_page: 662 btrfs_release_disk_super(disk_super); 663 blkdev_put(bdev, flags); 664 665 return -EINVAL; 666 } 667 668 /* 669 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 670 * being created with a disk that has already completed its fsid change. Such 671 * disk can belong to an fs which has its FSID changed or to one which doesn't. 672 * Handle both cases here. 673 */ 674 static struct btrfs_fs_devices *find_fsid_inprogress( 675 struct btrfs_super_block *disk_super) 676 { 677 struct btrfs_fs_devices *fs_devices; 678 679 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 680 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 681 BTRFS_FSID_SIZE) != 0 && 682 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 683 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 684 return fs_devices; 685 } 686 } 687 688 return find_fsid(disk_super->fsid, NULL); 689 } 690 691 692 static struct btrfs_fs_devices *find_fsid_changed( 693 struct btrfs_super_block *disk_super) 694 { 695 struct btrfs_fs_devices *fs_devices; 696 697 /* 698 * Handles the case where scanned device is part of an fs that had 699 * multiple successful changes of FSID but currently device didn't 700 * observe it. Meaning our fsid will be different than theirs. We need 701 * to handle two subcases : 702 * 1 - The fs still continues to have different METADATA/FSID uuids. 703 * 2 - The fs is switched back to its original FSID (METADATA/FSID 704 * are equal). 705 */ 706 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 707 /* Changed UUIDs */ 708 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 709 BTRFS_FSID_SIZE) != 0 && 710 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 711 BTRFS_FSID_SIZE) == 0 && 712 memcmp(fs_devices->fsid, disk_super->fsid, 713 BTRFS_FSID_SIZE) != 0) 714 return fs_devices; 715 716 /* Unchanged UUIDs */ 717 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 718 BTRFS_FSID_SIZE) == 0 && 719 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 720 BTRFS_FSID_SIZE) == 0) 721 return fs_devices; 722 } 723 724 return NULL; 725 } 726 727 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 728 struct btrfs_super_block *disk_super) 729 { 730 struct btrfs_fs_devices *fs_devices; 731 732 /* 733 * Handle the case where the scanned device is part of an fs whose last 734 * metadata UUID change reverted it to the original FSID. At the same 735 * time * fs_devices was first created by another constitutent device 736 * which didn't fully observe the operation. This results in an 737 * btrfs_fs_devices created with metadata/fsid different AND 738 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 739 * fs_devices equal to the FSID of the disk. 740 */ 741 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 742 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 743 BTRFS_FSID_SIZE) != 0 && 744 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 745 BTRFS_FSID_SIZE) == 0 && 746 fs_devices->fsid_change) 747 return fs_devices; 748 } 749 750 return NULL; 751 } 752 /* 753 * Add new device to list of registered devices 754 * 755 * Returns: 756 * device pointer which was just added or updated when successful 757 * error pointer when failed 758 */ 759 static noinline struct btrfs_device *device_list_add(const char *path, 760 struct btrfs_super_block *disk_super, 761 bool *new_device_added) 762 { 763 struct btrfs_device *device; 764 struct btrfs_fs_devices *fs_devices = NULL; 765 struct rcu_string *name; 766 u64 found_transid = btrfs_super_generation(disk_super); 767 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 768 dev_t path_devt; 769 int error; 770 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 771 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 772 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 773 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 774 775 error = lookup_bdev(path, &path_devt); 776 if (error) 777 return ERR_PTR(error); 778 779 if (fsid_change_in_progress) { 780 if (!has_metadata_uuid) 781 fs_devices = find_fsid_inprogress(disk_super); 782 else 783 fs_devices = find_fsid_changed(disk_super); 784 } else if (has_metadata_uuid) { 785 fs_devices = find_fsid_with_metadata_uuid(disk_super); 786 } else { 787 fs_devices = find_fsid_reverted_metadata(disk_super); 788 if (!fs_devices) 789 fs_devices = find_fsid(disk_super->fsid, NULL); 790 } 791 792 793 if (!fs_devices) { 794 if (has_metadata_uuid) 795 fs_devices = alloc_fs_devices(disk_super->fsid, 796 disk_super->metadata_uuid); 797 else 798 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 799 800 if (IS_ERR(fs_devices)) 801 return ERR_CAST(fs_devices); 802 803 fs_devices->fsid_change = fsid_change_in_progress; 804 805 mutex_lock(&fs_devices->device_list_mutex); 806 list_add(&fs_devices->fs_list, &fs_uuids); 807 808 device = NULL; 809 } else { 810 struct btrfs_dev_lookup_args args = { 811 .devid = devid, 812 .uuid = disk_super->dev_item.uuid, 813 }; 814 815 mutex_lock(&fs_devices->device_list_mutex); 816 device = btrfs_find_device(fs_devices, &args); 817 818 /* 819 * If this disk has been pulled into an fs devices created by 820 * a device which had the CHANGING_FSID_V2 flag then replace the 821 * metadata_uuid/fsid values of the fs_devices. 822 */ 823 if (fs_devices->fsid_change && 824 found_transid > fs_devices->latest_generation) { 825 memcpy(fs_devices->fsid, disk_super->fsid, 826 BTRFS_FSID_SIZE); 827 828 if (has_metadata_uuid) 829 memcpy(fs_devices->metadata_uuid, 830 disk_super->metadata_uuid, 831 BTRFS_FSID_SIZE); 832 else 833 memcpy(fs_devices->metadata_uuid, 834 disk_super->fsid, BTRFS_FSID_SIZE); 835 836 fs_devices->fsid_change = false; 837 } 838 } 839 840 if (!device) { 841 if (fs_devices->opened) { 842 mutex_unlock(&fs_devices->device_list_mutex); 843 return ERR_PTR(-EBUSY); 844 } 845 846 device = btrfs_alloc_device(NULL, &devid, 847 disk_super->dev_item.uuid); 848 if (IS_ERR(device)) { 849 mutex_unlock(&fs_devices->device_list_mutex); 850 /* we can safely leave the fs_devices entry around */ 851 return device; 852 } 853 854 name = rcu_string_strdup(path, GFP_NOFS); 855 if (!name) { 856 btrfs_free_device(device); 857 mutex_unlock(&fs_devices->device_list_mutex); 858 return ERR_PTR(-ENOMEM); 859 } 860 rcu_assign_pointer(device->name, name); 861 device->devt = path_devt; 862 863 list_add_rcu(&device->dev_list, &fs_devices->devices); 864 fs_devices->num_devices++; 865 866 device->fs_devices = fs_devices; 867 *new_device_added = true; 868 869 if (disk_super->label[0]) 870 pr_info( 871 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 872 disk_super->label, devid, found_transid, path, 873 current->comm, task_pid_nr(current)); 874 else 875 pr_info( 876 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 877 disk_super->fsid, devid, found_transid, path, 878 current->comm, task_pid_nr(current)); 879 880 } else if (!device->name || strcmp(device->name->str, path)) { 881 /* 882 * When FS is already mounted. 883 * 1. If you are here and if the device->name is NULL that 884 * means this device was missing at time of FS mount. 885 * 2. If you are here and if the device->name is different 886 * from 'path' that means either 887 * a. The same device disappeared and reappeared with 888 * different name. or 889 * b. The missing-disk-which-was-replaced, has 890 * reappeared now. 891 * 892 * We must allow 1 and 2a above. But 2b would be a spurious 893 * and unintentional. 894 * 895 * Further in case of 1 and 2a above, the disk at 'path' 896 * would have missed some transaction when it was away and 897 * in case of 2a the stale bdev has to be updated as well. 898 * 2b must not be allowed at all time. 899 */ 900 901 /* 902 * For now, we do allow update to btrfs_fs_device through the 903 * btrfs dev scan cli after FS has been mounted. We're still 904 * tracking a problem where systems fail mount by subvolume id 905 * when we reject replacement on a mounted FS. 906 */ 907 if (!fs_devices->opened && found_transid < device->generation) { 908 /* 909 * That is if the FS is _not_ mounted and if you 910 * are here, that means there is more than one 911 * disk with same uuid and devid.We keep the one 912 * with larger generation number or the last-in if 913 * generation are equal. 914 */ 915 mutex_unlock(&fs_devices->device_list_mutex); 916 return ERR_PTR(-EEXIST); 917 } 918 919 /* 920 * We are going to replace the device path for a given devid, 921 * make sure it's the same device if the device is mounted 922 * 923 * NOTE: the device->fs_info may not be reliable here so pass 924 * in a NULL to message helpers instead. This avoids a possible 925 * use-after-free when the fs_info and fs_info->sb are already 926 * torn down. 927 */ 928 if (device->bdev) { 929 if (device->devt != path_devt) { 930 mutex_unlock(&fs_devices->device_list_mutex); 931 btrfs_warn_in_rcu(NULL, 932 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 933 path, devid, found_transid, 934 current->comm, 935 task_pid_nr(current)); 936 return ERR_PTR(-EEXIST); 937 } 938 btrfs_info_in_rcu(NULL, 939 "devid %llu device path %s changed to %s scanned by %s (%d)", 940 devid, rcu_str_deref(device->name), 941 path, current->comm, 942 task_pid_nr(current)); 943 } 944 945 name = rcu_string_strdup(path, GFP_NOFS); 946 if (!name) { 947 mutex_unlock(&fs_devices->device_list_mutex); 948 return ERR_PTR(-ENOMEM); 949 } 950 rcu_string_free(device->name); 951 rcu_assign_pointer(device->name, name); 952 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 953 fs_devices->missing_devices--; 954 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 955 } 956 device->devt = path_devt; 957 } 958 959 /* 960 * Unmount does not free the btrfs_device struct but would zero 961 * generation along with most of the other members. So just update 962 * it back. We need it to pick the disk with largest generation 963 * (as above). 964 */ 965 if (!fs_devices->opened) { 966 device->generation = found_transid; 967 fs_devices->latest_generation = max_t(u64, found_transid, 968 fs_devices->latest_generation); 969 } 970 971 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 972 973 mutex_unlock(&fs_devices->device_list_mutex); 974 return device; 975 } 976 977 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 978 { 979 struct btrfs_fs_devices *fs_devices; 980 struct btrfs_device *device; 981 struct btrfs_device *orig_dev; 982 int ret = 0; 983 984 lockdep_assert_held(&uuid_mutex); 985 986 fs_devices = alloc_fs_devices(orig->fsid, NULL); 987 if (IS_ERR(fs_devices)) 988 return fs_devices; 989 990 fs_devices->total_devices = orig->total_devices; 991 992 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 993 struct rcu_string *name; 994 995 device = btrfs_alloc_device(NULL, &orig_dev->devid, 996 orig_dev->uuid); 997 if (IS_ERR(device)) { 998 ret = PTR_ERR(device); 999 goto error; 1000 } 1001 1002 /* 1003 * This is ok to do without rcu read locked because we hold the 1004 * uuid mutex so nothing we touch in here is going to disappear. 1005 */ 1006 if (orig_dev->name) { 1007 name = rcu_string_strdup(orig_dev->name->str, 1008 GFP_KERNEL); 1009 if (!name) { 1010 btrfs_free_device(device); 1011 ret = -ENOMEM; 1012 goto error; 1013 } 1014 rcu_assign_pointer(device->name, name); 1015 } 1016 1017 if (orig_dev->zone_info) { 1018 struct btrfs_zoned_device_info *zone_info; 1019 1020 zone_info = btrfs_clone_dev_zone_info(orig_dev); 1021 if (!zone_info) { 1022 btrfs_free_device(device); 1023 ret = -ENOMEM; 1024 goto error; 1025 } 1026 device->zone_info = zone_info; 1027 } 1028 1029 list_add(&device->dev_list, &fs_devices->devices); 1030 device->fs_devices = fs_devices; 1031 fs_devices->num_devices++; 1032 } 1033 return fs_devices; 1034 error: 1035 free_fs_devices(fs_devices); 1036 return ERR_PTR(ret); 1037 } 1038 1039 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1040 struct btrfs_device **latest_dev) 1041 { 1042 struct btrfs_device *device, *next; 1043 1044 /* This is the initialized path, it is safe to release the devices. */ 1045 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1046 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1047 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1048 &device->dev_state) && 1049 !test_bit(BTRFS_DEV_STATE_MISSING, 1050 &device->dev_state) && 1051 (!*latest_dev || 1052 device->generation > (*latest_dev)->generation)) { 1053 *latest_dev = device; 1054 } 1055 continue; 1056 } 1057 1058 /* 1059 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1060 * in btrfs_init_dev_replace() so just continue. 1061 */ 1062 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1063 continue; 1064 1065 if (device->bdev) { 1066 blkdev_put(device->bdev, device->mode); 1067 device->bdev = NULL; 1068 fs_devices->open_devices--; 1069 } 1070 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1071 list_del_init(&device->dev_alloc_list); 1072 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1073 fs_devices->rw_devices--; 1074 } 1075 list_del_init(&device->dev_list); 1076 fs_devices->num_devices--; 1077 btrfs_free_device(device); 1078 } 1079 1080 } 1081 1082 /* 1083 * After we have read the system tree and know devids belonging to this 1084 * filesystem, remove the device which does not belong there. 1085 */ 1086 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1087 { 1088 struct btrfs_device *latest_dev = NULL; 1089 struct btrfs_fs_devices *seed_dev; 1090 1091 mutex_lock(&uuid_mutex); 1092 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1093 1094 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1095 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1096 1097 fs_devices->latest_dev = latest_dev; 1098 1099 mutex_unlock(&uuid_mutex); 1100 } 1101 1102 static void btrfs_close_bdev(struct btrfs_device *device) 1103 { 1104 if (!device->bdev) 1105 return; 1106 1107 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1108 sync_blockdev(device->bdev); 1109 invalidate_bdev(device->bdev); 1110 } 1111 1112 blkdev_put(device->bdev, device->mode); 1113 } 1114 1115 static void btrfs_close_one_device(struct btrfs_device *device) 1116 { 1117 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1118 1119 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1120 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1121 list_del_init(&device->dev_alloc_list); 1122 fs_devices->rw_devices--; 1123 } 1124 1125 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1126 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1127 1128 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1129 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1130 fs_devices->missing_devices--; 1131 } 1132 1133 btrfs_close_bdev(device); 1134 if (device->bdev) { 1135 fs_devices->open_devices--; 1136 device->bdev = NULL; 1137 } 1138 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1139 btrfs_destroy_dev_zone_info(device); 1140 1141 device->fs_info = NULL; 1142 atomic_set(&device->dev_stats_ccnt, 0); 1143 extent_io_tree_release(&device->alloc_state); 1144 1145 /* 1146 * Reset the flush error record. We might have a transient flush error 1147 * in this mount, and if so we aborted the current transaction and set 1148 * the fs to an error state, guaranteeing no super blocks can be further 1149 * committed. However that error might be transient and if we unmount the 1150 * filesystem and mount it again, we should allow the mount to succeed 1151 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1152 * filesystem again we still get flush errors, then we will again abort 1153 * any transaction and set the error state, guaranteeing no commits of 1154 * unsafe super blocks. 1155 */ 1156 device->last_flush_error = 0; 1157 1158 /* Verify the device is back in a pristine state */ 1159 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1160 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1161 ASSERT(list_empty(&device->dev_alloc_list)); 1162 ASSERT(list_empty(&device->post_commit_list)); 1163 } 1164 1165 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1166 { 1167 struct btrfs_device *device, *tmp; 1168 1169 lockdep_assert_held(&uuid_mutex); 1170 1171 if (--fs_devices->opened > 0) 1172 return; 1173 1174 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1175 btrfs_close_one_device(device); 1176 1177 WARN_ON(fs_devices->open_devices); 1178 WARN_ON(fs_devices->rw_devices); 1179 fs_devices->opened = 0; 1180 fs_devices->seeding = false; 1181 fs_devices->fs_info = NULL; 1182 } 1183 1184 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1185 { 1186 LIST_HEAD(list); 1187 struct btrfs_fs_devices *tmp; 1188 1189 mutex_lock(&uuid_mutex); 1190 close_fs_devices(fs_devices); 1191 if (!fs_devices->opened) 1192 list_splice_init(&fs_devices->seed_list, &list); 1193 1194 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1195 close_fs_devices(fs_devices); 1196 list_del(&fs_devices->seed_list); 1197 free_fs_devices(fs_devices); 1198 } 1199 mutex_unlock(&uuid_mutex); 1200 } 1201 1202 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1203 fmode_t flags, void *holder) 1204 { 1205 struct btrfs_device *device; 1206 struct btrfs_device *latest_dev = NULL; 1207 struct btrfs_device *tmp_device; 1208 1209 flags |= FMODE_EXCL; 1210 1211 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1212 dev_list) { 1213 int ret; 1214 1215 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1216 if (ret == 0 && 1217 (!latest_dev || device->generation > latest_dev->generation)) { 1218 latest_dev = device; 1219 } else if (ret == -ENODATA) { 1220 fs_devices->num_devices--; 1221 list_del(&device->dev_list); 1222 btrfs_free_device(device); 1223 } 1224 } 1225 if (fs_devices->open_devices == 0) 1226 return -EINVAL; 1227 1228 fs_devices->opened = 1; 1229 fs_devices->latest_dev = latest_dev; 1230 fs_devices->total_rw_bytes = 0; 1231 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1232 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1233 1234 return 0; 1235 } 1236 1237 static int devid_cmp(void *priv, const struct list_head *a, 1238 const struct list_head *b) 1239 { 1240 const struct btrfs_device *dev1, *dev2; 1241 1242 dev1 = list_entry(a, struct btrfs_device, dev_list); 1243 dev2 = list_entry(b, struct btrfs_device, dev_list); 1244 1245 if (dev1->devid < dev2->devid) 1246 return -1; 1247 else if (dev1->devid > dev2->devid) 1248 return 1; 1249 return 0; 1250 } 1251 1252 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1253 fmode_t flags, void *holder) 1254 { 1255 int ret; 1256 1257 lockdep_assert_held(&uuid_mutex); 1258 /* 1259 * The device_list_mutex cannot be taken here in case opening the 1260 * underlying device takes further locks like open_mutex. 1261 * 1262 * We also don't need the lock here as this is called during mount and 1263 * exclusion is provided by uuid_mutex 1264 */ 1265 1266 if (fs_devices->opened) { 1267 fs_devices->opened++; 1268 ret = 0; 1269 } else { 1270 list_sort(NULL, &fs_devices->devices, devid_cmp); 1271 ret = open_fs_devices(fs_devices, flags, holder); 1272 } 1273 1274 return ret; 1275 } 1276 1277 void btrfs_release_disk_super(struct btrfs_super_block *super) 1278 { 1279 struct page *page = virt_to_page(super); 1280 1281 put_page(page); 1282 } 1283 1284 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1285 u64 bytenr, u64 bytenr_orig) 1286 { 1287 struct btrfs_super_block *disk_super; 1288 struct page *page; 1289 void *p; 1290 pgoff_t index; 1291 1292 /* make sure our super fits in the device */ 1293 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 1294 return ERR_PTR(-EINVAL); 1295 1296 /* make sure our super fits in the page */ 1297 if (sizeof(*disk_super) > PAGE_SIZE) 1298 return ERR_PTR(-EINVAL); 1299 1300 /* make sure our super doesn't straddle pages on disk */ 1301 index = bytenr >> PAGE_SHIFT; 1302 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1303 return ERR_PTR(-EINVAL); 1304 1305 /* pull in the page with our super */ 1306 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1307 1308 if (IS_ERR(page)) 1309 return ERR_CAST(page); 1310 1311 p = page_address(page); 1312 1313 /* align our pointer to the offset of the super block */ 1314 disk_super = p + offset_in_page(bytenr); 1315 1316 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1317 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1318 btrfs_release_disk_super(p); 1319 return ERR_PTR(-EINVAL); 1320 } 1321 1322 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1323 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1324 1325 return disk_super; 1326 } 1327 1328 int btrfs_forget_devices(dev_t devt) 1329 { 1330 int ret; 1331 1332 mutex_lock(&uuid_mutex); 1333 ret = btrfs_free_stale_devices(devt, NULL); 1334 mutex_unlock(&uuid_mutex); 1335 1336 return ret; 1337 } 1338 1339 /* 1340 * Look for a btrfs signature on a device. This may be called out of the mount path 1341 * and we are not allowed to call set_blocksize during the scan. The superblock 1342 * is read via pagecache 1343 */ 1344 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1345 void *holder) 1346 { 1347 struct btrfs_super_block *disk_super; 1348 bool new_device_added = false; 1349 struct btrfs_device *device = NULL; 1350 struct block_device *bdev; 1351 u64 bytenr, bytenr_orig; 1352 int ret; 1353 1354 lockdep_assert_held(&uuid_mutex); 1355 1356 /* 1357 * we would like to check all the supers, but that would make 1358 * a btrfs mount succeed after a mkfs from a different FS. 1359 * So, we need to add a special mount option to scan for 1360 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1361 */ 1362 flags |= FMODE_EXCL; 1363 1364 bdev = blkdev_get_by_path(path, flags, holder); 1365 if (IS_ERR(bdev)) 1366 return ERR_CAST(bdev); 1367 1368 bytenr_orig = btrfs_sb_offset(0); 1369 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1370 if (ret) { 1371 device = ERR_PTR(ret); 1372 goto error_bdev_put; 1373 } 1374 1375 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1376 if (IS_ERR(disk_super)) { 1377 device = ERR_CAST(disk_super); 1378 goto error_bdev_put; 1379 } 1380 1381 device = device_list_add(path, disk_super, &new_device_added); 1382 if (!IS_ERR(device) && new_device_added) 1383 btrfs_free_stale_devices(device->devt, device); 1384 1385 btrfs_release_disk_super(disk_super); 1386 1387 error_bdev_put: 1388 blkdev_put(bdev, flags); 1389 1390 return device; 1391 } 1392 1393 /* 1394 * Try to find a chunk that intersects [start, start + len] range and when one 1395 * such is found, record the end of it in *start 1396 */ 1397 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1398 u64 len) 1399 { 1400 u64 physical_start, physical_end; 1401 1402 lockdep_assert_held(&device->fs_info->chunk_mutex); 1403 1404 if (!find_first_extent_bit(&device->alloc_state, *start, 1405 &physical_start, &physical_end, 1406 CHUNK_ALLOCATED, NULL)) { 1407 1408 if (in_range(physical_start, *start, len) || 1409 in_range(*start, physical_start, 1410 physical_end - physical_start)) { 1411 *start = physical_end + 1; 1412 return true; 1413 } 1414 } 1415 return false; 1416 } 1417 1418 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1419 { 1420 switch (device->fs_devices->chunk_alloc_policy) { 1421 case BTRFS_CHUNK_ALLOC_REGULAR: 1422 return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); 1423 case BTRFS_CHUNK_ALLOC_ZONED: 1424 /* 1425 * We don't care about the starting region like regular 1426 * allocator, because we anyway use/reserve the first two zones 1427 * for superblock logging. 1428 */ 1429 return ALIGN(start, device->zone_info->zone_size); 1430 default: 1431 BUG(); 1432 } 1433 } 1434 1435 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1436 u64 *hole_start, u64 *hole_size, 1437 u64 num_bytes) 1438 { 1439 u64 zone_size = device->zone_info->zone_size; 1440 u64 pos; 1441 int ret; 1442 bool changed = false; 1443 1444 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1445 1446 while (*hole_size > 0) { 1447 pos = btrfs_find_allocatable_zones(device, *hole_start, 1448 *hole_start + *hole_size, 1449 num_bytes); 1450 if (pos != *hole_start) { 1451 *hole_size = *hole_start + *hole_size - pos; 1452 *hole_start = pos; 1453 changed = true; 1454 if (*hole_size < num_bytes) 1455 break; 1456 } 1457 1458 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1459 1460 /* Range is ensured to be empty */ 1461 if (!ret) 1462 return changed; 1463 1464 /* Given hole range was invalid (outside of device) */ 1465 if (ret == -ERANGE) { 1466 *hole_start += *hole_size; 1467 *hole_size = 0; 1468 return true; 1469 } 1470 1471 *hole_start += zone_size; 1472 *hole_size -= zone_size; 1473 changed = true; 1474 } 1475 1476 return changed; 1477 } 1478 1479 /** 1480 * dev_extent_hole_check - check if specified hole is suitable for allocation 1481 * @device: the device which we have the hole 1482 * @hole_start: starting position of the hole 1483 * @hole_size: the size of the hole 1484 * @num_bytes: the size of the free space that we need 1485 * 1486 * This function may modify @hole_start and @hole_size to reflect the suitable 1487 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1488 */ 1489 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1490 u64 *hole_size, u64 num_bytes) 1491 { 1492 bool changed = false; 1493 u64 hole_end = *hole_start + *hole_size; 1494 1495 for (;;) { 1496 /* 1497 * Check before we set max_hole_start, otherwise we could end up 1498 * sending back this offset anyway. 1499 */ 1500 if (contains_pending_extent(device, hole_start, *hole_size)) { 1501 if (hole_end >= *hole_start) 1502 *hole_size = hole_end - *hole_start; 1503 else 1504 *hole_size = 0; 1505 changed = true; 1506 } 1507 1508 switch (device->fs_devices->chunk_alloc_policy) { 1509 case BTRFS_CHUNK_ALLOC_REGULAR: 1510 /* No extra check */ 1511 break; 1512 case BTRFS_CHUNK_ALLOC_ZONED: 1513 if (dev_extent_hole_check_zoned(device, hole_start, 1514 hole_size, num_bytes)) { 1515 changed = true; 1516 /* 1517 * The changed hole can contain pending extent. 1518 * Loop again to check that. 1519 */ 1520 continue; 1521 } 1522 break; 1523 default: 1524 BUG(); 1525 } 1526 1527 break; 1528 } 1529 1530 return changed; 1531 } 1532 1533 /* 1534 * find_free_dev_extent_start - find free space in the specified device 1535 * @device: the device which we search the free space in 1536 * @num_bytes: the size of the free space that we need 1537 * @search_start: the position from which to begin the search 1538 * @start: store the start of the free space. 1539 * @len: the size of the free space. that we find, or the size 1540 * of the max free space if we don't find suitable free space 1541 * 1542 * this uses a pretty simple search, the expectation is that it is 1543 * called very infrequently and that a given device has a small number 1544 * of extents 1545 * 1546 * @start is used to store the start of the free space if we find. But if we 1547 * don't find suitable free space, it will be used to store the start position 1548 * of the max free space. 1549 * 1550 * @len is used to store the size of the free space that we find. 1551 * But if we don't find suitable free space, it is used to store the size of 1552 * the max free space. 1553 * 1554 * NOTE: This function will search *commit* root of device tree, and does extra 1555 * check to ensure dev extents are not double allocated. 1556 * This makes the function safe to allocate dev extents but may not report 1557 * correct usable device space, as device extent freed in current transaction 1558 * is not reported as available. 1559 */ 1560 static int find_free_dev_extent_start(struct btrfs_device *device, 1561 u64 num_bytes, u64 search_start, u64 *start, 1562 u64 *len) 1563 { 1564 struct btrfs_fs_info *fs_info = device->fs_info; 1565 struct btrfs_root *root = fs_info->dev_root; 1566 struct btrfs_key key; 1567 struct btrfs_dev_extent *dev_extent; 1568 struct btrfs_path *path; 1569 u64 hole_size; 1570 u64 max_hole_start; 1571 u64 max_hole_size; 1572 u64 extent_end; 1573 u64 search_end = device->total_bytes; 1574 int ret; 1575 int slot; 1576 struct extent_buffer *l; 1577 1578 search_start = dev_extent_search_start(device, search_start); 1579 1580 WARN_ON(device->zone_info && 1581 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1582 1583 path = btrfs_alloc_path(); 1584 if (!path) 1585 return -ENOMEM; 1586 1587 max_hole_start = search_start; 1588 max_hole_size = 0; 1589 1590 again: 1591 if (search_start >= search_end || 1592 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1593 ret = -ENOSPC; 1594 goto out; 1595 } 1596 1597 path->reada = READA_FORWARD; 1598 path->search_commit_root = 1; 1599 path->skip_locking = 1; 1600 1601 key.objectid = device->devid; 1602 key.offset = search_start; 1603 key.type = BTRFS_DEV_EXTENT_KEY; 1604 1605 ret = btrfs_search_backwards(root, &key, path); 1606 if (ret < 0) 1607 goto out; 1608 1609 while (1) { 1610 l = path->nodes[0]; 1611 slot = path->slots[0]; 1612 if (slot >= btrfs_header_nritems(l)) { 1613 ret = btrfs_next_leaf(root, path); 1614 if (ret == 0) 1615 continue; 1616 if (ret < 0) 1617 goto out; 1618 1619 break; 1620 } 1621 btrfs_item_key_to_cpu(l, &key, slot); 1622 1623 if (key.objectid < device->devid) 1624 goto next; 1625 1626 if (key.objectid > device->devid) 1627 break; 1628 1629 if (key.type != BTRFS_DEV_EXTENT_KEY) 1630 goto next; 1631 1632 if (key.offset > search_start) { 1633 hole_size = key.offset - search_start; 1634 dev_extent_hole_check(device, &search_start, &hole_size, 1635 num_bytes); 1636 1637 if (hole_size > max_hole_size) { 1638 max_hole_start = search_start; 1639 max_hole_size = hole_size; 1640 } 1641 1642 /* 1643 * If this free space is greater than which we need, 1644 * it must be the max free space that we have found 1645 * until now, so max_hole_start must point to the start 1646 * of this free space and the length of this free space 1647 * is stored in max_hole_size. Thus, we return 1648 * max_hole_start and max_hole_size and go back to the 1649 * caller. 1650 */ 1651 if (hole_size >= num_bytes) { 1652 ret = 0; 1653 goto out; 1654 } 1655 } 1656 1657 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1658 extent_end = key.offset + btrfs_dev_extent_length(l, 1659 dev_extent); 1660 if (extent_end > search_start) 1661 search_start = extent_end; 1662 next: 1663 path->slots[0]++; 1664 cond_resched(); 1665 } 1666 1667 /* 1668 * At this point, search_start should be the end of 1669 * allocated dev extents, and when shrinking the device, 1670 * search_end may be smaller than search_start. 1671 */ 1672 if (search_end > search_start) { 1673 hole_size = search_end - search_start; 1674 if (dev_extent_hole_check(device, &search_start, &hole_size, 1675 num_bytes)) { 1676 btrfs_release_path(path); 1677 goto again; 1678 } 1679 1680 if (hole_size > max_hole_size) { 1681 max_hole_start = search_start; 1682 max_hole_size = hole_size; 1683 } 1684 } 1685 1686 /* See above. */ 1687 if (max_hole_size < num_bytes) 1688 ret = -ENOSPC; 1689 else 1690 ret = 0; 1691 1692 out: 1693 btrfs_free_path(path); 1694 *start = max_hole_start; 1695 if (len) 1696 *len = max_hole_size; 1697 return ret; 1698 } 1699 1700 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1701 u64 *start, u64 *len) 1702 { 1703 /* FIXME use last free of some kind */ 1704 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1705 } 1706 1707 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1708 struct btrfs_device *device, 1709 u64 start, u64 *dev_extent_len) 1710 { 1711 struct btrfs_fs_info *fs_info = device->fs_info; 1712 struct btrfs_root *root = fs_info->dev_root; 1713 int ret; 1714 struct btrfs_path *path; 1715 struct btrfs_key key; 1716 struct btrfs_key found_key; 1717 struct extent_buffer *leaf = NULL; 1718 struct btrfs_dev_extent *extent = NULL; 1719 1720 path = btrfs_alloc_path(); 1721 if (!path) 1722 return -ENOMEM; 1723 1724 key.objectid = device->devid; 1725 key.offset = start; 1726 key.type = BTRFS_DEV_EXTENT_KEY; 1727 again: 1728 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1729 if (ret > 0) { 1730 ret = btrfs_previous_item(root, path, key.objectid, 1731 BTRFS_DEV_EXTENT_KEY); 1732 if (ret) 1733 goto out; 1734 leaf = path->nodes[0]; 1735 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1736 extent = btrfs_item_ptr(leaf, path->slots[0], 1737 struct btrfs_dev_extent); 1738 BUG_ON(found_key.offset > start || found_key.offset + 1739 btrfs_dev_extent_length(leaf, extent) < start); 1740 key = found_key; 1741 btrfs_release_path(path); 1742 goto again; 1743 } else if (ret == 0) { 1744 leaf = path->nodes[0]; 1745 extent = btrfs_item_ptr(leaf, path->slots[0], 1746 struct btrfs_dev_extent); 1747 } else { 1748 goto out; 1749 } 1750 1751 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1752 1753 ret = btrfs_del_item(trans, root, path); 1754 if (ret == 0) 1755 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1756 out: 1757 btrfs_free_path(path); 1758 return ret; 1759 } 1760 1761 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1762 { 1763 struct extent_map_tree *em_tree; 1764 struct extent_map *em; 1765 struct rb_node *n; 1766 u64 ret = 0; 1767 1768 em_tree = &fs_info->mapping_tree; 1769 read_lock(&em_tree->lock); 1770 n = rb_last(&em_tree->map.rb_root); 1771 if (n) { 1772 em = rb_entry(n, struct extent_map, rb_node); 1773 ret = em->start + em->len; 1774 } 1775 read_unlock(&em_tree->lock); 1776 1777 return ret; 1778 } 1779 1780 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1781 u64 *devid_ret) 1782 { 1783 int ret; 1784 struct btrfs_key key; 1785 struct btrfs_key found_key; 1786 struct btrfs_path *path; 1787 1788 path = btrfs_alloc_path(); 1789 if (!path) 1790 return -ENOMEM; 1791 1792 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1793 key.type = BTRFS_DEV_ITEM_KEY; 1794 key.offset = (u64)-1; 1795 1796 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1797 if (ret < 0) 1798 goto error; 1799 1800 if (ret == 0) { 1801 /* Corruption */ 1802 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1803 ret = -EUCLEAN; 1804 goto error; 1805 } 1806 1807 ret = btrfs_previous_item(fs_info->chunk_root, path, 1808 BTRFS_DEV_ITEMS_OBJECTID, 1809 BTRFS_DEV_ITEM_KEY); 1810 if (ret) { 1811 *devid_ret = 1; 1812 } else { 1813 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1814 path->slots[0]); 1815 *devid_ret = found_key.offset + 1; 1816 } 1817 ret = 0; 1818 error: 1819 btrfs_free_path(path); 1820 return ret; 1821 } 1822 1823 /* 1824 * the device information is stored in the chunk root 1825 * the btrfs_device struct should be fully filled in 1826 */ 1827 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1828 struct btrfs_device *device) 1829 { 1830 int ret; 1831 struct btrfs_path *path; 1832 struct btrfs_dev_item *dev_item; 1833 struct extent_buffer *leaf; 1834 struct btrfs_key key; 1835 unsigned long ptr; 1836 1837 path = btrfs_alloc_path(); 1838 if (!path) 1839 return -ENOMEM; 1840 1841 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1842 key.type = BTRFS_DEV_ITEM_KEY; 1843 key.offset = device->devid; 1844 1845 btrfs_reserve_chunk_metadata(trans, true); 1846 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1847 &key, sizeof(*dev_item)); 1848 btrfs_trans_release_chunk_metadata(trans); 1849 if (ret) 1850 goto out; 1851 1852 leaf = path->nodes[0]; 1853 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1854 1855 btrfs_set_device_id(leaf, dev_item, device->devid); 1856 btrfs_set_device_generation(leaf, dev_item, 0); 1857 btrfs_set_device_type(leaf, dev_item, device->type); 1858 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1859 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1860 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1861 btrfs_set_device_total_bytes(leaf, dev_item, 1862 btrfs_device_get_disk_total_bytes(device)); 1863 btrfs_set_device_bytes_used(leaf, dev_item, 1864 btrfs_device_get_bytes_used(device)); 1865 btrfs_set_device_group(leaf, dev_item, 0); 1866 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1867 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1868 btrfs_set_device_start_offset(leaf, dev_item, 0); 1869 1870 ptr = btrfs_device_uuid(dev_item); 1871 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1872 ptr = btrfs_device_fsid(dev_item); 1873 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1874 ptr, BTRFS_FSID_SIZE); 1875 btrfs_mark_buffer_dirty(leaf); 1876 1877 ret = 0; 1878 out: 1879 btrfs_free_path(path); 1880 return ret; 1881 } 1882 1883 /* 1884 * Function to update ctime/mtime for a given device path. 1885 * Mainly used for ctime/mtime based probe like libblkid. 1886 * 1887 * We don't care about errors here, this is just to be kind to userspace. 1888 */ 1889 static void update_dev_time(const char *device_path) 1890 { 1891 struct path path; 1892 struct timespec64 now; 1893 int ret; 1894 1895 ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 1896 if (ret) 1897 return; 1898 1899 now = current_time(d_inode(path.dentry)); 1900 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); 1901 path_put(&path); 1902 } 1903 1904 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, 1905 struct btrfs_device *device) 1906 { 1907 struct btrfs_root *root = device->fs_info->chunk_root; 1908 int ret; 1909 struct btrfs_path *path; 1910 struct btrfs_key key; 1911 1912 path = btrfs_alloc_path(); 1913 if (!path) 1914 return -ENOMEM; 1915 1916 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1917 key.type = BTRFS_DEV_ITEM_KEY; 1918 key.offset = device->devid; 1919 1920 btrfs_reserve_chunk_metadata(trans, false); 1921 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1922 btrfs_trans_release_chunk_metadata(trans); 1923 if (ret) { 1924 if (ret > 0) 1925 ret = -ENOENT; 1926 goto out; 1927 } 1928 1929 ret = btrfs_del_item(trans, root, path); 1930 out: 1931 btrfs_free_path(path); 1932 return ret; 1933 } 1934 1935 /* 1936 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1937 * filesystem. It's up to the caller to adjust that number regarding eg. device 1938 * replace. 1939 */ 1940 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1941 u64 num_devices) 1942 { 1943 u64 all_avail; 1944 unsigned seq; 1945 int i; 1946 1947 do { 1948 seq = read_seqbegin(&fs_info->profiles_lock); 1949 1950 all_avail = fs_info->avail_data_alloc_bits | 1951 fs_info->avail_system_alloc_bits | 1952 fs_info->avail_metadata_alloc_bits; 1953 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1954 1955 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1956 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1957 continue; 1958 1959 if (num_devices < btrfs_raid_array[i].devs_min) 1960 return btrfs_raid_array[i].mindev_error; 1961 } 1962 1963 return 0; 1964 } 1965 1966 static struct btrfs_device * btrfs_find_next_active_device( 1967 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1968 { 1969 struct btrfs_device *next_device; 1970 1971 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1972 if (next_device != device && 1973 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1974 && next_device->bdev) 1975 return next_device; 1976 } 1977 1978 return NULL; 1979 } 1980 1981 /* 1982 * Helper function to check if the given device is part of s_bdev / latest_dev 1983 * and replace it with the provided or the next active device, in the context 1984 * where this function called, there should be always be another device (or 1985 * this_dev) which is active. 1986 */ 1987 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1988 struct btrfs_device *next_device) 1989 { 1990 struct btrfs_fs_info *fs_info = device->fs_info; 1991 1992 if (!next_device) 1993 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1994 device); 1995 ASSERT(next_device); 1996 1997 if (fs_info->sb->s_bdev && 1998 (fs_info->sb->s_bdev == device->bdev)) 1999 fs_info->sb->s_bdev = next_device->bdev; 2000 2001 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2002 fs_info->fs_devices->latest_dev = next_device; 2003 } 2004 2005 /* 2006 * Return btrfs_fs_devices::num_devices excluding the device that's being 2007 * currently replaced. 2008 */ 2009 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2010 { 2011 u64 num_devices = fs_info->fs_devices->num_devices; 2012 2013 down_read(&fs_info->dev_replace.rwsem); 2014 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2015 ASSERT(num_devices > 1); 2016 num_devices--; 2017 } 2018 up_read(&fs_info->dev_replace.rwsem); 2019 2020 return num_devices; 2021 } 2022 2023 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2024 struct block_device *bdev, 2025 const char *device_path) 2026 { 2027 struct btrfs_super_block *disk_super; 2028 int copy_num; 2029 2030 if (!bdev) 2031 return; 2032 2033 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2034 struct page *page; 2035 int ret; 2036 2037 disk_super = btrfs_read_dev_one_super(bdev, copy_num, false); 2038 if (IS_ERR(disk_super)) 2039 continue; 2040 2041 if (bdev_is_zoned(bdev)) { 2042 btrfs_reset_sb_log_zones(bdev, copy_num); 2043 continue; 2044 } 2045 2046 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2047 2048 page = virt_to_page(disk_super); 2049 set_page_dirty(page); 2050 lock_page(page); 2051 /* write_on_page() unlocks the page */ 2052 ret = write_one_page(page); 2053 if (ret) 2054 btrfs_warn(fs_info, 2055 "error clearing superblock number %d (%d)", 2056 copy_num, ret); 2057 btrfs_release_disk_super(disk_super); 2058 2059 } 2060 2061 /* Notify udev that device has changed */ 2062 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2063 2064 /* Update ctime/mtime for device path for libblkid */ 2065 update_dev_time(device_path); 2066 } 2067 2068 int btrfs_rm_device(struct btrfs_fs_info *fs_info, 2069 struct btrfs_dev_lookup_args *args, 2070 struct block_device **bdev, fmode_t *mode) 2071 { 2072 struct btrfs_trans_handle *trans; 2073 struct btrfs_device *device; 2074 struct btrfs_fs_devices *cur_devices; 2075 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2076 u64 num_devices; 2077 int ret = 0; 2078 2079 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2080 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); 2081 return -EINVAL; 2082 } 2083 2084 /* 2085 * The device list in fs_devices is accessed without locks (neither 2086 * uuid_mutex nor device_list_mutex) as it won't change on a mounted 2087 * filesystem and another device rm cannot run. 2088 */ 2089 num_devices = btrfs_num_devices(fs_info); 2090 2091 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2092 if (ret) 2093 return ret; 2094 2095 device = btrfs_find_device(fs_info->fs_devices, args); 2096 if (!device) { 2097 if (args->missing) 2098 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2099 else 2100 ret = -ENOENT; 2101 return ret; 2102 } 2103 2104 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2105 btrfs_warn_in_rcu(fs_info, 2106 "cannot remove device %s (devid %llu) due to active swapfile", 2107 rcu_str_deref(device->name), device->devid); 2108 return -ETXTBSY; 2109 } 2110 2111 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 2112 return BTRFS_ERROR_DEV_TGT_REPLACE; 2113 2114 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2115 fs_info->fs_devices->rw_devices == 1) 2116 return BTRFS_ERROR_DEV_ONLY_WRITABLE; 2117 2118 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2119 mutex_lock(&fs_info->chunk_mutex); 2120 list_del_init(&device->dev_alloc_list); 2121 device->fs_devices->rw_devices--; 2122 mutex_unlock(&fs_info->chunk_mutex); 2123 } 2124 2125 ret = btrfs_shrink_device(device, 0); 2126 if (ret) 2127 goto error_undo; 2128 2129 trans = btrfs_start_transaction(fs_info->chunk_root, 0); 2130 if (IS_ERR(trans)) { 2131 ret = PTR_ERR(trans); 2132 goto error_undo; 2133 } 2134 2135 ret = btrfs_rm_dev_item(trans, device); 2136 if (ret) { 2137 /* Any error in dev item removal is critical */ 2138 btrfs_crit(fs_info, 2139 "failed to remove device item for devid %llu: %d", 2140 device->devid, ret); 2141 btrfs_abort_transaction(trans, ret); 2142 btrfs_end_transaction(trans); 2143 return ret; 2144 } 2145 2146 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2147 btrfs_scrub_cancel_dev(device); 2148 2149 /* 2150 * the device list mutex makes sure that we don't change 2151 * the device list while someone else is writing out all 2152 * the device supers. Whoever is writing all supers, should 2153 * lock the device list mutex before getting the number of 2154 * devices in the super block (super_copy). Conversely, 2155 * whoever updates the number of devices in the super block 2156 * (super_copy) should hold the device list mutex. 2157 */ 2158 2159 /* 2160 * In normal cases the cur_devices == fs_devices. But in case 2161 * of deleting a seed device, the cur_devices should point to 2162 * its own fs_devices listed under the fs_devices->seed_list. 2163 */ 2164 cur_devices = device->fs_devices; 2165 mutex_lock(&fs_devices->device_list_mutex); 2166 list_del_rcu(&device->dev_list); 2167 2168 cur_devices->num_devices--; 2169 cur_devices->total_devices--; 2170 /* Update total_devices of the parent fs_devices if it's seed */ 2171 if (cur_devices != fs_devices) 2172 fs_devices->total_devices--; 2173 2174 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2175 cur_devices->missing_devices--; 2176 2177 btrfs_assign_next_active_device(device, NULL); 2178 2179 if (device->bdev) { 2180 cur_devices->open_devices--; 2181 /* remove sysfs entry */ 2182 btrfs_sysfs_remove_device(device); 2183 } 2184 2185 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2186 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2187 mutex_unlock(&fs_devices->device_list_mutex); 2188 2189 /* 2190 * At this point, the device is zero sized and detached from the 2191 * devices list. All that's left is to zero out the old supers and 2192 * free the device. 2193 * 2194 * We cannot call btrfs_close_bdev() here because we're holding the sb 2195 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2196 * block device and it's dependencies. Instead just flush the device 2197 * and let the caller do the final blkdev_put. 2198 */ 2199 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2200 btrfs_scratch_superblocks(fs_info, device->bdev, 2201 device->name->str); 2202 if (device->bdev) { 2203 sync_blockdev(device->bdev); 2204 invalidate_bdev(device->bdev); 2205 } 2206 } 2207 2208 *bdev = device->bdev; 2209 *mode = device->mode; 2210 synchronize_rcu(); 2211 btrfs_free_device(device); 2212 2213 /* 2214 * This can happen if cur_devices is the private seed devices list. We 2215 * cannot call close_fs_devices() here because it expects the uuid_mutex 2216 * to be held, but in fact we don't need that for the private 2217 * seed_devices, we can simply decrement cur_devices->opened and then 2218 * remove it from our list and free the fs_devices. 2219 */ 2220 if (cur_devices->num_devices == 0) { 2221 list_del_init(&cur_devices->seed_list); 2222 ASSERT(cur_devices->opened == 1); 2223 cur_devices->opened--; 2224 free_fs_devices(cur_devices); 2225 } 2226 2227 ret = btrfs_commit_transaction(trans); 2228 2229 return ret; 2230 2231 error_undo: 2232 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2233 mutex_lock(&fs_info->chunk_mutex); 2234 list_add(&device->dev_alloc_list, 2235 &fs_devices->alloc_list); 2236 device->fs_devices->rw_devices++; 2237 mutex_unlock(&fs_info->chunk_mutex); 2238 } 2239 return ret; 2240 } 2241 2242 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2243 { 2244 struct btrfs_fs_devices *fs_devices; 2245 2246 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2247 2248 /* 2249 * in case of fs with no seed, srcdev->fs_devices will point 2250 * to fs_devices of fs_info. However when the dev being replaced is 2251 * a seed dev it will point to the seed's local fs_devices. In short 2252 * srcdev will have its correct fs_devices in both the cases. 2253 */ 2254 fs_devices = srcdev->fs_devices; 2255 2256 list_del_rcu(&srcdev->dev_list); 2257 list_del(&srcdev->dev_alloc_list); 2258 fs_devices->num_devices--; 2259 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2260 fs_devices->missing_devices--; 2261 2262 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2263 fs_devices->rw_devices--; 2264 2265 if (srcdev->bdev) 2266 fs_devices->open_devices--; 2267 } 2268 2269 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2270 { 2271 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2272 2273 mutex_lock(&uuid_mutex); 2274 2275 btrfs_close_bdev(srcdev); 2276 synchronize_rcu(); 2277 btrfs_free_device(srcdev); 2278 2279 /* if this is no devs we rather delete the fs_devices */ 2280 if (!fs_devices->num_devices) { 2281 /* 2282 * On a mounted FS, num_devices can't be zero unless it's a 2283 * seed. In case of a seed device being replaced, the replace 2284 * target added to the sprout FS, so there will be no more 2285 * device left under the seed FS. 2286 */ 2287 ASSERT(fs_devices->seeding); 2288 2289 list_del_init(&fs_devices->seed_list); 2290 close_fs_devices(fs_devices); 2291 free_fs_devices(fs_devices); 2292 } 2293 mutex_unlock(&uuid_mutex); 2294 } 2295 2296 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2297 { 2298 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2299 2300 mutex_lock(&fs_devices->device_list_mutex); 2301 2302 btrfs_sysfs_remove_device(tgtdev); 2303 2304 if (tgtdev->bdev) 2305 fs_devices->open_devices--; 2306 2307 fs_devices->num_devices--; 2308 2309 btrfs_assign_next_active_device(tgtdev, NULL); 2310 2311 list_del_rcu(&tgtdev->dev_list); 2312 2313 mutex_unlock(&fs_devices->device_list_mutex); 2314 2315 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2316 tgtdev->name->str); 2317 2318 btrfs_close_bdev(tgtdev); 2319 synchronize_rcu(); 2320 btrfs_free_device(tgtdev); 2321 } 2322 2323 /** 2324 * Populate args from device at path 2325 * 2326 * @fs_info: the filesystem 2327 * @args: the args to populate 2328 * @path: the path to the device 2329 * 2330 * This will read the super block of the device at @path and populate @args with 2331 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 2332 * lookup a device to operate on, but need to do it before we take any locks. 2333 * This properly handles the special case of "missing" that a user may pass in, 2334 * and does some basic sanity checks. The caller must make sure that @path is 2335 * properly NUL terminated before calling in, and must call 2336 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 2337 * uuid buffers. 2338 * 2339 * Return: 0 for success, -errno for failure 2340 */ 2341 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 2342 struct btrfs_dev_lookup_args *args, 2343 const char *path) 2344 { 2345 struct btrfs_super_block *disk_super; 2346 struct block_device *bdev; 2347 int ret; 2348 2349 if (!path || !path[0]) 2350 return -EINVAL; 2351 if (!strcmp(path, "missing")) { 2352 args->missing = true; 2353 return 0; 2354 } 2355 2356 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 2357 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 2358 if (!args->uuid || !args->fsid) { 2359 btrfs_put_dev_args_from_path(args); 2360 return -ENOMEM; 2361 } 2362 2363 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, 2364 &bdev, &disk_super); 2365 if (ret) { 2366 btrfs_put_dev_args_from_path(args); 2367 return ret; 2368 } 2369 2370 args->devid = btrfs_stack_device_id(&disk_super->dev_item); 2371 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 2372 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2373 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 2374 else 2375 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 2376 btrfs_release_disk_super(disk_super); 2377 blkdev_put(bdev, FMODE_READ); 2378 return 0; 2379 } 2380 2381 /* 2382 * Only use this jointly with btrfs_get_dev_args_from_path() because we will 2383 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 2384 * that don't need to be freed. 2385 */ 2386 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 2387 { 2388 kfree(args->uuid); 2389 kfree(args->fsid); 2390 args->uuid = NULL; 2391 args->fsid = NULL; 2392 } 2393 2394 struct btrfs_device *btrfs_find_device_by_devspec( 2395 struct btrfs_fs_info *fs_info, u64 devid, 2396 const char *device_path) 2397 { 2398 BTRFS_DEV_LOOKUP_ARGS(args); 2399 struct btrfs_device *device; 2400 int ret; 2401 2402 if (devid) { 2403 args.devid = devid; 2404 device = btrfs_find_device(fs_info->fs_devices, &args); 2405 if (!device) 2406 return ERR_PTR(-ENOENT); 2407 return device; 2408 } 2409 2410 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 2411 if (ret) 2412 return ERR_PTR(ret); 2413 device = btrfs_find_device(fs_info->fs_devices, &args); 2414 btrfs_put_dev_args_from_path(&args); 2415 if (!device) 2416 return ERR_PTR(-ENOENT); 2417 return device; 2418 } 2419 2420 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 2421 { 2422 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2423 struct btrfs_fs_devices *old_devices; 2424 struct btrfs_fs_devices *seed_devices; 2425 2426 lockdep_assert_held(&uuid_mutex); 2427 if (!fs_devices->seeding) 2428 return ERR_PTR(-EINVAL); 2429 2430 /* 2431 * Private copy of the seed devices, anchored at 2432 * fs_info->fs_devices->seed_list 2433 */ 2434 seed_devices = alloc_fs_devices(NULL, NULL); 2435 if (IS_ERR(seed_devices)) 2436 return seed_devices; 2437 2438 /* 2439 * It's necessary to retain a copy of the original seed fs_devices in 2440 * fs_uuids so that filesystems which have been seeded can successfully 2441 * reference the seed device from open_seed_devices. This also supports 2442 * multiple fs seed. 2443 */ 2444 old_devices = clone_fs_devices(fs_devices); 2445 if (IS_ERR(old_devices)) { 2446 kfree(seed_devices); 2447 return old_devices; 2448 } 2449 2450 list_add(&old_devices->fs_list, &fs_uuids); 2451 2452 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2453 seed_devices->opened = 1; 2454 INIT_LIST_HEAD(&seed_devices->devices); 2455 INIT_LIST_HEAD(&seed_devices->alloc_list); 2456 mutex_init(&seed_devices->device_list_mutex); 2457 2458 return seed_devices; 2459 } 2460 2461 /* 2462 * Splice seed devices into the sprout fs_devices. 2463 * Generate a new fsid for the sprouted read-write filesystem. 2464 */ 2465 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 2466 struct btrfs_fs_devices *seed_devices) 2467 { 2468 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2469 struct btrfs_super_block *disk_super = fs_info->super_copy; 2470 struct btrfs_device *device; 2471 u64 super_flags; 2472 2473 /* 2474 * We are updating the fsid, the thread leading to device_list_add() 2475 * could race, so uuid_mutex is needed. 2476 */ 2477 lockdep_assert_held(&uuid_mutex); 2478 2479 /* 2480 * The threads listed below may traverse dev_list but can do that without 2481 * device_list_mutex: 2482 * - All device ops and balance - as we are in btrfs_exclop_start. 2483 * - Various dev_list readers - are using RCU. 2484 * - btrfs_ioctl_fitrim() - is using RCU. 2485 * 2486 * For-read threads as below are using device_list_mutex: 2487 * - Readonly scrub btrfs_scrub_dev() 2488 * - Readonly scrub btrfs_scrub_progress() 2489 * - btrfs_get_dev_stats() 2490 */ 2491 lockdep_assert_held(&fs_devices->device_list_mutex); 2492 2493 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2494 synchronize_rcu); 2495 list_for_each_entry(device, &seed_devices->devices, dev_list) 2496 device->fs_devices = seed_devices; 2497 2498 fs_devices->seeding = false; 2499 fs_devices->num_devices = 0; 2500 fs_devices->open_devices = 0; 2501 fs_devices->missing_devices = 0; 2502 fs_devices->rotating = false; 2503 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2504 2505 generate_random_uuid(fs_devices->fsid); 2506 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2507 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2508 2509 super_flags = btrfs_super_flags(disk_super) & 2510 ~BTRFS_SUPER_FLAG_SEEDING; 2511 btrfs_set_super_flags(disk_super, super_flags); 2512 } 2513 2514 /* 2515 * Store the expected generation for seed devices in device items. 2516 */ 2517 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2518 { 2519 BTRFS_DEV_LOOKUP_ARGS(args); 2520 struct btrfs_fs_info *fs_info = trans->fs_info; 2521 struct btrfs_root *root = fs_info->chunk_root; 2522 struct btrfs_path *path; 2523 struct extent_buffer *leaf; 2524 struct btrfs_dev_item *dev_item; 2525 struct btrfs_device *device; 2526 struct btrfs_key key; 2527 u8 fs_uuid[BTRFS_FSID_SIZE]; 2528 u8 dev_uuid[BTRFS_UUID_SIZE]; 2529 int ret; 2530 2531 path = btrfs_alloc_path(); 2532 if (!path) 2533 return -ENOMEM; 2534 2535 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2536 key.offset = 0; 2537 key.type = BTRFS_DEV_ITEM_KEY; 2538 2539 while (1) { 2540 btrfs_reserve_chunk_metadata(trans, false); 2541 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2542 btrfs_trans_release_chunk_metadata(trans); 2543 if (ret < 0) 2544 goto error; 2545 2546 leaf = path->nodes[0]; 2547 next_slot: 2548 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2549 ret = btrfs_next_leaf(root, path); 2550 if (ret > 0) 2551 break; 2552 if (ret < 0) 2553 goto error; 2554 leaf = path->nodes[0]; 2555 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2556 btrfs_release_path(path); 2557 continue; 2558 } 2559 2560 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2561 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2562 key.type != BTRFS_DEV_ITEM_KEY) 2563 break; 2564 2565 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2566 struct btrfs_dev_item); 2567 args.devid = btrfs_device_id(leaf, dev_item); 2568 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2569 BTRFS_UUID_SIZE); 2570 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2571 BTRFS_FSID_SIZE); 2572 args.uuid = dev_uuid; 2573 args.fsid = fs_uuid; 2574 device = btrfs_find_device(fs_info->fs_devices, &args); 2575 BUG_ON(!device); /* Logic error */ 2576 2577 if (device->fs_devices->seeding) { 2578 btrfs_set_device_generation(leaf, dev_item, 2579 device->generation); 2580 btrfs_mark_buffer_dirty(leaf); 2581 } 2582 2583 path->slots[0]++; 2584 goto next_slot; 2585 } 2586 ret = 0; 2587 error: 2588 btrfs_free_path(path); 2589 return ret; 2590 } 2591 2592 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2593 { 2594 struct btrfs_root *root = fs_info->dev_root; 2595 struct btrfs_trans_handle *trans; 2596 struct btrfs_device *device; 2597 struct block_device *bdev; 2598 struct super_block *sb = fs_info->sb; 2599 struct rcu_string *name; 2600 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2601 struct btrfs_fs_devices *seed_devices; 2602 u64 orig_super_total_bytes; 2603 u64 orig_super_num_devices; 2604 int ret = 0; 2605 bool seeding_dev = false; 2606 bool locked = false; 2607 2608 if (sb_rdonly(sb) && !fs_devices->seeding) 2609 return -EROFS; 2610 2611 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2612 fs_info->bdev_holder); 2613 if (IS_ERR(bdev)) 2614 return PTR_ERR(bdev); 2615 2616 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2617 ret = -EINVAL; 2618 goto error; 2619 } 2620 2621 if (fs_devices->seeding) { 2622 seeding_dev = true; 2623 down_write(&sb->s_umount); 2624 mutex_lock(&uuid_mutex); 2625 locked = true; 2626 } 2627 2628 sync_blockdev(bdev); 2629 2630 rcu_read_lock(); 2631 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2632 if (device->bdev == bdev) { 2633 ret = -EEXIST; 2634 rcu_read_unlock(); 2635 goto error; 2636 } 2637 } 2638 rcu_read_unlock(); 2639 2640 device = btrfs_alloc_device(fs_info, NULL, NULL); 2641 if (IS_ERR(device)) { 2642 /* we can safely leave the fs_devices entry around */ 2643 ret = PTR_ERR(device); 2644 goto error; 2645 } 2646 2647 name = rcu_string_strdup(device_path, GFP_KERNEL); 2648 if (!name) { 2649 ret = -ENOMEM; 2650 goto error_free_device; 2651 } 2652 rcu_assign_pointer(device->name, name); 2653 2654 device->fs_info = fs_info; 2655 device->bdev = bdev; 2656 ret = lookup_bdev(device_path, &device->devt); 2657 if (ret) 2658 goto error_free_device; 2659 2660 ret = btrfs_get_dev_zone_info(device, false); 2661 if (ret) 2662 goto error_free_device; 2663 2664 trans = btrfs_start_transaction(root, 0); 2665 if (IS_ERR(trans)) { 2666 ret = PTR_ERR(trans); 2667 goto error_free_zone; 2668 } 2669 2670 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2671 device->generation = trans->transid; 2672 device->io_width = fs_info->sectorsize; 2673 device->io_align = fs_info->sectorsize; 2674 device->sector_size = fs_info->sectorsize; 2675 device->total_bytes = 2676 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); 2677 device->disk_total_bytes = device->total_bytes; 2678 device->commit_total_bytes = device->total_bytes; 2679 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2680 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2681 device->mode = FMODE_EXCL; 2682 device->dev_stats_valid = 1; 2683 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2684 2685 if (seeding_dev) { 2686 btrfs_clear_sb_rdonly(sb); 2687 2688 /* GFP_KERNEL allocation must not be under device_list_mutex */ 2689 seed_devices = btrfs_init_sprout(fs_info); 2690 if (IS_ERR(seed_devices)) { 2691 ret = PTR_ERR(seed_devices); 2692 btrfs_abort_transaction(trans, ret); 2693 goto error_trans; 2694 } 2695 } 2696 2697 mutex_lock(&fs_devices->device_list_mutex); 2698 if (seeding_dev) { 2699 btrfs_setup_sprout(fs_info, seed_devices); 2700 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2701 device); 2702 } 2703 2704 device->fs_devices = fs_devices; 2705 2706 mutex_lock(&fs_info->chunk_mutex); 2707 list_add_rcu(&device->dev_list, &fs_devices->devices); 2708 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2709 fs_devices->num_devices++; 2710 fs_devices->open_devices++; 2711 fs_devices->rw_devices++; 2712 fs_devices->total_devices++; 2713 fs_devices->total_rw_bytes += device->total_bytes; 2714 2715 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2716 2717 if (!bdev_nonrot(bdev)) 2718 fs_devices->rotating = true; 2719 2720 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2721 btrfs_set_super_total_bytes(fs_info->super_copy, 2722 round_down(orig_super_total_bytes + device->total_bytes, 2723 fs_info->sectorsize)); 2724 2725 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2726 btrfs_set_super_num_devices(fs_info->super_copy, 2727 orig_super_num_devices + 1); 2728 2729 /* 2730 * we've got more storage, clear any full flags on the space 2731 * infos 2732 */ 2733 btrfs_clear_space_info_full(fs_info); 2734 2735 mutex_unlock(&fs_info->chunk_mutex); 2736 2737 /* Add sysfs device entry */ 2738 btrfs_sysfs_add_device(device); 2739 2740 mutex_unlock(&fs_devices->device_list_mutex); 2741 2742 if (seeding_dev) { 2743 mutex_lock(&fs_info->chunk_mutex); 2744 ret = init_first_rw_device(trans); 2745 mutex_unlock(&fs_info->chunk_mutex); 2746 if (ret) { 2747 btrfs_abort_transaction(trans, ret); 2748 goto error_sysfs; 2749 } 2750 } 2751 2752 ret = btrfs_add_dev_item(trans, device); 2753 if (ret) { 2754 btrfs_abort_transaction(trans, ret); 2755 goto error_sysfs; 2756 } 2757 2758 if (seeding_dev) { 2759 ret = btrfs_finish_sprout(trans); 2760 if (ret) { 2761 btrfs_abort_transaction(trans, ret); 2762 goto error_sysfs; 2763 } 2764 2765 /* 2766 * fs_devices now represents the newly sprouted filesystem and 2767 * its fsid has been changed by btrfs_sprout_splice(). 2768 */ 2769 btrfs_sysfs_update_sprout_fsid(fs_devices); 2770 } 2771 2772 ret = btrfs_commit_transaction(trans); 2773 2774 if (seeding_dev) { 2775 mutex_unlock(&uuid_mutex); 2776 up_write(&sb->s_umount); 2777 locked = false; 2778 2779 if (ret) /* transaction commit */ 2780 return ret; 2781 2782 ret = btrfs_relocate_sys_chunks(fs_info); 2783 if (ret < 0) 2784 btrfs_handle_fs_error(fs_info, ret, 2785 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2786 trans = btrfs_attach_transaction(root); 2787 if (IS_ERR(trans)) { 2788 if (PTR_ERR(trans) == -ENOENT) 2789 return 0; 2790 ret = PTR_ERR(trans); 2791 trans = NULL; 2792 goto error_sysfs; 2793 } 2794 ret = btrfs_commit_transaction(trans); 2795 } 2796 2797 /* 2798 * Now that we have written a new super block to this device, check all 2799 * other fs_devices list if device_path alienates any other scanned 2800 * device. 2801 * We can ignore the return value as it typically returns -EINVAL and 2802 * only succeeds if the device was an alien. 2803 */ 2804 btrfs_forget_devices(device->devt); 2805 2806 /* Update ctime/mtime for blkid or udev */ 2807 update_dev_time(device_path); 2808 2809 return ret; 2810 2811 error_sysfs: 2812 btrfs_sysfs_remove_device(device); 2813 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2814 mutex_lock(&fs_info->chunk_mutex); 2815 list_del_rcu(&device->dev_list); 2816 list_del(&device->dev_alloc_list); 2817 fs_info->fs_devices->num_devices--; 2818 fs_info->fs_devices->open_devices--; 2819 fs_info->fs_devices->rw_devices--; 2820 fs_info->fs_devices->total_devices--; 2821 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2822 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2823 btrfs_set_super_total_bytes(fs_info->super_copy, 2824 orig_super_total_bytes); 2825 btrfs_set_super_num_devices(fs_info->super_copy, 2826 orig_super_num_devices); 2827 mutex_unlock(&fs_info->chunk_mutex); 2828 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2829 error_trans: 2830 if (seeding_dev) 2831 btrfs_set_sb_rdonly(sb); 2832 if (trans) 2833 btrfs_end_transaction(trans); 2834 error_free_zone: 2835 btrfs_destroy_dev_zone_info(device); 2836 error_free_device: 2837 btrfs_free_device(device); 2838 error: 2839 blkdev_put(bdev, FMODE_EXCL); 2840 if (locked) { 2841 mutex_unlock(&uuid_mutex); 2842 up_write(&sb->s_umount); 2843 } 2844 return ret; 2845 } 2846 2847 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2848 struct btrfs_device *device) 2849 { 2850 int ret; 2851 struct btrfs_path *path; 2852 struct btrfs_root *root = device->fs_info->chunk_root; 2853 struct btrfs_dev_item *dev_item; 2854 struct extent_buffer *leaf; 2855 struct btrfs_key key; 2856 2857 path = btrfs_alloc_path(); 2858 if (!path) 2859 return -ENOMEM; 2860 2861 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2862 key.type = BTRFS_DEV_ITEM_KEY; 2863 key.offset = device->devid; 2864 2865 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2866 if (ret < 0) 2867 goto out; 2868 2869 if (ret > 0) { 2870 ret = -ENOENT; 2871 goto out; 2872 } 2873 2874 leaf = path->nodes[0]; 2875 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2876 2877 btrfs_set_device_id(leaf, dev_item, device->devid); 2878 btrfs_set_device_type(leaf, dev_item, device->type); 2879 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2880 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2881 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2882 btrfs_set_device_total_bytes(leaf, dev_item, 2883 btrfs_device_get_disk_total_bytes(device)); 2884 btrfs_set_device_bytes_used(leaf, dev_item, 2885 btrfs_device_get_bytes_used(device)); 2886 btrfs_mark_buffer_dirty(leaf); 2887 2888 out: 2889 btrfs_free_path(path); 2890 return ret; 2891 } 2892 2893 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2894 struct btrfs_device *device, u64 new_size) 2895 { 2896 struct btrfs_fs_info *fs_info = device->fs_info; 2897 struct btrfs_super_block *super_copy = fs_info->super_copy; 2898 u64 old_total; 2899 u64 diff; 2900 int ret; 2901 2902 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2903 return -EACCES; 2904 2905 new_size = round_down(new_size, fs_info->sectorsize); 2906 2907 mutex_lock(&fs_info->chunk_mutex); 2908 old_total = btrfs_super_total_bytes(super_copy); 2909 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2910 2911 if (new_size <= device->total_bytes || 2912 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2913 mutex_unlock(&fs_info->chunk_mutex); 2914 return -EINVAL; 2915 } 2916 2917 btrfs_set_super_total_bytes(super_copy, 2918 round_down(old_total + diff, fs_info->sectorsize)); 2919 device->fs_devices->total_rw_bytes += diff; 2920 2921 btrfs_device_set_total_bytes(device, new_size); 2922 btrfs_device_set_disk_total_bytes(device, new_size); 2923 btrfs_clear_space_info_full(device->fs_info); 2924 if (list_empty(&device->post_commit_list)) 2925 list_add_tail(&device->post_commit_list, 2926 &trans->transaction->dev_update_list); 2927 mutex_unlock(&fs_info->chunk_mutex); 2928 2929 btrfs_reserve_chunk_metadata(trans, false); 2930 ret = btrfs_update_device(trans, device); 2931 btrfs_trans_release_chunk_metadata(trans); 2932 2933 return ret; 2934 } 2935 2936 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2937 { 2938 struct btrfs_fs_info *fs_info = trans->fs_info; 2939 struct btrfs_root *root = fs_info->chunk_root; 2940 int ret; 2941 struct btrfs_path *path; 2942 struct btrfs_key key; 2943 2944 path = btrfs_alloc_path(); 2945 if (!path) 2946 return -ENOMEM; 2947 2948 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2949 key.offset = chunk_offset; 2950 key.type = BTRFS_CHUNK_ITEM_KEY; 2951 2952 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2953 if (ret < 0) 2954 goto out; 2955 else if (ret > 0) { /* Logic error or corruption */ 2956 btrfs_handle_fs_error(fs_info, -ENOENT, 2957 "Failed lookup while freeing chunk."); 2958 ret = -ENOENT; 2959 goto out; 2960 } 2961 2962 ret = btrfs_del_item(trans, root, path); 2963 if (ret < 0) 2964 btrfs_handle_fs_error(fs_info, ret, 2965 "Failed to delete chunk item."); 2966 out: 2967 btrfs_free_path(path); 2968 return ret; 2969 } 2970 2971 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2972 { 2973 struct btrfs_super_block *super_copy = fs_info->super_copy; 2974 struct btrfs_disk_key *disk_key; 2975 struct btrfs_chunk *chunk; 2976 u8 *ptr; 2977 int ret = 0; 2978 u32 num_stripes; 2979 u32 array_size; 2980 u32 len = 0; 2981 u32 cur; 2982 struct btrfs_key key; 2983 2984 lockdep_assert_held(&fs_info->chunk_mutex); 2985 array_size = btrfs_super_sys_array_size(super_copy); 2986 2987 ptr = super_copy->sys_chunk_array; 2988 cur = 0; 2989 2990 while (cur < array_size) { 2991 disk_key = (struct btrfs_disk_key *)ptr; 2992 btrfs_disk_key_to_cpu(&key, disk_key); 2993 2994 len = sizeof(*disk_key); 2995 2996 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2997 chunk = (struct btrfs_chunk *)(ptr + len); 2998 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2999 len += btrfs_chunk_item_size(num_stripes); 3000 } else { 3001 ret = -EIO; 3002 break; 3003 } 3004 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 3005 key.offset == chunk_offset) { 3006 memmove(ptr, ptr + len, array_size - (cur + len)); 3007 array_size -= len; 3008 btrfs_set_super_sys_array_size(super_copy, array_size); 3009 } else { 3010 ptr += len; 3011 cur += len; 3012 } 3013 } 3014 return ret; 3015 } 3016 3017 /* 3018 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 3019 * @logical: Logical block offset in bytes. 3020 * @length: Length of extent in bytes. 3021 * 3022 * Return: Chunk mapping or ERR_PTR. 3023 */ 3024 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 3025 u64 logical, u64 length) 3026 { 3027 struct extent_map_tree *em_tree; 3028 struct extent_map *em; 3029 3030 em_tree = &fs_info->mapping_tree; 3031 read_lock(&em_tree->lock); 3032 em = lookup_extent_mapping(em_tree, logical, length); 3033 read_unlock(&em_tree->lock); 3034 3035 if (!em) { 3036 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 3037 logical, length); 3038 return ERR_PTR(-EINVAL); 3039 } 3040 3041 if (em->start > logical || em->start + em->len < logical) { 3042 btrfs_crit(fs_info, 3043 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 3044 logical, length, em->start, em->start + em->len); 3045 free_extent_map(em); 3046 return ERR_PTR(-EINVAL); 3047 } 3048 3049 /* callers are responsible for dropping em's ref. */ 3050 return em; 3051 } 3052 3053 static int remove_chunk_item(struct btrfs_trans_handle *trans, 3054 struct map_lookup *map, u64 chunk_offset) 3055 { 3056 int i; 3057 3058 /* 3059 * Removing chunk items and updating the device items in the chunks btree 3060 * requires holding the chunk_mutex. 3061 * See the comment at btrfs_chunk_alloc() for the details. 3062 */ 3063 lockdep_assert_held(&trans->fs_info->chunk_mutex); 3064 3065 for (i = 0; i < map->num_stripes; i++) { 3066 int ret; 3067 3068 ret = btrfs_update_device(trans, map->stripes[i].dev); 3069 if (ret) 3070 return ret; 3071 } 3072 3073 return btrfs_free_chunk(trans, chunk_offset); 3074 } 3075 3076 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3077 { 3078 struct btrfs_fs_info *fs_info = trans->fs_info; 3079 struct extent_map *em; 3080 struct map_lookup *map; 3081 u64 dev_extent_len = 0; 3082 int i, ret = 0; 3083 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3084 3085 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3086 if (IS_ERR(em)) { 3087 /* 3088 * This is a logic error, but we don't want to just rely on the 3089 * user having built with ASSERT enabled, so if ASSERT doesn't 3090 * do anything we still error out. 3091 */ 3092 ASSERT(0); 3093 return PTR_ERR(em); 3094 } 3095 map = em->map_lookup; 3096 3097 /* 3098 * First delete the device extent items from the devices btree. 3099 * We take the device_list_mutex to avoid racing with the finishing phase 3100 * of a device replace operation. See the comment below before acquiring 3101 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3102 * because that can result in a deadlock when deleting the device extent 3103 * items from the devices btree - COWing an extent buffer from the btree 3104 * may result in allocating a new metadata chunk, which would attempt to 3105 * lock again fs_info->chunk_mutex. 3106 */ 3107 mutex_lock(&fs_devices->device_list_mutex); 3108 for (i = 0; i < map->num_stripes; i++) { 3109 struct btrfs_device *device = map->stripes[i].dev; 3110 ret = btrfs_free_dev_extent(trans, device, 3111 map->stripes[i].physical, 3112 &dev_extent_len); 3113 if (ret) { 3114 mutex_unlock(&fs_devices->device_list_mutex); 3115 btrfs_abort_transaction(trans, ret); 3116 goto out; 3117 } 3118 3119 if (device->bytes_used > 0) { 3120 mutex_lock(&fs_info->chunk_mutex); 3121 btrfs_device_set_bytes_used(device, 3122 device->bytes_used - dev_extent_len); 3123 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3124 btrfs_clear_space_info_full(fs_info); 3125 mutex_unlock(&fs_info->chunk_mutex); 3126 } 3127 } 3128 mutex_unlock(&fs_devices->device_list_mutex); 3129 3130 /* 3131 * We acquire fs_info->chunk_mutex for 2 reasons: 3132 * 3133 * 1) Just like with the first phase of the chunk allocation, we must 3134 * reserve system space, do all chunk btree updates and deletions, and 3135 * update the system chunk array in the superblock while holding this 3136 * mutex. This is for similar reasons as explained on the comment at 3137 * the top of btrfs_chunk_alloc(); 3138 * 3139 * 2) Prevent races with the final phase of a device replace operation 3140 * that replaces the device object associated with the map's stripes, 3141 * because the device object's id can change at any time during that 3142 * final phase of the device replace operation 3143 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3144 * replaced device and then see it with an ID of 3145 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3146 * the device item, which does not exists on the chunk btree. 3147 * The finishing phase of device replace acquires both the 3148 * device_list_mutex and the chunk_mutex, in that order, so we are 3149 * safe by just acquiring the chunk_mutex. 3150 */ 3151 trans->removing_chunk = true; 3152 mutex_lock(&fs_info->chunk_mutex); 3153 3154 check_system_chunk(trans, map->type); 3155 3156 ret = remove_chunk_item(trans, map, chunk_offset); 3157 /* 3158 * Normally we should not get -ENOSPC since we reserved space before 3159 * through the call to check_system_chunk(). 3160 * 3161 * Despite our system space_info having enough free space, we may not 3162 * be able to allocate extents from its block groups, because all have 3163 * an incompatible profile, which will force us to allocate a new system 3164 * block group with the right profile, or right after we called 3165 * check_system_space() above, a scrub turned the only system block group 3166 * with enough free space into RO mode. 3167 * This is explained with more detail at do_chunk_alloc(). 3168 * 3169 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3170 */ 3171 if (ret == -ENOSPC) { 3172 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3173 struct btrfs_block_group *sys_bg; 3174 3175 sys_bg = btrfs_create_chunk(trans, sys_flags); 3176 if (IS_ERR(sys_bg)) { 3177 ret = PTR_ERR(sys_bg); 3178 btrfs_abort_transaction(trans, ret); 3179 goto out; 3180 } 3181 3182 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3183 if (ret) { 3184 btrfs_abort_transaction(trans, ret); 3185 goto out; 3186 } 3187 3188 ret = remove_chunk_item(trans, map, chunk_offset); 3189 if (ret) { 3190 btrfs_abort_transaction(trans, ret); 3191 goto out; 3192 } 3193 } else if (ret) { 3194 btrfs_abort_transaction(trans, ret); 3195 goto out; 3196 } 3197 3198 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3199 3200 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3201 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3202 if (ret) { 3203 btrfs_abort_transaction(trans, ret); 3204 goto out; 3205 } 3206 } 3207 3208 mutex_unlock(&fs_info->chunk_mutex); 3209 trans->removing_chunk = false; 3210 3211 /* 3212 * We are done with chunk btree updates and deletions, so release the 3213 * system space we previously reserved (with check_system_chunk()). 3214 */ 3215 btrfs_trans_release_chunk_metadata(trans); 3216 3217 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3218 if (ret) { 3219 btrfs_abort_transaction(trans, ret); 3220 goto out; 3221 } 3222 3223 out: 3224 if (trans->removing_chunk) { 3225 mutex_unlock(&fs_info->chunk_mutex); 3226 trans->removing_chunk = false; 3227 } 3228 /* once for us */ 3229 free_extent_map(em); 3230 return ret; 3231 } 3232 3233 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3234 { 3235 struct btrfs_root *root = fs_info->chunk_root; 3236 struct btrfs_trans_handle *trans; 3237 struct btrfs_block_group *block_group; 3238 u64 length; 3239 int ret; 3240 3241 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3242 btrfs_err(fs_info, 3243 "relocate: not supported on extent tree v2 yet"); 3244 return -EINVAL; 3245 } 3246 3247 /* 3248 * Prevent races with automatic removal of unused block groups. 3249 * After we relocate and before we remove the chunk with offset 3250 * chunk_offset, automatic removal of the block group can kick in, 3251 * resulting in a failure when calling btrfs_remove_chunk() below. 3252 * 3253 * Make sure to acquire this mutex before doing a tree search (dev 3254 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3255 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3256 * we release the path used to search the chunk/dev tree and before 3257 * the current task acquires this mutex and calls us. 3258 */ 3259 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3260 3261 /* step one, relocate all the extents inside this chunk */ 3262 btrfs_scrub_pause(fs_info); 3263 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3264 btrfs_scrub_continue(fs_info); 3265 if (ret) 3266 return ret; 3267 3268 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3269 if (!block_group) 3270 return -ENOENT; 3271 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3272 length = block_group->length; 3273 btrfs_put_block_group(block_group); 3274 3275 /* 3276 * On a zoned file system, discard the whole block group, this will 3277 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3278 * resetting the zone fails, don't treat it as a fatal problem from the 3279 * filesystem's point of view. 3280 */ 3281 if (btrfs_is_zoned(fs_info)) { 3282 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3283 if (ret) 3284 btrfs_info(fs_info, 3285 "failed to reset zone %llu after relocation", 3286 chunk_offset); 3287 } 3288 3289 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3290 chunk_offset); 3291 if (IS_ERR(trans)) { 3292 ret = PTR_ERR(trans); 3293 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3294 return ret; 3295 } 3296 3297 /* 3298 * step two, delete the device extents and the 3299 * chunk tree entries 3300 */ 3301 ret = btrfs_remove_chunk(trans, chunk_offset); 3302 btrfs_end_transaction(trans); 3303 return ret; 3304 } 3305 3306 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3307 { 3308 struct btrfs_root *chunk_root = fs_info->chunk_root; 3309 struct btrfs_path *path; 3310 struct extent_buffer *leaf; 3311 struct btrfs_chunk *chunk; 3312 struct btrfs_key key; 3313 struct btrfs_key found_key; 3314 u64 chunk_type; 3315 bool retried = false; 3316 int failed = 0; 3317 int ret; 3318 3319 path = btrfs_alloc_path(); 3320 if (!path) 3321 return -ENOMEM; 3322 3323 again: 3324 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3325 key.offset = (u64)-1; 3326 key.type = BTRFS_CHUNK_ITEM_KEY; 3327 3328 while (1) { 3329 mutex_lock(&fs_info->reclaim_bgs_lock); 3330 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3331 if (ret < 0) { 3332 mutex_unlock(&fs_info->reclaim_bgs_lock); 3333 goto error; 3334 } 3335 BUG_ON(ret == 0); /* Corruption */ 3336 3337 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3338 key.type); 3339 if (ret) 3340 mutex_unlock(&fs_info->reclaim_bgs_lock); 3341 if (ret < 0) 3342 goto error; 3343 if (ret > 0) 3344 break; 3345 3346 leaf = path->nodes[0]; 3347 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3348 3349 chunk = btrfs_item_ptr(leaf, path->slots[0], 3350 struct btrfs_chunk); 3351 chunk_type = btrfs_chunk_type(leaf, chunk); 3352 btrfs_release_path(path); 3353 3354 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3355 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3356 if (ret == -ENOSPC) 3357 failed++; 3358 else 3359 BUG_ON(ret); 3360 } 3361 mutex_unlock(&fs_info->reclaim_bgs_lock); 3362 3363 if (found_key.offset == 0) 3364 break; 3365 key.offset = found_key.offset - 1; 3366 } 3367 ret = 0; 3368 if (failed && !retried) { 3369 failed = 0; 3370 retried = true; 3371 goto again; 3372 } else if (WARN_ON(failed && retried)) { 3373 ret = -ENOSPC; 3374 } 3375 error: 3376 btrfs_free_path(path); 3377 return ret; 3378 } 3379 3380 /* 3381 * return 1 : allocate a data chunk successfully, 3382 * return <0: errors during allocating a data chunk, 3383 * return 0 : no need to allocate a data chunk. 3384 */ 3385 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3386 u64 chunk_offset) 3387 { 3388 struct btrfs_block_group *cache; 3389 u64 bytes_used; 3390 u64 chunk_type; 3391 3392 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3393 ASSERT(cache); 3394 chunk_type = cache->flags; 3395 btrfs_put_block_group(cache); 3396 3397 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3398 return 0; 3399 3400 spin_lock(&fs_info->data_sinfo->lock); 3401 bytes_used = fs_info->data_sinfo->bytes_used; 3402 spin_unlock(&fs_info->data_sinfo->lock); 3403 3404 if (!bytes_used) { 3405 struct btrfs_trans_handle *trans; 3406 int ret; 3407 3408 trans = btrfs_join_transaction(fs_info->tree_root); 3409 if (IS_ERR(trans)) 3410 return PTR_ERR(trans); 3411 3412 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3413 btrfs_end_transaction(trans); 3414 if (ret < 0) 3415 return ret; 3416 return 1; 3417 } 3418 3419 return 0; 3420 } 3421 3422 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3423 struct btrfs_balance_control *bctl) 3424 { 3425 struct btrfs_root *root = fs_info->tree_root; 3426 struct btrfs_trans_handle *trans; 3427 struct btrfs_balance_item *item; 3428 struct btrfs_disk_balance_args disk_bargs; 3429 struct btrfs_path *path; 3430 struct extent_buffer *leaf; 3431 struct btrfs_key key; 3432 int ret, err; 3433 3434 path = btrfs_alloc_path(); 3435 if (!path) 3436 return -ENOMEM; 3437 3438 trans = btrfs_start_transaction(root, 0); 3439 if (IS_ERR(trans)) { 3440 btrfs_free_path(path); 3441 return PTR_ERR(trans); 3442 } 3443 3444 key.objectid = BTRFS_BALANCE_OBJECTID; 3445 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3446 key.offset = 0; 3447 3448 ret = btrfs_insert_empty_item(trans, root, path, &key, 3449 sizeof(*item)); 3450 if (ret) 3451 goto out; 3452 3453 leaf = path->nodes[0]; 3454 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3455 3456 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3457 3458 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3459 btrfs_set_balance_data(leaf, item, &disk_bargs); 3460 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3461 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3462 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3463 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3464 3465 btrfs_set_balance_flags(leaf, item, bctl->flags); 3466 3467 btrfs_mark_buffer_dirty(leaf); 3468 out: 3469 btrfs_free_path(path); 3470 err = btrfs_commit_transaction(trans); 3471 if (err && !ret) 3472 ret = err; 3473 return ret; 3474 } 3475 3476 static int del_balance_item(struct btrfs_fs_info *fs_info) 3477 { 3478 struct btrfs_root *root = fs_info->tree_root; 3479 struct btrfs_trans_handle *trans; 3480 struct btrfs_path *path; 3481 struct btrfs_key key; 3482 int ret, err; 3483 3484 path = btrfs_alloc_path(); 3485 if (!path) 3486 return -ENOMEM; 3487 3488 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3489 if (IS_ERR(trans)) { 3490 btrfs_free_path(path); 3491 return PTR_ERR(trans); 3492 } 3493 3494 key.objectid = BTRFS_BALANCE_OBJECTID; 3495 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3496 key.offset = 0; 3497 3498 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3499 if (ret < 0) 3500 goto out; 3501 if (ret > 0) { 3502 ret = -ENOENT; 3503 goto out; 3504 } 3505 3506 ret = btrfs_del_item(trans, root, path); 3507 out: 3508 btrfs_free_path(path); 3509 err = btrfs_commit_transaction(trans); 3510 if (err && !ret) 3511 ret = err; 3512 return ret; 3513 } 3514 3515 /* 3516 * This is a heuristic used to reduce the number of chunks balanced on 3517 * resume after balance was interrupted. 3518 */ 3519 static void update_balance_args(struct btrfs_balance_control *bctl) 3520 { 3521 /* 3522 * Turn on soft mode for chunk types that were being converted. 3523 */ 3524 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3525 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3526 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3527 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3528 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3529 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3530 3531 /* 3532 * Turn on usage filter if is not already used. The idea is 3533 * that chunks that we have already balanced should be 3534 * reasonably full. Don't do it for chunks that are being 3535 * converted - that will keep us from relocating unconverted 3536 * (albeit full) chunks. 3537 */ 3538 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3539 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3540 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3541 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3542 bctl->data.usage = 90; 3543 } 3544 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3545 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3546 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3547 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3548 bctl->sys.usage = 90; 3549 } 3550 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3551 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3552 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3553 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3554 bctl->meta.usage = 90; 3555 } 3556 } 3557 3558 /* 3559 * Clear the balance status in fs_info and delete the balance item from disk. 3560 */ 3561 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3562 { 3563 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3564 int ret; 3565 3566 BUG_ON(!fs_info->balance_ctl); 3567 3568 spin_lock(&fs_info->balance_lock); 3569 fs_info->balance_ctl = NULL; 3570 spin_unlock(&fs_info->balance_lock); 3571 3572 kfree(bctl); 3573 ret = del_balance_item(fs_info); 3574 if (ret) 3575 btrfs_handle_fs_error(fs_info, ret, NULL); 3576 } 3577 3578 /* 3579 * Balance filters. Return 1 if chunk should be filtered out 3580 * (should not be balanced). 3581 */ 3582 static int chunk_profiles_filter(u64 chunk_type, 3583 struct btrfs_balance_args *bargs) 3584 { 3585 chunk_type = chunk_to_extended(chunk_type) & 3586 BTRFS_EXTENDED_PROFILE_MASK; 3587 3588 if (bargs->profiles & chunk_type) 3589 return 0; 3590 3591 return 1; 3592 } 3593 3594 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3595 struct btrfs_balance_args *bargs) 3596 { 3597 struct btrfs_block_group *cache; 3598 u64 chunk_used; 3599 u64 user_thresh_min; 3600 u64 user_thresh_max; 3601 int ret = 1; 3602 3603 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3604 chunk_used = cache->used; 3605 3606 if (bargs->usage_min == 0) 3607 user_thresh_min = 0; 3608 else 3609 user_thresh_min = div_factor_fine(cache->length, 3610 bargs->usage_min); 3611 3612 if (bargs->usage_max == 0) 3613 user_thresh_max = 1; 3614 else if (bargs->usage_max > 100) 3615 user_thresh_max = cache->length; 3616 else 3617 user_thresh_max = div_factor_fine(cache->length, 3618 bargs->usage_max); 3619 3620 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3621 ret = 0; 3622 3623 btrfs_put_block_group(cache); 3624 return ret; 3625 } 3626 3627 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3628 u64 chunk_offset, struct btrfs_balance_args *bargs) 3629 { 3630 struct btrfs_block_group *cache; 3631 u64 chunk_used, user_thresh; 3632 int ret = 1; 3633 3634 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3635 chunk_used = cache->used; 3636 3637 if (bargs->usage_min == 0) 3638 user_thresh = 1; 3639 else if (bargs->usage > 100) 3640 user_thresh = cache->length; 3641 else 3642 user_thresh = div_factor_fine(cache->length, bargs->usage); 3643 3644 if (chunk_used < user_thresh) 3645 ret = 0; 3646 3647 btrfs_put_block_group(cache); 3648 return ret; 3649 } 3650 3651 static int chunk_devid_filter(struct extent_buffer *leaf, 3652 struct btrfs_chunk *chunk, 3653 struct btrfs_balance_args *bargs) 3654 { 3655 struct btrfs_stripe *stripe; 3656 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3657 int i; 3658 3659 for (i = 0; i < num_stripes; i++) { 3660 stripe = btrfs_stripe_nr(chunk, i); 3661 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3662 return 0; 3663 } 3664 3665 return 1; 3666 } 3667 3668 static u64 calc_data_stripes(u64 type, int num_stripes) 3669 { 3670 const int index = btrfs_bg_flags_to_raid_index(type); 3671 const int ncopies = btrfs_raid_array[index].ncopies; 3672 const int nparity = btrfs_raid_array[index].nparity; 3673 3674 return (num_stripes - nparity) / ncopies; 3675 } 3676 3677 /* [pstart, pend) */ 3678 static int chunk_drange_filter(struct extent_buffer *leaf, 3679 struct btrfs_chunk *chunk, 3680 struct btrfs_balance_args *bargs) 3681 { 3682 struct btrfs_stripe *stripe; 3683 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3684 u64 stripe_offset; 3685 u64 stripe_length; 3686 u64 type; 3687 int factor; 3688 int i; 3689 3690 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3691 return 0; 3692 3693 type = btrfs_chunk_type(leaf, chunk); 3694 factor = calc_data_stripes(type, num_stripes); 3695 3696 for (i = 0; i < num_stripes; i++) { 3697 stripe = btrfs_stripe_nr(chunk, i); 3698 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3699 continue; 3700 3701 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3702 stripe_length = btrfs_chunk_length(leaf, chunk); 3703 stripe_length = div_u64(stripe_length, factor); 3704 3705 if (stripe_offset < bargs->pend && 3706 stripe_offset + stripe_length > bargs->pstart) 3707 return 0; 3708 } 3709 3710 return 1; 3711 } 3712 3713 /* [vstart, vend) */ 3714 static int chunk_vrange_filter(struct extent_buffer *leaf, 3715 struct btrfs_chunk *chunk, 3716 u64 chunk_offset, 3717 struct btrfs_balance_args *bargs) 3718 { 3719 if (chunk_offset < bargs->vend && 3720 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3721 /* at least part of the chunk is inside this vrange */ 3722 return 0; 3723 3724 return 1; 3725 } 3726 3727 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3728 struct btrfs_chunk *chunk, 3729 struct btrfs_balance_args *bargs) 3730 { 3731 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3732 3733 if (bargs->stripes_min <= num_stripes 3734 && num_stripes <= bargs->stripes_max) 3735 return 0; 3736 3737 return 1; 3738 } 3739 3740 static int chunk_soft_convert_filter(u64 chunk_type, 3741 struct btrfs_balance_args *bargs) 3742 { 3743 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3744 return 0; 3745 3746 chunk_type = chunk_to_extended(chunk_type) & 3747 BTRFS_EXTENDED_PROFILE_MASK; 3748 3749 if (bargs->target == chunk_type) 3750 return 1; 3751 3752 return 0; 3753 } 3754 3755 static int should_balance_chunk(struct extent_buffer *leaf, 3756 struct btrfs_chunk *chunk, u64 chunk_offset) 3757 { 3758 struct btrfs_fs_info *fs_info = leaf->fs_info; 3759 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3760 struct btrfs_balance_args *bargs = NULL; 3761 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3762 3763 /* type filter */ 3764 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3765 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3766 return 0; 3767 } 3768 3769 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3770 bargs = &bctl->data; 3771 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3772 bargs = &bctl->sys; 3773 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3774 bargs = &bctl->meta; 3775 3776 /* profiles filter */ 3777 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3778 chunk_profiles_filter(chunk_type, bargs)) { 3779 return 0; 3780 } 3781 3782 /* usage filter */ 3783 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3784 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3785 return 0; 3786 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3787 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3788 return 0; 3789 } 3790 3791 /* devid filter */ 3792 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3793 chunk_devid_filter(leaf, chunk, bargs)) { 3794 return 0; 3795 } 3796 3797 /* drange filter, makes sense only with devid filter */ 3798 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3799 chunk_drange_filter(leaf, chunk, bargs)) { 3800 return 0; 3801 } 3802 3803 /* vrange filter */ 3804 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3805 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3806 return 0; 3807 } 3808 3809 /* stripes filter */ 3810 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3811 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3812 return 0; 3813 } 3814 3815 /* soft profile changing mode */ 3816 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3817 chunk_soft_convert_filter(chunk_type, bargs)) { 3818 return 0; 3819 } 3820 3821 /* 3822 * limited by count, must be the last filter 3823 */ 3824 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3825 if (bargs->limit == 0) 3826 return 0; 3827 else 3828 bargs->limit--; 3829 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3830 /* 3831 * Same logic as the 'limit' filter; the minimum cannot be 3832 * determined here because we do not have the global information 3833 * about the count of all chunks that satisfy the filters. 3834 */ 3835 if (bargs->limit_max == 0) 3836 return 0; 3837 else 3838 bargs->limit_max--; 3839 } 3840 3841 return 1; 3842 } 3843 3844 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3845 { 3846 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3847 struct btrfs_root *chunk_root = fs_info->chunk_root; 3848 u64 chunk_type; 3849 struct btrfs_chunk *chunk; 3850 struct btrfs_path *path = NULL; 3851 struct btrfs_key key; 3852 struct btrfs_key found_key; 3853 struct extent_buffer *leaf; 3854 int slot; 3855 int ret; 3856 int enospc_errors = 0; 3857 bool counting = true; 3858 /* The single value limit and min/max limits use the same bytes in the */ 3859 u64 limit_data = bctl->data.limit; 3860 u64 limit_meta = bctl->meta.limit; 3861 u64 limit_sys = bctl->sys.limit; 3862 u32 count_data = 0; 3863 u32 count_meta = 0; 3864 u32 count_sys = 0; 3865 int chunk_reserved = 0; 3866 3867 path = btrfs_alloc_path(); 3868 if (!path) { 3869 ret = -ENOMEM; 3870 goto error; 3871 } 3872 3873 /* zero out stat counters */ 3874 spin_lock(&fs_info->balance_lock); 3875 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3876 spin_unlock(&fs_info->balance_lock); 3877 again: 3878 if (!counting) { 3879 /* 3880 * The single value limit and min/max limits use the same bytes 3881 * in the 3882 */ 3883 bctl->data.limit = limit_data; 3884 bctl->meta.limit = limit_meta; 3885 bctl->sys.limit = limit_sys; 3886 } 3887 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3888 key.offset = (u64)-1; 3889 key.type = BTRFS_CHUNK_ITEM_KEY; 3890 3891 while (1) { 3892 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3893 atomic_read(&fs_info->balance_cancel_req)) { 3894 ret = -ECANCELED; 3895 goto error; 3896 } 3897 3898 mutex_lock(&fs_info->reclaim_bgs_lock); 3899 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3900 if (ret < 0) { 3901 mutex_unlock(&fs_info->reclaim_bgs_lock); 3902 goto error; 3903 } 3904 3905 /* 3906 * this shouldn't happen, it means the last relocate 3907 * failed 3908 */ 3909 if (ret == 0) 3910 BUG(); /* FIXME break ? */ 3911 3912 ret = btrfs_previous_item(chunk_root, path, 0, 3913 BTRFS_CHUNK_ITEM_KEY); 3914 if (ret) { 3915 mutex_unlock(&fs_info->reclaim_bgs_lock); 3916 ret = 0; 3917 break; 3918 } 3919 3920 leaf = path->nodes[0]; 3921 slot = path->slots[0]; 3922 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3923 3924 if (found_key.objectid != key.objectid) { 3925 mutex_unlock(&fs_info->reclaim_bgs_lock); 3926 break; 3927 } 3928 3929 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3930 chunk_type = btrfs_chunk_type(leaf, chunk); 3931 3932 if (!counting) { 3933 spin_lock(&fs_info->balance_lock); 3934 bctl->stat.considered++; 3935 spin_unlock(&fs_info->balance_lock); 3936 } 3937 3938 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3939 3940 btrfs_release_path(path); 3941 if (!ret) { 3942 mutex_unlock(&fs_info->reclaim_bgs_lock); 3943 goto loop; 3944 } 3945 3946 if (counting) { 3947 mutex_unlock(&fs_info->reclaim_bgs_lock); 3948 spin_lock(&fs_info->balance_lock); 3949 bctl->stat.expected++; 3950 spin_unlock(&fs_info->balance_lock); 3951 3952 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3953 count_data++; 3954 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3955 count_sys++; 3956 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3957 count_meta++; 3958 3959 goto loop; 3960 } 3961 3962 /* 3963 * Apply limit_min filter, no need to check if the LIMITS 3964 * filter is used, limit_min is 0 by default 3965 */ 3966 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3967 count_data < bctl->data.limit_min) 3968 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3969 count_meta < bctl->meta.limit_min) 3970 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3971 count_sys < bctl->sys.limit_min)) { 3972 mutex_unlock(&fs_info->reclaim_bgs_lock); 3973 goto loop; 3974 } 3975 3976 if (!chunk_reserved) { 3977 /* 3978 * We may be relocating the only data chunk we have, 3979 * which could potentially end up with losing data's 3980 * raid profile, so lets allocate an empty one in 3981 * advance. 3982 */ 3983 ret = btrfs_may_alloc_data_chunk(fs_info, 3984 found_key.offset); 3985 if (ret < 0) { 3986 mutex_unlock(&fs_info->reclaim_bgs_lock); 3987 goto error; 3988 } else if (ret == 1) { 3989 chunk_reserved = 1; 3990 } 3991 } 3992 3993 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3994 mutex_unlock(&fs_info->reclaim_bgs_lock); 3995 if (ret == -ENOSPC) { 3996 enospc_errors++; 3997 } else if (ret == -ETXTBSY) { 3998 btrfs_info(fs_info, 3999 "skipping relocation of block group %llu due to active swapfile", 4000 found_key.offset); 4001 ret = 0; 4002 } else if (ret) { 4003 goto error; 4004 } else { 4005 spin_lock(&fs_info->balance_lock); 4006 bctl->stat.completed++; 4007 spin_unlock(&fs_info->balance_lock); 4008 } 4009 loop: 4010 if (found_key.offset == 0) 4011 break; 4012 key.offset = found_key.offset - 1; 4013 } 4014 4015 if (counting) { 4016 btrfs_release_path(path); 4017 counting = false; 4018 goto again; 4019 } 4020 error: 4021 btrfs_free_path(path); 4022 if (enospc_errors) { 4023 btrfs_info(fs_info, "%d enospc errors during balance", 4024 enospc_errors); 4025 if (!ret) 4026 ret = -ENOSPC; 4027 } 4028 4029 return ret; 4030 } 4031 4032 /** 4033 * alloc_profile_is_valid - see if a given profile is valid and reduced 4034 * @flags: profile to validate 4035 * @extended: if true @flags is treated as an extended profile 4036 */ 4037 static int alloc_profile_is_valid(u64 flags, int extended) 4038 { 4039 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 4040 BTRFS_BLOCK_GROUP_PROFILE_MASK); 4041 4042 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 4043 4044 /* 1) check that all other bits are zeroed */ 4045 if (flags & ~mask) 4046 return 0; 4047 4048 /* 2) see if profile is reduced */ 4049 if (flags == 0) 4050 return !extended; /* "0" is valid for usual profiles */ 4051 4052 return has_single_bit_set(flags); 4053 } 4054 4055 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 4056 { 4057 /* cancel requested || normal exit path */ 4058 return atomic_read(&fs_info->balance_cancel_req) || 4059 (atomic_read(&fs_info->balance_pause_req) == 0 && 4060 atomic_read(&fs_info->balance_cancel_req) == 0); 4061 } 4062 4063 /* 4064 * Validate target profile against allowed profiles and return true if it's OK. 4065 * Otherwise print the error message and return false. 4066 */ 4067 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 4068 const struct btrfs_balance_args *bargs, 4069 u64 allowed, const char *type) 4070 { 4071 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4072 return true; 4073 4074 /* Profile is valid and does not have bits outside of the allowed set */ 4075 if (alloc_profile_is_valid(bargs->target, 1) && 4076 (bargs->target & ~allowed) == 0) 4077 return true; 4078 4079 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4080 type, btrfs_bg_type_to_raid_name(bargs->target)); 4081 return false; 4082 } 4083 4084 /* 4085 * Fill @buf with textual description of balance filter flags @bargs, up to 4086 * @size_buf including the terminating null. The output may be trimmed if it 4087 * does not fit into the provided buffer. 4088 */ 4089 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4090 u32 size_buf) 4091 { 4092 int ret; 4093 u32 size_bp = size_buf; 4094 char *bp = buf; 4095 u64 flags = bargs->flags; 4096 char tmp_buf[128] = {'\0'}; 4097 4098 if (!flags) 4099 return; 4100 4101 #define CHECK_APPEND_NOARG(a) \ 4102 do { \ 4103 ret = snprintf(bp, size_bp, (a)); \ 4104 if (ret < 0 || ret >= size_bp) \ 4105 goto out_overflow; \ 4106 size_bp -= ret; \ 4107 bp += ret; \ 4108 } while (0) 4109 4110 #define CHECK_APPEND_1ARG(a, v1) \ 4111 do { \ 4112 ret = snprintf(bp, size_bp, (a), (v1)); \ 4113 if (ret < 0 || ret >= size_bp) \ 4114 goto out_overflow; \ 4115 size_bp -= ret; \ 4116 bp += ret; \ 4117 } while (0) 4118 4119 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4120 do { \ 4121 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4122 if (ret < 0 || ret >= size_bp) \ 4123 goto out_overflow; \ 4124 size_bp -= ret; \ 4125 bp += ret; \ 4126 } while (0) 4127 4128 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4129 CHECK_APPEND_1ARG("convert=%s,", 4130 btrfs_bg_type_to_raid_name(bargs->target)); 4131 4132 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4133 CHECK_APPEND_NOARG("soft,"); 4134 4135 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4136 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4137 sizeof(tmp_buf)); 4138 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4139 } 4140 4141 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4142 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4143 4144 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4145 CHECK_APPEND_2ARG("usage=%u..%u,", 4146 bargs->usage_min, bargs->usage_max); 4147 4148 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4149 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4150 4151 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4152 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4153 bargs->pstart, bargs->pend); 4154 4155 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4156 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4157 bargs->vstart, bargs->vend); 4158 4159 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4160 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4161 4162 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4163 CHECK_APPEND_2ARG("limit=%u..%u,", 4164 bargs->limit_min, bargs->limit_max); 4165 4166 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4167 CHECK_APPEND_2ARG("stripes=%u..%u,", 4168 bargs->stripes_min, bargs->stripes_max); 4169 4170 #undef CHECK_APPEND_2ARG 4171 #undef CHECK_APPEND_1ARG 4172 #undef CHECK_APPEND_NOARG 4173 4174 out_overflow: 4175 4176 if (size_bp < size_buf) 4177 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4178 else 4179 buf[0] = '\0'; 4180 } 4181 4182 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4183 { 4184 u32 size_buf = 1024; 4185 char tmp_buf[192] = {'\0'}; 4186 char *buf; 4187 char *bp; 4188 u32 size_bp = size_buf; 4189 int ret; 4190 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4191 4192 buf = kzalloc(size_buf, GFP_KERNEL); 4193 if (!buf) 4194 return; 4195 4196 bp = buf; 4197 4198 #define CHECK_APPEND_1ARG(a, v1) \ 4199 do { \ 4200 ret = snprintf(bp, size_bp, (a), (v1)); \ 4201 if (ret < 0 || ret >= size_bp) \ 4202 goto out_overflow; \ 4203 size_bp -= ret; \ 4204 bp += ret; \ 4205 } while (0) 4206 4207 if (bctl->flags & BTRFS_BALANCE_FORCE) 4208 CHECK_APPEND_1ARG("%s", "-f "); 4209 4210 if (bctl->flags & BTRFS_BALANCE_DATA) { 4211 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4212 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4213 } 4214 4215 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4216 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4217 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4218 } 4219 4220 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4221 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4222 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4223 } 4224 4225 #undef CHECK_APPEND_1ARG 4226 4227 out_overflow: 4228 4229 if (size_bp < size_buf) 4230 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4231 btrfs_info(fs_info, "balance: %s %s", 4232 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4233 "resume" : "start", buf); 4234 4235 kfree(buf); 4236 } 4237 4238 /* 4239 * Should be called with balance mutexe held 4240 */ 4241 int btrfs_balance(struct btrfs_fs_info *fs_info, 4242 struct btrfs_balance_control *bctl, 4243 struct btrfs_ioctl_balance_args *bargs) 4244 { 4245 u64 meta_target, data_target; 4246 u64 allowed; 4247 int mixed = 0; 4248 int ret; 4249 u64 num_devices; 4250 unsigned seq; 4251 bool reducing_redundancy; 4252 int i; 4253 4254 if (btrfs_fs_closing(fs_info) || 4255 atomic_read(&fs_info->balance_pause_req) || 4256 btrfs_should_cancel_balance(fs_info)) { 4257 ret = -EINVAL; 4258 goto out; 4259 } 4260 4261 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4262 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4263 mixed = 1; 4264 4265 /* 4266 * In case of mixed groups both data and meta should be picked, 4267 * and identical options should be given for both of them. 4268 */ 4269 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4270 if (mixed && (bctl->flags & allowed)) { 4271 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4272 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4273 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4274 btrfs_err(fs_info, 4275 "balance: mixed groups data and metadata options must be the same"); 4276 ret = -EINVAL; 4277 goto out; 4278 } 4279 } 4280 4281 /* 4282 * rw_devices will not change at the moment, device add/delete/replace 4283 * are exclusive 4284 */ 4285 num_devices = fs_info->fs_devices->rw_devices; 4286 4287 /* 4288 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4289 * special bit for it, to make it easier to distinguish. Thus we need 4290 * to set it manually, or balance would refuse the profile. 4291 */ 4292 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4293 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4294 if (num_devices >= btrfs_raid_array[i].devs_min) 4295 allowed |= btrfs_raid_array[i].bg_flag; 4296 4297 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4298 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4299 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4300 ret = -EINVAL; 4301 goto out; 4302 } 4303 4304 /* 4305 * Allow to reduce metadata or system integrity only if force set for 4306 * profiles with redundancy (copies, parity) 4307 */ 4308 allowed = 0; 4309 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4310 if (btrfs_raid_array[i].ncopies >= 2 || 4311 btrfs_raid_array[i].tolerated_failures >= 1) 4312 allowed |= btrfs_raid_array[i].bg_flag; 4313 } 4314 do { 4315 seq = read_seqbegin(&fs_info->profiles_lock); 4316 4317 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4318 (fs_info->avail_system_alloc_bits & allowed) && 4319 !(bctl->sys.target & allowed)) || 4320 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4321 (fs_info->avail_metadata_alloc_bits & allowed) && 4322 !(bctl->meta.target & allowed))) 4323 reducing_redundancy = true; 4324 else 4325 reducing_redundancy = false; 4326 4327 /* if we're not converting, the target field is uninitialized */ 4328 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4329 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4330 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4331 bctl->data.target : fs_info->avail_data_alloc_bits; 4332 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4333 4334 if (reducing_redundancy) { 4335 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4336 btrfs_info(fs_info, 4337 "balance: force reducing metadata redundancy"); 4338 } else { 4339 btrfs_err(fs_info, 4340 "balance: reduces metadata redundancy, use --force if you want this"); 4341 ret = -EINVAL; 4342 goto out; 4343 } 4344 } 4345 4346 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4347 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4348 btrfs_warn(fs_info, 4349 "balance: metadata profile %s has lower redundancy than data profile %s", 4350 btrfs_bg_type_to_raid_name(meta_target), 4351 btrfs_bg_type_to_raid_name(data_target)); 4352 } 4353 4354 ret = insert_balance_item(fs_info, bctl); 4355 if (ret && ret != -EEXIST) 4356 goto out; 4357 4358 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4359 BUG_ON(ret == -EEXIST); 4360 BUG_ON(fs_info->balance_ctl); 4361 spin_lock(&fs_info->balance_lock); 4362 fs_info->balance_ctl = bctl; 4363 spin_unlock(&fs_info->balance_lock); 4364 } else { 4365 BUG_ON(ret != -EEXIST); 4366 spin_lock(&fs_info->balance_lock); 4367 update_balance_args(bctl); 4368 spin_unlock(&fs_info->balance_lock); 4369 } 4370 4371 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4372 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4373 describe_balance_start_or_resume(fs_info); 4374 mutex_unlock(&fs_info->balance_mutex); 4375 4376 ret = __btrfs_balance(fs_info); 4377 4378 mutex_lock(&fs_info->balance_mutex); 4379 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4380 btrfs_info(fs_info, "balance: paused"); 4381 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4382 } 4383 /* 4384 * Balance can be canceled by: 4385 * 4386 * - Regular cancel request 4387 * Then ret == -ECANCELED and balance_cancel_req > 0 4388 * 4389 * - Fatal signal to "btrfs" process 4390 * Either the signal caught by wait_reserve_ticket() and callers 4391 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4392 * got -ECANCELED. 4393 * Either way, in this case balance_cancel_req = 0, and 4394 * ret == -EINTR or ret == -ECANCELED. 4395 * 4396 * So here we only check the return value to catch canceled balance. 4397 */ 4398 else if (ret == -ECANCELED || ret == -EINTR) 4399 btrfs_info(fs_info, "balance: canceled"); 4400 else 4401 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4402 4403 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4404 4405 if (bargs) { 4406 memset(bargs, 0, sizeof(*bargs)); 4407 btrfs_update_ioctl_balance_args(fs_info, bargs); 4408 } 4409 4410 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4411 balance_need_close(fs_info)) { 4412 reset_balance_state(fs_info); 4413 btrfs_exclop_finish(fs_info); 4414 } 4415 4416 wake_up(&fs_info->balance_wait_q); 4417 4418 return ret; 4419 out: 4420 if (bctl->flags & BTRFS_BALANCE_RESUME) 4421 reset_balance_state(fs_info); 4422 else 4423 kfree(bctl); 4424 btrfs_exclop_finish(fs_info); 4425 4426 return ret; 4427 } 4428 4429 static int balance_kthread(void *data) 4430 { 4431 struct btrfs_fs_info *fs_info = data; 4432 int ret = 0; 4433 4434 sb_start_write(fs_info->sb); 4435 mutex_lock(&fs_info->balance_mutex); 4436 if (fs_info->balance_ctl) 4437 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4438 mutex_unlock(&fs_info->balance_mutex); 4439 sb_end_write(fs_info->sb); 4440 4441 return ret; 4442 } 4443 4444 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4445 { 4446 struct task_struct *tsk; 4447 4448 mutex_lock(&fs_info->balance_mutex); 4449 if (!fs_info->balance_ctl) { 4450 mutex_unlock(&fs_info->balance_mutex); 4451 return 0; 4452 } 4453 mutex_unlock(&fs_info->balance_mutex); 4454 4455 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4456 btrfs_info(fs_info, "balance: resume skipped"); 4457 return 0; 4458 } 4459 4460 spin_lock(&fs_info->super_lock); 4461 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 4462 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 4463 spin_unlock(&fs_info->super_lock); 4464 /* 4465 * A ro->rw remount sequence should continue with the paused balance 4466 * regardless of who pauses it, system or the user as of now, so set 4467 * the resume flag. 4468 */ 4469 spin_lock(&fs_info->balance_lock); 4470 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4471 spin_unlock(&fs_info->balance_lock); 4472 4473 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4474 return PTR_ERR_OR_ZERO(tsk); 4475 } 4476 4477 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4478 { 4479 struct btrfs_balance_control *bctl; 4480 struct btrfs_balance_item *item; 4481 struct btrfs_disk_balance_args disk_bargs; 4482 struct btrfs_path *path; 4483 struct extent_buffer *leaf; 4484 struct btrfs_key key; 4485 int ret; 4486 4487 path = btrfs_alloc_path(); 4488 if (!path) 4489 return -ENOMEM; 4490 4491 key.objectid = BTRFS_BALANCE_OBJECTID; 4492 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4493 key.offset = 0; 4494 4495 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4496 if (ret < 0) 4497 goto out; 4498 if (ret > 0) { /* ret = -ENOENT; */ 4499 ret = 0; 4500 goto out; 4501 } 4502 4503 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4504 if (!bctl) { 4505 ret = -ENOMEM; 4506 goto out; 4507 } 4508 4509 leaf = path->nodes[0]; 4510 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4511 4512 bctl->flags = btrfs_balance_flags(leaf, item); 4513 bctl->flags |= BTRFS_BALANCE_RESUME; 4514 4515 btrfs_balance_data(leaf, item, &disk_bargs); 4516 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4517 btrfs_balance_meta(leaf, item, &disk_bargs); 4518 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4519 btrfs_balance_sys(leaf, item, &disk_bargs); 4520 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4521 4522 /* 4523 * This should never happen, as the paused balance state is recovered 4524 * during mount without any chance of other exclusive ops to collide. 4525 * 4526 * This gives the exclusive op status to balance and keeps in paused 4527 * state until user intervention (cancel or umount). If the ownership 4528 * cannot be assigned, show a message but do not fail. The balance 4529 * is in a paused state and must have fs_info::balance_ctl properly 4530 * set up. 4531 */ 4532 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 4533 btrfs_warn(fs_info, 4534 "balance: cannot set exclusive op status, resume manually"); 4535 4536 btrfs_release_path(path); 4537 4538 mutex_lock(&fs_info->balance_mutex); 4539 BUG_ON(fs_info->balance_ctl); 4540 spin_lock(&fs_info->balance_lock); 4541 fs_info->balance_ctl = bctl; 4542 spin_unlock(&fs_info->balance_lock); 4543 mutex_unlock(&fs_info->balance_mutex); 4544 out: 4545 btrfs_free_path(path); 4546 return ret; 4547 } 4548 4549 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4550 { 4551 int ret = 0; 4552 4553 mutex_lock(&fs_info->balance_mutex); 4554 if (!fs_info->balance_ctl) { 4555 mutex_unlock(&fs_info->balance_mutex); 4556 return -ENOTCONN; 4557 } 4558 4559 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4560 atomic_inc(&fs_info->balance_pause_req); 4561 mutex_unlock(&fs_info->balance_mutex); 4562 4563 wait_event(fs_info->balance_wait_q, 4564 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4565 4566 mutex_lock(&fs_info->balance_mutex); 4567 /* we are good with balance_ctl ripped off from under us */ 4568 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4569 atomic_dec(&fs_info->balance_pause_req); 4570 } else { 4571 ret = -ENOTCONN; 4572 } 4573 4574 mutex_unlock(&fs_info->balance_mutex); 4575 return ret; 4576 } 4577 4578 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4579 { 4580 mutex_lock(&fs_info->balance_mutex); 4581 if (!fs_info->balance_ctl) { 4582 mutex_unlock(&fs_info->balance_mutex); 4583 return -ENOTCONN; 4584 } 4585 4586 /* 4587 * A paused balance with the item stored on disk can be resumed at 4588 * mount time if the mount is read-write. Otherwise it's still paused 4589 * and we must not allow cancelling as it deletes the item. 4590 */ 4591 if (sb_rdonly(fs_info->sb)) { 4592 mutex_unlock(&fs_info->balance_mutex); 4593 return -EROFS; 4594 } 4595 4596 atomic_inc(&fs_info->balance_cancel_req); 4597 /* 4598 * if we are running just wait and return, balance item is 4599 * deleted in btrfs_balance in this case 4600 */ 4601 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4602 mutex_unlock(&fs_info->balance_mutex); 4603 wait_event(fs_info->balance_wait_q, 4604 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4605 mutex_lock(&fs_info->balance_mutex); 4606 } else { 4607 mutex_unlock(&fs_info->balance_mutex); 4608 /* 4609 * Lock released to allow other waiters to continue, we'll 4610 * reexamine the status again. 4611 */ 4612 mutex_lock(&fs_info->balance_mutex); 4613 4614 if (fs_info->balance_ctl) { 4615 reset_balance_state(fs_info); 4616 btrfs_exclop_finish(fs_info); 4617 btrfs_info(fs_info, "balance: canceled"); 4618 } 4619 } 4620 4621 BUG_ON(fs_info->balance_ctl || 4622 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4623 atomic_dec(&fs_info->balance_cancel_req); 4624 mutex_unlock(&fs_info->balance_mutex); 4625 return 0; 4626 } 4627 4628 int btrfs_uuid_scan_kthread(void *data) 4629 { 4630 struct btrfs_fs_info *fs_info = data; 4631 struct btrfs_root *root = fs_info->tree_root; 4632 struct btrfs_key key; 4633 struct btrfs_path *path = NULL; 4634 int ret = 0; 4635 struct extent_buffer *eb; 4636 int slot; 4637 struct btrfs_root_item root_item; 4638 u32 item_size; 4639 struct btrfs_trans_handle *trans = NULL; 4640 bool closing = false; 4641 4642 path = btrfs_alloc_path(); 4643 if (!path) { 4644 ret = -ENOMEM; 4645 goto out; 4646 } 4647 4648 key.objectid = 0; 4649 key.type = BTRFS_ROOT_ITEM_KEY; 4650 key.offset = 0; 4651 4652 while (1) { 4653 if (btrfs_fs_closing(fs_info)) { 4654 closing = true; 4655 break; 4656 } 4657 ret = btrfs_search_forward(root, &key, path, 4658 BTRFS_OLDEST_GENERATION); 4659 if (ret) { 4660 if (ret > 0) 4661 ret = 0; 4662 break; 4663 } 4664 4665 if (key.type != BTRFS_ROOT_ITEM_KEY || 4666 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4667 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4668 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4669 goto skip; 4670 4671 eb = path->nodes[0]; 4672 slot = path->slots[0]; 4673 item_size = btrfs_item_size(eb, slot); 4674 if (item_size < sizeof(root_item)) 4675 goto skip; 4676 4677 read_extent_buffer(eb, &root_item, 4678 btrfs_item_ptr_offset(eb, slot), 4679 (int)sizeof(root_item)); 4680 if (btrfs_root_refs(&root_item) == 0) 4681 goto skip; 4682 4683 if (!btrfs_is_empty_uuid(root_item.uuid) || 4684 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4685 if (trans) 4686 goto update_tree; 4687 4688 btrfs_release_path(path); 4689 /* 4690 * 1 - subvol uuid item 4691 * 1 - received_subvol uuid item 4692 */ 4693 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4694 if (IS_ERR(trans)) { 4695 ret = PTR_ERR(trans); 4696 break; 4697 } 4698 continue; 4699 } else { 4700 goto skip; 4701 } 4702 update_tree: 4703 btrfs_release_path(path); 4704 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4705 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4706 BTRFS_UUID_KEY_SUBVOL, 4707 key.objectid); 4708 if (ret < 0) { 4709 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4710 ret); 4711 break; 4712 } 4713 } 4714 4715 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4716 ret = btrfs_uuid_tree_add(trans, 4717 root_item.received_uuid, 4718 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4719 key.objectid); 4720 if (ret < 0) { 4721 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4722 ret); 4723 break; 4724 } 4725 } 4726 4727 skip: 4728 btrfs_release_path(path); 4729 if (trans) { 4730 ret = btrfs_end_transaction(trans); 4731 trans = NULL; 4732 if (ret) 4733 break; 4734 } 4735 4736 if (key.offset < (u64)-1) { 4737 key.offset++; 4738 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4739 key.offset = 0; 4740 key.type = BTRFS_ROOT_ITEM_KEY; 4741 } else if (key.objectid < (u64)-1) { 4742 key.offset = 0; 4743 key.type = BTRFS_ROOT_ITEM_KEY; 4744 key.objectid++; 4745 } else { 4746 break; 4747 } 4748 cond_resched(); 4749 } 4750 4751 out: 4752 btrfs_free_path(path); 4753 if (trans && !IS_ERR(trans)) 4754 btrfs_end_transaction(trans); 4755 if (ret) 4756 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4757 else if (!closing) 4758 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4759 up(&fs_info->uuid_tree_rescan_sem); 4760 return 0; 4761 } 4762 4763 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4764 { 4765 struct btrfs_trans_handle *trans; 4766 struct btrfs_root *tree_root = fs_info->tree_root; 4767 struct btrfs_root *uuid_root; 4768 struct task_struct *task; 4769 int ret; 4770 4771 /* 4772 * 1 - root node 4773 * 1 - root item 4774 */ 4775 trans = btrfs_start_transaction(tree_root, 2); 4776 if (IS_ERR(trans)) 4777 return PTR_ERR(trans); 4778 4779 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4780 if (IS_ERR(uuid_root)) { 4781 ret = PTR_ERR(uuid_root); 4782 btrfs_abort_transaction(trans, ret); 4783 btrfs_end_transaction(trans); 4784 return ret; 4785 } 4786 4787 fs_info->uuid_root = uuid_root; 4788 4789 ret = btrfs_commit_transaction(trans); 4790 if (ret) 4791 return ret; 4792 4793 down(&fs_info->uuid_tree_rescan_sem); 4794 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4795 if (IS_ERR(task)) { 4796 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4797 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4798 up(&fs_info->uuid_tree_rescan_sem); 4799 return PTR_ERR(task); 4800 } 4801 4802 return 0; 4803 } 4804 4805 /* 4806 * shrinking a device means finding all of the device extents past 4807 * the new size, and then following the back refs to the chunks. 4808 * The chunk relocation code actually frees the device extent 4809 */ 4810 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4811 { 4812 struct btrfs_fs_info *fs_info = device->fs_info; 4813 struct btrfs_root *root = fs_info->dev_root; 4814 struct btrfs_trans_handle *trans; 4815 struct btrfs_dev_extent *dev_extent = NULL; 4816 struct btrfs_path *path; 4817 u64 length; 4818 u64 chunk_offset; 4819 int ret; 4820 int slot; 4821 int failed = 0; 4822 bool retried = false; 4823 struct extent_buffer *l; 4824 struct btrfs_key key; 4825 struct btrfs_super_block *super_copy = fs_info->super_copy; 4826 u64 old_total = btrfs_super_total_bytes(super_copy); 4827 u64 old_size = btrfs_device_get_total_bytes(device); 4828 u64 diff; 4829 u64 start; 4830 4831 new_size = round_down(new_size, fs_info->sectorsize); 4832 start = new_size; 4833 diff = round_down(old_size - new_size, fs_info->sectorsize); 4834 4835 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4836 return -EINVAL; 4837 4838 path = btrfs_alloc_path(); 4839 if (!path) 4840 return -ENOMEM; 4841 4842 path->reada = READA_BACK; 4843 4844 trans = btrfs_start_transaction(root, 0); 4845 if (IS_ERR(trans)) { 4846 btrfs_free_path(path); 4847 return PTR_ERR(trans); 4848 } 4849 4850 mutex_lock(&fs_info->chunk_mutex); 4851 4852 btrfs_device_set_total_bytes(device, new_size); 4853 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4854 device->fs_devices->total_rw_bytes -= diff; 4855 atomic64_sub(diff, &fs_info->free_chunk_space); 4856 } 4857 4858 /* 4859 * Once the device's size has been set to the new size, ensure all 4860 * in-memory chunks are synced to disk so that the loop below sees them 4861 * and relocates them accordingly. 4862 */ 4863 if (contains_pending_extent(device, &start, diff)) { 4864 mutex_unlock(&fs_info->chunk_mutex); 4865 ret = btrfs_commit_transaction(trans); 4866 if (ret) 4867 goto done; 4868 } else { 4869 mutex_unlock(&fs_info->chunk_mutex); 4870 btrfs_end_transaction(trans); 4871 } 4872 4873 again: 4874 key.objectid = device->devid; 4875 key.offset = (u64)-1; 4876 key.type = BTRFS_DEV_EXTENT_KEY; 4877 4878 do { 4879 mutex_lock(&fs_info->reclaim_bgs_lock); 4880 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4881 if (ret < 0) { 4882 mutex_unlock(&fs_info->reclaim_bgs_lock); 4883 goto done; 4884 } 4885 4886 ret = btrfs_previous_item(root, path, 0, key.type); 4887 if (ret) { 4888 mutex_unlock(&fs_info->reclaim_bgs_lock); 4889 if (ret < 0) 4890 goto done; 4891 ret = 0; 4892 btrfs_release_path(path); 4893 break; 4894 } 4895 4896 l = path->nodes[0]; 4897 slot = path->slots[0]; 4898 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4899 4900 if (key.objectid != device->devid) { 4901 mutex_unlock(&fs_info->reclaim_bgs_lock); 4902 btrfs_release_path(path); 4903 break; 4904 } 4905 4906 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4907 length = btrfs_dev_extent_length(l, dev_extent); 4908 4909 if (key.offset + length <= new_size) { 4910 mutex_unlock(&fs_info->reclaim_bgs_lock); 4911 btrfs_release_path(path); 4912 break; 4913 } 4914 4915 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4916 btrfs_release_path(path); 4917 4918 /* 4919 * We may be relocating the only data chunk we have, 4920 * which could potentially end up with losing data's 4921 * raid profile, so lets allocate an empty one in 4922 * advance. 4923 */ 4924 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4925 if (ret < 0) { 4926 mutex_unlock(&fs_info->reclaim_bgs_lock); 4927 goto done; 4928 } 4929 4930 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4931 mutex_unlock(&fs_info->reclaim_bgs_lock); 4932 if (ret == -ENOSPC) { 4933 failed++; 4934 } else if (ret) { 4935 if (ret == -ETXTBSY) { 4936 btrfs_warn(fs_info, 4937 "could not shrink block group %llu due to active swapfile", 4938 chunk_offset); 4939 } 4940 goto done; 4941 } 4942 } while (key.offset-- > 0); 4943 4944 if (failed && !retried) { 4945 failed = 0; 4946 retried = true; 4947 goto again; 4948 } else if (failed && retried) { 4949 ret = -ENOSPC; 4950 goto done; 4951 } 4952 4953 /* Shrinking succeeded, else we would be at "done". */ 4954 trans = btrfs_start_transaction(root, 0); 4955 if (IS_ERR(trans)) { 4956 ret = PTR_ERR(trans); 4957 goto done; 4958 } 4959 4960 mutex_lock(&fs_info->chunk_mutex); 4961 /* Clear all state bits beyond the shrunk device size */ 4962 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4963 CHUNK_STATE_MASK); 4964 4965 btrfs_device_set_disk_total_bytes(device, new_size); 4966 if (list_empty(&device->post_commit_list)) 4967 list_add_tail(&device->post_commit_list, 4968 &trans->transaction->dev_update_list); 4969 4970 WARN_ON(diff > old_total); 4971 btrfs_set_super_total_bytes(super_copy, 4972 round_down(old_total - diff, fs_info->sectorsize)); 4973 mutex_unlock(&fs_info->chunk_mutex); 4974 4975 btrfs_reserve_chunk_metadata(trans, false); 4976 /* Now btrfs_update_device() will change the on-disk size. */ 4977 ret = btrfs_update_device(trans, device); 4978 btrfs_trans_release_chunk_metadata(trans); 4979 if (ret < 0) { 4980 btrfs_abort_transaction(trans, ret); 4981 btrfs_end_transaction(trans); 4982 } else { 4983 ret = btrfs_commit_transaction(trans); 4984 } 4985 done: 4986 btrfs_free_path(path); 4987 if (ret) { 4988 mutex_lock(&fs_info->chunk_mutex); 4989 btrfs_device_set_total_bytes(device, old_size); 4990 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4991 device->fs_devices->total_rw_bytes += diff; 4992 atomic64_add(diff, &fs_info->free_chunk_space); 4993 mutex_unlock(&fs_info->chunk_mutex); 4994 } 4995 return ret; 4996 } 4997 4998 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4999 struct btrfs_key *key, 5000 struct btrfs_chunk *chunk, int item_size) 5001 { 5002 struct btrfs_super_block *super_copy = fs_info->super_copy; 5003 struct btrfs_disk_key disk_key; 5004 u32 array_size; 5005 u8 *ptr; 5006 5007 lockdep_assert_held(&fs_info->chunk_mutex); 5008 5009 array_size = btrfs_super_sys_array_size(super_copy); 5010 if (array_size + item_size + sizeof(disk_key) 5011 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 5012 return -EFBIG; 5013 5014 ptr = super_copy->sys_chunk_array + array_size; 5015 btrfs_cpu_key_to_disk(&disk_key, key); 5016 memcpy(ptr, &disk_key, sizeof(disk_key)); 5017 ptr += sizeof(disk_key); 5018 memcpy(ptr, chunk, item_size); 5019 item_size += sizeof(disk_key); 5020 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 5021 5022 return 0; 5023 } 5024 5025 /* 5026 * sort the devices in descending order by max_avail, total_avail 5027 */ 5028 static int btrfs_cmp_device_info(const void *a, const void *b) 5029 { 5030 const struct btrfs_device_info *di_a = a; 5031 const struct btrfs_device_info *di_b = b; 5032 5033 if (di_a->max_avail > di_b->max_avail) 5034 return -1; 5035 if (di_a->max_avail < di_b->max_avail) 5036 return 1; 5037 if (di_a->total_avail > di_b->total_avail) 5038 return -1; 5039 if (di_a->total_avail < di_b->total_avail) 5040 return 1; 5041 return 0; 5042 } 5043 5044 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 5045 { 5046 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5047 return; 5048 5049 btrfs_set_fs_incompat(info, RAID56); 5050 } 5051 5052 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 5053 { 5054 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 5055 return; 5056 5057 btrfs_set_fs_incompat(info, RAID1C34); 5058 } 5059 5060 /* 5061 * Structure used internally for btrfs_create_chunk() function. 5062 * Wraps needed parameters. 5063 */ 5064 struct alloc_chunk_ctl { 5065 u64 start; 5066 u64 type; 5067 /* Total number of stripes to allocate */ 5068 int num_stripes; 5069 /* sub_stripes info for map */ 5070 int sub_stripes; 5071 /* Stripes per device */ 5072 int dev_stripes; 5073 /* Maximum number of devices to use */ 5074 int devs_max; 5075 /* Minimum number of devices to use */ 5076 int devs_min; 5077 /* ndevs has to be a multiple of this */ 5078 int devs_increment; 5079 /* Number of copies */ 5080 int ncopies; 5081 /* Number of stripes worth of bytes to store parity information */ 5082 int nparity; 5083 u64 max_stripe_size; 5084 u64 max_chunk_size; 5085 u64 dev_extent_min; 5086 u64 stripe_size; 5087 u64 chunk_size; 5088 int ndevs; 5089 }; 5090 5091 static void init_alloc_chunk_ctl_policy_regular( 5092 struct btrfs_fs_devices *fs_devices, 5093 struct alloc_chunk_ctl *ctl) 5094 { 5095 struct btrfs_space_info *space_info; 5096 5097 space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); 5098 ASSERT(space_info); 5099 5100 ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); 5101 ctl->max_stripe_size = ctl->max_chunk_size; 5102 5103 if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) 5104 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); 5105 5106 /* We don't want a chunk larger than 10% of writable space */ 5107 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5108 ctl->max_chunk_size); 5109 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5110 } 5111 5112 static void init_alloc_chunk_ctl_policy_zoned( 5113 struct btrfs_fs_devices *fs_devices, 5114 struct alloc_chunk_ctl *ctl) 5115 { 5116 u64 zone_size = fs_devices->fs_info->zone_size; 5117 u64 limit; 5118 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5119 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5120 u64 min_chunk_size = min_data_stripes * zone_size; 5121 u64 type = ctl->type; 5122 5123 ctl->max_stripe_size = zone_size; 5124 if (type & BTRFS_BLOCK_GROUP_DATA) { 5125 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5126 zone_size); 5127 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5128 ctl->max_chunk_size = ctl->max_stripe_size; 5129 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5130 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5131 ctl->devs_max = min_t(int, ctl->devs_max, 5132 BTRFS_MAX_DEVS_SYS_CHUNK); 5133 } else { 5134 BUG(); 5135 } 5136 5137 /* We don't want a chunk larger than 10% of writable space */ 5138 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5139 zone_size), 5140 min_chunk_size); 5141 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5142 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5143 } 5144 5145 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5146 struct alloc_chunk_ctl *ctl) 5147 { 5148 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5149 5150 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5151 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5152 ctl->devs_max = btrfs_raid_array[index].devs_max; 5153 if (!ctl->devs_max) 5154 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5155 ctl->devs_min = btrfs_raid_array[index].devs_min; 5156 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5157 ctl->ncopies = btrfs_raid_array[index].ncopies; 5158 ctl->nparity = btrfs_raid_array[index].nparity; 5159 ctl->ndevs = 0; 5160 5161 switch (fs_devices->chunk_alloc_policy) { 5162 case BTRFS_CHUNK_ALLOC_REGULAR: 5163 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5164 break; 5165 case BTRFS_CHUNK_ALLOC_ZONED: 5166 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5167 break; 5168 default: 5169 BUG(); 5170 } 5171 } 5172 5173 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5174 struct alloc_chunk_ctl *ctl, 5175 struct btrfs_device_info *devices_info) 5176 { 5177 struct btrfs_fs_info *info = fs_devices->fs_info; 5178 struct btrfs_device *device; 5179 u64 total_avail; 5180 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5181 int ret; 5182 int ndevs = 0; 5183 u64 max_avail; 5184 u64 dev_offset; 5185 5186 /* 5187 * in the first pass through the devices list, we gather information 5188 * about the available holes on each device. 5189 */ 5190 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5191 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5192 WARN(1, KERN_ERR 5193 "BTRFS: read-only device in alloc_list\n"); 5194 continue; 5195 } 5196 5197 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5198 &device->dev_state) || 5199 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5200 continue; 5201 5202 if (device->total_bytes > device->bytes_used) 5203 total_avail = device->total_bytes - device->bytes_used; 5204 else 5205 total_avail = 0; 5206 5207 /* If there is no space on this device, skip it. */ 5208 if (total_avail < ctl->dev_extent_min) 5209 continue; 5210 5211 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5212 &max_avail); 5213 if (ret && ret != -ENOSPC) 5214 return ret; 5215 5216 if (ret == 0) 5217 max_avail = dev_extent_want; 5218 5219 if (max_avail < ctl->dev_extent_min) { 5220 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5221 btrfs_debug(info, 5222 "%s: devid %llu has no free space, have=%llu want=%llu", 5223 __func__, device->devid, max_avail, 5224 ctl->dev_extent_min); 5225 continue; 5226 } 5227 5228 if (ndevs == fs_devices->rw_devices) { 5229 WARN(1, "%s: found more than %llu devices\n", 5230 __func__, fs_devices->rw_devices); 5231 break; 5232 } 5233 devices_info[ndevs].dev_offset = dev_offset; 5234 devices_info[ndevs].max_avail = max_avail; 5235 devices_info[ndevs].total_avail = total_avail; 5236 devices_info[ndevs].dev = device; 5237 ++ndevs; 5238 } 5239 ctl->ndevs = ndevs; 5240 5241 /* 5242 * now sort the devices by hole size / available space 5243 */ 5244 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5245 btrfs_cmp_device_info, NULL); 5246 5247 return 0; 5248 } 5249 5250 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5251 struct btrfs_device_info *devices_info) 5252 { 5253 /* Number of stripes that count for block group size */ 5254 int data_stripes; 5255 5256 /* 5257 * The primary goal is to maximize the number of stripes, so use as 5258 * many devices as possible, even if the stripes are not maximum sized. 5259 * 5260 * The DUP profile stores more than one stripe per device, the 5261 * max_avail is the total size so we have to adjust. 5262 */ 5263 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5264 ctl->dev_stripes); 5265 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5266 5267 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5268 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5269 5270 /* 5271 * Use the number of data stripes to figure out how big this chunk is 5272 * really going to be in terms of logical address space, and compare 5273 * that answer with the max chunk size. If it's higher, we try to 5274 * reduce stripe_size. 5275 */ 5276 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5277 /* 5278 * Reduce stripe_size, round it up to a 16MB boundary again and 5279 * then use it, unless it ends up being even bigger than the 5280 * previous value we had already. 5281 */ 5282 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5283 data_stripes), SZ_16M), 5284 ctl->stripe_size); 5285 } 5286 5287 /* Stripe size should not go beyond 1G. */ 5288 ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); 5289 5290 /* Align to BTRFS_STRIPE_LEN */ 5291 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5292 ctl->chunk_size = ctl->stripe_size * data_stripes; 5293 5294 return 0; 5295 } 5296 5297 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5298 struct btrfs_device_info *devices_info) 5299 { 5300 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5301 /* Number of stripes that count for block group size */ 5302 int data_stripes; 5303 5304 /* 5305 * It should hold because: 5306 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5307 */ 5308 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5309 5310 ctl->stripe_size = zone_size; 5311 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5312 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5313 5314 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5315 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5316 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5317 ctl->stripe_size) + ctl->nparity, 5318 ctl->dev_stripes); 5319 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5320 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5321 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5322 } 5323 5324 ctl->chunk_size = ctl->stripe_size * data_stripes; 5325 5326 return 0; 5327 } 5328 5329 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5330 struct alloc_chunk_ctl *ctl, 5331 struct btrfs_device_info *devices_info) 5332 { 5333 struct btrfs_fs_info *info = fs_devices->fs_info; 5334 5335 /* 5336 * Round down to number of usable stripes, devs_increment can be any 5337 * number so we can't use round_down() that requires power of 2, while 5338 * rounddown is safe. 5339 */ 5340 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5341 5342 if (ctl->ndevs < ctl->devs_min) { 5343 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5344 btrfs_debug(info, 5345 "%s: not enough devices with free space: have=%d minimum required=%d", 5346 __func__, ctl->ndevs, ctl->devs_min); 5347 } 5348 return -ENOSPC; 5349 } 5350 5351 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5352 5353 switch (fs_devices->chunk_alloc_policy) { 5354 case BTRFS_CHUNK_ALLOC_REGULAR: 5355 return decide_stripe_size_regular(ctl, devices_info); 5356 case BTRFS_CHUNK_ALLOC_ZONED: 5357 return decide_stripe_size_zoned(ctl, devices_info); 5358 default: 5359 BUG(); 5360 } 5361 } 5362 5363 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5364 struct alloc_chunk_ctl *ctl, 5365 struct btrfs_device_info *devices_info) 5366 { 5367 struct btrfs_fs_info *info = trans->fs_info; 5368 struct map_lookup *map = NULL; 5369 struct extent_map_tree *em_tree; 5370 struct btrfs_block_group *block_group; 5371 struct extent_map *em; 5372 u64 start = ctl->start; 5373 u64 type = ctl->type; 5374 int ret; 5375 int i; 5376 int j; 5377 5378 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5379 if (!map) 5380 return ERR_PTR(-ENOMEM); 5381 map->num_stripes = ctl->num_stripes; 5382 5383 for (i = 0; i < ctl->ndevs; ++i) { 5384 for (j = 0; j < ctl->dev_stripes; ++j) { 5385 int s = i * ctl->dev_stripes + j; 5386 map->stripes[s].dev = devices_info[i].dev; 5387 map->stripes[s].physical = devices_info[i].dev_offset + 5388 j * ctl->stripe_size; 5389 } 5390 } 5391 map->stripe_len = BTRFS_STRIPE_LEN; 5392 map->io_align = BTRFS_STRIPE_LEN; 5393 map->io_width = BTRFS_STRIPE_LEN; 5394 map->type = type; 5395 map->sub_stripes = ctl->sub_stripes; 5396 5397 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5398 5399 em = alloc_extent_map(); 5400 if (!em) { 5401 kfree(map); 5402 return ERR_PTR(-ENOMEM); 5403 } 5404 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5405 em->map_lookup = map; 5406 em->start = start; 5407 em->len = ctl->chunk_size; 5408 em->block_start = 0; 5409 em->block_len = em->len; 5410 em->orig_block_len = ctl->stripe_size; 5411 5412 em_tree = &info->mapping_tree; 5413 write_lock(&em_tree->lock); 5414 ret = add_extent_mapping(em_tree, em, 0); 5415 if (ret) { 5416 write_unlock(&em_tree->lock); 5417 free_extent_map(em); 5418 return ERR_PTR(ret); 5419 } 5420 write_unlock(&em_tree->lock); 5421 5422 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5423 if (IS_ERR(block_group)) 5424 goto error_del_extent; 5425 5426 for (i = 0; i < map->num_stripes; i++) { 5427 struct btrfs_device *dev = map->stripes[i].dev; 5428 5429 btrfs_device_set_bytes_used(dev, 5430 dev->bytes_used + ctl->stripe_size); 5431 if (list_empty(&dev->post_commit_list)) 5432 list_add_tail(&dev->post_commit_list, 5433 &trans->transaction->dev_update_list); 5434 } 5435 5436 atomic64_sub(ctl->stripe_size * map->num_stripes, 5437 &info->free_chunk_space); 5438 5439 free_extent_map(em); 5440 check_raid56_incompat_flag(info, type); 5441 check_raid1c34_incompat_flag(info, type); 5442 5443 return block_group; 5444 5445 error_del_extent: 5446 write_lock(&em_tree->lock); 5447 remove_extent_mapping(em_tree, em); 5448 write_unlock(&em_tree->lock); 5449 5450 /* One for our allocation */ 5451 free_extent_map(em); 5452 /* One for the tree reference */ 5453 free_extent_map(em); 5454 5455 return block_group; 5456 } 5457 5458 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5459 u64 type) 5460 { 5461 struct btrfs_fs_info *info = trans->fs_info; 5462 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5463 struct btrfs_device_info *devices_info = NULL; 5464 struct alloc_chunk_ctl ctl; 5465 struct btrfs_block_group *block_group; 5466 int ret; 5467 5468 lockdep_assert_held(&info->chunk_mutex); 5469 5470 if (!alloc_profile_is_valid(type, 0)) { 5471 ASSERT(0); 5472 return ERR_PTR(-EINVAL); 5473 } 5474 5475 if (list_empty(&fs_devices->alloc_list)) { 5476 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5477 btrfs_debug(info, "%s: no writable device", __func__); 5478 return ERR_PTR(-ENOSPC); 5479 } 5480 5481 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5482 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5483 ASSERT(0); 5484 return ERR_PTR(-EINVAL); 5485 } 5486 5487 ctl.start = find_next_chunk(info); 5488 ctl.type = type; 5489 init_alloc_chunk_ctl(fs_devices, &ctl); 5490 5491 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5492 GFP_NOFS); 5493 if (!devices_info) 5494 return ERR_PTR(-ENOMEM); 5495 5496 ret = gather_device_info(fs_devices, &ctl, devices_info); 5497 if (ret < 0) { 5498 block_group = ERR_PTR(ret); 5499 goto out; 5500 } 5501 5502 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5503 if (ret < 0) { 5504 block_group = ERR_PTR(ret); 5505 goto out; 5506 } 5507 5508 block_group = create_chunk(trans, &ctl, devices_info); 5509 5510 out: 5511 kfree(devices_info); 5512 return block_group; 5513 } 5514 5515 /* 5516 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5517 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5518 * chunks. 5519 * 5520 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5521 * phases. 5522 */ 5523 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5524 struct btrfs_block_group *bg) 5525 { 5526 struct btrfs_fs_info *fs_info = trans->fs_info; 5527 struct btrfs_root *chunk_root = fs_info->chunk_root; 5528 struct btrfs_key key; 5529 struct btrfs_chunk *chunk; 5530 struct btrfs_stripe *stripe; 5531 struct extent_map *em; 5532 struct map_lookup *map; 5533 size_t item_size; 5534 int i; 5535 int ret; 5536 5537 /* 5538 * We take the chunk_mutex for 2 reasons: 5539 * 5540 * 1) Updates and insertions in the chunk btree must be done while holding 5541 * the chunk_mutex, as well as updating the system chunk array in the 5542 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5543 * details; 5544 * 5545 * 2) To prevent races with the final phase of a device replace operation 5546 * that replaces the device object associated with the map's stripes, 5547 * because the device object's id can change at any time during that 5548 * final phase of the device replace operation 5549 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5550 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5551 * which would cause a failure when updating the device item, which does 5552 * not exists, or persisting a stripe of the chunk item with such ID. 5553 * Here we can't use the device_list_mutex because our caller already 5554 * has locked the chunk_mutex, and the final phase of device replace 5555 * acquires both mutexes - first the device_list_mutex and then the 5556 * chunk_mutex. Using any of those two mutexes protects us from a 5557 * concurrent device replace. 5558 */ 5559 lockdep_assert_held(&fs_info->chunk_mutex); 5560 5561 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5562 if (IS_ERR(em)) { 5563 ret = PTR_ERR(em); 5564 btrfs_abort_transaction(trans, ret); 5565 return ret; 5566 } 5567 5568 map = em->map_lookup; 5569 item_size = btrfs_chunk_item_size(map->num_stripes); 5570 5571 chunk = kzalloc(item_size, GFP_NOFS); 5572 if (!chunk) { 5573 ret = -ENOMEM; 5574 btrfs_abort_transaction(trans, ret); 5575 goto out; 5576 } 5577 5578 for (i = 0; i < map->num_stripes; i++) { 5579 struct btrfs_device *device = map->stripes[i].dev; 5580 5581 ret = btrfs_update_device(trans, device); 5582 if (ret) 5583 goto out; 5584 } 5585 5586 stripe = &chunk->stripe; 5587 for (i = 0; i < map->num_stripes; i++) { 5588 struct btrfs_device *device = map->stripes[i].dev; 5589 const u64 dev_offset = map->stripes[i].physical; 5590 5591 btrfs_set_stack_stripe_devid(stripe, device->devid); 5592 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5593 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5594 stripe++; 5595 } 5596 5597 btrfs_set_stack_chunk_length(chunk, bg->length); 5598 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 5599 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5600 btrfs_set_stack_chunk_type(chunk, map->type); 5601 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5602 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5603 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5604 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5605 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5606 5607 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5608 key.type = BTRFS_CHUNK_ITEM_KEY; 5609 key.offset = bg->start; 5610 5611 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5612 if (ret) 5613 goto out; 5614 5615 set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); 5616 5617 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5618 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5619 if (ret) 5620 goto out; 5621 } 5622 5623 out: 5624 kfree(chunk); 5625 free_extent_map(em); 5626 return ret; 5627 } 5628 5629 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5630 { 5631 struct btrfs_fs_info *fs_info = trans->fs_info; 5632 u64 alloc_profile; 5633 struct btrfs_block_group *meta_bg; 5634 struct btrfs_block_group *sys_bg; 5635 5636 /* 5637 * When adding a new device for sprouting, the seed device is read-only 5638 * so we must first allocate a metadata and a system chunk. But before 5639 * adding the block group items to the extent, device and chunk btrees, 5640 * we must first: 5641 * 5642 * 1) Create both chunks without doing any changes to the btrees, as 5643 * otherwise we would get -ENOSPC since the block groups from the 5644 * seed device are read-only; 5645 * 5646 * 2) Add the device item for the new sprout device - finishing the setup 5647 * of a new block group requires updating the device item in the chunk 5648 * btree, so it must exist when we attempt to do it. The previous step 5649 * ensures this does not fail with -ENOSPC. 5650 * 5651 * After that we can add the block group items to their btrees: 5652 * update existing device item in the chunk btree, add a new block group 5653 * item to the extent btree, add a new chunk item to the chunk btree and 5654 * finally add the new device extent items to the devices btree. 5655 */ 5656 5657 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5658 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5659 if (IS_ERR(meta_bg)) 5660 return PTR_ERR(meta_bg); 5661 5662 alloc_profile = btrfs_system_alloc_profile(fs_info); 5663 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5664 if (IS_ERR(sys_bg)) 5665 return PTR_ERR(sys_bg); 5666 5667 return 0; 5668 } 5669 5670 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5671 { 5672 const int index = btrfs_bg_flags_to_raid_index(map->type); 5673 5674 return btrfs_raid_array[index].tolerated_failures; 5675 } 5676 5677 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5678 { 5679 struct extent_map *em; 5680 struct map_lookup *map; 5681 int miss_ndevs = 0; 5682 int i; 5683 bool ret = true; 5684 5685 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5686 if (IS_ERR(em)) 5687 return false; 5688 5689 map = em->map_lookup; 5690 for (i = 0; i < map->num_stripes; i++) { 5691 if (test_bit(BTRFS_DEV_STATE_MISSING, 5692 &map->stripes[i].dev->dev_state)) { 5693 miss_ndevs++; 5694 continue; 5695 } 5696 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5697 &map->stripes[i].dev->dev_state)) { 5698 ret = false; 5699 goto end; 5700 } 5701 } 5702 5703 /* 5704 * If the number of missing devices is larger than max errors, we can 5705 * not write the data into that chunk successfully. 5706 */ 5707 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5708 ret = false; 5709 end: 5710 free_extent_map(em); 5711 return ret; 5712 } 5713 5714 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5715 { 5716 struct extent_map *em; 5717 5718 while (1) { 5719 write_lock(&tree->lock); 5720 em = lookup_extent_mapping(tree, 0, (u64)-1); 5721 if (em) 5722 remove_extent_mapping(tree, em); 5723 write_unlock(&tree->lock); 5724 if (!em) 5725 break; 5726 /* once for us */ 5727 free_extent_map(em); 5728 /* once for the tree */ 5729 free_extent_map(em); 5730 } 5731 } 5732 5733 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5734 { 5735 struct extent_map *em; 5736 struct map_lookup *map; 5737 enum btrfs_raid_types index; 5738 int ret = 1; 5739 5740 em = btrfs_get_chunk_map(fs_info, logical, len); 5741 if (IS_ERR(em)) 5742 /* 5743 * We could return errors for these cases, but that could get 5744 * ugly and we'd probably do the same thing which is just not do 5745 * anything else and exit, so return 1 so the callers don't try 5746 * to use other copies. 5747 */ 5748 return 1; 5749 5750 map = em->map_lookup; 5751 index = btrfs_bg_flags_to_raid_index(map->type); 5752 5753 /* Non-RAID56, use their ncopies from btrfs_raid_array. */ 5754 if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5755 ret = btrfs_raid_array[index].ncopies; 5756 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5757 ret = 2; 5758 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5759 /* 5760 * There could be two corrupted data stripes, we need 5761 * to loop retry in order to rebuild the correct data. 5762 * 5763 * Fail a stripe at a time on every retry except the 5764 * stripe under reconstruction. 5765 */ 5766 ret = map->num_stripes; 5767 free_extent_map(em); 5768 5769 down_read(&fs_info->dev_replace.rwsem); 5770 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5771 fs_info->dev_replace.tgtdev) 5772 ret++; 5773 up_read(&fs_info->dev_replace.rwsem); 5774 5775 return ret; 5776 } 5777 5778 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5779 u64 logical) 5780 { 5781 struct extent_map *em; 5782 struct map_lookup *map; 5783 unsigned long len = fs_info->sectorsize; 5784 5785 if (!btrfs_fs_incompat(fs_info, RAID56)) 5786 return len; 5787 5788 em = btrfs_get_chunk_map(fs_info, logical, len); 5789 5790 if (!WARN_ON(IS_ERR(em))) { 5791 map = em->map_lookup; 5792 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5793 len = map->stripe_len * nr_data_stripes(map); 5794 free_extent_map(em); 5795 } 5796 return len; 5797 } 5798 5799 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5800 { 5801 struct extent_map *em; 5802 struct map_lookup *map; 5803 int ret = 0; 5804 5805 if (!btrfs_fs_incompat(fs_info, RAID56)) 5806 return 0; 5807 5808 em = btrfs_get_chunk_map(fs_info, logical, len); 5809 5810 if(!WARN_ON(IS_ERR(em))) { 5811 map = em->map_lookup; 5812 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5813 ret = 1; 5814 free_extent_map(em); 5815 } 5816 return ret; 5817 } 5818 5819 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5820 struct map_lookup *map, int first, 5821 int dev_replace_is_ongoing) 5822 { 5823 int i; 5824 int num_stripes; 5825 int preferred_mirror; 5826 int tolerance; 5827 struct btrfs_device *srcdev; 5828 5829 ASSERT((map->type & 5830 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5831 5832 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5833 num_stripes = map->sub_stripes; 5834 else 5835 num_stripes = map->num_stripes; 5836 5837 switch (fs_info->fs_devices->read_policy) { 5838 default: 5839 /* Shouldn't happen, just warn and use pid instead of failing */ 5840 btrfs_warn_rl(fs_info, 5841 "unknown read_policy type %u, reset to pid", 5842 fs_info->fs_devices->read_policy); 5843 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5844 fallthrough; 5845 case BTRFS_READ_POLICY_PID: 5846 preferred_mirror = first + (current->pid % num_stripes); 5847 break; 5848 } 5849 5850 if (dev_replace_is_ongoing && 5851 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5852 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5853 srcdev = fs_info->dev_replace.srcdev; 5854 else 5855 srcdev = NULL; 5856 5857 /* 5858 * try to avoid the drive that is the source drive for a 5859 * dev-replace procedure, only choose it if no other non-missing 5860 * mirror is available 5861 */ 5862 for (tolerance = 0; tolerance < 2; tolerance++) { 5863 if (map->stripes[preferred_mirror].dev->bdev && 5864 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5865 return preferred_mirror; 5866 for (i = first; i < first + num_stripes; i++) { 5867 if (map->stripes[i].dev->bdev && 5868 (tolerance || map->stripes[i].dev != srcdev)) 5869 return i; 5870 } 5871 } 5872 5873 /* we couldn't find one that doesn't fail. Just return something 5874 * and the io error handling code will clean up eventually 5875 */ 5876 return preferred_mirror; 5877 } 5878 5879 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5880 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) 5881 { 5882 int i; 5883 int again = 1; 5884 5885 while (again) { 5886 again = 0; 5887 for (i = 0; i < num_stripes - 1; i++) { 5888 /* Swap if parity is on a smaller index */ 5889 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { 5890 swap(bioc->stripes[i], bioc->stripes[i + 1]); 5891 swap(bioc->raid_map[i], bioc->raid_map[i + 1]); 5892 again = 1; 5893 } 5894 } 5895 } 5896 } 5897 5898 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 5899 int total_stripes, 5900 int real_stripes) 5901 { 5902 struct btrfs_io_context *bioc = kzalloc( 5903 /* The size of btrfs_io_context */ 5904 sizeof(struct btrfs_io_context) + 5905 /* Plus the variable array for the stripes */ 5906 sizeof(struct btrfs_io_stripe) * (total_stripes) + 5907 /* Plus the variable array for the tgt dev */ 5908 sizeof(int) * (real_stripes) + 5909 /* 5910 * Plus the raid_map, which includes both the tgt dev 5911 * and the stripes. 5912 */ 5913 sizeof(u64) * (total_stripes), 5914 GFP_NOFS|__GFP_NOFAIL); 5915 5916 refcount_set(&bioc->refs, 1); 5917 5918 bioc->fs_info = fs_info; 5919 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); 5920 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); 5921 5922 return bioc; 5923 } 5924 5925 void btrfs_get_bioc(struct btrfs_io_context *bioc) 5926 { 5927 WARN_ON(!refcount_read(&bioc->refs)); 5928 refcount_inc(&bioc->refs); 5929 } 5930 5931 void btrfs_put_bioc(struct btrfs_io_context *bioc) 5932 { 5933 if (!bioc) 5934 return; 5935 if (refcount_dec_and_test(&bioc->refs)) 5936 kfree(bioc); 5937 } 5938 5939 /* 5940 * Please note that, discard won't be sent to target device of device 5941 * replace. 5942 */ 5943 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, 5944 u64 logical, u64 *length_ret, 5945 u32 *num_stripes) 5946 { 5947 struct extent_map *em; 5948 struct map_lookup *map; 5949 struct btrfs_discard_stripe *stripes; 5950 u64 length = *length_ret; 5951 u64 offset; 5952 u64 stripe_nr; 5953 u64 stripe_nr_end; 5954 u64 stripe_end_offset; 5955 u64 stripe_cnt; 5956 u64 stripe_len; 5957 u64 stripe_offset; 5958 u32 stripe_index; 5959 u32 factor = 0; 5960 u32 sub_stripes = 0; 5961 u64 stripes_per_dev = 0; 5962 u32 remaining_stripes = 0; 5963 u32 last_stripe = 0; 5964 int ret; 5965 int i; 5966 5967 em = btrfs_get_chunk_map(fs_info, logical, length); 5968 if (IS_ERR(em)) 5969 return ERR_CAST(em); 5970 5971 map = em->map_lookup; 5972 5973 /* we don't discard raid56 yet */ 5974 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5975 ret = -EOPNOTSUPP; 5976 goto out_free_map; 5977 } 5978 5979 offset = logical - em->start; 5980 length = min_t(u64, em->start + em->len - logical, length); 5981 *length_ret = length; 5982 5983 stripe_len = map->stripe_len; 5984 /* 5985 * stripe_nr counts the total number of stripes we have to stride 5986 * to get to this block 5987 */ 5988 stripe_nr = div64_u64(offset, stripe_len); 5989 5990 /* stripe_offset is the offset of this block in its stripe */ 5991 stripe_offset = offset - stripe_nr * stripe_len; 5992 5993 stripe_nr_end = round_up(offset + length, map->stripe_len); 5994 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5995 stripe_cnt = stripe_nr_end - stripe_nr; 5996 stripe_end_offset = stripe_nr_end * map->stripe_len - 5997 (offset + length); 5998 /* 5999 * after this, stripe_nr is the number of stripes on this 6000 * device we have to walk to find the data, and stripe_index is 6001 * the number of our device in the stripe array 6002 */ 6003 *num_stripes = 1; 6004 stripe_index = 0; 6005 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6006 BTRFS_BLOCK_GROUP_RAID10)) { 6007 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6008 sub_stripes = 1; 6009 else 6010 sub_stripes = map->sub_stripes; 6011 6012 factor = map->num_stripes / sub_stripes; 6013 *num_stripes = min_t(u64, map->num_stripes, 6014 sub_stripes * stripe_cnt); 6015 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6016 stripe_index *= sub_stripes; 6017 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 6018 &remaining_stripes); 6019 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 6020 last_stripe *= sub_stripes; 6021 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 6022 BTRFS_BLOCK_GROUP_DUP)) { 6023 *num_stripes = map->num_stripes; 6024 } else { 6025 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6026 &stripe_index); 6027 } 6028 6029 stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); 6030 if (!stripes) { 6031 ret = -ENOMEM; 6032 goto out_free_map; 6033 } 6034 6035 for (i = 0; i < *num_stripes; i++) { 6036 stripes[i].physical = 6037 map->stripes[stripe_index].physical + 6038 stripe_offset + stripe_nr * map->stripe_len; 6039 stripes[i].dev = map->stripes[stripe_index].dev; 6040 6041 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6042 BTRFS_BLOCK_GROUP_RAID10)) { 6043 stripes[i].length = stripes_per_dev * map->stripe_len; 6044 6045 if (i / sub_stripes < remaining_stripes) 6046 stripes[i].length += map->stripe_len; 6047 6048 /* 6049 * Special for the first stripe and 6050 * the last stripe: 6051 * 6052 * |-------|...|-------| 6053 * |----------| 6054 * off end_off 6055 */ 6056 if (i < sub_stripes) 6057 stripes[i].length -= stripe_offset; 6058 6059 if (stripe_index >= last_stripe && 6060 stripe_index <= (last_stripe + 6061 sub_stripes - 1)) 6062 stripes[i].length -= stripe_end_offset; 6063 6064 if (i == sub_stripes - 1) 6065 stripe_offset = 0; 6066 } else { 6067 stripes[i].length = length; 6068 } 6069 6070 stripe_index++; 6071 if (stripe_index == map->num_stripes) { 6072 stripe_index = 0; 6073 stripe_nr++; 6074 } 6075 } 6076 6077 free_extent_map(em); 6078 return stripes; 6079 out_free_map: 6080 free_extent_map(em); 6081 return ERR_PTR(ret); 6082 } 6083 6084 /* 6085 * In dev-replace case, for repair case (that's the only case where the mirror 6086 * is selected explicitly when calling btrfs_map_block), blocks left of the 6087 * left cursor can also be read from the target drive. 6088 * 6089 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6090 * array of stripes. 6091 * For READ, it also needs to be supported using the same mirror number. 6092 * 6093 * If the requested block is not left of the left cursor, EIO is returned. This 6094 * can happen because btrfs_num_copies() returns one more in the dev-replace 6095 * case. 6096 */ 6097 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6098 u64 logical, u64 length, 6099 u64 srcdev_devid, int *mirror_num, 6100 u64 *physical) 6101 { 6102 struct btrfs_io_context *bioc = NULL; 6103 int num_stripes; 6104 int index_srcdev = 0; 6105 int found = 0; 6106 u64 physical_of_found = 0; 6107 int i; 6108 int ret = 0; 6109 6110 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6111 logical, &length, &bioc, NULL, NULL, 0); 6112 if (ret) { 6113 ASSERT(bioc == NULL); 6114 return ret; 6115 } 6116 6117 num_stripes = bioc->num_stripes; 6118 if (*mirror_num > num_stripes) { 6119 /* 6120 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6121 * that means that the requested area is not left of the left 6122 * cursor 6123 */ 6124 btrfs_put_bioc(bioc); 6125 return -EIO; 6126 } 6127 6128 /* 6129 * process the rest of the function using the mirror_num of the source 6130 * drive. Therefore look it up first. At the end, patch the device 6131 * pointer to the one of the target drive. 6132 */ 6133 for (i = 0; i < num_stripes; i++) { 6134 if (bioc->stripes[i].dev->devid != srcdev_devid) 6135 continue; 6136 6137 /* 6138 * In case of DUP, in order to keep it simple, only add the 6139 * mirror with the lowest physical address 6140 */ 6141 if (found && 6142 physical_of_found <= bioc->stripes[i].physical) 6143 continue; 6144 6145 index_srcdev = i; 6146 found = 1; 6147 physical_of_found = bioc->stripes[i].physical; 6148 } 6149 6150 btrfs_put_bioc(bioc); 6151 6152 ASSERT(found); 6153 if (!found) 6154 return -EIO; 6155 6156 *mirror_num = index_srcdev + 1; 6157 *physical = physical_of_found; 6158 return ret; 6159 } 6160 6161 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6162 { 6163 struct btrfs_block_group *cache; 6164 bool ret; 6165 6166 /* Non zoned filesystem does not use "to_copy" flag */ 6167 if (!btrfs_is_zoned(fs_info)) 6168 return false; 6169 6170 cache = btrfs_lookup_block_group(fs_info, logical); 6171 6172 ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 6173 6174 btrfs_put_block_group(cache); 6175 return ret; 6176 } 6177 6178 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6179 struct btrfs_io_context **bioc_ret, 6180 struct btrfs_dev_replace *dev_replace, 6181 u64 logical, 6182 int *num_stripes_ret, int *max_errors_ret) 6183 { 6184 struct btrfs_io_context *bioc = *bioc_ret; 6185 u64 srcdev_devid = dev_replace->srcdev->devid; 6186 int tgtdev_indexes = 0; 6187 int num_stripes = *num_stripes_ret; 6188 int max_errors = *max_errors_ret; 6189 int i; 6190 6191 if (op == BTRFS_MAP_WRITE) { 6192 int index_where_to_add; 6193 6194 /* 6195 * A block group which have "to_copy" set will eventually 6196 * copied by dev-replace process. We can avoid cloning IO here. 6197 */ 6198 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6199 return; 6200 6201 /* 6202 * duplicate the write operations while the dev replace 6203 * procedure is running. Since the copying of the old disk to 6204 * the new disk takes place at run time while the filesystem is 6205 * mounted writable, the regular write operations to the old 6206 * disk have to be duplicated to go to the new disk as well. 6207 * 6208 * Note that device->missing is handled by the caller, and that 6209 * the write to the old disk is already set up in the stripes 6210 * array. 6211 */ 6212 index_where_to_add = num_stripes; 6213 for (i = 0; i < num_stripes; i++) { 6214 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6215 /* write to new disk, too */ 6216 struct btrfs_io_stripe *new = 6217 bioc->stripes + index_where_to_add; 6218 struct btrfs_io_stripe *old = 6219 bioc->stripes + i; 6220 6221 new->physical = old->physical; 6222 new->dev = dev_replace->tgtdev; 6223 bioc->tgtdev_map[i] = index_where_to_add; 6224 index_where_to_add++; 6225 max_errors++; 6226 tgtdev_indexes++; 6227 } 6228 } 6229 num_stripes = index_where_to_add; 6230 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6231 int index_srcdev = 0; 6232 int found = 0; 6233 u64 physical_of_found = 0; 6234 6235 /* 6236 * During the dev-replace procedure, the target drive can also 6237 * be used to read data in case it is needed to repair a corrupt 6238 * block elsewhere. This is possible if the requested area is 6239 * left of the left cursor. In this area, the target drive is a 6240 * full copy of the source drive. 6241 */ 6242 for (i = 0; i < num_stripes; i++) { 6243 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6244 /* 6245 * In case of DUP, in order to keep it simple, 6246 * only add the mirror with the lowest physical 6247 * address 6248 */ 6249 if (found && 6250 physical_of_found <= bioc->stripes[i].physical) 6251 continue; 6252 index_srcdev = i; 6253 found = 1; 6254 physical_of_found = bioc->stripes[i].physical; 6255 } 6256 } 6257 if (found) { 6258 struct btrfs_io_stripe *tgtdev_stripe = 6259 bioc->stripes + num_stripes; 6260 6261 tgtdev_stripe->physical = physical_of_found; 6262 tgtdev_stripe->dev = dev_replace->tgtdev; 6263 bioc->tgtdev_map[index_srcdev] = num_stripes; 6264 6265 tgtdev_indexes++; 6266 num_stripes++; 6267 } 6268 } 6269 6270 *num_stripes_ret = num_stripes; 6271 *max_errors_ret = max_errors; 6272 bioc->num_tgtdevs = tgtdev_indexes; 6273 *bioc_ret = bioc; 6274 } 6275 6276 static bool need_full_stripe(enum btrfs_map_op op) 6277 { 6278 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6279 } 6280 6281 /* 6282 * Calculate the geometry of a particular (address, len) tuple. This 6283 * information is used to calculate how big a particular bio can get before it 6284 * straddles a stripe. 6285 * 6286 * @fs_info: the filesystem 6287 * @em: mapping containing the logical extent 6288 * @op: type of operation - write or read 6289 * @logical: address that we want to figure out the geometry of 6290 * @io_geom: pointer used to return values 6291 * 6292 * Returns < 0 in case a chunk for the given logical address cannot be found, 6293 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6294 */ 6295 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6296 enum btrfs_map_op op, u64 logical, 6297 struct btrfs_io_geometry *io_geom) 6298 { 6299 struct map_lookup *map; 6300 u64 len; 6301 u64 offset; 6302 u64 stripe_offset; 6303 u64 stripe_nr; 6304 u32 stripe_len; 6305 u64 raid56_full_stripe_start = (u64)-1; 6306 int data_stripes; 6307 6308 ASSERT(op != BTRFS_MAP_DISCARD); 6309 6310 map = em->map_lookup; 6311 /* Offset of this logical address in the chunk */ 6312 offset = logical - em->start; 6313 /* Len of a stripe in a chunk */ 6314 stripe_len = map->stripe_len; 6315 /* 6316 * Stripe_nr is where this block falls in 6317 * stripe_offset is the offset of this block in its stripe. 6318 */ 6319 stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); 6320 ASSERT(stripe_offset < U32_MAX); 6321 6322 data_stripes = nr_data_stripes(map); 6323 6324 /* Only stripe based profiles needs to check against stripe length. */ 6325 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { 6326 u64 max_len = stripe_len - stripe_offset; 6327 6328 /* 6329 * In case of raid56, we need to know the stripe aligned start 6330 */ 6331 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6332 unsigned long full_stripe_len = stripe_len * data_stripes; 6333 raid56_full_stripe_start = offset; 6334 6335 /* 6336 * Allow a write of a full stripe, but make sure we 6337 * don't allow straddling of stripes 6338 */ 6339 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6340 full_stripe_len); 6341 raid56_full_stripe_start *= full_stripe_len; 6342 6343 /* 6344 * For writes to RAID[56], allow a full stripeset across 6345 * all disks. For other RAID types and for RAID[56] 6346 * reads, just allow a single stripe (on a single disk). 6347 */ 6348 if (op == BTRFS_MAP_WRITE) { 6349 max_len = stripe_len * data_stripes - 6350 (offset - raid56_full_stripe_start); 6351 } 6352 } 6353 len = min_t(u64, em->len - offset, max_len); 6354 } else { 6355 len = em->len - offset; 6356 } 6357 6358 io_geom->len = len; 6359 io_geom->offset = offset; 6360 io_geom->stripe_len = stripe_len; 6361 io_geom->stripe_nr = stripe_nr; 6362 io_geom->stripe_offset = stripe_offset; 6363 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6364 6365 return 0; 6366 } 6367 6368 static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, 6369 u32 stripe_index, u64 stripe_offset, u64 stripe_nr) 6370 { 6371 dst->dev = map->stripes[stripe_index].dev; 6372 dst->physical = map->stripes[stripe_index].physical + 6373 stripe_offset + stripe_nr * map->stripe_len; 6374 } 6375 6376 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6377 enum btrfs_map_op op, u64 logical, u64 *length, 6378 struct btrfs_io_context **bioc_ret, 6379 struct btrfs_io_stripe *smap, 6380 int *mirror_num_ret, int need_raid_map) 6381 { 6382 struct extent_map *em; 6383 struct map_lookup *map; 6384 u64 stripe_offset; 6385 u64 stripe_nr; 6386 u64 stripe_len; 6387 u32 stripe_index; 6388 int data_stripes; 6389 int i; 6390 int ret = 0; 6391 int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); 6392 int num_stripes; 6393 int max_errors = 0; 6394 int tgtdev_indexes = 0; 6395 struct btrfs_io_context *bioc = NULL; 6396 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6397 int dev_replace_is_ongoing = 0; 6398 int num_alloc_stripes; 6399 int patch_the_first_stripe_for_dev_replace = 0; 6400 u64 physical_to_patch_in_first_stripe = 0; 6401 u64 raid56_full_stripe_start = (u64)-1; 6402 struct btrfs_io_geometry geom; 6403 6404 ASSERT(bioc_ret); 6405 ASSERT(op != BTRFS_MAP_DISCARD); 6406 6407 em = btrfs_get_chunk_map(fs_info, logical, *length); 6408 ASSERT(!IS_ERR(em)); 6409 6410 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6411 if (ret < 0) 6412 return ret; 6413 6414 map = em->map_lookup; 6415 6416 *length = geom.len; 6417 stripe_len = geom.stripe_len; 6418 stripe_nr = geom.stripe_nr; 6419 stripe_offset = geom.stripe_offset; 6420 raid56_full_stripe_start = geom.raid56_stripe_offset; 6421 data_stripes = nr_data_stripes(map); 6422 6423 down_read(&dev_replace->rwsem); 6424 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6425 /* 6426 * Hold the semaphore for read during the whole operation, write is 6427 * requested at commit time but must wait. 6428 */ 6429 if (!dev_replace_is_ongoing) 6430 up_read(&dev_replace->rwsem); 6431 6432 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6433 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6434 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6435 dev_replace->srcdev->devid, 6436 &mirror_num, 6437 &physical_to_patch_in_first_stripe); 6438 if (ret) 6439 goto out; 6440 else 6441 patch_the_first_stripe_for_dev_replace = 1; 6442 } else if (mirror_num > map->num_stripes) { 6443 mirror_num = 0; 6444 } 6445 6446 num_stripes = 1; 6447 stripe_index = 0; 6448 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6449 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6450 &stripe_index); 6451 if (!need_full_stripe(op)) 6452 mirror_num = 1; 6453 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6454 if (need_full_stripe(op)) 6455 num_stripes = map->num_stripes; 6456 else if (mirror_num) 6457 stripe_index = mirror_num - 1; 6458 else { 6459 stripe_index = find_live_mirror(fs_info, map, 0, 6460 dev_replace_is_ongoing); 6461 mirror_num = stripe_index + 1; 6462 } 6463 6464 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6465 if (need_full_stripe(op)) { 6466 num_stripes = map->num_stripes; 6467 } else if (mirror_num) { 6468 stripe_index = mirror_num - 1; 6469 } else { 6470 mirror_num = 1; 6471 } 6472 6473 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6474 u32 factor = map->num_stripes / map->sub_stripes; 6475 6476 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6477 stripe_index *= map->sub_stripes; 6478 6479 if (need_full_stripe(op)) 6480 num_stripes = map->sub_stripes; 6481 else if (mirror_num) 6482 stripe_index += mirror_num - 1; 6483 else { 6484 int old_stripe_index = stripe_index; 6485 stripe_index = find_live_mirror(fs_info, map, 6486 stripe_index, 6487 dev_replace_is_ongoing); 6488 mirror_num = stripe_index - old_stripe_index + 1; 6489 } 6490 6491 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6492 ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); 6493 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6494 /* push stripe_nr back to the start of the full stripe */ 6495 stripe_nr = div64_u64(raid56_full_stripe_start, 6496 stripe_len * data_stripes); 6497 6498 /* RAID[56] write or recovery. Return all stripes */ 6499 num_stripes = map->num_stripes; 6500 max_errors = btrfs_chunk_max_errors(map); 6501 6502 /* Return the length to the full stripe end */ 6503 *length = min(logical + *length, 6504 raid56_full_stripe_start + em->start + 6505 data_stripes * stripe_len) - logical; 6506 stripe_index = 0; 6507 stripe_offset = 0; 6508 } else { 6509 /* 6510 * Mirror #0 or #1 means the original data block. 6511 * Mirror #2 is RAID5 parity block. 6512 * Mirror #3 is RAID6 Q block. 6513 */ 6514 stripe_nr = div_u64_rem(stripe_nr, 6515 data_stripes, &stripe_index); 6516 if (mirror_num > 1) 6517 stripe_index = data_stripes + mirror_num - 2; 6518 6519 /* We distribute the parity blocks across stripes */ 6520 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6521 &stripe_index); 6522 if (!need_full_stripe(op) && mirror_num <= 1) 6523 mirror_num = 1; 6524 } 6525 } else { 6526 /* 6527 * after this, stripe_nr is the number of stripes on this 6528 * device we have to walk to find the data, and stripe_index is 6529 * the number of our device in the stripe array 6530 */ 6531 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6532 &stripe_index); 6533 mirror_num = stripe_index + 1; 6534 } 6535 if (stripe_index >= map->num_stripes) { 6536 btrfs_crit(fs_info, 6537 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6538 stripe_index, map->num_stripes); 6539 ret = -EINVAL; 6540 goto out; 6541 } 6542 6543 num_alloc_stripes = num_stripes; 6544 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6545 if (op == BTRFS_MAP_WRITE) 6546 num_alloc_stripes <<= 1; 6547 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6548 num_alloc_stripes++; 6549 tgtdev_indexes = num_stripes; 6550 } 6551 6552 /* 6553 * If this I/O maps to a single device, try to return the device and 6554 * physical block information on the stack instead of allocating an 6555 * I/O context structure. 6556 */ 6557 if (smap && num_alloc_stripes == 1 && 6558 !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && 6559 (!need_full_stripe(op) || !dev_replace_is_ongoing || 6560 !dev_replace->tgtdev)) { 6561 if (patch_the_first_stripe_for_dev_replace) { 6562 smap->dev = dev_replace->tgtdev; 6563 smap->physical = physical_to_patch_in_first_stripe; 6564 *mirror_num_ret = map->num_stripes + 1; 6565 } else { 6566 set_io_stripe(smap, map, stripe_index, stripe_offset, 6567 stripe_nr); 6568 *mirror_num_ret = mirror_num; 6569 } 6570 *bioc_ret = NULL; 6571 ret = 0; 6572 goto out; 6573 } 6574 6575 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); 6576 if (!bioc) { 6577 ret = -ENOMEM; 6578 goto out; 6579 } 6580 6581 for (i = 0; i < num_stripes; i++) { 6582 set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, 6583 stripe_nr); 6584 stripe_index++; 6585 } 6586 6587 /* Build raid_map */ 6588 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6589 (need_full_stripe(op) || mirror_num > 1)) { 6590 u64 tmp; 6591 unsigned rot; 6592 6593 /* Work out the disk rotation on this stripe-set */ 6594 div_u64_rem(stripe_nr, num_stripes, &rot); 6595 6596 /* Fill in the logical address of each stripe */ 6597 tmp = stripe_nr * data_stripes; 6598 for (i = 0; i < data_stripes; i++) 6599 bioc->raid_map[(i + rot) % num_stripes] = 6600 em->start + (tmp + i) * map->stripe_len; 6601 6602 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; 6603 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6604 bioc->raid_map[(i + rot + 1) % num_stripes] = 6605 RAID6_Q_STRIPE; 6606 6607 sort_parity_stripes(bioc, num_stripes); 6608 } 6609 6610 if (need_full_stripe(op)) 6611 max_errors = btrfs_chunk_max_errors(map); 6612 6613 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6614 need_full_stripe(op)) { 6615 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, 6616 &num_stripes, &max_errors); 6617 } 6618 6619 *bioc_ret = bioc; 6620 bioc->map_type = map->type; 6621 bioc->num_stripes = num_stripes; 6622 bioc->max_errors = max_errors; 6623 bioc->mirror_num = mirror_num; 6624 6625 /* 6626 * this is the case that REQ_READ && dev_replace_is_ongoing && 6627 * mirror_num == num_stripes + 1 && dev_replace target drive is 6628 * available as a mirror 6629 */ 6630 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6631 WARN_ON(num_stripes > 1); 6632 bioc->stripes[0].dev = dev_replace->tgtdev; 6633 bioc->stripes[0].physical = physical_to_patch_in_first_stripe; 6634 bioc->mirror_num = map->num_stripes + 1; 6635 } 6636 out: 6637 if (dev_replace_is_ongoing) { 6638 lockdep_assert_held(&dev_replace->rwsem); 6639 /* Unlock and let waiting writers proceed */ 6640 up_read(&dev_replace->rwsem); 6641 } 6642 free_extent_map(em); 6643 return ret; 6644 } 6645 6646 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6647 u64 logical, u64 *length, 6648 struct btrfs_io_context **bioc_ret, int mirror_num) 6649 { 6650 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6651 NULL, &mirror_num, 0); 6652 } 6653 6654 /* For Scrub/replace */ 6655 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6656 u64 logical, u64 *length, 6657 struct btrfs_io_context **bioc_ret) 6658 { 6659 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6660 NULL, NULL, 1); 6661 } 6662 6663 /* 6664 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 6665 * is already initialized by the block layer. 6666 */ 6667 static inline void btrfs_bio_init(struct btrfs_bio *bbio, 6668 btrfs_bio_end_io_t end_io, void *private) 6669 { 6670 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 6671 bbio->end_io = end_io; 6672 bbio->private = private; 6673 } 6674 6675 /* 6676 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 6677 * btrfs, and is used for all I/O submitted through btrfs_submit_bio. 6678 * 6679 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 6680 * a mempool. 6681 */ 6682 struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 6683 btrfs_bio_end_io_t end_io, void *private) 6684 { 6685 struct bio *bio; 6686 6687 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 6688 btrfs_bio_init(btrfs_bio(bio), end_io, private); 6689 return bio; 6690 } 6691 6692 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, 6693 btrfs_bio_end_io_t end_io, void *private) 6694 { 6695 struct bio *bio; 6696 struct btrfs_bio *bbio; 6697 6698 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 6699 6700 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 6701 bbio = btrfs_bio(bio); 6702 btrfs_bio_init(bbio, end_io, private); 6703 6704 bio_trim(bio, offset >> 9, size >> 9); 6705 bbio->iter = bio->bi_iter; 6706 return bio; 6707 } 6708 6709 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) 6710 { 6711 if (!dev || !dev->bdev) 6712 return; 6713 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 6714 return; 6715 6716 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6717 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 6718 if (!(bio->bi_opf & REQ_RAHEAD)) 6719 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 6720 if (bio->bi_opf & REQ_PREFLUSH) 6721 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 6722 } 6723 6724 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, 6725 struct bio *bio) 6726 { 6727 if (bio->bi_opf & REQ_META) 6728 return fs_info->endio_meta_workers; 6729 return fs_info->endio_workers; 6730 } 6731 6732 static void btrfs_end_bio_work(struct work_struct *work) 6733 { 6734 struct btrfs_bio *bbio = 6735 container_of(work, struct btrfs_bio, end_io_work); 6736 6737 bbio->end_io(bbio); 6738 } 6739 6740 static void btrfs_simple_end_io(struct bio *bio) 6741 { 6742 struct btrfs_fs_info *fs_info = bio->bi_private; 6743 struct btrfs_bio *bbio = btrfs_bio(bio); 6744 6745 btrfs_bio_counter_dec(fs_info); 6746 6747 if (bio->bi_status) 6748 btrfs_log_dev_io_error(bio, bbio->device); 6749 6750 if (bio_op(bio) == REQ_OP_READ) { 6751 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); 6752 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 6753 } else { 6754 bbio->end_io(bbio); 6755 } 6756 } 6757 6758 static void btrfs_raid56_end_io(struct bio *bio) 6759 { 6760 struct btrfs_io_context *bioc = bio->bi_private; 6761 struct btrfs_bio *bbio = btrfs_bio(bio); 6762 6763 btrfs_bio_counter_dec(bioc->fs_info); 6764 bbio->mirror_num = bioc->mirror_num; 6765 bbio->end_io(bbio); 6766 6767 btrfs_put_bioc(bioc); 6768 } 6769 6770 static void btrfs_orig_write_end_io(struct bio *bio) 6771 { 6772 struct btrfs_io_stripe *stripe = bio->bi_private; 6773 struct btrfs_io_context *bioc = stripe->bioc; 6774 struct btrfs_bio *bbio = btrfs_bio(bio); 6775 6776 btrfs_bio_counter_dec(bioc->fs_info); 6777 6778 if (bio->bi_status) { 6779 atomic_inc(&bioc->error); 6780 btrfs_log_dev_io_error(bio, stripe->dev); 6781 } 6782 6783 /* 6784 * Only send an error to the higher layers if it is beyond the tolerance 6785 * threshold. 6786 */ 6787 if (atomic_read(&bioc->error) > bioc->max_errors) 6788 bio->bi_status = BLK_STS_IOERR; 6789 else 6790 bio->bi_status = BLK_STS_OK; 6791 6792 bbio->end_io(bbio); 6793 btrfs_put_bioc(bioc); 6794 } 6795 6796 static void btrfs_clone_write_end_io(struct bio *bio) 6797 { 6798 struct btrfs_io_stripe *stripe = bio->bi_private; 6799 6800 if (bio->bi_status) { 6801 atomic_inc(&stripe->bioc->error); 6802 btrfs_log_dev_io_error(bio, stripe->dev); 6803 } 6804 6805 /* Pass on control to the original bio this one was cloned from */ 6806 bio_endio(stripe->bioc->orig_bio); 6807 bio_put(bio); 6808 } 6809 6810 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 6811 { 6812 if (!dev || !dev->bdev || 6813 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 6814 (btrfs_op(bio) == BTRFS_MAP_WRITE && 6815 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6816 bio_io_error(bio); 6817 return; 6818 } 6819 6820 bio_set_dev(bio, dev->bdev); 6821 6822 /* 6823 * For zone append writing, bi_sector must point the beginning of the 6824 * zone 6825 */ 6826 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6827 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 6828 6829 if (btrfs_dev_is_sequential(dev, physical)) { 6830 u64 zone_start = round_down(physical, 6831 dev->fs_info->zone_size); 6832 6833 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6834 } else { 6835 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6836 bio->bi_opf |= REQ_OP_WRITE; 6837 } 6838 } 6839 btrfs_debug_in_rcu(dev->fs_info, 6840 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6841 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6842 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6843 dev->devid, bio->bi_iter.bi_size); 6844 6845 btrfsic_check_bio(bio); 6846 submit_bio(bio); 6847 } 6848 6849 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 6850 { 6851 struct bio *orig_bio = bioc->orig_bio, *bio; 6852 6853 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 6854 6855 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 6856 if (dev_nr == bioc->num_stripes - 1) { 6857 bio = orig_bio; 6858 bio->bi_end_io = btrfs_orig_write_end_io; 6859 } else { 6860 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); 6861 bio_inc_remaining(orig_bio); 6862 bio->bi_end_io = btrfs_clone_write_end_io; 6863 } 6864 6865 bio->bi_private = &bioc->stripes[dev_nr]; 6866 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 6867 bioc->stripes[dev_nr].bioc = bioc; 6868 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 6869 } 6870 6871 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) 6872 { 6873 u64 logical = bio->bi_iter.bi_sector << 9; 6874 u64 length = bio->bi_iter.bi_size; 6875 u64 map_length = length; 6876 struct btrfs_io_context *bioc = NULL; 6877 struct btrfs_io_stripe smap; 6878 int ret; 6879 6880 btrfs_bio_counter_inc_blocked(fs_info); 6881 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 6882 &bioc, &smap, &mirror_num, 1); 6883 if (ret) { 6884 btrfs_bio_counter_dec(fs_info); 6885 btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); 6886 return; 6887 } 6888 6889 if (map_length < length) { 6890 btrfs_crit(fs_info, 6891 "mapping failed logical %llu bio len %llu len %llu", 6892 logical, length, map_length); 6893 BUG(); 6894 } 6895 6896 if (!bioc) { 6897 /* Single mirror read/write fast path */ 6898 btrfs_bio(bio)->mirror_num = mirror_num; 6899 btrfs_bio(bio)->device = smap.dev; 6900 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 6901 bio->bi_private = fs_info; 6902 bio->bi_end_io = btrfs_simple_end_io; 6903 btrfs_submit_dev_bio(smap.dev, bio); 6904 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6905 /* Parity RAID write or read recovery */ 6906 bio->bi_private = bioc; 6907 bio->bi_end_io = btrfs_raid56_end_io; 6908 if (bio_op(bio) == REQ_OP_READ) 6909 raid56_parity_recover(bio, bioc, mirror_num); 6910 else 6911 raid56_parity_write(bio, bioc); 6912 } else { 6913 /* Write to multiple mirrors */ 6914 int total_devs = bioc->num_stripes; 6915 int dev_nr; 6916 6917 bioc->orig_bio = bio; 6918 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) 6919 btrfs_submit_mirrored_bio(bioc, dev_nr); 6920 } 6921 } 6922 6923 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 6924 const struct btrfs_fs_devices *fs_devices) 6925 { 6926 if (args->fsid == NULL) 6927 return true; 6928 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 6929 return true; 6930 return false; 6931 } 6932 6933 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 6934 const struct btrfs_device *device) 6935 { 6936 if (args->missing) { 6937 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 6938 !device->bdev) 6939 return true; 6940 return false; 6941 } 6942 6943 if (device->devid != args->devid) 6944 return false; 6945 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 6946 return false; 6947 return true; 6948 } 6949 6950 /* 6951 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6952 * return NULL. 6953 * 6954 * If devid and uuid are both specified, the match must be exact, otherwise 6955 * only devid is used. 6956 */ 6957 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 6958 const struct btrfs_dev_lookup_args *args) 6959 { 6960 struct btrfs_device *device; 6961 struct btrfs_fs_devices *seed_devs; 6962 6963 if (dev_args_match_fs_devices(args, fs_devices)) { 6964 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6965 if (dev_args_match_device(args, device)) 6966 return device; 6967 } 6968 } 6969 6970 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6971 if (!dev_args_match_fs_devices(args, seed_devs)) 6972 continue; 6973 list_for_each_entry(device, &seed_devs->devices, dev_list) { 6974 if (dev_args_match_device(args, device)) 6975 return device; 6976 } 6977 } 6978 6979 return NULL; 6980 } 6981 6982 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6983 u64 devid, u8 *dev_uuid) 6984 { 6985 struct btrfs_device *device; 6986 unsigned int nofs_flag; 6987 6988 /* 6989 * We call this under the chunk_mutex, so we want to use NOFS for this 6990 * allocation, however we don't want to change btrfs_alloc_device() to 6991 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6992 * places. 6993 */ 6994 nofs_flag = memalloc_nofs_save(); 6995 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6996 memalloc_nofs_restore(nofs_flag); 6997 if (IS_ERR(device)) 6998 return device; 6999 7000 list_add(&device->dev_list, &fs_devices->devices); 7001 device->fs_devices = fs_devices; 7002 fs_devices->num_devices++; 7003 7004 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7005 fs_devices->missing_devices++; 7006 7007 return device; 7008 } 7009 7010 /** 7011 * btrfs_alloc_device - allocate struct btrfs_device 7012 * @fs_info: used only for generating a new devid, can be NULL if 7013 * devid is provided (i.e. @devid != NULL). 7014 * @devid: a pointer to devid for this device. If NULL a new devid 7015 * is generated. 7016 * @uuid: a pointer to UUID for this device. If NULL a new UUID 7017 * is generated. 7018 * 7019 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 7020 * on error. Returned struct is not linked onto any lists and must be 7021 * destroyed with btrfs_free_device. 7022 */ 7023 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 7024 const u64 *devid, 7025 const u8 *uuid) 7026 { 7027 struct btrfs_device *dev; 7028 u64 tmp; 7029 7030 if (WARN_ON(!devid && !fs_info)) 7031 return ERR_PTR(-EINVAL); 7032 7033 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 7034 if (!dev) 7035 return ERR_PTR(-ENOMEM); 7036 7037 INIT_LIST_HEAD(&dev->dev_list); 7038 INIT_LIST_HEAD(&dev->dev_alloc_list); 7039 INIT_LIST_HEAD(&dev->post_commit_list); 7040 7041 atomic_set(&dev->dev_stats_ccnt, 0); 7042 btrfs_device_data_ordered_init(dev); 7043 extent_io_tree_init(fs_info, &dev->alloc_state, 7044 IO_TREE_DEVICE_ALLOC_STATE, NULL); 7045 7046 if (devid) 7047 tmp = *devid; 7048 else { 7049 int ret; 7050 7051 ret = find_next_devid(fs_info, &tmp); 7052 if (ret) { 7053 btrfs_free_device(dev); 7054 return ERR_PTR(ret); 7055 } 7056 } 7057 dev->devid = tmp; 7058 7059 if (uuid) 7060 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 7061 else 7062 generate_random_uuid(dev->uuid); 7063 7064 return dev; 7065 } 7066 7067 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 7068 u64 devid, u8 *uuid, bool error) 7069 { 7070 if (error) 7071 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 7072 devid, uuid); 7073 else 7074 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 7075 devid, uuid); 7076 } 7077 7078 u64 btrfs_calc_stripe_length(const struct extent_map *em) 7079 { 7080 const struct map_lookup *map = em->map_lookup; 7081 const int data_stripes = calc_data_stripes(map->type, map->num_stripes); 7082 7083 return div_u64(em->len, data_stripes); 7084 } 7085 7086 #if BITS_PER_LONG == 32 7087 /* 7088 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 7089 * can't be accessed on 32bit systems. 7090 * 7091 * This function do mount time check to reject the fs if it already has 7092 * metadata chunk beyond that limit. 7093 */ 7094 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7095 u64 logical, u64 length, u64 type) 7096 { 7097 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7098 return 0; 7099 7100 if (logical + length < MAX_LFS_FILESIZE) 7101 return 0; 7102 7103 btrfs_err_32bit_limit(fs_info); 7104 return -EOVERFLOW; 7105 } 7106 7107 /* 7108 * This is to give early warning for any metadata chunk reaching 7109 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 7110 * Although we can still access the metadata, it's not going to be possible 7111 * once the limit is reached. 7112 */ 7113 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7114 u64 logical, u64 length, u64 type) 7115 { 7116 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7117 return; 7118 7119 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 7120 return; 7121 7122 btrfs_warn_32bit_limit(fs_info); 7123 } 7124 #endif 7125 7126 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 7127 u64 devid, u8 *uuid) 7128 { 7129 struct btrfs_device *dev; 7130 7131 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7132 btrfs_report_missing_device(fs_info, devid, uuid, true); 7133 return ERR_PTR(-ENOENT); 7134 } 7135 7136 dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 7137 if (IS_ERR(dev)) { 7138 btrfs_err(fs_info, "failed to init missing device %llu: %ld", 7139 devid, PTR_ERR(dev)); 7140 return dev; 7141 } 7142 btrfs_report_missing_device(fs_info, devid, uuid, false); 7143 7144 return dev; 7145 } 7146 7147 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 7148 struct btrfs_chunk *chunk) 7149 { 7150 BTRFS_DEV_LOOKUP_ARGS(args); 7151 struct btrfs_fs_info *fs_info = leaf->fs_info; 7152 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7153 struct map_lookup *map; 7154 struct extent_map *em; 7155 u64 logical; 7156 u64 length; 7157 u64 devid; 7158 u64 type; 7159 u8 uuid[BTRFS_UUID_SIZE]; 7160 int index; 7161 int num_stripes; 7162 int ret; 7163 int i; 7164 7165 logical = key->offset; 7166 length = btrfs_chunk_length(leaf, chunk); 7167 type = btrfs_chunk_type(leaf, chunk); 7168 index = btrfs_bg_flags_to_raid_index(type); 7169 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 7170 7171 #if BITS_PER_LONG == 32 7172 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 7173 if (ret < 0) 7174 return ret; 7175 warn_32bit_meta_chunk(fs_info, logical, length, type); 7176 #endif 7177 7178 /* 7179 * Only need to verify chunk item if we're reading from sys chunk array, 7180 * as chunk item in tree block is already verified by tree-checker. 7181 */ 7182 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 7183 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 7184 if (ret) 7185 return ret; 7186 } 7187 7188 read_lock(&map_tree->lock); 7189 em = lookup_extent_mapping(map_tree, logical, 1); 7190 read_unlock(&map_tree->lock); 7191 7192 /* already mapped? */ 7193 if (em && em->start <= logical && em->start + em->len > logical) { 7194 free_extent_map(em); 7195 return 0; 7196 } else if (em) { 7197 free_extent_map(em); 7198 } 7199 7200 em = alloc_extent_map(); 7201 if (!em) 7202 return -ENOMEM; 7203 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 7204 if (!map) { 7205 free_extent_map(em); 7206 return -ENOMEM; 7207 } 7208 7209 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 7210 em->map_lookup = map; 7211 em->start = logical; 7212 em->len = length; 7213 em->orig_start = 0; 7214 em->block_start = 0; 7215 em->block_len = em->len; 7216 7217 map->num_stripes = num_stripes; 7218 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7219 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7220 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7221 map->type = type; 7222 /* 7223 * We can't use the sub_stripes value, as for profiles other than 7224 * RAID10, they may have 0 as sub_stripes for filesystems created by 7225 * older mkfs (<v5.4). 7226 * In that case, it can cause divide-by-zero errors later. 7227 * Since currently sub_stripes is fixed for each profile, let's 7228 * use the trusted value instead. 7229 */ 7230 map->sub_stripes = btrfs_raid_array[index].sub_stripes; 7231 map->verified_stripes = 0; 7232 em->orig_block_len = btrfs_calc_stripe_length(em); 7233 for (i = 0; i < num_stripes; i++) { 7234 map->stripes[i].physical = 7235 btrfs_stripe_offset_nr(leaf, chunk, i); 7236 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7237 args.devid = devid; 7238 read_extent_buffer(leaf, uuid, (unsigned long) 7239 btrfs_stripe_dev_uuid_nr(chunk, i), 7240 BTRFS_UUID_SIZE); 7241 args.uuid = uuid; 7242 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 7243 if (!map->stripes[i].dev) { 7244 map->stripes[i].dev = handle_missing_device(fs_info, 7245 devid, uuid); 7246 if (IS_ERR(map->stripes[i].dev)) { 7247 free_extent_map(em); 7248 return PTR_ERR(map->stripes[i].dev); 7249 } 7250 } 7251 7252 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7253 &(map->stripes[i].dev->dev_state)); 7254 } 7255 7256 write_lock(&map_tree->lock); 7257 ret = add_extent_mapping(map_tree, em, 0); 7258 write_unlock(&map_tree->lock); 7259 if (ret < 0) { 7260 btrfs_err(fs_info, 7261 "failed to add chunk map, start=%llu len=%llu: %d", 7262 em->start, em->len, ret); 7263 } 7264 free_extent_map(em); 7265 7266 return ret; 7267 } 7268 7269 static void fill_device_from_item(struct extent_buffer *leaf, 7270 struct btrfs_dev_item *dev_item, 7271 struct btrfs_device *device) 7272 { 7273 unsigned long ptr; 7274 7275 device->devid = btrfs_device_id(leaf, dev_item); 7276 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7277 device->total_bytes = device->disk_total_bytes; 7278 device->commit_total_bytes = device->disk_total_bytes; 7279 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7280 device->commit_bytes_used = device->bytes_used; 7281 device->type = btrfs_device_type(leaf, dev_item); 7282 device->io_align = btrfs_device_io_align(leaf, dev_item); 7283 device->io_width = btrfs_device_io_width(leaf, dev_item); 7284 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7285 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7286 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7287 7288 ptr = btrfs_device_uuid(dev_item); 7289 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7290 } 7291 7292 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7293 u8 *fsid) 7294 { 7295 struct btrfs_fs_devices *fs_devices; 7296 int ret; 7297 7298 lockdep_assert_held(&uuid_mutex); 7299 ASSERT(fsid); 7300 7301 /* This will match only for multi-device seed fs */ 7302 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7303 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7304 return fs_devices; 7305 7306 7307 fs_devices = find_fsid(fsid, NULL); 7308 if (!fs_devices) { 7309 if (!btrfs_test_opt(fs_info, DEGRADED)) 7310 return ERR_PTR(-ENOENT); 7311 7312 fs_devices = alloc_fs_devices(fsid, NULL); 7313 if (IS_ERR(fs_devices)) 7314 return fs_devices; 7315 7316 fs_devices->seeding = true; 7317 fs_devices->opened = 1; 7318 return fs_devices; 7319 } 7320 7321 /* 7322 * Upon first call for a seed fs fsid, just create a private copy of the 7323 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7324 */ 7325 fs_devices = clone_fs_devices(fs_devices); 7326 if (IS_ERR(fs_devices)) 7327 return fs_devices; 7328 7329 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7330 if (ret) { 7331 free_fs_devices(fs_devices); 7332 return ERR_PTR(ret); 7333 } 7334 7335 if (!fs_devices->seeding) { 7336 close_fs_devices(fs_devices); 7337 free_fs_devices(fs_devices); 7338 return ERR_PTR(-EINVAL); 7339 } 7340 7341 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7342 7343 return fs_devices; 7344 } 7345 7346 static int read_one_dev(struct extent_buffer *leaf, 7347 struct btrfs_dev_item *dev_item) 7348 { 7349 BTRFS_DEV_LOOKUP_ARGS(args); 7350 struct btrfs_fs_info *fs_info = leaf->fs_info; 7351 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7352 struct btrfs_device *device; 7353 u64 devid; 7354 int ret; 7355 u8 fs_uuid[BTRFS_FSID_SIZE]; 7356 u8 dev_uuid[BTRFS_UUID_SIZE]; 7357 7358 devid = btrfs_device_id(leaf, dev_item); 7359 args.devid = devid; 7360 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7361 BTRFS_UUID_SIZE); 7362 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7363 BTRFS_FSID_SIZE); 7364 args.uuid = dev_uuid; 7365 args.fsid = fs_uuid; 7366 7367 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7368 fs_devices = open_seed_devices(fs_info, fs_uuid); 7369 if (IS_ERR(fs_devices)) 7370 return PTR_ERR(fs_devices); 7371 } 7372 7373 device = btrfs_find_device(fs_info->fs_devices, &args); 7374 if (!device) { 7375 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7376 btrfs_report_missing_device(fs_info, devid, 7377 dev_uuid, true); 7378 return -ENOENT; 7379 } 7380 7381 device = add_missing_dev(fs_devices, devid, dev_uuid); 7382 if (IS_ERR(device)) { 7383 btrfs_err(fs_info, 7384 "failed to add missing dev %llu: %ld", 7385 devid, PTR_ERR(device)); 7386 return PTR_ERR(device); 7387 } 7388 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7389 } else { 7390 if (!device->bdev) { 7391 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7392 btrfs_report_missing_device(fs_info, 7393 devid, dev_uuid, true); 7394 return -ENOENT; 7395 } 7396 btrfs_report_missing_device(fs_info, devid, 7397 dev_uuid, false); 7398 } 7399 7400 if (!device->bdev && 7401 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7402 /* 7403 * this happens when a device that was properly setup 7404 * in the device info lists suddenly goes bad. 7405 * device->bdev is NULL, and so we have to set 7406 * device->missing to one here 7407 */ 7408 device->fs_devices->missing_devices++; 7409 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7410 } 7411 7412 /* Move the device to its own fs_devices */ 7413 if (device->fs_devices != fs_devices) { 7414 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7415 &device->dev_state)); 7416 7417 list_move(&device->dev_list, &fs_devices->devices); 7418 device->fs_devices->num_devices--; 7419 fs_devices->num_devices++; 7420 7421 device->fs_devices->missing_devices--; 7422 fs_devices->missing_devices++; 7423 7424 device->fs_devices = fs_devices; 7425 } 7426 } 7427 7428 if (device->fs_devices != fs_info->fs_devices) { 7429 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7430 if (device->generation != 7431 btrfs_device_generation(leaf, dev_item)) 7432 return -EINVAL; 7433 } 7434 7435 fill_device_from_item(leaf, dev_item, device); 7436 if (device->bdev) { 7437 u64 max_total_bytes = bdev_nr_bytes(device->bdev); 7438 7439 if (device->total_bytes > max_total_bytes) { 7440 btrfs_err(fs_info, 7441 "device total_bytes should be at most %llu but found %llu", 7442 max_total_bytes, device->total_bytes); 7443 return -EINVAL; 7444 } 7445 } 7446 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7447 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7448 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7449 device->fs_devices->total_rw_bytes += device->total_bytes; 7450 atomic64_add(device->total_bytes - device->bytes_used, 7451 &fs_info->free_chunk_space); 7452 } 7453 ret = 0; 7454 return ret; 7455 } 7456 7457 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7458 { 7459 struct btrfs_super_block *super_copy = fs_info->super_copy; 7460 struct extent_buffer *sb; 7461 struct btrfs_disk_key *disk_key; 7462 struct btrfs_chunk *chunk; 7463 u8 *array_ptr; 7464 unsigned long sb_array_offset; 7465 int ret = 0; 7466 u32 num_stripes; 7467 u32 array_size; 7468 u32 len = 0; 7469 u32 cur_offset; 7470 u64 type; 7471 struct btrfs_key key; 7472 7473 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7474 7475 /* 7476 * We allocated a dummy extent, just to use extent buffer accessors. 7477 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but 7478 * that's fine, we will not go beyond system chunk array anyway. 7479 */ 7480 sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); 7481 if (!sb) 7482 return -ENOMEM; 7483 set_extent_buffer_uptodate(sb); 7484 7485 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7486 array_size = btrfs_super_sys_array_size(super_copy); 7487 7488 array_ptr = super_copy->sys_chunk_array; 7489 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7490 cur_offset = 0; 7491 7492 while (cur_offset < array_size) { 7493 disk_key = (struct btrfs_disk_key *)array_ptr; 7494 len = sizeof(*disk_key); 7495 if (cur_offset + len > array_size) 7496 goto out_short_read; 7497 7498 btrfs_disk_key_to_cpu(&key, disk_key); 7499 7500 array_ptr += len; 7501 sb_array_offset += len; 7502 cur_offset += len; 7503 7504 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7505 btrfs_err(fs_info, 7506 "unexpected item type %u in sys_array at offset %u", 7507 (u32)key.type, cur_offset); 7508 ret = -EIO; 7509 break; 7510 } 7511 7512 chunk = (struct btrfs_chunk *)sb_array_offset; 7513 /* 7514 * At least one btrfs_chunk with one stripe must be present, 7515 * exact stripe count check comes afterwards 7516 */ 7517 len = btrfs_chunk_item_size(1); 7518 if (cur_offset + len > array_size) 7519 goto out_short_read; 7520 7521 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7522 if (!num_stripes) { 7523 btrfs_err(fs_info, 7524 "invalid number of stripes %u in sys_array at offset %u", 7525 num_stripes, cur_offset); 7526 ret = -EIO; 7527 break; 7528 } 7529 7530 type = btrfs_chunk_type(sb, chunk); 7531 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7532 btrfs_err(fs_info, 7533 "invalid chunk type %llu in sys_array at offset %u", 7534 type, cur_offset); 7535 ret = -EIO; 7536 break; 7537 } 7538 7539 len = btrfs_chunk_item_size(num_stripes); 7540 if (cur_offset + len > array_size) 7541 goto out_short_read; 7542 7543 ret = read_one_chunk(&key, sb, chunk); 7544 if (ret) 7545 break; 7546 7547 array_ptr += len; 7548 sb_array_offset += len; 7549 cur_offset += len; 7550 } 7551 clear_extent_buffer_uptodate(sb); 7552 free_extent_buffer_stale(sb); 7553 return ret; 7554 7555 out_short_read: 7556 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7557 len, cur_offset); 7558 clear_extent_buffer_uptodate(sb); 7559 free_extent_buffer_stale(sb); 7560 return -EIO; 7561 } 7562 7563 /* 7564 * Check if all chunks in the fs are OK for read-write degraded mount 7565 * 7566 * If the @failing_dev is specified, it's accounted as missing. 7567 * 7568 * Return true if all chunks meet the minimal RW mount requirements. 7569 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7570 */ 7571 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7572 struct btrfs_device *failing_dev) 7573 { 7574 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7575 struct extent_map *em; 7576 u64 next_start = 0; 7577 bool ret = true; 7578 7579 read_lock(&map_tree->lock); 7580 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7581 read_unlock(&map_tree->lock); 7582 /* No chunk at all? Return false anyway */ 7583 if (!em) { 7584 ret = false; 7585 goto out; 7586 } 7587 while (em) { 7588 struct map_lookup *map; 7589 int missing = 0; 7590 int max_tolerated; 7591 int i; 7592 7593 map = em->map_lookup; 7594 max_tolerated = 7595 btrfs_get_num_tolerated_disk_barrier_failures( 7596 map->type); 7597 for (i = 0; i < map->num_stripes; i++) { 7598 struct btrfs_device *dev = map->stripes[i].dev; 7599 7600 if (!dev || !dev->bdev || 7601 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7602 dev->last_flush_error) 7603 missing++; 7604 else if (failing_dev && failing_dev == dev) 7605 missing++; 7606 } 7607 if (missing > max_tolerated) { 7608 if (!failing_dev) 7609 btrfs_warn(fs_info, 7610 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7611 em->start, missing, max_tolerated); 7612 free_extent_map(em); 7613 ret = false; 7614 goto out; 7615 } 7616 next_start = extent_map_end(em); 7617 free_extent_map(em); 7618 7619 read_lock(&map_tree->lock); 7620 em = lookup_extent_mapping(map_tree, next_start, 7621 (u64)(-1) - next_start); 7622 read_unlock(&map_tree->lock); 7623 } 7624 out: 7625 return ret; 7626 } 7627 7628 static void readahead_tree_node_children(struct extent_buffer *node) 7629 { 7630 int i; 7631 const int nr_items = btrfs_header_nritems(node); 7632 7633 for (i = 0; i < nr_items; i++) 7634 btrfs_readahead_node_child(node, i); 7635 } 7636 7637 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7638 { 7639 struct btrfs_root *root = fs_info->chunk_root; 7640 struct btrfs_path *path; 7641 struct extent_buffer *leaf; 7642 struct btrfs_key key; 7643 struct btrfs_key found_key; 7644 int ret; 7645 int slot; 7646 int iter_ret = 0; 7647 u64 total_dev = 0; 7648 u64 last_ra_node = 0; 7649 7650 path = btrfs_alloc_path(); 7651 if (!path) 7652 return -ENOMEM; 7653 7654 /* 7655 * uuid_mutex is needed only if we are mounting a sprout FS 7656 * otherwise we don't need it. 7657 */ 7658 mutex_lock(&uuid_mutex); 7659 7660 /* 7661 * It is possible for mount and umount to race in such a way that 7662 * we execute this code path, but open_fs_devices failed to clear 7663 * total_rw_bytes. We certainly want it cleared before reading the 7664 * device items, so clear it here. 7665 */ 7666 fs_info->fs_devices->total_rw_bytes = 0; 7667 7668 /* 7669 * Lockdep complains about possible circular locking dependency between 7670 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 7671 * used for freeze procection of a fs (struct super_block.s_writers), 7672 * which we take when starting a transaction, and extent buffers of the 7673 * chunk tree if we call read_one_dev() while holding a lock on an 7674 * extent buffer of the chunk tree. Since we are mounting the filesystem 7675 * and at this point there can't be any concurrent task modifying the 7676 * chunk tree, to keep it simple, just skip locking on the chunk tree. 7677 */ 7678 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7679 path->skip_locking = 1; 7680 7681 /* 7682 * Read all device items, and then all the chunk items. All 7683 * device items are found before any chunk item (their object id 7684 * is smaller than the lowest possible object id for a chunk 7685 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7686 */ 7687 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7688 key.offset = 0; 7689 key.type = 0; 7690 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 7691 struct extent_buffer *node = path->nodes[1]; 7692 7693 leaf = path->nodes[0]; 7694 slot = path->slots[0]; 7695 7696 if (node) { 7697 if (last_ra_node != node->start) { 7698 readahead_tree_node_children(node); 7699 last_ra_node = node->start; 7700 } 7701 } 7702 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7703 struct btrfs_dev_item *dev_item; 7704 dev_item = btrfs_item_ptr(leaf, slot, 7705 struct btrfs_dev_item); 7706 ret = read_one_dev(leaf, dev_item); 7707 if (ret) 7708 goto error; 7709 total_dev++; 7710 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7711 struct btrfs_chunk *chunk; 7712 7713 /* 7714 * We are only called at mount time, so no need to take 7715 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7716 * we always lock first fs_info->chunk_mutex before 7717 * acquiring any locks on the chunk tree. This is a 7718 * requirement for chunk allocation, see the comment on 7719 * top of btrfs_chunk_alloc() for details. 7720 */ 7721 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7722 ret = read_one_chunk(&found_key, leaf, chunk); 7723 if (ret) 7724 goto error; 7725 } 7726 } 7727 /* Catch error found during iteration */ 7728 if (iter_ret < 0) { 7729 ret = iter_ret; 7730 goto error; 7731 } 7732 7733 /* 7734 * After loading chunk tree, we've got all device information, 7735 * do another round of validation checks. 7736 */ 7737 if (total_dev != fs_info->fs_devices->total_devices) { 7738 btrfs_warn(fs_info, 7739 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", 7740 btrfs_super_num_devices(fs_info->super_copy), 7741 total_dev); 7742 fs_info->fs_devices->total_devices = total_dev; 7743 btrfs_set_super_num_devices(fs_info->super_copy, total_dev); 7744 } 7745 if (btrfs_super_total_bytes(fs_info->super_copy) < 7746 fs_info->fs_devices->total_rw_bytes) { 7747 btrfs_err(fs_info, 7748 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7749 btrfs_super_total_bytes(fs_info->super_copy), 7750 fs_info->fs_devices->total_rw_bytes); 7751 ret = -EINVAL; 7752 goto error; 7753 } 7754 ret = 0; 7755 error: 7756 mutex_unlock(&uuid_mutex); 7757 7758 btrfs_free_path(path); 7759 return ret; 7760 } 7761 7762 int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7763 { 7764 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7765 struct btrfs_device *device; 7766 int ret = 0; 7767 7768 fs_devices->fs_info = fs_info; 7769 7770 mutex_lock(&fs_devices->device_list_mutex); 7771 list_for_each_entry(device, &fs_devices->devices, dev_list) 7772 device->fs_info = fs_info; 7773 7774 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7775 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7776 device->fs_info = fs_info; 7777 ret = btrfs_get_dev_zone_info(device, false); 7778 if (ret) 7779 break; 7780 } 7781 7782 seed_devs->fs_info = fs_info; 7783 } 7784 mutex_unlock(&fs_devices->device_list_mutex); 7785 7786 return ret; 7787 } 7788 7789 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7790 const struct btrfs_dev_stats_item *ptr, 7791 int index) 7792 { 7793 u64 val; 7794 7795 read_extent_buffer(eb, &val, 7796 offsetof(struct btrfs_dev_stats_item, values) + 7797 ((unsigned long)ptr) + (index * sizeof(u64)), 7798 sizeof(val)); 7799 return val; 7800 } 7801 7802 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7803 struct btrfs_dev_stats_item *ptr, 7804 int index, u64 val) 7805 { 7806 write_extent_buffer(eb, &val, 7807 offsetof(struct btrfs_dev_stats_item, values) + 7808 ((unsigned long)ptr) + (index * sizeof(u64)), 7809 sizeof(val)); 7810 } 7811 7812 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7813 struct btrfs_path *path) 7814 { 7815 struct btrfs_dev_stats_item *ptr; 7816 struct extent_buffer *eb; 7817 struct btrfs_key key; 7818 int item_size; 7819 int i, ret, slot; 7820 7821 if (!device->fs_info->dev_root) 7822 return 0; 7823 7824 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7825 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7826 key.offset = device->devid; 7827 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7828 if (ret) { 7829 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7830 btrfs_dev_stat_set(device, i, 0); 7831 device->dev_stats_valid = 1; 7832 btrfs_release_path(path); 7833 return ret < 0 ? ret : 0; 7834 } 7835 slot = path->slots[0]; 7836 eb = path->nodes[0]; 7837 item_size = btrfs_item_size(eb, slot); 7838 7839 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7840 7841 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7842 if (item_size >= (1 + i) * sizeof(__le64)) 7843 btrfs_dev_stat_set(device, i, 7844 btrfs_dev_stats_value(eb, ptr, i)); 7845 else 7846 btrfs_dev_stat_set(device, i, 0); 7847 } 7848 7849 device->dev_stats_valid = 1; 7850 btrfs_dev_stat_print_on_load(device); 7851 btrfs_release_path(path); 7852 7853 return 0; 7854 } 7855 7856 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7857 { 7858 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7859 struct btrfs_device *device; 7860 struct btrfs_path *path = NULL; 7861 int ret = 0; 7862 7863 path = btrfs_alloc_path(); 7864 if (!path) 7865 return -ENOMEM; 7866 7867 mutex_lock(&fs_devices->device_list_mutex); 7868 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7869 ret = btrfs_device_init_dev_stats(device, path); 7870 if (ret) 7871 goto out; 7872 } 7873 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7874 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7875 ret = btrfs_device_init_dev_stats(device, path); 7876 if (ret) 7877 goto out; 7878 } 7879 } 7880 out: 7881 mutex_unlock(&fs_devices->device_list_mutex); 7882 7883 btrfs_free_path(path); 7884 return ret; 7885 } 7886 7887 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7888 struct btrfs_device *device) 7889 { 7890 struct btrfs_fs_info *fs_info = trans->fs_info; 7891 struct btrfs_root *dev_root = fs_info->dev_root; 7892 struct btrfs_path *path; 7893 struct btrfs_key key; 7894 struct extent_buffer *eb; 7895 struct btrfs_dev_stats_item *ptr; 7896 int ret; 7897 int i; 7898 7899 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7900 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7901 key.offset = device->devid; 7902 7903 path = btrfs_alloc_path(); 7904 if (!path) 7905 return -ENOMEM; 7906 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7907 if (ret < 0) { 7908 btrfs_warn_in_rcu(fs_info, 7909 "error %d while searching for dev_stats item for device %s", 7910 ret, rcu_str_deref(device->name)); 7911 goto out; 7912 } 7913 7914 if (ret == 0 && 7915 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7916 /* need to delete old one and insert a new one */ 7917 ret = btrfs_del_item(trans, dev_root, path); 7918 if (ret != 0) { 7919 btrfs_warn_in_rcu(fs_info, 7920 "delete too small dev_stats item for device %s failed %d", 7921 rcu_str_deref(device->name), ret); 7922 goto out; 7923 } 7924 ret = 1; 7925 } 7926 7927 if (ret == 1) { 7928 /* need to insert a new item */ 7929 btrfs_release_path(path); 7930 ret = btrfs_insert_empty_item(trans, dev_root, path, 7931 &key, sizeof(*ptr)); 7932 if (ret < 0) { 7933 btrfs_warn_in_rcu(fs_info, 7934 "insert dev_stats item for device %s failed %d", 7935 rcu_str_deref(device->name), ret); 7936 goto out; 7937 } 7938 } 7939 7940 eb = path->nodes[0]; 7941 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7942 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7943 btrfs_set_dev_stats_value(eb, ptr, i, 7944 btrfs_dev_stat_read(device, i)); 7945 btrfs_mark_buffer_dirty(eb); 7946 7947 out: 7948 btrfs_free_path(path); 7949 return ret; 7950 } 7951 7952 /* 7953 * called from commit_transaction. Writes all changed device stats to disk. 7954 */ 7955 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7956 { 7957 struct btrfs_fs_info *fs_info = trans->fs_info; 7958 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7959 struct btrfs_device *device; 7960 int stats_cnt; 7961 int ret = 0; 7962 7963 mutex_lock(&fs_devices->device_list_mutex); 7964 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7965 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7966 if (!device->dev_stats_valid || stats_cnt == 0) 7967 continue; 7968 7969 7970 /* 7971 * There is a LOAD-LOAD control dependency between the value of 7972 * dev_stats_ccnt and updating the on-disk values which requires 7973 * reading the in-memory counters. Such control dependencies 7974 * require explicit read memory barriers. 7975 * 7976 * This memory barriers pairs with smp_mb__before_atomic in 7977 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7978 * barrier implied by atomic_xchg in 7979 * btrfs_dev_stats_read_and_reset 7980 */ 7981 smp_rmb(); 7982 7983 ret = update_dev_stat_item(trans, device); 7984 if (!ret) 7985 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7986 } 7987 mutex_unlock(&fs_devices->device_list_mutex); 7988 7989 return ret; 7990 } 7991 7992 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7993 { 7994 btrfs_dev_stat_inc(dev, index); 7995 7996 if (!dev->dev_stats_valid) 7997 return; 7998 btrfs_err_rl_in_rcu(dev->fs_info, 7999 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 8000 rcu_str_deref(dev->name), 8001 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 8002 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 8003 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 8004 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 8005 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 8006 } 8007 8008 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 8009 { 8010 int i; 8011 8012 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 8013 if (btrfs_dev_stat_read(dev, i) != 0) 8014 break; 8015 if (i == BTRFS_DEV_STAT_VALUES_MAX) 8016 return; /* all values == 0, suppress message */ 8017 8018 btrfs_info_in_rcu(dev->fs_info, 8019 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 8020 rcu_str_deref(dev->name), 8021 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 8022 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 8023 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 8024 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 8025 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 8026 } 8027 8028 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 8029 struct btrfs_ioctl_get_dev_stats *stats) 8030 { 8031 BTRFS_DEV_LOOKUP_ARGS(args); 8032 struct btrfs_device *dev; 8033 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 8034 int i; 8035 8036 mutex_lock(&fs_devices->device_list_mutex); 8037 args.devid = stats->devid; 8038 dev = btrfs_find_device(fs_info->fs_devices, &args); 8039 mutex_unlock(&fs_devices->device_list_mutex); 8040 8041 if (!dev) { 8042 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 8043 return -ENODEV; 8044 } else if (!dev->dev_stats_valid) { 8045 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 8046 return -ENODEV; 8047 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 8048 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 8049 if (stats->nr_items > i) 8050 stats->values[i] = 8051 btrfs_dev_stat_read_and_reset(dev, i); 8052 else 8053 btrfs_dev_stat_set(dev, i, 0); 8054 } 8055 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 8056 current->comm, task_pid_nr(current)); 8057 } else { 8058 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 8059 if (stats->nr_items > i) 8060 stats->values[i] = btrfs_dev_stat_read(dev, i); 8061 } 8062 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 8063 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 8064 return 0; 8065 } 8066 8067 /* 8068 * Update the size and bytes used for each device where it changed. This is 8069 * delayed since we would otherwise get errors while writing out the 8070 * superblocks. 8071 * 8072 * Must be invoked during transaction commit. 8073 */ 8074 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 8075 { 8076 struct btrfs_device *curr, *next; 8077 8078 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 8079 8080 if (list_empty(&trans->dev_update_list)) 8081 return; 8082 8083 /* 8084 * We don't need the device_list_mutex here. This list is owned by the 8085 * transaction and the transaction must complete before the device is 8086 * released. 8087 */ 8088 mutex_lock(&trans->fs_info->chunk_mutex); 8089 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 8090 post_commit_list) { 8091 list_del_init(&curr->post_commit_list); 8092 curr->commit_total_bytes = curr->disk_total_bytes; 8093 curr->commit_bytes_used = curr->bytes_used; 8094 } 8095 mutex_unlock(&trans->fs_info->chunk_mutex); 8096 } 8097 8098 /* 8099 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 8100 */ 8101 int btrfs_bg_type_to_factor(u64 flags) 8102 { 8103 const int index = btrfs_bg_flags_to_raid_index(flags); 8104 8105 return btrfs_raid_array[index].ncopies; 8106 } 8107 8108 8109 8110 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 8111 u64 chunk_offset, u64 devid, 8112 u64 physical_offset, u64 physical_len) 8113 { 8114 struct btrfs_dev_lookup_args args = { .devid = devid }; 8115 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8116 struct extent_map *em; 8117 struct map_lookup *map; 8118 struct btrfs_device *dev; 8119 u64 stripe_len; 8120 bool found = false; 8121 int ret = 0; 8122 int i; 8123 8124 read_lock(&em_tree->lock); 8125 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 8126 read_unlock(&em_tree->lock); 8127 8128 if (!em) { 8129 btrfs_err(fs_info, 8130 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 8131 physical_offset, devid); 8132 ret = -EUCLEAN; 8133 goto out; 8134 } 8135 8136 map = em->map_lookup; 8137 stripe_len = btrfs_calc_stripe_length(em); 8138 if (physical_len != stripe_len) { 8139 btrfs_err(fs_info, 8140 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 8141 physical_offset, devid, em->start, physical_len, 8142 stripe_len); 8143 ret = -EUCLEAN; 8144 goto out; 8145 } 8146 8147 /* 8148 * Very old mkfs.btrfs (before v4.1) will not respect the reserved 8149 * space. Although kernel can handle it without problem, better to warn 8150 * the users. 8151 */ 8152 if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) 8153 btrfs_warn(fs_info, 8154 "devid %llu physical %llu len %llu inside the reserved space", 8155 devid, physical_offset, physical_len); 8156 8157 for (i = 0; i < map->num_stripes; i++) { 8158 if (map->stripes[i].dev->devid == devid && 8159 map->stripes[i].physical == physical_offset) { 8160 found = true; 8161 if (map->verified_stripes >= map->num_stripes) { 8162 btrfs_err(fs_info, 8163 "too many dev extents for chunk %llu found", 8164 em->start); 8165 ret = -EUCLEAN; 8166 goto out; 8167 } 8168 map->verified_stripes++; 8169 break; 8170 } 8171 } 8172 if (!found) { 8173 btrfs_err(fs_info, 8174 "dev extent physical offset %llu devid %llu has no corresponding chunk", 8175 physical_offset, devid); 8176 ret = -EUCLEAN; 8177 } 8178 8179 /* Make sure no dev extent is beyond device boundary */ 8180 dev = btrfs_find_device(fs_info->fs_devices, &args); 8181 if (!dev) { 8182 btrfs_err(fs_info, "failed to find devid %llu", devid); 8183 ret = -EUCLEAN; 8184 goto out; 8185 } 8186 8187 if (physical_offset + physical_len > dev->disk_total_bytes) { 8188 btrfs_err(fs_info, 8189 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 8190 devid, physical_offset, physical_len, 8191 dev->disk_total_bytes); 8192 ret = -EUCLEAN; 8193 goto out; 8194 } 8195 8196 if (dev->zone_info) { 8197 u64 zone_size = dev->zone_info->zone_size; 8198 8199 if (!IS_ALIGNED(physical_offset, zone_size) || 8200 !IS_ALIGNED(physical_len, zone_size)) { 8201 btrfs_err(fs_info, 8202 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8203 devid, physical_offset, physical_len); 8204 ret = -EUCLEAN; 8205 goto out; 8206 } 8207 } 8208 8209 out: 8210 free_extent_map(em); 8211 return ret; 8212 } 8213 8214 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8215 { 8216 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8217 struct extent_map *em; 8218 struct rb_node *node; 8219 int ret = 0; 8220 8221 read_lock(&em_tree->lock); 8222 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8223 em = rb_entry(node, struct extent_map, rb_node); 8224 if (em->map_lookup->num_stripes != 8225 em->map_lookup->verified_stripes) { 8226 btrfs_err(fs_info, 8227 "chunk %llu has missing dev extent, have %d expect %d", 8228 em->start, em->map_lookup->verified_stripes, 8229 em->map_lookup->num_stripes); 8230 ret = -EUCLEAN; 8231 goto out; 8232 } 8233 } 8234 out: 8235 read_unlock(&em_tree->lock); 8236 return ret; 8237 } 8238 8239 /* 8240 * Ensure that all dev extents are mapped to correct chunk, otherwise 8241 * later chunk allocation/free would cause unexpected behavior. 8242 * 8243 * NOTE: This will iterate through the whole device tree, which should be of 8244 * the same size level as the chunk tree. This slightly increases mount time. 8245 */ 8246 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8247 { 8248 struct btrfs_path *path; 8249 struct btrfs_root *root = fs_info->dev_root; 8250 struct btrfs_key key; 8251 u64 prev_devid = 0; 8252 u64 prev_dev_ext_end = 0; 8253 int ret = 0; 8254 8255 /* 8256 * We don't have a dev_root because we mounted with ignorebadroots and 8257 * failed to load the root, so we want to skip the verification in this 8258 * case for sure. 8259 * 8260 * However if the dev root is fine, but the tree itself is corrupted 8261 * we'd still fail to mount. This verification is only to make sure 8262 * writes can happen safely, so instead just bypass this check 8263 * completely in the case of IGNOREBADROOTS. 8264 */ 8265 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8266 return 0; 8267 8268 key.objectid = 1; 8269 key.type = BTRFS_DEV_EXTENT_KEY; 8270 key.offset = 0; 8271 8272 path = btrfs_alloc_path(); 8273 if (!path) 8274 return -ENOMEM; 8275 8276 path->reada = READA_FORWARD; 8277 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8278 if (ret < 0) 8279 goto out; 8280 8281 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8282 ret = btrfs_next_leaf(root, path); 8283 if (ret < 0) 8284 goto out; 8285 /* No dev extents at all? Not good */ 8286 if (ret > 0) { 8287 ret = -EUCLEAN; 8288 goto out; 8289 } 8290 } 8291 while (1) { 8292 struct extent_buffer *leaf = path->nodes[0]; 8293 struct btrfs_dev_extent *dext; 8294 int slot = path->slots[0]; 8295 u64 chunk_offset; 8296 u64 physical_offset; 8297 u64 physical_len; 8298 u64 devid; 8299 8300 btrfs_item_key_to_cpu(leaf, &key, slot); 8301 if (key.type != BTRFS_DEV_EXTENT_KEY) 8302 break; 8303 devid = key.objectid; 8304 physical_offset = key.offset; 8305 8306 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8307 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8308 physical_len = btrfs_dev_extent_length(leaf, dext); 8309 8310 /* Check if this dev extent overlaps with the previous one */ 8311 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8312 btrfs_err(fs_info, 8313 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8314 devid, physical_offset, prev_dev_ext_end); 8315 ret = -EUCLEAN; 8316 goto out; 8317 } 8318 8319 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8320 physical_offset, physical_len); 8321 if (ret < 0) 8322 goto out; 8323 prev_devid = devid; 8324 prev_dev_ext_end = physical_offset + physical_len; 8325 8326 ret = btrfs_next_item(root, path); 8327 if (ret < 0) 8328 goto out; 8329 if (ret > 0) { 8330 ret = 0; 8331 break; 8332 } 8333 } 8334 8335 /* Ensure all chunks have corresponding dev extents */ 8336 ret = verify_chunk_dev_extent_mapping(fs_info); 8337 out: 8338 btrfs_free_path(path); 8339 return ret; 8340 } 8341 8342 /* 8343 * Check whether the given block group or device is pinned by any inode being 8344 * used as a swapfile. 8345 */ 8346 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8347 { 8348 struct btrfs_swapfile_pin *sp; 8349 struct rb_node *node; 8350 8351 spin_lock(&fs_info->swapfile_pins_lock); 8352 node = fs_info->swapfile_pins.rb_node; 8353 while (node) { 8354 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8355 if (ptr < sp->ptr) 8356 node = node->rb_left; 8357 else if (ptr > sp->ptr) 8358 node = node->rb_right; 8359 else 8360 break; 8361 } 8362 spin_unlock(&fs_info->swapfile_pins_lock); 8363 return node != NULL; 8364 } 8365 8366 static int relocating_repair_kthread(void *data) 8367 { 8368 struct btrfs_block_group *cache = data; 8369 struct btrfs_fs_info *fs_info = cache->fs_info; 8370 u64 target; 8371 int ret = 0; 8372 8373 target = cache->start; 8374 btrfs_put_block_group(cache); 8375 8376 sb_start_write(fs_info->sb); 8377 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8378 btrfs_info(fs_info, 8379 "zoned: skip relocating block group %llu to repair: EBUSY", 8380 target); 8381 sb_end_write(fs_info->sb); 8382 return -EBUSY; 8383 } 8384 8385 mutex_lock(&fs_info->reclaim_bgs_lock); 8386 8387 /* Ensure block group still exists */ 8388 cache = btrfs_lookup_block_group(fs_info, target); 8389 if (!cache) 8390 goto out; 8391 8392 if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) 8393 goto out; 8394 8395 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8396 if (ret < 0) 8397 goto out; 8398 8399 btrfs_info(fs_info, 8400 "zoned: relocating block group %llu to repair IO failure", 8401 target); 8402 ret = btrfs_relocate_chunk(fs_info, target); 8403 8404 out: 8405 if (cache) 8406 btrfs_put_block_group(cache); 8407 mutex_unlock(&fs_info->reclaim_bgs_lock); 8408 btrfs_exclop_finish(fs_info); 8409 sb_end_write(fs_info->sb); 8410 8411 return ret; 8412 } 8413 8414 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8415 { 8416 struct btrfs_block_group *cache; 8417 8418 if (!btrfs_is_zoned(fs_info)) 8419 return false; 8420 8421 /* Do not attempt to repair in degraded state */ 8422 if (btrfs_test_opt(fs_info, DEGRADED)) 8423 return true; 8424 8425 cache = btrfs_lookup_block_group(fs_info, logical); 8426 if (!cache) 8427 return true; 8428 8429 if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { 8430 btrfs_put_block_group(cache); 8431 return true; 8432 } 8433 8434 kthread_run(relocating_repair_kthread, cache, 8435 "btrfs-relocating-repair"); 8436 8437 return true; 8438 } 8439 8440 int __init btrfs_bioset_init(void) 8441 { 8442 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 8443 offsetof(struct btrfs_bio, bio), 8444 BIOSET_NEED_BVECS)) 8445 return -ENOMEM; 8446 return 0; 8447 } 8448 8449 void __cold btrfs_bioset_exit(void) 8450 { 8451 bioset_exit(&btrfs_bioset); 8452 } 8453