1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include <linux/namei.h> 18 #include "misc.h" 19 #include "ctree.h" 20 #include "extent_map.h" 21 #include "disk-io.h" 22 #include "transaction.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "async-thread.h" 27 #include "check-integrity.h" 28 #include "rcu-string.h" 29 #include "dev-replace.h" 30 #include "sysfs.h" 31 #include "tree-checker.h" 32 #include "space-info.h" 33 #include "block-group.h" 34 #include "discard.h" 35 #include "zoned.h" 36 #include "fs.h" 37 #include "accessors.h" 38 #include "uuid-tree.h" 39 #include "ioctl.h" 40 #include "relocation.h" 41 #include "scrub.h" 42 #include "super.h" 43 44 static struct bio_set btrfs_bioset; 45 46 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 47 BTRFS_BLOCK_GROUP_RAID10 | \ 48 BTRFS_BLOCK_GROUP_RAID56_MASK) 49 50 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 51 [BTRFS_RAID_RAID10] = { 52 .sub_stripes = 2, 53 .dev_stripes = 1, 54 .devs_max = 0, /* 0 == as many as possible */ 55 .devs_min = 2, 56 .tolerated_failures = 1, 57 .devs_increment = 2, 58 .ncopies = 2, 59 .nparity = 0, 60 .raid_name = "raid10", 61 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 62 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 63 }, 64 [BTRFS_RAID_RAID1] = { 65 .sub_stripes = 1, 66 .dev_stripes = 1, 67 .devs_max = 2, 68 .devs_min = 2, 69 .tolerated_failures = 1, 70 .devs_increment = 2, 71 .ncopies = 2, 72 .nparity = 0, 73 .raid_name = "raid1", 74 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 75 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 76 }, 77 [BTRFS_RAID_RAID1C3] = { 78 .sub_stripes = 1, 79 .dev_stripes = 1, 80 .devs_max = 3, 81 .devs_min = 3, 82 .tolerated_failures = 2, 83 .devs_increment = 3, 84 .ncopies = 3, 85 .nparity = 0, 86 .raid_name = "raid1c3", 87 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 88 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 89 }, 90 [BTRFS_RAID_RAID1C4] = { 91 .sub_stripes = 1, 92 .dev_stripes = 1, 93 .devs_max = 4, 94 .devs_min = 4, 95 .tolerated_failures = 3, 96 .devs_increment = 4, 97 .ncopies = 4, 98 .nparity = 0, 99 .raid_name = "raid1c4", 100 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 101 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 102 }, 103 [BTRFS_RAID_DUP] = { 104 .sub_stripes = 1, 105 .dev_stripes = 2, 106 .devs_max = 1, 107 .devs_min = 1, 108 .tolerated_failures = 0, 109 .devs_increment = 1, 110 .ncopies = 2, 111 .nparity = 0, 112 .raid_name = "dup", 113 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 114 .mindev_error = 0, 115 }, 116 [BTRFS_RAID_RAID0] = { 117 .sub_stripes = 1, 118 .dev_stripes = 1, 119 .devs_max = 0, 120 .devs_min = 1, 121 .tolerated_failures = 0, 122 .devs_increment = 1, 123 .ncopies = 1, 124 .nparity = 0, 125 .raid_name = "raid0", 126 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 127 .mindev_error = 0, 128 }, 129 [BTRFS_RAID_SINGLE] = { 130 .sub_stripes = 1, 131 .dev_stripes = 1, 132 .devs_max = 1, 133 .devs_min = 1, 134 .tolerated_failures = 0, 135 .devs_increment = 1, 136 .ncopies = 1, 137 .nparity = 0, 138 .raid_name = "single", 139 .bg_flag = 0, 140 .mindev_error = 0, 141 }, 142 [BTRFS_RAID_RAID5] = { 143 .sub_stripes = 1, 144 .dev_stripes = 1, 145 .devs_max = 0, 146 .devs_min = 2, 147 .tolerated_failures = 1, 148 .devs_increment = 1, 149 .ncopies = 1, 150 .nparity = 1, 151 .raid_name = "raid5", 152 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 153 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 154 }, 155 [BTRFS_RAID_RAID6] = { 156 .sub_stripes = 1, 157 .dev_stripes = 1, 158 .devs_max = 0, 159 .devs_min = 3, 160 .tolerated_failures = 2, 161 .devs_increment = 1, 162 .ncopies = 1, 163 .nparity = 2, 164 .raid_name = "raid6", 165 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 166 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 167 }, 168 }; 169 170 /* 171 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 172 * can be used as index to access btrfs_raid_array[]. 173 */ 174 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 175 { 176 const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); 177 178 if (!profile) 179 return BTRFS_RAID_SINGLE; 180 181 return BTRFS_BG_FLAG_TO_INDEX(profile); 182 } 183 184 const char *btrfs_bg_type_to_raid_name(u64 flags) 185 { 186 const int index = btrfs_bg_flags_to_raid_index(flags); 187 188 if (index >= BTRFS_NR_RAID_TYPES) 189 return NULL; 190 191 return btrfs_raid_array[index].raid_name; 192 } 193 194 int btrfs_nr_parity_stripes(u64 type) 195 { 196 enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); 197 198 return btrfs_raid_array[index].nparity; 199 } 200 201 /* 202 * Fill @buf with textual description of @bg_flags, no more than @size_buf 203 * bytes including terminating null byte. 204 */ 205 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 206 { 207 int i; 208 int ret; 209 char *bp = buf; 210 u64 flags = bg_flags; 211 u32 size_bp = size_buf; 212 213 if (!flags) { 214 strcpy(bp, "NONE"); 215 return; 216 } 217 218 #define DESCRIBE_FLAG(flag, desc) \ 219 do { \ 220 if (flags & (flag)) { \ 221 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 222 if (ret < 0 || ret >= size_bp) \ 223 goto out_overflow; \ 224 size_bp -= ret; \ 225 bp += ret; \ 226 flags &= ~(flag); \ 227 } \ 228 } while (0) 229 230 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 231 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 232 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 233 234 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 235 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 236 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 237 btrfs_raid_array[i].raid_name); 238 #undef DESCRIBE_FLAG 239 240 if (flags) { 241 ret = snprintf(bp, size_bp, "0x%llx|", flags); 242 size_bp -= ret; 243 } 244 245 if (size_bp < size_buf) 246 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 247 248 /* 249 * The text is trimmed, it's up to the caller to provide sufficiently 250 * large buffer 251 */ 252 out_overflow:; 253 } 254 255 static int init_first_rw_device(struct btrfs_trans_handle *trans); 256 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 257 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 258 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 259 enum btrfs_map_op op, u64 logical, u64 *length, 260 struct btrfs_io_context **bioc_ret, 261 struct btrfs_io_stripe *smap, 262 int *mirror_num_ret, int need_raid_map); 263 264 /* 265 * Device locking 266 * ============== 267 * 268 * There are several mutexes that protect manipulation of devices and low-level 269 * structures like chunks but not block groups, extents or files 270 * 271 * uuid_mutex (global lock) 272 * ------------------------ 273 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 274 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 275 * device) or requested by the device= mount option 276 * 277 * the mutex can be very coarse and can cover long-running operations 278 * 279 * protects: updates to fs_devices counters like missing devices, rw devices, 280 * seeding, structure cloning, opening/closing devices at mount/umount time 281 * 282 * global::fs_devs - add, remove, updates to the global list 283 * 284 * does not protect: manipulation of the fs_devices::devices list in general 285 * but in mount context it could be used to exclude list modifications by eg. 286 * scan ioctl 287 * 288 * btrfs_device::name - renames (write side), read is RCU 289 * 290 * fs_devices::device_list_mutex (per-fs, with RCU) 291 * ------------------------------------------------ 292 * protects updates to fs_devices::devices, ie. adding and deleting 293 * 294 * simple list traversal with read-only actions can be done with RCU protection 295 * 296 * may be used to exclude some operations from running concurrently without any 297 * modifications to the list (see write_all_supers) 298 * 299 * Is not required at mount and close times, because our device list is 300 * protected by the uuid_mutex at that point. 301 * 302 * balance_mutex 303 * ------------- 304 * protects balance structures (status, state) and context accessed from 305 * several places (internally, ioctl) 306 * 307 * chunk_mutex 308 * ----------- 309 * protects chunks, adding or removing during allocation, trim or when a new 310 * device is added/removed. Additionally it also protects post_commit_list of 311 * individual devices, since they can be added to the transaction's 312 * post_commit_list only with chunk_mutex held. 313 * 314 * cleaner_mutex 315 * ------------- 316 * a big lock that is held by the cleaner thread and prevents running subvolume 317 * cleaning together with relocation or delayed iputs 318 * 319 * 320 * Lock nesting 321 * ============ 322 * 323 * uuid_mutex 324 * device_list_mutex 325 * chunk_mutex 326 * balance_mutex 327 * 328 * 329 * Exclusive operations 330 * ==================== 331 * 332 * Maintains the exclusivity of the following operations that apply to the 333 * whole filesystem and cannot run in parallel. 334 * 335 * - Balance (*) 336 * - Device add 337 * - Device remove 338 * - Device replace (*) 339 * - Resize 340 * 341 * The device operations (as above) can be in one of the following states: 342 * 343 * - Running state 344 * - Paused state 345 * - Completed state 346 * 347 * Only device operations marked with (*) can go into the Paused state for the 348 * following reasons: 349 * 350 * - ioctl (only Balance can be Paused through ioctl) 351 * - filesystem remounted as read-only 352 * - filesystem unmounted and mounted as read-only 353 * - system power-cycle and filesystem mounted as read-only 354 * - filesystem or device errors leading to forced read-only 355 * 356 * The status of exclusive operation is set and cleared atomically. 357 * During the course of Paused state, fs_info::exclusive_operation remains set. 358 * A device operation in Paused or Running state can be canceled or resumed 359 * either by ioctl (Balance only) or when remounted as read-write. 360 * The exclusive status is cleared when the device operation is canceled or 361 * completed. 362 */ 363 364 DEFINE_MUTEX(uuid_mutex); 365 static LIST_HEAD(fs_uuids); 366 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 367 { 368 return &fs_uuids; 369 } 370 371 /* 372 * alloc_fs_devices - allocate struct btrfs_fs_devices 373 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 374 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 375 * 376 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 377 * The returned struct is not linked onto any lists and can be destroyed with 378 * kfree() right away. 379 */ 380 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 381 const u8 *metadata_fsid) 382 { 383 struct btrfs_fs_devices *fs_devs; 384 385 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 386 if (!fs_devs) 387 return ERR_PTR(-ENOMEM); 388 389 mutex_init(&fs_devs->device_list_mutex); 390 391 INIT_LIST_HEAD(&fs_devs->devices); 392 INIT_LIST_HEAD(&fs_devs->alloc_list); 393 INIT_LIST_HEAD(&fs_devs->fs_list); 394 INIT_LIST_HEAD(&fs_devs->seed_list); 395 if (fsid) 396 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 397 398 if (metadata_fsid) 399 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 400 else if (fsid) 401 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 402 403 return fs_devs; 404 } 405 406 void btrfs_free_device(struct btrfs_device *device) 407 { 408 WARN_ON(!list_empty(&device->post_commit_list)); 409 rcu_string_free(device->name); 410 extent_io_tree_release(&device->alloc_state); 411 btrfs_destroy_dev_zone_info(device); 412 kfree(device); 413 } 414 415 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 416 { 417 struct btrfs_device *device; 418 WARN_ON(fs_devices->opened); 419 while (!list_empty(&fs_devices->devices)) { 420 device = list_entry(fs_devices->devices.next, 421 struct btrfs_device, dev_list); 422 list_del(&device->dev_list); 423 btrfs_free_device(device); 424 } 425 kfree(fs_devices); 426 } 427 428 void __exit btrfs_cleanup_fs_uuids(void) 429 { 430 struct btrfs_fs_devices *fs_devices; 431 432 while (!list_empty(&fs_uuids)) { 433 fs_devices = list_entry(fs_uuids.next, 434 struct btrfs_fs_devices, fs_list); 435 list_del(&fs_devices->fs_list); 436 free_fs_devices(fs_devices); 437 } 438 } 439 440 static noinline struct btrfs_fs_devices *find_fsid( 441 const u8 *fsid, const u8 *metadata_fsid) 442 { 443 struct btrfs_fs_devices *fs_devices; 444 445 ASSERT(fsid); 446 447 /* Handle non-split brain cases */ 448 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 449 if (metadata_fsid) { 450 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 451 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 452 BTRFS_FSID_SIZE) == 0) 453 return fs_devices; 454 } else { 455 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 456 return fs_devices; 457 } 458 } 459 return NULL; 460 } 461 462 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 463 struct btrfs_super_block *disk_super) 464 { 465 466 struct btrfs_fs_devices *fs_devices; 467 468 /* 469 * Handle scanned device having completed its fsid change but 470 * belonging to a fs_devices that was created by first scanning 471 * a device which didn't have its fsid/metadata_uuid changed 472 * at all and the CHANGING_FSID_V2 flag set. 473 */ 474 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 475 if (fs_devices->fsid_change && 476 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 477 BTRFS_FSID_SIZE) == 0 && 478 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 479 BTRFS_FSID_SIZE) == 0) { 480 return fs_devices; 481 } 482 } 483 /* 484 * Handle scanned device having completed its fsid change but 485 * belonging to a fs_devices that was created by a device that 486 * has an outdated pair of fsid/metadata_uuid and 487 * CHANGING_FSID_V2 flag set. 488 */ 489 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 490 if (fs_devices->fsid_change && 491 memcmp(fs_devices->metadata_uuid, 492 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 493 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 494 BTRFS_FSID_SIZE) == 0) { 495 return fs_devices; 496 } 497 } 498 499 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 500 } 501 502 503 static int 504 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 505 int flush, struct block_device **bdev, 506 struct btrfs_super_block **disk_super) 507 { 508 int ret; 509 510 *bdev = blkdev_get_by_path(device_path, flags, holder); 511 512 if (IS_ERR(*bdev)) { 513 ret = PTR_ERR(*bdev); 514 goto error; 515 } 516 517 if (flush) 518 sync_blockdev(*bdev); 519 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 520 if (ret) { 521 blkdev_put(*bdev, flags); 522 goto error; 523 } 524 invalidate_bdev(*bdev); 525 *disk_super = btrfs_read_dev_super(*bdev); 526 if (IS_ERR(*disk_super)) { 527 ret = PTR_ERR(*disk_super); 528 blkdev_put(*bdev, flags); 529 goto error; 530 } 531 532 return 0; 533 534 error: 535 *bdev = NULL; 536 return ret; 537 } 538 539 /* 540 * Search and remove all stale devices (which are not mounted). When both 541 * inputs are NULL, it will search and release all stale devices. 542 * 543 * @devt: Optional. When provided will it release all unmounted devices 544 * matching this devt only. 545 * @skip_device: Optional. Will skip this device when searching for the stale 546 * devices. 547 * 548 * Return: 0 for success or if @devt is 0. 549 * -EBUSY if @devt is a mounted device. 550 * -ENOENT if @devt does not match any device in the list. 551 */ 552 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 553 { 554 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 555 struct btrfs_device *device, *tmp_device; 556 int ret = 0; 557 558 lockdep_assert_held(&uuid_mutex); 559 560 if (devt) 561 ret = -ENOENT; 562 563 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 564 565 mutex_lock(&fs_devices->device_list_mutex); 566 list_for_each_entry_safe(device, tmp_device, 567 &fs_devices->devices, dev_list) { 568 if (skip_device && skip_device == device) 569 continue; 570 if (devt && devt != device->devt) 571 continue; 572 if (fs_devices->opened) { 573 /* for an already deleted device return 0 */ 574 if (devt && ret != 0) 575 ret = -EBUSY; 576 break; 577 } 578 579 /* delete the stale device */ 580 fs_devices->num_devices--; 581 list_del(&device->dev_list); 582 btrfs_free_device(device); 583 584 ret = 0; 585 } 586 mutex_unlock(&fs_devices->device_list_mutex); 587 588 if (fs_devices->num_devices == 0) { 589 btrfs_sysfs_remove_fsid(fs_devices); 590 list_del(&fs_devices->fs_list); 591 free_fs_devices(fs_devices); 592 } 593 } 594 595 return ret; 596 } 597 598 /* 599 * This is only used on mount, and we are protected from competing things 600 * messing with our fs_devices by the uuid_mutex, thus we do not need the 601 * fs_devices->device_list_mutex here. 602 */ 603 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 604 struct btrfs_device *device, fmode_t flags, 605 void *holder) 606 { 607 struct block_device *bdev; 608 struct btrfs_super_block *disk_super; 609 u64 devid; 610 int ret; 611 612 if (device->bdev) 613 return -EINVAL; 614 if (!device->name) 615 return -EINVAL; 616 617 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 618 &bdev, &disk_super); 619 if (ret) 620 return ret; 621 622 devid = btrfs_stack_device_id(&disk_super->dev_item); 623 if (devid != device->devid) 624 goto error_free_page; 625 626 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 627 goto error_free_page; 628 629 device->generation = btrfs_super_generation(disk_super); 630 631 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 632 if (btrfs_super_incompat_flags(disk_super) & 633 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 634 pr_err( 635 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 636 goto error_free_page; 637 } 638 639 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 640 fs_devices->seeding = true; 641 } else { 642 if (bdev_read_only(bdev)) 643 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 644 else 645 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 646 } 647 648 if (!bdev_nonrot(bdev)) 649 fs_devices->rotating = true; 650 651 if (bdev_max_discard_sectors(bdev)) 652 fs_devices->discardable = true; 653 654 device->bdev = bdev; 655 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 656 device->mode = flags; 657 658 fs_devices->open_devices++; 659 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 660 device->devid != BTRFS_DEV_REPLACE_DEVID) { 661 fs_devices->rw_devices++; 662 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 663 } 664 btrfs_release_disk_super(disk_super); 665 666 return 0; 667 668 error_free_page: 669 btrfs_release_disk_super(disk_super); 670 blkdev_put(bdev, flags); 671 672 return -EINVAL; 673 } 674 675 /* 676 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 677 * being created with a disk that has already completed its fsid change. Such 678 * disk can belong to an fs which has its FSID changed or to one which doesn't. 679 * Handle both cases here. 680 */ 681 static struct btrfs_fs_devices *find_fsid_inprogress( 682 struct btrfs_super_block *disk_super) 683 { 684 struct btrfs_fs_devices *fs_devices; 685 686 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 687 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 688 BTRFS_FSID_SIZE) != 0 && 689 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 690 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 691 return fs_devices; 692 } 693 } 694 695 return find_fsid(disk_super->fsid, NULL); 696 } 697 698 699 static struct btrfs_fs_devices *find_fsid_changed( 700 struct btrfs_super_block *disk_super) 701 { 702 struct btrfs_fs_devices *fs_devices; 703 704 /* 705 * Handles the case where scanned device is part of an fs that had 706 * multiple successful changes of FSID but currently device didn't 707 * observe it. Meaning our fsid will be different than theirs. We need 708 * to handle two subcases : 709 * 1 - The fs still continues to have different METADATA/FSID uuids. 710 * 2 - The fs is switched back to its original FSID (METADATA/FSID 711 * are equal). 712 */ 713 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 714 /* Changed UUIDs */ 715 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 716 BTRFS_FSID_SIZE) != 0 && 717 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 718 BTRFS_FSID_SIZE) == 0 && 719 memcmp(fs_devices->fsid, disk_super->fsid, 720 BTRFS_FSID_SIZE) != 0) 721 return fs_devices; 722 723 /* Unchanged UUIDs */ 724 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 725 BTRFS_FSID_SIZE) == 0 && 726 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 727 BTRFS_FSID_SIZE) == 0) 728 return fs_devices; 729 } 730 731 return NULL; 732 } 733 734 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 735 struct btrfs_super_block *disk_super) 736 { 737 struct btrfs_fs_devices *fs_devices; 738 739 /* 740 * Handle the case where the scanned device is part of an fs whose last 741 * metadata UUID change reverted it to the original FSID. At the same 742 * time * fs_devices was first created by another constitutent device 743 * which didn't fully observe the operation. This results in an 744 * btrfs_fs_devices created with metadata/fsid different AND 745 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 746 * fs_devices equal to the FSID of the disk. 747 */ 748 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 749 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 750 BTRFS_FSID_SIZE) != 0 && 751 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 752 BTRFS_FSID_SIZE) == 0 && 753 fs_devices->fsid_change) 754 return fs_devices; 755 } 756 757 return NULL; 758 } 759 /* 760 * Add new device to list of registered devices 761 * 762 * Returns: 763 * device pointer which was just added or updated when successful 764 * error pointer when failed 765 */ 766 static noinline struct btrfs_device *device_list_add(const char *path, 767 struct btrfs_super_block *disk_super, 768 bool *new_device_added) 769 { 770 struct btrfs_device *device; 771 struct btrfs_fs_devices *fs_devices = NULL; 772 struct rcu_string *name; 773 u64 found_transid = btrfs_super_generation(disk_super); 774 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 775 dev_t path_devt; 776 int error; 777 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 778 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 779 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 780 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 781 782 error = lookup_bdev(path, &path_devt); 783 if (error) 784 return ERR_PTR(error); 785 786 if (fsid_change_in_progress) { 787 if (!has_metadata_uuid) 788 fs_devices = find_fsid_inprogress(disk_super); 789 else 790 fs_devices = find_fsid_changed(disk_super); 791 } else if (has_metadata_uuid) { 792 fs_devices = find_fsid_with_metadata_uuid(disk_super); 793 } else { 794 fs_devices = find_fsid_reverted_metadata(disk_super); 795 if (!fs_devices) 796 fs_devices = find_fsid(disk_super->fsid, NULL); 797 } 798 799 800 if (!fs_devices) { 801 if (has_metadata_uuid) 802 fs_devices = alloc_fs_devices(disk_super->fsid, 803 disk_super->metadata_uuid); 804 else 805 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 806 807 if (IS_ERR(fs_devices)) 808 return ERR_CAST(fs_devices); 809 810 fs_devices->fsid_change = fsid_change_in_progress; 811 812 mutex_lock(&fs_devices->device_list_mutex); 813 list_add(&fs_devices->fs_list, &fs_uuids); 814 815 device = NULL; 816 } else { 817 struct btrfs_dev_lookup_args args = { 818 .devid = devid, 819 .uuid = disk_super->dev_item.uuid, 820 }; 821 822 mutex_lock(&fs_devices->device_list_mutex); 823 device = btrfs_find_device(fs_devices, &args); 824 825 /* 826 * If this disk has been pulled into an fs devices created by 827 * a device which had the CHANGING_FSID_V2 flag then replace the 828 * metadata_uuid/fsid values of the fs_devices. 829 */ 830 if (fs_devices->fsid_change && 831 found_transid > fs_devices->latest_generation) { 832 memcpy(fs_devices->fsid, disk_super->fsid, 833 BTRFS_FSID_SIZE); 834 835 if (has_metadata_uuid) 836 memcpy(fs_devices->metadata_uuid, 837 disk_super->metadata_uuid, 838 BTRFS_FSID_SIZE); 839 else 840 memcpy(fs_devices->metadata_uuid, 841 disk_super->fsid, BTRFS_FSID_SIZE); 842 843 fs_devices->fsid_change = false; 844 } 845 } 846 847 if (!device) { 848 unsigned int nofs_flag; 849 850 if (fs_devices->opened) { 851 mutex_unlock(&fs_devices->device_list_mutex); 852 return ERR_PTR(-EBUSY); 853 } 854 855 nofs_flag = memalloc_nofs_save(); 856 device = btrfs_alloc_device(NULL, &devid, 857 disk_super->dev_item.uuid, path); 858 memalloc_nofs_restore(nofs_flag); 859 if (IS_ERR(device)) { 860 mutex_unlock(&fs_devices->device_list_mutex); 861 /* we can safely leave the fs_devices entry around */ 862 return device; 863 } 864 865 device->devt = path_devt; 866 867 list_add_rcu(&device->dev_list, &fs_devices->devices); 868 fs_devices->num_devices++; 869 870 device->fs_devices = fs_devices; 871 *new_device_added = true; 872 873 if (disk_super->label[0]) 874 pr_info( 875 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 876 disk_super->label, devid, found_transid, path, 877 current->comm, task_pid_nr(current)); 878 else 879 pr_info( 880 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 881 disk_super->fsid, devid, found_transid, path, 882 current->comm, task_pid_nr(current)); 883 884 } else if (!device->name || strcmp(device->name->str, path)) { 885 /* 886 * When FS is already mounted. 887 * 1. If you are here and if the device->name is NULL that 888 * means this device was missing at time of FS mount. 889 * 2. If you are here and if the device->name is different 890 * from 'path' that means either 891 * a. The same device disappeared and reappeared with 892 * different name. or 893 * b. The missing-disk-which-was-replaced, has 894 * reappeared now. 895 * 896 * We must allow 1 and 2a above. But 2b would be a spurious 897 * and unintentional. 898 * 899 * Further in case of 1 and 2a above, the disk at 'path' 900 * would have missed some transaction when it was away and 901 * in case of 2a the stale bdev has to be updated as well. 902 * 2b must not be allowed at all time. 903 */ 904 905 /* 906 * For now, we do allow update to btrfs_fs_device through the 907 * btrfs dev scan cli after FS has been mounted. We're still 908 * tracking a problem where systems fail mount by subvolume id 909 * when we reject replacement on a mounted FS. 910 */ 911 if (!fs_devices->opened && found_transid < device->generation) { 912 /* 913 * That is if the FS is _not_ mounted and if you 914 * are here, that means there is more than one 915 * disk with same uuid and devid.We keep the one 916 * with larger generation number or the last-in if 917 * generation are equal. 918 */ 919 mutex_unlock(&fs_devices->device_list_mutex); 920 return ERR_PTR(-EEXIST); 921 } 922 923 /* 924 * We are going to replace the device path for a given devid, 925 * make sure it's the same device if the device is mounted 926 * 927 * NOTE: the device->fs_info may not be reliable here so pass 928 * in a NULL to message helpers instead. This avoids a possible 929 * use-after-free when the fs_info and fs_info->sb are already 930 * torn down. 931 */ 932 if (device->bdev) { 933 if (device->devt != path_devt) { 934 mutex_unlock(&fs_devices->device_list_mutex); 935 btrfs_warn_in_rcu(NULL, 936 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 937 path, devid, found_transid, 938 current->comm, 939 task_pid_nr(current)); 940 return ERR_PTR(-EEXIST); 941 } 942 btrfs_info_in_rcu(NULL, 943 "devid %llu device path %s changed to %s scanned by %s (%d)", 944 devid, btrfs_dev_name(device), 945 path, current->comm, 946 task_pid_nr(current)); 947 } 948 949 name = rcu_string_strdup(path, GFP_NOFS); 950 if (!name) { 951 mutex_unlock(&fs_devices->device_list_mutex); 952 return ERR_PTR(-ENOMEM); 953 } 954 rcu_string_free(device->name); 955 rcu_assign_pointer(device->name, name); 956 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 957 fs_devices->missing_devices--; 958 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 959 } 960 device->devt = path_devt; 961 } 962 963 /* 964 * Unmount does not free the btrfs_device struct but would zero 965 * generation along with most of the other members. So just update 966 * it back. We need it to pick the disk with largest generation 967 * (as above). 968 */ 969 if (!fs_devices->opened) { 970 device->generation = found_transid; 971 fs_devices->latest_generation = max_t(u64, found_transid, 972 fs_devices->latest_generation); 973 } 974 975 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 976 977 mutex_unlock(&fs_devices->device_list_mutex); 978 return device; 979 } 980 981 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 982 { 983 struct btrfs_fs_devices *fs_devices; 984 struct btrfs_device *device; 985 struct btrfs_device *orig_dev; 986 int ret = 0; 987 988 lockdep_assert_held(&uuid_mutex); 989 990 fs_devices = alloc_fs_devices(orig->fsid, NULL); 991 if (IS_ERR(fs_devices)) 992 return fs_devices; 993 994 fs_devices->total_devices = orig->total_devices; 995 996 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 997 const char *dev_path = NULL; 998 999 /* 1000 * This is ok to do without RCU read locked because we hold the 1001 * uuid mutex so nothing we touch in here is going to disappear. 1002 */ 1003 if (orig_dev->name) 1004 dev_path = orig_dev->name->str; 1005 1006 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1007 orig_dev->uuid, dev_path); 1008 if (IS_ERR(device)) { 1009 ret = PTR_ERR(device); 1010 goto error; 1011 } 1012 1013 if (orig_dev->zone_info) { 1014 struct btrfs_zoned_device_info *zone_info; 1015 1016 zone_info = btrfs_clone_dev_zone_info(orig_dev); 1017 if (!zone_info) { 1018 btrfs_free_device(device); 1019 ret = -ENOMEM; 1020 goto error; 1021 } 1022 device->zone_info = zone_info; 1023 } 1024 1025 list_add(&device->dev_list, &fs_devices->devices); 1026 device->fs_devices = fs_devices; 1027 fs_devices->num_devices++; 1028 } 1029 return fs_devices; 1030 error: 1031 free_fs_devices(fs_devices); 1032 return ERR_PTR(ret); 1033 } 1034 1035 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1036 struct btrfs_device **latest_dev) 1037 { 1038 struct btrfs_device *device, *next; 1039 1040 /* This is the initialized path, it is safe to release the devices. */ 1041 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1042 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1043 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1044 &device->dev_state) && 1045 !test_bit(BTRFS_DEV_STATE_MISSING, 1046 &device->dev_state) && 1047 (!*latest_dev || 1048 device->generation > (*latest_dev)->generation)) { 1049 *latest_dev = device; 1050 } 1051 continue; 1052 } 1053 1054 /* 1055 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1056 * in btrfs_init_dev_replace() so just continue. 1057 */ 1058 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1059 continue; 1060 1061 if (device->bdev) { 1062 blkdev_put(device->bdev, device->mode); 1063 device->bdev = NULL; 1064 fs_devices->open_devices--; 1065 } 1066 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1067 list_del_init(&device->dev_alloc_list); 1068 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1069 fs_devices->rw_devices--; 1070 } 1071 list_del_init(&device->dev_list); 1072 fs_devices->num_devices--; 1073 btrfs_free_device(device); 1074 } 1075 1076 } 1077 1078 /* 1079 * After we have read the system tree and know devids belonging to this 1080 * filesystem, remove the device which does not belong there. 1081 */ 1082 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1083 { 1084 struct btrfs_device *latest_dev = NULL; 1085 struct btrfs_fs_devices *seed_dev; 1086 1087 mutex_lock(&uuid_mutex); 1088 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1089 1090 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1091 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1092 1093 fs_devices->latest_dev = latest_dev; 1094 1095 mutex_unlock(&uuid_mutex); 1096 } 1097 1098 static void btrfs_close_bdev(struct btrfs_device *device) 1099 { 1100 if (!device->bdev) 1101 return; 1102 1103 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1104 sync_blockdev(device->bdev); 1105 invalidate_bdev(device->bdev); 1106 } 1107 1108 blkdev_put(device->bdev, device->mode); 1109 } 1110 1111 static void btrfs_close_one_device(struct btrfs_device *device) 1112 { 1113 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1114 1115 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1116 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1117 list_del_init(&device->dev_alloc_list); 1118 fs_devices->rw_devices--; 1119 } 1120 1121 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1122 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1123 1124 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1125 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1126 fs_devices->missing_devices--; 1127 } 1128 1129 btrfs_close_bdev(device); 1130 if (device->bdev) { 1131 fs_devices->open_devices--; 1132 device->bdev = NULL; 1133 } 1134 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1135 btrfs_destroy_dev_zone_info(device); 1136 1137 device->fs_info = NULL; 1138 atomic_set(&device->dev_stats_ccnt, 0); 1139 extent_io_tree_release(&device->alloc_state); 1140 1141 /* 1142 * Reset the flush error record. We might have a transient flush error 1143 * in this mount, and if so we aborted the current transaction and set 1144 * the fs to an error state, guaranteeing no super blocks can be further 1145 * committed. However that error might be transient and if we unmount the 1146 * filesystem and mount it again, we should allow the mount to succeed 1147 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1148 * filesystem again we still get flush errors, then we will again abort 1149 * any transaction and set the error state, guaranteeing no commits of 1150 * unsafe super blocks. 1151 */ 1152 device->last_flush_error = 0; 1153 1154 /* Verify the device is back in a pristine state */ 1155 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1156 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1157 ASSERT(list_empty(&device->dev_alloc_list)); 1158 ASSERT(list_empty(&device->post_commit_list)); 1159 } 1160 1161 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1162 { 1163 struct btrfs_device *device, *tmp; 1164 1165 lockdep_assert_held(&uuid_mutex); 1166 1167 if (--fs_devices->opened > 0) 1168 return; 1169 1170 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1171 btrfs_close_one_device(device); 1172 1173 WARN_ON(fs_devices->open_devices); 1174 WARN_ON(fs_devices->rw_devices); 1175 fs_devices->opened = 0; 1176 fs_devices->seeding = false; 1177 fs_devices->fs_info = NULL; 1178 } 1179 1180 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1181 { 1182 LIST_HEAD(list); 1183 struct btrfs_fs_devices *tmp; 1184 1185 mutex_lock(&uuid_mutex); 1186 close_fs_devices(fs_devices); 1187 if (!fs_devices->opened) 1188 list_splice_init(&fs_devices->seed_list, &list); 1189 1190 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1191 close_fs_devices(fs_devices); 1192 list_del(&fs_devices->seed_list); 1193 free_fs_devices(fs_devices); 1194 } 1195 mutex_unlock(&uuid_mutex); 1196 } 1197 1198 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1199 fmode_t flags, void *holder) 1200 { 1201 struct btrfs_device *device; 1202 struct btrfs_device *latest_dev = NULL; 1203 struct btrfs_device *tmp_device; 1204 1205 flags |= FMODE_EXCL; 1206 1207 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1208 dev_list) { 1209 int ret; 1210 1211 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1212 if (ret == 0 && 1213 (!latest_dev || device->generation > latest_dev->generation)) { 1214 latest_dev = device; 1215 } else if (ret == -ENODATA) { 1216 fs_devices->num_devices--; 1217 list_del(&device->dev_list); 1218 btrfs_free_device(device); 1219 } 1220 } 1221 if (fs_devices->open_devices == 0) 1222 return -EINVAL; 1223 1224 fs_devices->opened = 1; 1225 fs_devices->latest_dev = latest_dev; 1226 fs_devices->total_rw_bytes = 0; 1227 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1228 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1229 1230 return 0; 1231 } 1232 1233 static int devid_cmp(void *priv, const struct list_head *a, 1234 const struct list_head *b) 1235 { 1236 const struct btrfs_device *dev1, *dev2; 1237 1238 dev1 = list_entry(a, struct btrfs_device, dev_list); 1239 dev2 = list_entry(b, struct btrfs_device, dev_list); 1240 1241 if (dev1->devid < dev2->devid) 1242 return -1; 1243 else if (dev1->devid > dev2->devid) 1244 return 1; 1245 return 0; 1246 } 1247 1248 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1249 fmode_t flags, void *holder) 1250 { 1251 int ret; 1252 1253 lockdep_assert_held(&uuid_mutex); 1254 /* 1255 * The device_list_mutex cannot be taken here in case opening the 1256 * underlying device takes further locks like open_mutex. 1257 * 1258 * We also don't need the lock here as this is called during mount and 1259 * exclusion is provided by uuid_mutex 1260 */ 1261 1262 if (fs_devices->opened) { 1263 fs_devices->opened++; 1264 ret = 0; 1265 } else { 1266 list_sort(NULL, &fs_devices->devices, devid_cmp); 1267 ret = open_fs_devices(fs_devices, flags, holder); 1268 } 1269 1270 return ret; 1271 } 1272 1273 void btrfs_release_disk_super(struct btrfs_super_block *super) 1274 { 1275 struct page *page = virt_to_page(super); 1276 1277 put_page(page); 1278 } 1279 1280 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1281 u64 bytenr, u64 bytenr_orig) 1282 { 1283 struct btrfs_super_block *disk_super; 1284 struct page *page; 1285 void *p; 1286 pgoff_t index; 1287 1288 /* make sure our super fits in the device */ 1289 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 1290 return ERR_PTR(-EINVAL); 1291 1292 /* make sure our super fits in the page */ 1293 if (sizeof(*disk_super) > PAGE_SIZE) 1294 return ERR_PTR(-EINVAL); 1295 1296 /* make sure our super doesn't straddle pages on disk */ 1297 index = bytenr >> PAGE_SHIFT; 1298 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1299 return ERR_PTR(-EINVAL); 1300 1301 /* pull in the page with our super */ 1302 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1303 1304 if (IS_ERR(page)) 1305 return ERR_CAST(page); 1306 1307 p = page_address(page); 1308 1309 /* align our pointer to the offset of the super block */ 1310 disk_super = p + offset_in_page(bytenr); 1311 1312 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1313 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1314 btrfs_release_disk_super(p); 1315 return ERR_PTR(-EINVAL); 1316 } 1317 1318 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1319 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1320 1321 return disk_super; 1322 } 1323 1324 int btrfs_forget_devices(dev_t devt) 1325 { 1326 int ret; 1327 1328 mutex_lock(&uuid_mutex); 1329 ret = btrfs_free_stale_devices(devt, NULL); 1330 mutex_unlock(&uuid_mutex); 1331 1332 return ret; 1333 } 1334 1335 /* 1336 * Look for a btrfs signature on a device. This may be called out of the mount path 1337 * and we are not allowed to call set_blocksize during the scan. The superblock 1338 * is read via pagecache 1339 */ 1340 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1341 void *holder) 1342 { 1343 struct btrfs_super_block *disk_super; 1344 bool new_device_added = false; 1345 struct btrfs_device *device = NULL; 1346 struct block_device *bdev; 1347 u64 bytenr, bytenr_orig; 1348 int ret; 1349 1350 lockdep_assert_held(&uuid_mutex); 1351 1352 /* 1353 * we would like to check all the supers, but that would make 1354 * a btrfs mount succeed after a mkfs from a different FS. 1355 * So, we need to add a special mount option to scan for 1356 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1357 */ 1358 flags |= FMODE_EXCL; 1359 1360 bdev = blkdev_get_by_path(path, flags, holder); 1361 if (IS_ERR(bdev)) 1362 return ERR_CAST(bdev); 1363 1364 bytenr_orig = btrfs_sb_offset(0); 1365 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1366 if (ret) { 1367 device = ERR_PTR(ret); 1368 goto error_bdev_put; 1369 } 1370 1371 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1372 if (IS_ERR(disk_super)) { 1373 device = ERR_CAST(disk_super); 1374 goto error_bdev_put; 1375 } 1376 1377 device = device_list_add(path, disk_super, &new_device_added); 1378 if (!IS_ERR(device) && new_device_added) 1379 btrfs_free_stale_devices(device->devt, device); 1380 1381 btrfs_release_disk_super(disk_super); 1382 1383 error_bdev_put: 1384 blkdev_put(bdev, flags); 1385 1386 return device; 1387 } 1388 1389 /* 1390 * Try to find a chunk that intersects [start, start + len] range and when one 1391 * such is found, record the end of it in *start 1392 */ 1393 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1394 u64 len) 1395 { 1396 u64 physical_start, physical_end; 1397 1398 lockdep_assert_held(&device->fs_info->chunk_mutex); 1399 1400 if (!find_first_extent_bit(&device->alloc_state, *start, 1401 &physical_start, &physical_end, 1402 CHUNK_ALLOCATED, NULL)) { 1403 1404 if (in_range(physical_start, *start, len) || 1405 in_range(*start, physical_start, 1406 physical_end - physical_start)) { 1407 *start = physical_end + 1; 1408 return true; 1409 } 1410 } 1411 return false; 1412 } 1413 1414 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1415 { 1416 switch (device->fs_devices->chunk_alloc_policy) { 1417 case BTRFS_CHUNK_ALLOC_REGULAR: 1418 return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); 1419 case BTRFS_CHUNK_ALLOC_ZONED: 1420 /* 1421 * We don't care about the starting region like regular 1422 * allocator, because we anyway use/reserve the first two zones 1423 * for superblock logging. 1424 */ 1425 return ALIGN(start, device->zone_info->zone_size); 1426 default: 1427 BUG(); 1428 } 1429 } 1430 1431 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1432 u64 *hole_start, u64 *hole_size, 1433 u64 num_bytes) 1434 { 1435 u64 zone_size = device->zone_info->zone_size; 1436 u64 pos; 1437 int ret; 1438 bool changed = false; 1439 1440 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1441 1442 while (*hole_size > 0) { 1443 pos = btrfs_find_allocatable_zones(device, *hole_start, 1444 *hole_start + *hole_size, 1445 num_bytes); 1446 if (pos != *hole_start) { 1447 *hole_size = *hole_start + *hole_size - pos; 1448 *hole_start = pos; 1449 changed = true; 1450 if (*hole_size < num_bytes) 1451 break; 1452 } 1453 1454 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1455 1456 /* Range is ensured to be empty */ 1457 if (!ret) 1458 return changed; 1459 1460 /* Given hole range was invalid (outside of device) */ 1461 if (ret == -ERANGE) { 1462 *hole_start += *hole_size; 1463 *hole_size = 0; 1464 return true; 1465 } 1466 1467 *hole_start += zone_size; 1468 *hole_size -= zone_size; 1469 changed = true; 1470 } 1471 1472 return changed; 1473 } 1474 1475 /* 1476 * Check if specified hole is suitable for allocation. 1477 * 1478 * @device: the device which we have the hole 1479 * @hole_start: starting position of the hole 1480 * @hole_size: the size of the hole 1481 * @num_bytes: the size of the free space that we need 1482 * 1483 * This function may modify @hole_start and @hole_size to reflect the suitable 1484 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1485 */ 1486 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1487 u64 *hole_size, u64 num_bytes) 1488 { 1489 bool changed = false; 1490 u64 hole_end = *hole_start + *hole_size; 1491 1492 for (;;) { 1493 /* 1494 * Check before we set max_hole_start, otherwise we could end up 1495 * sending back this offset anyway. 1496 */ 1497 if (contains_pending_extent(device, hole_start, *hole_size)) { 1498 if (hole_end >= *hole_start) 1499 *hole_size = hole_end - *hole_start; 1500 else 1501 *hole_size = 0; 1502 changed = true; 1503 } 1504 1505 switch (device->fs_devices->chunk_alloc_policy) { 1506 case BTRFS_CHUNK_ALLOC_REGULAR: 1507 /* No extra check */ 1508 break; 1509 case BTRFS_CHUNK_ALLOC_ZONED: 1510 if (dev_extent_hole_check_zoned(device, hole_start, 1511 hole_size, num_bytes)) { 1512 changed = true; 1513 /* 1514 * The changed hole can contain pending extent. 1515 * Loop again to check that. 1516 */ 1517 continue; 1518 } 1519 break; 1520 default: 1521 BUG(); 1522 } 1523 1524 break; 1525 } 1526 1527 return changed; 1528 } 1529 1530 /* 1531 * Find free space in the specified device. 1532 * 1533 * @device: the device which we search the free space in 1534 * @num_bytes: the size of the free space that we need 1535 * @search_start: the position from which to begin the search 1536 * @start: store the start of the free space. 1537 * @len: the size of the free space. that we find, or the size 1538 * of the max free space if we don't find suitable free space 1539 * 1540 * This does a pretty simple search, the expectation is that it is called very 1541 * infrequently and that a given device has a small number of extents. 1542 * 1543 * @start is used to store the start of the free space if we find. But if we 1544 * don't find suitable free space, it will be used to store the start position 1545 * of the max free space. 1546 * 1547 * @len is used to store the size of the free space that we find. 1548 * But if we don't find suitable free space, it is used to store the size of 1549 * the max free space. 1550 * 1551 * NOTE: This function will search *commit* root of device tree, and does extra 1552 * check to ensure dev extents are not double allocated. 1553 * This makes the function safe to allocate dev extents but may not report 1554 * correct usable device space, as device extent freed in current transaction 1555 * is not reported as available. 1556 */ 1557 static int find_free_dev_extent_start(struct btrfs_device *device, 1558 u64 num_bytes, u64 search_start, u64 *start, 1559 u64 *len) 1560 { 1561 struct btrfs_fs_info *fs_info = device->fs_info; 1562 struct btrfs_root *root = fs_info->dev_root; 1563 struct btrfs_key key; 1564 struct btrfs_dev_extent *dev_extent; 1565 struct btrfs_path *path; 1566 u64 hole_size; 1567 u64 max_hole_start; 1568 u64 max_hole_size; 1569 u64 extent_end; 1570 u64 search_end = device->total_bytes; 1571 int ret; 1572 int slot; 1573 struct extent_buffer *l; 1574 1575 search_start = dev_extent_search_start(device, search_start); 1576 1577 WARN_ON(device->zone_info && 1578 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1579 1580 path = btrfs_alloc_path(); 1581 if (!path) 1582 return -ENOMEM; 1583 1584 max_hole_start = search_start; 1585 max_hole_size = 0; 1586 1587 again: 1588 if (search_start >= search_end || 1589 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1590 ret = -ENOSPC; 1591 goto out; 1592 } 1593 1594 path->reada = READA_FORWARD; 1595 path->search_commit_root = 1; 1596 path->skip_locking = 1; 1597 1598 key.objectid = device->devid; 1599 key.offset = search_start; 1600 key.type = BTRFS_DEV_EXTENT_KEY; 1601 1602 ret = btrfs_search_backwards(root, &key, path); 1603 if (ret < 0) 1604 goto out; 1605 1606 while (1) { 1607 l = path->nodes[0]; 1608 slot = path->slots[0]; 1609 if (slot >= btrfs_header_nritems(l)) { 1610 ret = btrfs_next_leaf(root, path); 1611 if (ret == 0) 1612 continue; 1613 if (ret < 0) 1614 goto out; 1615 1616 break; 1617 } 1618 btrfs_item_key_to_cpu(l, &key, slot); 1619 1620 if (key.objectid < device->devid) 1621 goto next; 1622 1623 if (key.objectid > device->devid) 1624 break; 1625 1626 if (key.type != BTRFS_DEV_EXTENT_KEY) 1627 goto next; 1628 1629 if (key.offset > search_start) { 1630 hole_size = key.offset - search_start; 1631 dev_extent_hole_check(device, &search_start, &hole_size, 1632 num_bytes); 1633 1634 if (hole_size > max_hole_size) { 1635 max_hole_start = search_start; 1636 max_hole_size = hole_size; 1637 } 1638 1639 /* 1640 * If this free space is greater than which we need, 1641 * it must be the max free space that we have found 1642 * until now, so max_hole_start must point to the start 1643 * of this free space and the length of this free space 1644 * is stored in max_hole_size. Thus, we return 1645 * max_hole_start and max_hole_size and go back to the 1646 * caller. 1647 */ 1648 if (hole_size >= num_bytes) { 1649 ret = 0; 1650 goto out; 1651 } 1652 } 1653 1654 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1655 extent_end = key.offset + btrfs_dev_extent_length(l, 1656 dev_extent); 1657 if (extent_end > search_start) 1658 search_start = extent_end; 1659 next: 1660 path->slots[0]++; 1661 cond_resched(); 1662 } 1663 1664 /* 1665 * At this point, search_start should be the end of 1666 * allocated dev extents, and when shrinking the device, 1667 * search_end may be smaller than search_start. 1668 */ 1669 if (search_end > search_start) { 1670 hole_size = search_end - search_start; 1671 if (dev_extent_hole_check(device, &search_start, &hole_size, 1672 num_bytes)) { 1673 btrfs_release_path(path); 1674 goto again; 1675 } 1676 1677 if (hole_size > max_hole_size) { 1678 max_hole_start = search_start; 1679 max_hole_size = hole_size; 1680 } 1681 } 1682 1683 /* See above. */ 1684 if (max_hole_size < num_bytes) 1685 ret = -ENOSPC; 1686 else 1687 ret = 0; 1688 1689 out: 1690 btrfs_free_path(path); 1691 *start = max_hole_start; 1692 if (len) 1693 *len = max_hole_size; 1694 return ret; 1695 } 1696 1697 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1698 u64 *start, u64 *len) 1699 { 1700 /* FIXME use last free of some kind */ 1701 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1702 } 1703 1704 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1705 struct btrfs_device *device, 1706 u64 start, u64 *dev_extent_len) 1707 { 1708 struct btrfs_fs_info *fs_info = device->fs_info; 1709 struct btrfs_root *root = fs_info->dev_root; 1710 int ret; 1711 struct btrfs_path *path; 1712 struct btrfs_key key; 1713 struct btrfs_key found_key; 1714 struct extent_buffer *leaf = NULL; 1715 struct btrfs_dev_extent *extent = NULL; 1716 1717 path = btrfs_alloc_path(); 1718 if (!path) 1719 return -ENOMEM; 1720 1721 key.objectid = device->devid; 1722 key.offset = start; 1723 key.type = BTRFS_DEV_EXTENT_KEY; 1724 again: 1725 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1726 if (ret > 0) { 1727 ret = btrfs_previous_item(root, path, key.objectid, 1728 BTRFS_DEV_EXTENT_KEY); 1729 if (ret) 1730 goto out; 1731 leaf = path->nodes[0]; 1732 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1733 extent = btrfs_item_ptr(leaf, path->slots[0], 1734 struct btrfs_dev_extent); 1735 BUG_ON(found_key.offset > start || found_key.offset + 1736 btrfs_dev_extent_length(leaf, extent) < start); 1737 key = found_key; 1738 btrfs_release_path(path); 1739 goto again; 1740 } else if (ret == 0) { 1741 leaf = path->nodes[0]; 1742 extent = btrfs_item_ptr(leaf, path->slots[0], 1743 struct btrfs_dev_extent); 1744 } else { 1745 goto out; 1746 } 1747 1748 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1749 1750 ret = btrfs_del_item(trans, root, path); 1751 if (ret == 0) 1752 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1753 out: 1754 btrfs_free_path(path); 1755 return ret; 1756 } 1757 1758 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1759 { 1760 struct extent_map_tree *em_tree; 1761 struct extent_map *em; 1762 struct rb_node *n; 1763 u64 ret = 0; 1764 1765 em_tree = &fs_info->mapping_tree; 1766 read_lock(&em_tree->lock); 1767 n = rb_last(&em_tree->map.rb_root); 1768 if (n) { 1769 em = rb_entry(n, struct extent_map, rb_node); 1770 ret = em->start + em->len; 1771 } 1772 read_unlock(&em_tree->lock); 1773 1774 return ret; 1775 } 1776 1777 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1778 u64 *devid_ret) 1779 { 1780 int ret; 1781 struct btrfs_key key; 1782 struct btrfs_key found_key; 1783 struct btrfs_path *path; 1784 1785 path = btrfs_alloc_path(); 1786 if (!path) 1787 return -ENOMEM; 1788 1789 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1790 key.type = BTRFS_DEV_ITEM_KEY; 1791 key.offset = (u64)-1; 1792 1793 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1794 if (ret < 0) 1795 goto error; 1796 1797 if (ret == 0) { 1798 /* Corruption */ 1799 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1800 ret = -EUCLEAN; 1801 goto error; 1802 } 1803 1804 ret = btrfs_previous_item(fs_info->chunk_root, path, 1805 BTRFS_DEV_ITEMS_OBJECTID, 1806 BTRFS_DEV_ITEM_KEY); 1807 if (ret) { 1808 *devid_ret = 1; 1809 } else { 1810 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1811 path->slots[0]); 1812 *devid_ret = found_key.offset + 1; 1813 } 1814 ret = 0; 1815 error: 1816 btrfs_free_path(path); 1817 return ret; 1818 } 1819 1820 /* 1821 * the device information is stored in the chunk root 1822 * the btrfs_device struct should be fully filled in 1823 */ 1824 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1825 struct btrfs_device *device) 1826 { 1827 int ret; 1828 struct btrfs_path *path; 1829 struct btrfs_dev_item *dev_item; 1830 struct extent_buffer *leaf; 1831 struct btrfs_key key; 1832 unsigned long ptr; 1833 1834 path = btrfs_alloc_path(); 1835 if (!path) 1836 return -ENOMEM; 1837 1838 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1839 key.type = BTRFS_DEV_ITEM_KEY; 1840 key.offset = device->devid; 1841 1842 btrfs_reserve_chunk_metadata(trans, true); 1843 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1844 &key, sizeof(*dev_item)); 1845 btrfs_trans_release_chunk_metadata(trans); 1846 if (ret) 1847 goto out; 1848 1849 leaf = path->nodes[0]; 1850 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1851 1852 btrfs_set_device_id(leaf, dev_item, device->devid); 1853 btrfs_set_device_generation(leaf, dev_item, 0); 1854 btrfs_set_device_type(leaf, dev_item, device->type); 1855 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1856 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1857 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1858 btrfs_set_device_total_bytes(leaf, dev_item, 1859 btrfs_device_get_disk_total_bytes(device)); 1860 btrfs_set_device_bytes_used(leaf, dev_item, 1861 btrfs_device_get_bytes_used(device)); 1862 btrfs_set_device_group(leaf, dev_item, 0); 1863 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1864 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1865 btrfs_set_device_start_offset(leaf, dev_item, 0); 1866 1867 ptr = btrfs_device_uuid(dev_item); 1868 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1869 ptr = btrfs_device_fsid(dev_item); 1870 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1871 ptr, BTRFS_FSID_SIZE); 1872 btrfs_mark_buffer_dirty(leaf); 1873 1874 ret = 0; 1875 out: 1876 btrfs_free_path(path); 1877 return ret; 1878 } 1879 1880 /* 1881 * Function to update ctime/mtime for a given device path. 1882 * Mainly used for ctime/mtime based probe like libblkid. 1883 * 1884 * We don't care about errors here, this is just to be kind to userspace. 1885 */ 1886 static void update_dev_time(const char *device_path) 1887 { 1888 struct path path; 1889 struct timespec64 now; 1890 int ret; 1891 1892 ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 1893 if (ret) 1894 return; 1895 1896 now = current_time(d_inode(path.dentry)); 1897 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); 1898 path_put(&path); 1899 } 1900 1901 static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans, 1902 struct btrfs_device *device) 1903 { 1904 struct btrfs_root *root = device->fs_info->chunk_root; 1905 int ret; 1906 struct btrfs_path *path; 1907 struct btrfs_key key; 1908 1909 path = btrfs_alloc_path(); 1910 if (!path) 1911 return -ENOMEM; 1912 1913 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1914 key.type = BTRFS_DEV_ITEM_KEY; 1915 key.offset = device->devid; 1916 1917 btrfs_reserve_chunk_metadata(trans, false); 1918 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1919 btrfs_trans_release_chunk_metadata(trans); 1920 if (ret) { 1921 if (ret > 0) 1922 ret = -ENOENT; 1923 goto out; 1924 } 1925 1926 ret = btrfs_del_item(trans, root, path); 1927 out: 1928 btrfs_free_path(path); 1929 return ret; 1930 } 1931 1932 /* 1933 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1934 * filesystem. It's up to the caller to adjust that number regarding eg. device 1935 * replace. 1936 */ 1937 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1938 u64 num_devices) 1939 { 1940 u64 all_avail; 1941 unsigned seq; 1942 int i; 1943 1944 do { 1945 seq = read_seqbegin(&fs_info->profiles_lock); 1946 1947 all_avail = fs_info->avail_data_alloc_bits | 1948 fs_info->avail_system_alloc_bits | 1949 fs_info->avail_metadata_alloc_bits; 1950 } while (read_seqretry(&fs_info->profiles_lock, seq)); 1951 1952 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 1953 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 1954 continue; 1955 1956 if (num_devices < btrfs_raid_array[i].devs_min) 1957 return btrfs_raid_array[i].mindev_error; 1958 } 1959 1960 return 0; 1961 } 1962 1963 static struct btrfs_device * btrfs_find_next_active_device( 1964 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 1965 { 1966 struct btrfs_device *next_device; 1967 1968 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 1969 if (next_device != device && 1970 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 1971 && next_device->bdev) 1972 return next_device; 1973 } 1974 1975 return NULL; 1976 } 1977 1978 /* 1979 * Helper function to check if the given device is part of s_bdev / latest_dev 1980 * and replace it with the provided or the next active device, in the context 1981 * where this function called, there should be always be another device (or 1982 * this_dev) which is active. 1983 */ 1984 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 1985 struct btrfs_device *next_device) 1986 { 1987 struct btrfs_fs_info *fs_info = device->fs_info; 1988 1989 if (!next_device) 1990 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 1991 device); 1992 ASSERT(next_device); 1993 1994 if (fs_info->sb->s_bdev && 1995 (fs_info->sb->s_bdev == device->bdev)) 1996 fs_info->sb->s_bdev = next_device->bdev; 1997 1998 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 1999 fs_info->fs_devices->latest_dev = next_device; 2000 } 2001 2002 /* 2003 * Return btrfs_fs_devices::num_devices excluding the device that's being 2004 * currently replaced. 2005 */ 2006 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2007 { 2008 u64 num_devices = fs_info->fs_devices->num_devices; 2009 2010 down_read(&fs_info->dev_replace.rwsem); 2011 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2012 ASSERT(num_devices > 1); 2013 num_devices--; 2014 } 2015 up_read(&fs_info->dev_replace.rwsem); 2016 2017 return num_devices; 2018 } 2019 2020 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2021 struct block_device *bdev, 2022 const char *device_path) 2023 { 2024 struct btrfs_super_block *disk_super; 2025 int copy_num; 2026 2027 if (!bdev) 2028 return; 2029 2030 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2031 struct page *page; 2032 int ret; 2033 2034 disk_super = btrfs_read_dev_one_super(bdev, copy_num, false); 2035 if (IS_ERR(disk_super)) 2036 continue; 2037 2038 if (bdev_is_zoned(bdev)) { 2039 btrfs_reset_sb_log_zones(bdev, copy_num); 2040 continue; 2041 } 2042 2043 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2044 2045 page = virt_to_page(disk_super); 2046 set_page_dirty(page); 2047 lock_page(page); 2048 /* write_on_page() unlocks the page */ 2049 ret = write_one_page(page); 2050 if (ret) 2051 btrfs_warn(fs_info, 2052 "error clearing superblock number %d (%d)", 2053 copy_num, ret); 2054 btrfs_release_disk_super(disk_super); 2055 2056 } 2057 2058 /* Notify udev that device has changed */ 2059 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2060 2061 /* Update ctime/mtime for device path for libblkid */ 2062 update_dev_time(device_path); 2063 } 2064 2065 int btrfs_rm_device(struct btrfs_fs_info *fs_info, 2066 struct btrfs_dev_lookup_args *args, 2067 struct block_device **bdev, fmode_t *mode) 2068 { 2069 struct btrfs_trans_handle *trans; 2070 struct btrfs_device *device; 2071 struct btrfs_fs_devices *cur_devices; 2072 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2073 u64 num_devices; 2074 int ret = 0; 2075 2076 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2077 btrfs_err(fs_info, "device remove not supported on extent tree v2 yet"); 2078 return -EINVAL; 2079 } 2080 2081 /* 2082 * The device list in fs_devices is accessed without locks (neither 2083 * uuid_mutex nor device_list_mutex) as it won't change on a mounted 2084 * filesystem and another device rm cannot run. 2085 */ 2086 num_devices = btrfs_num_devices(fs_info); 2087 2088 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2089 if (ret) 2090 return ret; 2091 2092 device = btrfs_find_device(fs_info->fs_devices, args); 2093 if (!device) { 2094 if (args->missing) 2095 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2096 else 2097 ret = -ENOENT; 2098 return ret; 2099 } 2100 2101 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2102 btrfs_warn_in_rcu(fs_info, 2103 "cannot remove device %s (devid %llu) due to active swapfile", 2104 btrfs_dev_name(device), device->devid); 2105 return -ETXTBSY; 2106 } 2107 2108 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 2109 return BTRFS_ERROR_DEV_TGT_REPLACE; 2110 2111 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2112 fs_info->fs_devices->rw_devices == 1) 2113 return BTRFS_ERROR_DEV_ONLY_WRITABLE; 2114 2115 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2116 mutex_lock(&fs_info->chunk_mutex); 2117 list_del_init(&device->dev_alloc_list); 2118 device->fs_devices->rw_devices--; 2119 mutex_unlock(&fs_info->chunk_mutex); 2120 } 2121 2122 ret = btrfs_shrink_device(device, 0); 2123 if (ret) 2124 goto error_undo; 2125 2126 trans = btrfs_start_transaction(fs_info->chunk_root, 0); 2127 if (IS_ERR(trans)) { 2128 ret = PTR_ERR(trans); 2129 goto error_undo; 2130 } 2131 2132 ret = btrfs_rm_dev_item(trans, device); 2133 if (ret) { 2134 /* Any error in dev item removal is critical */ 2135 btrfs_crit(fs_info, 2136 "failed to remove device item for devid %llu: %d", 2137 device->devid, ret); 2138 btrfs_abort_transaction(trans, ret); 2139 btrfs_end_transaction(trans); 2140 return ret; 2141 } 2142 2143 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2144 btrfs_scrub_cancel_dev(device); 2145 2146 /* 2147 * the device list mutex makes sure that we don't change 2148 * the device list while someone else is writing out all 2149 * the device supers. Whoever is writing all supers, should 2150 * lock the device list mutex before getting the number of 2151 * devices in the super block (super_copy). Conversely, 2152 * whoever updates the number of devices in the super block 2153 * (super_copy) should hold the device list mutex. 2154 */ 2155 2156 /* 2157 * In normal cases the cur_devices == fs_devices. But in case 2158 * of deleting a seed device, the cur_devices should point to 2159 * its own fs_devices listed under the fs_devices->seed_list. 2160 */ 2161 cur_devices = device->fs_devices; 2162 mutex_lock(&fs_devices->device_list_mutex); 2163 list_del_rcu(&device->dev_list); 2164 2165 cur_devices->num_devices--; 2166 cur_devices->total_devices--; 2167 /* Update total_devices of the parent fs_devices if it's seed */ 2168 if (cur_devices != fs_devices) 2169 fs_devices->total_devices--; 2170 2171 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2172 cur_devices->missing_devices--; 2173 2174 btrfs_assign_next_active_device(device, NULL); 2175 2176 if (device->bdev) { 2177 cur_devices->open_devices--; 2178 /* remove sysfs entry */ 2179 btrfs_sysfs_remove_device(device); 2180 } 2181 2182 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2183 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2184 mutex_unlock(&fs_devices->device_list_mutex); 2185 2186 /* 2187 * At this point, the device is zero sized and detached from the 2188 * devices list. All that's left is to zero out the old supers and 2189 * free the device. 2190 * 2191 * We cannot call btrfs_close_bdev() here because we're holding the sb 2192 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2193 * block device and it's dependencies. Instead just flush the device 2194 * and let the caller do the final blkdev_put. 2195 */ 2196 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2197 btrfs_scratch_superblocks(fs_info, device->bdev, 2198 device->name->str); 2199 if (device->bdev) { 2200 sync_blockdev(device->bdev); 2201 invalidate_bdev(device->bdev); 2202 } 2203 } 2204 2205 *bdev = device->bdev; 2206 *mode = device->mode; 2207 synchronize_rcu(); 2208 btrfs_free_device(device); 2209 2210 /* 2211 * This can happen if cur_devices is the private seed devices list. We 2212 * cannot call close_fs_devices() here because it expects the uuid_mutex 2213 * to be held, but in fact we don't need that for the private 2214 * seed_devices, we can simply decrement cur_devices->opened and then 2215 * remove it from our list and free the fs_devices. 2216 */ 2217 if (cur_devices->num_devices == 0) { 2218 list_del_init(&cur_devices->seed_list); 2219 ASSERT(cur_devices->opened == 1); 2220 cur_devices->opened--; 2221 free_fs_devices(cur_devices); 2222 } 2223 2224 ret = btrfs_commit_transaction(trans); 2225 2226 return ret; 2227 2228 error_undo: 2229 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2230 mutex_lock(&fs_info->chunk_mutex); 2231 list_add(&device->dev_alloc_list, 2232 &fs_devices->alloc_list); 2233 device->fs_devices->rw_devices++; 2234 mutex_unlock(&fs_info->chunk_mutex); 2235 } 2236 return ret; 2237 } 2238 2239 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2240 { 2241 struct btrfs_fs_devices *fs_devices; 2242 2243 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2244 2245 /* 2246 * in case of fs with no seed, srcdev->fs_devices will point 2247 * to fs_devices of fs_info. However when the dev being replaced is 2248 * a seed dev it will point to the seed's local fs_devices. In short 2249 * srcdev will have its correct fs_devices in both the cases. 2250 */ 2251 fs_devices = srcdev->fs_devices; 2252 2253 list_del_rcu(&srcdev->dev_list); 2254 list_del(&srcdev->dev_alloc_list); 2255 fs_devices->num_devices--; 2256 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2257 fs_devices->missing_devices--; 2258 2259 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2260 fs_devices->rw_devices--; 2261 2262 if (srcdev->bdev) 2263 fs_devices->open_devices--; 2264 } 2265 2266 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2267 { 2268 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2269 2270 mutex_lock(&uuid_mutex); 2271 2272 btrfs_close_bdev(srcdev); 2273 synchronize_rcu(); 2274 btrfs_free_device(srcdev); 2275 2276 /* if this is no devs we rather delete the fs_devices */ 2277 if (!fs_devices->num_devices) { 2278 /* 2279 * On a mounted FS, num_devices can't be zero unless it's a 2280 * seed. In case of a seed device being replaced, the replace 2281 * target added to the sprout FS, so there will be no more 2282 * device left under the seed FS. 2283 */ 2284 ASSERT(fs_devices->seeding); 2285 2286 list_del_init(&fs_devices->seed_list); 2287 close_fs_devices(fs_devices); 2288 free_fs_devices(fs_devices); 2289 } 2290 mutex_unlock(&uuid_mutex); 2291 } 2292 2293 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2294 { 2295 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2296 2297 mutex_lock(&fs_devices->device_list_mutex); 2298 2299 btrfs_sysfs_remove_device(tgtdev); 2300 2301 if (tgtdev->bdev) 2302 fs_devices->open_devices--; 2303 2304 fs_devices->num_devices--; 2305 2306 btrfs_assign_next_active_device(tgtdev, NULL); 2307 2308 list_del_rcu(&tgtdev->dev_list); 2309 2310 mutex_unlock(&fs_devices->device_list_mutex); 2311 2312 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2313 tgtdev->name->str); 2314 2315 btrfs_close_bdev(tgtdev); 2316 synchronize_rcu(); 2317 btrfs_free_device(tgtdev); 2318 } 2319 2320 /* 2321 * Populate args from device at path. 2322 * 2323 * @fs_info: the filesystem 2324 * @args: the args to populate 2325 * @path: the path to the device 2326 * 2327 * This will read the super block of the device at @path and populate @args with 2328 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 2329 * lookup a device to operate on, but need to do it before we take any locks. 2330 * This properly handles the special case of "missing" that a user may pass in, 2331 * and does some basic sanity checks. The caller must make sure that @path is 2332 * properly NUL terminated before calling in, and must call 2333 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 2334 * uuid buffers. 2335 * 2336 * Return: 0 for success, -errno for failure 2337 */ 2338 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 2339 struct btrfs_dev_lookup_args *args, 2340 const char *path) 2341 { 2342 struct btrfs_super_block *disk_super; 2343 struct block_device *bdev; 2344 int ret; 2345 2346 if (!path || !path[0]) 2347 return -EINVAL; 2348 if (!strcmp(path, "missing")) { 2349 args->missing = true; 2350 return 0; 2351 } 2352 2353 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 2354 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 2355 if (!args->uuid || !args->fsid) { 2356 btrfs_put_dev_args_from_path(args); 2357 return -ENOMEM; 2358 } 2359 2360 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, 2361 &bdev, &disk_super); 2362 if (ret) { 2363 btrfs_put_dev_args_from_path(args); 2364 return ret; 2365 } 2366 2367 args->devid = btrfs_stack_device_id(&disk_super->dev_item); 2368 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 2369 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2370 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 2371 else 2372 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 2373 btrfs_release_disk_super(disk_super); 2374 blkdev_put(bdev, FMODE_READ); 2375 return 0; 2376 } 2377 2378 /* 2379 * Only use this jointly with btrfs_get_dev_args_from_path() because we will 2380 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 2381 * that don't need to be freed. 2382 */ 2383 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 2384 { 2385 kfree(args->uuid); 2386 kfree(args->fsid); 2387 args->uuid = NULL; 2388 args->fsid = NULL; 2389 } 2390 2391 struct btrfs_device *btrfs_find_device_by_devspec( 2392 struct btrfs_fs_info *fs_info, u64 devid, 2393 const char *device_path) 2394 { 2395 BTRFS_DEV_LOOKUP_ARGS(args); 2396 struct btrfs_device *device; 2397 int ret; 2398 2399 if (devid) { 2400 args.devid = devid; 2401 device = btrfs_find_device(fs_info->fs_devices, &args); 2402 if (!device) 2403 return ERR_PTR(-ENOENT); 2404 return device; 2405 } 2406 2407 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 2408 if (ret) 2409 return ERR_PTR(ret); 2410 device = btrfs_find_device(fs_info->fs_devices, &args); 2411 btrfs_put_dev_args_from_path(&args); 2412 if (!device) 2413 return ERR_PTR(-ENOENT); 2414 return device; 2415 } 2416 2417 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 2418 { 2419 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2420 struct btrfs_fs_devices *old_devices; 2421 struct btrfs_fs_devices *seed_devices; 2422 2423 lockdep_assert_held(&uuid_mutex); 2424 if (!fs_devices->seeding) 2425 return ERR_PTR(-EINVAL); 2426 2427 /* 2428 * Private copy of the seed devices, anchored at 2429 * fs_info->fs_devices->seed_list 2430 */ 2431 seed_devices = alloc_fs_devices(NULL, NULL); 2432 if (IS_ERR(seed_devices)) 2433 return seed_devices; 2434 2435 /* 2436 * It's necessary to retain a copy of the original seed fs_devices in 2437 * fs_uuids so that filesystems which have been seeded can successfully 2438 * reference the seed device from open_seed_devices. This also supports 2439 * multiple fs seed. 2440 */ 2441 old_devices = clone_fs_devices(fs_devices); 2442 if (IS_ERR(old_devices)) { 2443 kfree(seed_devices); 2444 return old_devices; 2445 } 2446 2447 list_add(&old_devices->fs_list, &fs_uuids); 2448 2449 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2450 seed_devices->opened = 1; 2451 INIT_LIST_HEAD(&seed_devices->devices); 2452 INIT_LIST_HEAD(&seed_devices->alloc_list); 2453 mutex_init(&seed_devices->device_list_mutex); 2454 2455 return seed_devices; 2456 } 2457 2458 /* 2459 * Splice seed devices into the sprout fs_devices. 2460 * Generate a new fsid for the sprouted read-write filesystem. 2461 */ 2462 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 2463 struct btrfs_fs_devices *seed_devices) 2464 { 2465 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2466 struct btrfs_super_block *disk_super = fs_info->super_copy; 2467 struct btrfs_device *device; 2468 u64 super_flags; 2469 2470 /* 2471 * We are updating the fsid, the thread leading to device_list_add() 2472 * could race, so uuid_mutex is needed. 2473 */ 2474 lockdep_assert_held(&uuid_mutex); 2475 2476 /* 2477 * The threads listed below may traverse dev_list but can do that without 2478 * device_list_mutex: 2479 * - All device ops and balance - as we are in btrfs_exclop_start. 2480 * - Various dev_list readers - are using RCU. 2481 * - btrfs_ioctl_fitrim() - is using RCU. 2482 * 2483 * For-read threads as below are using device_list_mutex: 2484 * - Readonly scrub btrfs_scrub_dev() 2485 * - Readonly scrub btrfs_scrub_progress() 2486 * - btrfs_get_dev_stats() 2487 */ 2488 lockdep_assert_held(&fs_devices->device_list_mutex); 2489 2490 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2491 synchronize_rcu); 2492 list_for_each_entry(device, &seed_devices->devices, dev_list) 2493 device->fs_devices = seed_devices; 2494 2495 fs_devices->seeding = false; 2496 fs_devices->num_devices = 0; 2497 fs_devices->open_devices = 0; 2498 fs_devices->missing_devices = 0; 2499 fs_devices->rotating = false; 2500 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2501 2502 generate_random_uuid(fs_devices->fsid); 2503 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2504 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2505 2506 super_flags = btrfs_super_flags(disk_super) & 2507 ~BTRFS_SUPER_FLAG_SEEDING; 2508 btrfs_set_super_flags(disk_super, super_flags); 2509 } 2510 2511 /* 2512 * Store the expected generation for seed devices in device items. 2513 */ 2514 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2515 { 2516 BTRFS_DEV_LOOKUP_ARGS(args); 2517 struct btrfs_fs_info *fs_info = trans->fs_info; 2518 struct btrfs_root *root = fs_info->chunk_root; 2519 struct btrfs_path *path; 2520 struct extent_buffer *leaf; 2521 struct btrfs_dev_item *dev_item; 2522 struct btrfs_device *device; 2523 struct btrfs_key key; 2524 u8 fs_uuid[BTRFS_FSID_SIZE]; 2525 u8 dev_uuid[BTRFS_UUID_SIZE]; 2526 int ret; 2527 2528 path = btrfs_alloc_path(); 2529 if (!path) 2530 return -ENOMEM; 2531 2532 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2533 key.offset = 0; 2534 key.type = BTRFS_DEV_ITEM_KEY; 2535 2536 while (1) { 2537 btrfs_reserve_chunk_metadata(trans, false); 2538 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2539 btrfs_trans_release_chunk_metadata(trans); 2540 if (ret < 0) 2541 goto error; 2542 2543 leaf = path->nodes[0]; 2544 next_slot: 2545 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2546 ret = btrfs_next_leaf(root, path); 2547 if (ret > 0) 2548 break; 2549 if (ret < 0) 2550 goto error; 2551 leaf = path->nodes[0]; 2552 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2553 btrfs_release_path(path); 2554 continue; 2555 } 2556 2557 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2558 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2559 key.type != BTRFS_DEV_ITEM_KEY) 2560 break; 2561 2562 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2563 struct btrfs_dev_item); 2564 args.devid = btrfs_device_id(leaf, dev_item); 2565 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2566 BTRFS_UUID_SIZE); 2567 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2568 BTRFS_FSID_SIZE); 2569 args.uuid = dev_uuid; 2570 args.fsid = fs_uuid; 2571 device = btrfs_find_device(fs_info->fs_devices, &args); 2572 BUG_ON(!device); /* Logic error */ 2573 2574 if (device->fs_devices->seeding) { 2575 btrfs_set_device_generation(leaf, dev_item, 2576 device->generation); 2577 btrfs_mark_buffer_dirty(leaf); 2578 } 2579 2580 path->slots[0]++; 2581 goto next_slot; 2582 } 2583 ret = 0; 2584 error: 2585 btrfs_free_path(path); 2586 return ret; 2587 } 2588 2589 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2590 { 2591 struct btrfs_root *root = fs_info->dev_root; 2592 struct btrfs_trans_handle *trans; 2593 struct btrfs_device *device; 2594 struct block_device *bdev; 2595 struct super_block *sb = fs_info->sb; 2596 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2597 struct btrfs_fs_devices *seed_devices; 2598 u64 orig_super_total_bytes; 2599 u64 orig_super_num_devices; 2600 int ret = 0; 2601 bool seeding_dev = false; 2602 bool locked = false; 2603 2604 if (sb_rdonly(sb) && !fs_devices->seeding) 2605 return -EROFS; 2606 2607 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2608 fs_info->bdev_holder); 2609 if (IS_ERR(bdev)) 2610 return PTR_ERR(bdev); 2611 2612 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2613 ret = -EINVAL; 2614 goto error; 2615 } 2616 2617 if (fs_devices->seeding) { 2618 seeding_dev = true; 2619 down_write(&sb->s_umount); 2620 mutex_lock(&uuid_mutex); 2621 locked = true; 2622 } 2623 2624 sync_blockdev(bdev); 2625 2626 rcu_read_lock(); 2627 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2628 if (device->bdev == bdev) { 2629 ret = -EEXIST; 2630 rcu_read_unlock(); 2631 goto error; 2632 } 2633 } 2634 rcu_read_unlock(); 2635 2636 device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); 2637 if (IS_ERR(device)) { 2638 /* we can safely leave the fs_devices entry around */ 2639 ret = PTR_ERR(device); 2640 goto error; 2641 } 2642 2643 device->fs_info = fs_info; 2644 device->bdev = bdev; 2645 ret = lookup_bdev(device_path, &device->devt); 2646 if (ret) 2647 goto error_free_device; 2648 2649 ret = btrfs_get_dev_zone_info(device, false); 2650 if (ret) 2651 goto error_free_device; 2652 2653 trans = btrfs_start_transaction(root, 0); 2654 if (IS_ERR(trans)) { 2655 ret = PTR_ERR(trans); 2656 goto error_free_zone; 2657 } 2658 2659 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2660 device->generation = trans->transid; 2661 device->io_width = fs_info->sectorsize; 2662 device->io_align = fs_info->sectorsize; 2663 device->sector_size = fs_info->sectorsize; 2664 device->total_bytes = 2665 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); 2666 device->disk_total_bytes = device->total_bytes; 2667 device->commit_total_bytes = device->total_bytes; 2668 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2669 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2670 device->mode = FMODE_EXCL; 2671 device->dev_stats_valid = 1; 2672 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2673 2674 if (seeding_dev) { 2675 btrfs_clear_sb_rdonly(sb); 2676 2677 /* GFP_KERNEL allocation must not be under device_list_mutex */ 2678 seed_devices = btrfs_init_sprout(fs_info); 2679 if (IS_ERR(seed_devices)) { 2680 ret = PTR_ERR(seed_devices); 2681 btrfs_abort_transaction(trans, ret); 2682 goto error_trans; 2683 } 2684 } 2685 2686 mutex_lock(&fs_devices->device_list_mutex); 2687 if (seeding_dev) { 2688 btrfs_setup_sprout(fs_info, seed_devices); 2689 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2690 device); 2691 } 2692 2693 device->fs_devices = fs_devices; 2694 2695 mutex_lock(&fs_info->chunk_mutex); 2696 list_add_rcu(&device->dev_list, &fs_devices->devices); 2697 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2698 fs_devices->num_devices++; 2699 fs_devices->open_devices++; 2700 fs_devices->rw_devices++; 2701 fs_devices->total_devices++; 2702 fs_devices->total_rw_bytes += device->total_bytes; 2703 2704 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2705 2706 if (!bdev_nonrot(bdev)) 2707 fs_devices->rotating = true; 2708 2709 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2710 btrfs_set_super_total_bytes(fs_info->super_copy, 2711 round_down(orig_super_total_bytes + device->total_bytes, 2712 fs_info->sectorsize)); 2713 2714 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2715 btrfs_set_super_num_devices(fs_info->super_copy, 2716 orig_super_num_devices + 1); 2717 2718 /* 2719 * we've got more storage, clear any full flags on the space 2720 * infos 2721 */ 2722 btrfs_clear_space_info_full(fs_info); 2723 2724 mutex_unlock(&fs_info->chunk_mutex); 2725 2726 /* Add sysfs device entry */ 2727 btrfs_sysfs_add_device(device); 2728 2729 mutex_unlock(&fs_devices->device_list_mutex); 2730 2731 if (seeding_dev) { 2732 mutex_lock(&fs_info->chunk_mutex); 2733 ret = init_first_rw_device(trans); 2734 mutex_unlock(&fs_info->chunk_mutex); 2735 if (ret) { 2736 btrfs_abort_transaction(trans, ret); 2737 goto error_sysfs; 2738 } 2739 } 2740 2741 ret = btrfs_add_dev_item(trans, device); 2742 if (ret) { 2743 btrfs_abort_transaction(trans, ret); 2744 goto error_sysfs; 2745 } 2746 2747 if (seeding_dev) { 2748 ret = btrfs_finish_sprout(trans); 2749 if (ret) { 2750 btrfs_abort_transaction(trans, ret); 2751 goto error_sysfs; 2752 } 2753 2754 /* 2755 * fs_devices now represents the newly sprouted filesystem and 2756 * its fsid has been changed by btrfs_sprout_splice(). 2757 */ 2758 btrfs_sysfs_update_sprout_fsid(fs_devices); 2759 } 2760 2761 ret = btrfs_commit_transaction(trans); 2762 2763 if (seeding_dev) { 2764 mutex_unlock(&uuid_mutex); 2765 up_write(&sb->s_umount); 2766 locked = false; 2767 2768 if (ret) /* transaction commit */ 2769 return ret; 2770 2771 ret = btrfs_relocate_sys_chunks(fs_info); 2772 if (ret < 0) 2773 btrfs_handle_fs_error(fs_info, ret, 2774 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2775 trans = btrfs_attach_transaction(root); 2776 if (IS_ERR(trans)) { 2777 if (PTR_ERR(trans) == -ENOENT) 2778 return 0; 2779 ret = PTR_ERR(trans); 2780 trans = NULL; 2781 goto error_sysfs; 2782 } 2783 ret = btrfs_commit_transaction(trans); 2784 } 2785 2786 /* 2787 * Now that we have written a new super block to this device, check all 2788 * other fs_devices list if device_path alienates any other scanned 2789 * device. 2790 * We can ignore the return value as it typically returns -EINVAL and 2791 * only succeeds if the device was an alien. 2792 */ 2793 btrfs_forget_devices(device->devt); 2794 2795 /* Update ctime/mtime for blkid or udev */ 2796 update_dev_time(device_path); 2797 2798 return ret; 2799 2800 error_sysfs: 2801 btrfs_sysfs_remove_device(device); 2802 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2803 mutex_lock(&fs_info->chunk_mutex); 2804 list_del_rcu(&device->dev_list); 2805 list_del(&device->dev_alloc_list); 2806 fs_info->fs_devices->num_devices--; 2807 fs_info->fs_devices->open_devices--; 2808 fs_info->fs_devices->rw_devices--; 2809 fs_info->fs_devices->total_devices--; 2810 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2811 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2812 btrfs_set_super_total_bytes(fs_info->super_copy, 2813 orig_super_total_bytes); 2814 btrfs_set_super_num_devices(fs_info->super_copy, 2815 orig_super_num_devices); 2816 mutex_unlock(&fs_info->chunk_mutex); 2817 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2818 error_trans: 2819 if (seeding_dev) 2820 btrfs_set_sb_rdonly(sb); 2821 if (trans) 2822 btrfs_end_transaction(trans); 2823 error_free_zone: 2824 btrfs_destroy_dev_zone_info(device); 2825 error_free_device: 2826 btrfs_free_device(device); 2827 error: 2828 blkdev_put(bdev, FMODE_EXCL); 2829 if (locked) { 2830 mutex_unlock(&uuid_mutex); 2831 up_write(&sb->s_umount); 2832 } 2833 return ret; 2834 } 2835 2836 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2837 struct btrfs_device *device) 2838 { 2839 int ret; 2840 struct btrfs_path *path; 2841 struct btrfs_root *root = device->fs_info->chunk_root; 2842 struct btrfs_dev_item *dev_item; 2843 struct extent_buffer *leaf; 2844 struct btrfs_key key; 2845 2846 path = btrfs_alloc_path(); 2847 if (!path) 2848 return -ENOMEM; 2849 2850 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2851 key.type = BTRFS_DEV_ITEM_KEY; 2852 key.offset = device->devid; 2853 2854 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2855 if (ret < 0) 2856 goto out; 2857 2858 if (ret > 0) { 2859 ret = -ENOENT; 2860 goto out; 2861 } 2862 2863 leaf = path->nodes[0]; 2864 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2865 2866 btrfs_set_device_id(leaf, dev_item, device->devid); 2867 btrfs_set_device_type(leaf, dev_item, device->type); 2868 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2869 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2870 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2871 btrfs_set_device_total_bytes(leaf, dev_item, 2872 btrfs_device_get_disk_total_bytes(device)); 2873 btrfs_set_device_bytes_used(leaf, dev_item, 2874 btrfs_device_get_bytes_used(device)); 2875 btrfs_mark_buffer_dirty(leaf); 2876 2877 out: 2878 btrfs_free_path(path); 2879 return ret; 2880 } 2881 2882 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2883 struct btrfs_device *device, u64 new_size) 2884 { 2885 struct btrfs_fs_info *fs_info = device->fs_info; 2886 struct btrfs_super_block *super_copy = fs_info->super_copy; 2887 u64 old_total; 2888 u64 diff; 2889 int ret; 2890 2891 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2892 return -EACCES; 2893 2894 new_size = round_down(new_size, fs_info->sectorsize); 2895 2896 mutex_lock(&fs_info->chunk_mutex); 2897 old_total = btrfs_super_total_bytes(super_copy); 2898 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2899 2900 if (new_size <= device->total_bytes || 2901 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2902 mutex_unlock(&fs_info->chunk_mutex); 2903 return -EINVAL; 2904 } 2905 2906 btrfs_set_super_total_bytes(super_copy, 2907 round_down(old_total + diff, fs_info->sectorsize)); 2908 device->fs_devices->total_rw_bytes += diff; 2909 2910 btrfs_device_set_total_bytes(device, new_size); 2911 btrfs_device_set_disk_total_bytes(device, new_size); 2912 btrfs_clear_space_info_full(device->fs_info); 2913 if (list_empty(&device->post_commit_list)) 2914 list_add_tail(&device->post_commit_list, 2915 &trans->transaction->dev_update_list); 2916 mutex_unlock(&fs_info->chunk_mutex); 2917 2918 btrfs_reserve_chunk_metadata(trans, false); 2919 ret = btrfs_update_device(trans, device); 2920 btrfs_trans_release_chunk_metadata(trans); 2921 2922 return ret; 2923 } 2924 2925 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2926 { 2927 struct btrfs_fs_info *fs_info = trans->fs_info; 2928 struct btrfs_root *root = fs_info->chunk_root; 2929 int ret; 2930 struct btrfs_path *path; 2931 struct btrfs_key key; 2932 2933 path = btrfs_alloc_path(); 2934 if (!path) 2935 return -ENOMEM; 2936 2937 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2938 key.offset = chunk_offset; 2939 key.type = BTRFS_CHUNK_ITEM_KEY; 2940 2941 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2942 if (ret < 0) 2943 goto out; 2944 else if (ret > 0) { /* Logic error or corruption */ 2945 btrfs_handle_fs_error(fs_info, -ENOENT, 2946 "Failed lookup while freeing chunk."); 2947 ret = -ENOENT; 2948 goto out; 2949 } 2950 2951 ret = btrfs_del_item(trans, root, path); 2952 if (ret < 0) 2953 btrfs_handle_fs_error(fs_info, ret, 2954 "Failed to delete chunk item."); 2955 out: 2956 btrfs_free_path(path); 2957 return ret; 2958 } 2959 2960 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 2961 { 2962 struct btrfs_super_block *super_copy = fs_info->super_copy; 2963 struct btrfs_disk_key *disk_key; 2964 struct btrfs_chunk *chunk; 2965 u8 *ptr; 2966 int ret = 0; 2967 u32 num_stripes; 2968 u32 array_size; 2969 u32 len = 0; 2970 u32 cur; 2971 struct btrfs_key key; 2972 2973 lockdep_assert_held(&fs_info->chunk_mutex); 2974 array_size = btrfs_super_sys_array_size(super_copy); 2975 2976 ptr = super_copy->sys_chunk_array; 2977 cur = 0; 2978 2979 while (cur < array_size) { 2980 disk_key = (struct btrfs_disk_key *)ptr; 2981 btrfs_disk_key_to_cpu(&key, disk_key); 2982 2983 len = sizeof(*disk_key); 2984 2985 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 2986 chunk = (struct btrfs_chunk *)(ptr + len); 2987 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 2988 len += btrfs_chunk_item_size(num_stripes); 2989 } else { 2990 ret = -EIO; 2991 break; 2992 } 2993 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 2994 key.offset == chunk_offset) { 2995 memmove(ptr, ptr + len, array_size - (cur + len)); 2996 array_size -= len; 2997 btrfs_set_super_sys_array_size(super_copy, array_size); 2998 } else { 2999 ptr += len; 3000 cur += len; 3001 } 3002 } 3003 return ret; 3004 } 3005 3006 /* 3007 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 3008 * @logical: Logical block offset in bytes. 3009 * @length: Length of extent in bytes. 3010 * 3011 * Return: Chunk mapping or ERR_PTR. 3012 */ 3013 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 3014 u64 logical, u64 length) 3015 { 3016 struct extent_map_tree *em_tree; 3017 struct extent_map *em; 3018 3019 em_tree = &fs_info->mapping_tree; 3020 read_lock(&em_tree->lock); 3021 em = lookup_extent_mapping(em_tree, logical, length); 3022 read_unlock(&em_tree->lock); 3023 3024 if (!em) { 3025 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 3026 logical, length); 3027 return ERR_PTR(-EINVAL); 3028 } 3029 3030 if (em->start > logical || em->start + em->len < logical) { 3031 btrfs_crit(fs_info, 3032 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 3033 logical, length, em->start, em->start + em->len); 3034 free_extent_map(em); 3035 return ERR_PTR(-EINVAL); 3036 } 3037 3038 /* callers are responsible for dropping em's ref. */ 3039 return em; 3040 } 3041 3042 static int remove_chunk_item(struct btrfs_trans_handle *trans, 3043 struct map_lookup *map, u64 chunk_offset) 3044 { 3045 int i; 3046 3047 /* 3048 * Removing chunk items and updating the device items in the chunks btree 3049 * requires holding the chunk_mutex. 3050 * See the comment at btrfs_chunk_alloc() for the details. 3051 */ 3052 lockdep_assert_held(&trans->fs_info->chunk_mutex); 3053 3054 for (i = 0; i < map->num_stripes; i++) { 3055 int ret; 3056 3057 ret = btrfs_update_device(trans, map->stripes[i].dev); 3058 if (ret) 3059 return ret; 3060 } 3061 3062 return btrfs_free_chunk(trans, chunk_offset); 3063 } 3064 3065 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3066 { 3067 struct btrfs_fs_info *fs_info = trans->fs_info; 3068 struct extent_map *em; 3069 struct map_lookup *map; 3070 u64 dev_extent_len = 0; 3071 int i, ret = 0; 3072 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3073 3074 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3075 if (IS_ERR(em)) { 3076 /* 3077 * This is a logic error, but we don't want to just rely on the 3078 * user having built with ASSERT enabled, so if ASSERT doesn't 3079 * do anything we still error out. 3080 */ 3081 ASSERT(0); 3082 return PTR_ERR(em); 3083 } 3084 map = em->map_lookup; 3085 3086 /* 3087 * First delete the device extent items from the devices btree. 3088 * We take the device_list_mutex to avoid racing with the finishing phase 3089 * of a device replace operation. See the comment below before acquiring 3090 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3091 * because that can result in a deadlock when deleting the device extent 3092 * items from the devices btree - COWing an extent buffer from the btree 3093 * may result in allocating a new metadata chunk, which would attempt to 3094 * lock again fs_info->chunk_mutex. 3095 */ 3096 mutex_lock(&fs_devices->device_list_mutex); 3097 for (i = 0; i < map->num_stripes; i++) { 3098 struct btrfs_device *device = map->stripes[i].dev; 3099 ret = btrfs_free_dev_extent(trans, device, 3100 map->stripes[i].physical, 3101 &dev_extent_len); 3102 if (ret) { 3103 mutex_unlock(&fs_devices->device_list_mutex); 3104 btrfs_abort_transaction(trans, ret); 3105 goto out; 3106 } 3107 3108 if (device->bytes_used > 0) { 3109 mutex_lock(&fs_info->chunk_mutex); 3110 btrfs_device_set_bytes_used(device, 3111 device->bytes_used - dev_extent_len); 3112 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3113 btrfs_clear_space_info_full(fs_info); 3114 mutex_unlock(&fs_info->chunk_mutex); 3115 } 3116 } 3117 mutex_unlock(&fs_devices->device_list_mutex); 3118 3119 /* 3120 * We acquire fs_info->chunk_mutex for 2 reasons: 3121 * 3122 * 1) Just like with the first phase of the chunk allocation, we must 3123 * reserve system space, do all chunk btree updates and deletions, and 3124 * update the system chunk array in the superblock while holding this 3125 * mutex. This is for similar reasons as explained on the comment at 3126 * the top of btrfs_chunk_alloc(); 3127 * 3128 * 2) Prevent races with the final phase of a device replace operation 3129 * that replaces the device object associated with the map's stripes, 3130 * because the device object's id can change at any time during that 3131 * final phase of the device replace operation 3132 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3133 * replaced device and then see it with an ID of 3134 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3135 * the device item, which does not exists on the chunk btree. 3136 * The finishing phase of device replace acquires both the 3137 * device_list_mutex and the chunk_mutex, in that order, so we are 3138 * safe by just acquiring the chunk_mutex. 3139 */ 3140 trans->removing_chunk = true; 3141 mutex_lock(&fs_info->chunk_mutex); 3142 3143 check_system_chunk(trans, map->type); 3144 3145 ret = remove_chunk_item(trans, map, chunk_offset); 3146 /* 3147 * Normally we should not get -ENOSPC since we reserved space before 3148 * through the call to check_system_chunk(). 3149 * 3150 * Despite our system space_info having enough free space, we may not 3151 * be able to allocate extents from its block groups, because all have 3152 * an incompatible profile, which will force us to allocate a new system 3153 * block group with the right profile, or right after we called 3154 * check_system_space() above, a scrub turned the only system block group 3155 * with enough free space into RO mode. 3156 * This is explained with more detail at do_chunk_alloc(). 3157 * 3158 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3159 */ 3160 if (ret == -ENOSPC) { 3161 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3162 struct btrfs_block_group *sys_bg; 3163 3164 sys_bg = btrfs_create_chunk(trans, sys_flags); 3165 if (IS_ERR(sys_bg)) { 3166 ret = PTR_ERR(sys_bg); 3167 btrfs_abort_transaction(trans, ret); 3168 goto out; 3169 } 3170 3171 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3172 if (ret) { 3173 btrfs_abort_transaction(trans, ret); 3174 goto out; 3175 } 3176 3177 ret = remove_chunk_item(trans, map, chunk_offset); 3178 if (ret) { 3179 btrfs_abort_transaction(trans, ret); 3180 goto out; 3181 } 3182 } else if (ret) { 3183 btrfs_abort_transaction(trans, ret); 3184 goto out; 3185 } 3186 3187 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3188 3189 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3190 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3191 if (ret) { 3192 btrfs_abort_transaction(trans, ret); 3193 goto out; 3194 } 3195 } 3196 3197 mutex_unlock(&fs_info->chunk_mutex); 3198 trans->removing_chunk = false; 3199 3200 /* 3201 * We are done with chunk btree updates and deletions, so release the 3202 * system space we previously reserved (with check_system_chunk()). 3203 */ 3204 btrfs_trans_release_chunk_metadata(trans); 3205 3206 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3207 if (ret) { 3208 btrfs_abort_transaction(trans, ret); 3209 goto out; 3210 } 3211 3212 out: 3213 if (trans->removing_chunk) { 3214 mutex_unlock(&fs_info->chunk_mutex); 3215 trans->removing_chunk = false; 3216 } 3217 /* once for us */ 3218 free_extent_map(em); 3219 return ret; 3220 } 3221 3222 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3223 { 3224 struct btrfs_root *root = fs_info->chunk_root; 3225 struct btrfs_trans_handle *trans; 3226 struct btrfs_block_group *block_group; 3227 u64 length; 3228 int ret; 3229 3230 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 3231 btrfs_err(fs_info, 3232 "relocate: not supported on extent tree v2 yet"); 3233 return -EINVAL; 3234 } 3235 3236 /* 3237 * Prevent races with automatic removal of unused block groups. 3238 * After we relocate and before we remove the chunk with offset 3239 * chunk_offset, automatic removal of the block group can kick in, 3240 * resulting in a failure when calling btrfs_remove_chunk() below. 3241 * 3242 * Make sure to acquire this mutex before doing a tree search (dev 3243 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3244 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3245 * we release the path used to search the chunk/dev tree and before 3246 * the current task acquires this mutex and calls us. 3247 */ 3248 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3249 3250 /* step one, relocate all the extents inside this chunk */ 3251 btrfs_scrub_pause(fs_info); 3252 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3253 btrfs_scrub_continue(fs_info); 3254 if (ret) 3255 return ret; 3256 3257 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3258 if (!block_group) 3259 return -ENOENT; 3260 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3261 length = block_group->length; 3262 btrfs_put_block_group(block_group); 3263 3264 /* 3265 * On a zoned file system, discard the whole block group, this will 3266 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3267 * resetting the zone fails, don't treat it as a fatal problem from the 3268 * filesystem's point of view. 3269 */ 3270 if (btrfs_is_zoned(fs_info)) { 3271 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3272 if (ret) 3273 btrfs_info(fs_info, 3274 "failed to reset zone %llu after relocation", 3275 chunk_offset); 3276 } 3277 3278 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3279 chunk_offset); 3280 if (IS_ERR(trans)) { 3281 ret = PTR_ERR(trans); 3282 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3283 return ret; 3284 } 3285 3286 /* 3287 * step two, delete the device extents and the 3288 * chunk tree entries 3289 */ 3290 ret = btrfs_remove_chunk(trans, chunk_offset); 3291 btrfs_end_transaction(trans); 3292 return ret; 3293 } 3294 3295 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3296 { 3297 struct btrfs_root *chunk_root = fs_info->chunk_root; 3298 struct btrfs_path *path; 3299 struct extent_buffer *leaf; 3300 struct btrfs_chunk *chunk; 3301 struct btrfs_key key; 3302 struct btrfs_key found_key; 3303 u64 chunk_type; 3304 bool retried = false; 3305 int failed = 0; 3306 int ret; 3307 3308 path = btrfs_alloc_path(); 3309 if (!path) 3310 return -ENOMEM; 3311 3312 again: 3313 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3314 key.offset = (u64)-1; 3315 key.type = BTRFS_CHUNK_ITEM_KEY; 3316 3317 while (1) { 3318 mutex_lock(&fs_info->reclaim_bgs_lock); 3319 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3320 if (ret < 0) { 3321 mutex_unlock(&fs_info->reclaim_bgs_lock); 3322 goto error; 3323 } 3324 BUG_ON(ret == 0); /* Corruption */ 3325 3326 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3327 key.type); 3328 if (ret) 3329 mutex_unlock(&fs_info->reclaim_bgs_lock); 3330 if (ret < 0) 3331 goto error; 3332 if (ret > 0) 3333 break; 3334 3335 leaf = path->nodes[0]; 3336 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3337 3338 chunk = btrfs_item_ptr(leaf, path->slots[0], 3339 struct btrfs_chunk); 3340 chunk_type = btrfs_chunk_type(leaf, chunk); 3341 btrfs_release_path(path); 3342 3343 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3344 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3345 if (ret == -ENOSPC) 3346 failed++; 3347 else 3348 BUG_ON(ret); 3349 } 3350 mutex_unlock(&fs_info->reclaim_bgs_lock); 3351 3352 if (found_key.offset == 0) 3353 break; 3354 key.offset = found_key.offset - 1; 3355 } 3356 ret = 0; 3357 if (failed && !retried) { 3358 failed = 0; 3359 retried = true; 3360 goto again; 3361 } else if (WARN_ON(failed && retried)) { 3362 ret = -ENOSPC; 3363 } 3364 error: 3365 btrfs_free_path(path); 3366 return ret; 3367 } 3368 3369 /* 3370 * return 1 : allocate a data chunk successfully, 3371 * return <0: errors during allocating a data chunk, 3372 * return 0 : no need to allocate a data chunk. 3373 */ 3374 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3375 u64 chunk_offset) 3376 { 3377 struct btrfs_block_group *cache; 3378 u64 bytes_used; 3379 u64 chunk_type; 3380 3381 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3382 ASSERT(cache); 3383 chunk_type = cache->flags; 3384 btrfs_put_block_group(cache); 3385 3386 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3387 return 0; 3388 3389 spin_lock(&fs_info->data_sinfo->lock); 3390 bytes_used = fs_info->data_sinfo->bytes_used; 3391 spin_unlock(&fs_info->data_sinfo->lock); 3392 3393 if (!bytes_used) { 3394 struct btrfs_trans_handle *trans; 3395 int ret; 3396 3397 trans = btrfs_join_transaction(fs_info->tree_root); 3398 if (IS_ERR(trans)) 3399 return PTR_ERR(trans); 3400 3401 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3402 btrfs_end_transaction(trans); 3403 if (ret < 0) 3404 return ret; 3405 return 1; 3406 } 3407 3408 return 0; 3409 } 3410 3411 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3412 struct btrfs_balance_control *bctl) 3413 { 3414 struct btrfs_root *root = fs_info->tree_root; 3415 struct btrfs_trans_handle *trans; 3416 struct btrfs_balance_item *item; 3417 struct btrfs_disk_balance_args disk_bargs; 3418 struct btrfs_path *path; 3419 struct extent_buffer *leaf; 3420 struct btrfs_key key; 3421 int ret, err; 3422 3423 path = btrfs_alloc_path(); 3424 if (!path) 3425 return -ENOMEM; 3426 3427 trans = btrfs_start_transaction(root, 0); 3428 if (IS_ERR(trans)) { 3429 btrfs_free_path(path); 3430 return PTR_ERR(trans); 3431 } 3432 3433 key.objectid = BTRFS_BALANCE_OBJECTID; 3434 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3435 key.offset = 0; 3436 3437 ret = btrfs_insert_empty_item(trans, root, path, &key, 3438 sizeof(*item)); 3439 if (ret) 3440 goto out; 3441 3442 leaf = path->nodes[0]; 3443 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3444 3445 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3446 3447 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3448 btrfs_set_balance_data(leaf, item, &disk_bargs); 3449 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3450 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3451 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3452 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3453 3454 btrfs_set_balance_flags(leaf, item, bctl->flags); 3455 3456 btrfs_mark_buffer_dirty(leaf); 3457 out: 3458 btrfs_free_path(path); 3459 err = btrfs_commit_transaction(trans); 3460 if (err && !ret) 3461 ret = err; 3462 return ret; 3463 } 3464 3465 static int del_balance_item(struct btrfs_fs_info *fs_info) 3466 { 3467 struct btrfs_root *root = fs_info->tree_root; 3468 struct btrfs_trans_handle *trans; 3469 struct btrfs_path *path; 3470 struct btrfs_key key; 3471 int ret, err; 3472 3473 path = btrfs_alloc_path(); 3474 if (!path) 3475 return -ENOMEM; 3476 3477 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3478 if (IS_ERR(trans)) { 3479 btrfs_free_path(path); 3480 return PTR_ERR(trans); 3481 } 3482 3483 key.objectid = BTRFS_BALANCE_OBJECTID; 3484 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3485 key.offset = 0; 3486 3487 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3488 if (ret < 0) 3489 goto out; 3490 if (ret > 0) { 3491 ret = -ENOENT; 3492 goto out; 3493 } 3494 3495 ret = btrfs_del_item(trans, root, path); 3496 out: 3497 btrfs_free_path(path); 3498 err = btrfs_commit_transaction(trans); 3499 if (err && !ret) 3500 ret = err; 3501 return ret; 3502 } 3503 3504 /* 3505 * This is a heuristic used to reduce the number of chunks balanced on 3506 * resume after balance was interrupted. 3507 */ 3508 static void update_balance_args(struct btrfs_balance_control *bctl) 3509 { 3510 /* 3511 * Turn on soft mode for chunk types that were being converted. 3512 */ 3513 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3514 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3515 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3516 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3517 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3518 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3519 3520 /* 3521 * Turn on usage filter if is not already used. The idea is 3522 * that chunks that we have already balanced should be 3523 * reasonably full. Don't do it for chunks that are being 3524 * converted - that will keep us from relocating unconverted 3525 * (albeit full) chunks. 3526 */ 3527 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3528 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3529 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3530 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3531 bctl->data.usage = 90; 3532 } 3533 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3534 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3535 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3536 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3537 bctl->sys.usage = 90; 3538 } 3539 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3540 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3541 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3542 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3543 bctl->meta.usage = 90; 3544 } 3545 } 3546 3547 /* 3548 * Clear the balance status in fs_info and delete the balance item from disk. 3549 */ 3550 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3551 { 3552 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3553 int ret; 3554 3555 BUG_ON(!fs_info->balance_ctl); 3556 3557 spin_lock(&fs_info->balance_lock); 3558 fs_info->balance_ctl = NULL; 3559 spin_unlock(&fs_info->balance_lock); 3560 3561 kfree(bctl); 3562 ret = del_balance_item(fs_info); 3563 if (ret) 3564 btrfs_handle_fs_error(fs_info, ret, NULL); 3565 } 3566 3567 /* 3568 * Balance filters. Return 1 if chunk should be filtered out 3569 * (should not be balanced). 3570 */ 3571 static int chunk_profiles_filter(u64 chunk_type, 3572 struct btrfs_balance_args *bargs) 3573 { 3574 chunk_type = chunk_to_extended(chunk_type) & 3575 BTRFS_EXTENDED_PROFILE_MASK; 3576 3577 if (bargs->profiles & chunk_type) 3578 return 0; 3579 3580 return 1; 3581 } 3582 3583 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3584 struct btrfs_balance_args *bargs) 3585 { 3586 struct btrfs_block_group *cache; 3587 u64 chunk_used; 3588 u64 user_thresh_min; 3589 u64 user_thresh_max; 3590 int ret = 1; 3591 3592 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3593 chunk_used = cache->used; 3594 3595 if (bargs->usage_min == 0) 3596 user_thresh_min = 0; 3597 else 3598 user_thresh_min = mult_perc(cache->length, bargs->usage_min); 3599 3600 if (bargs->usage_max == 0) 3601 user_thresh_max = 1; 3602 else if (bargs->usage_max > 100) 3603 user_thresh_max = cache->length; 3604 else 3605 user_thresh_max = mult_perc(cache->length, bargs->usage_max); 3606 3607 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3608 ret = 0; 3609 3610 btrfs_put_block_group(cache); 3611 return ret; 3612 } 3613 3614 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3615 u64 chunk_offset, struct btrfs_balance_args *bargs) 3616 { 3617 struct btrfs_block_group *cache; 3618 u64 chunk_used, user_thresh; 3619 int ret = 1; 3620 3621 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3622 chunk_used = cache->used; 3623 3624 if (bargs->usage_min == 0) 3625 user_thresh = 1; 3626 else if (bargs->usage > 100) 3627 user_thresh = cache->length; 3628 else 3629 user_thresh = mult_perc(cache->length, bargs->usage); 3630 3631 if (chunk_used < user_thresh) 3632 ret = 0; 3633 3634 btrfs_put_block_group(cache); 3635 return ret; 3636 } 3637 3638 static int chunk_devid_filter(struct extent_buffer *leaf, 3639 struct btrfs_chunk *chunk, 3640 struct btrfs_balance_args *bargs) 3641 { 3642 struct btrfs_stripe *stripe; 3643 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3644 int i; 3645 3646 for (i = 0; i < num_stripes; i++) { 3647 stripe = btrfs_stripe_nr(chunk, i); 3648 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3649 return 0; 3650 } 3651 3652 return 1; 3653 } 3654 3655 static u64 calc_data_stripes(u64 type, int num_stripes) 3656 { 3657 const int index = btrfs_bg_flags_to_raid_index(type); 3658 const int ncopies = btrfs_raid_array[index].ncopies; 3659 const int nparity = btrfs_raid_array[index].nparity; 3660 3661 return (num_stripes - nparity) / ncopies; 3662 } 3663 3664 /* [pstart, pend) */ 3665 static int chunk_drange_filter(struct extent_buffer *leaf, 3666 struct btrfs_chunk *chunk, 3667 struct btrfs_balance_args *bargs) 3668 { 3669 struct btrfs_stripe *stripe; 3670 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3671 u64 stripe_offset; 3672 u64 stripe_length; 3673 u64 type; 3674 int factor; 3675 int i; 3676 3677 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3678 return 0; 3679 3680 type = btrfs_chunk_type(leaf, chunk); 3681 factor = calc_data_stripes(type, num_stripes); 3682 3683 for (i = 0; i < num_stripes; i++) { 3684 stripe = btrfs_stripe_nr(chunk, i); 3685 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3686 continue; 3687 3688 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3689 stripe_length = btrfs_chunk_length(leaf, chunk); 3690 stripe_length = div_u64(stripe_length, factor); 3691 3692 if (stripe_offset < bargs->pend && 3693 stripe_offset + stripe_length > bargs->pstart) 3694 return 0; 3695 } 3696 3697 return 1; 3698 } 3699 3700 /* [vstart, vend) */ 3701 static int chunk_vrange_filter(struct extent_buffer *leaf, 3702 struct btrfs_chunk *chunk, 3703 u64 chunk_offset, 3704 struct btrfs_balance_args *bargs) 3705 { 3706 if (chunk_offset < bargs->vend && 3707 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3708 /* at least part of the chunk is inside this vrange */ 3709 return 0; 3710 3711 return 1; 3712 } 3713 3714 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3715 struct btrfs_chunk *chunk, 3716 struct btrfs_balance_args *bargs) 3717 { 3718 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3719 3720 if (bargs->stripes_min <= num_stripes 3721 && num_stripes <= bargs->stripes_max) 3722 return 0; 3723 3724 return 1; 3725 } 3726 3727 static int chunk_soft_convert_filter(u64 chunk_type, 3728 struct btrfs_balance_args *bargs) 3729 { 3730 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3731 return 0; 3732 3733 chunk_type = chunk_to_extended(chunk_type) & 3734 BTRFS_EXTENDED_PROFILE_MASK; 3735 3736 if (bargs->target == chunk_type) 3737 return 1; 3738 3739 return 0; 3740 } 3741 3742 static int should_balance_chunk(struct extent_buffer *leaf, 3743 struct btrfs_chunk *chunk, u64 chunk_offset) 3744 { 3745 struct btrfs_fs_info *fs_info = leaf->fs_info; 3746 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3747 struct btrfs_balance_args *bargs = NULL; 3748 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3749 3750 /* type filter */ 3751 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3752 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3753 return 0; 3754 } 3755 3756 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3757 bargs = &bctl->data; 3758 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3759 bargs = &bctl->sys; 3760 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3761 bargs = &bctl->meta; 3762 3763 /* profiles filter */ 3764 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3765 chunk_profiles_filter(chunk_type, bargs)) { 3766 return 0; 3767 } 3768 3769 /* usage filter */ 3770 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3771 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3772 return 0; 3773 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3774 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3775 return 0; 3776 } 3777 3778 /* devid filter */ 3779 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3780 chunk_devid_filter(leaf, chunk, bargs)) { 3781 return 0; 3782 } 3783 3784 /* drange filter, makes sense only with devid filter */ 3785 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3786 chunk_drange_filter(leaf, chunk, bargs)) { 3787 return 0; 3788 } 3789 3790 /* vrange filter */ 3791 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3792 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3793 return 0; 3794 } 3795 3796 /* stripes filter */ 3797 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3798 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3799 return 0; 3800 } 3801 3802 /* soft profile changing mode */ 3803 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3804 chunk_soft_convert_filter(chunk_type, bargs)) { 3805 return 0; 3806 } 3807 3808 /* 3809 * limited by count, must be the last filter 3810 */ 3811 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3812 if (bargs->limit == 0) 3813 return 0; 3814 else 3815 bargs->limit--; 3816 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3817 /* 3818 * Same logic as the 'limit' filter; the minimum cannot be 3819 * determined here because we do not have the global information 3820 * about the count of all chunks that satisfy the filters. 3821 */ 3822 if (bargs->limit_max == 0) 3823 return 0; 3824 else 3825 bargs->limit_max--; 3826 } 3827 3828 return 1; 3829 } 3830 3831 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3832 { 3833 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3834 struct btrfs_root *chunk_root = fs_info->chunk_root; 3835 u64 chunk_type; 3836 struct btrfs_chunk *chunk; 3837 struct btrfs_path *path = NULL; 3838 struct btrfs_key key; 3839 struct btrfs_key found_key; 3840 struct extent_buffer *leaf; 3841 int slot; 3842 int ret; 3843 int enospc_errors = 0; 3844 bool counting = true; 3845 /* The single value limit and min/max limits use the same bytes in the */ 3846 u64 limit_data = bctl->data.limit; 3847 u64 limit_meta = bctl->meta.limit; 3848 u64 limit_sys = bctl->sys.limit; 3849 u32 count_data = 0; 3850 u32 count_meta = 0; 3851 u32 count_sys = 0; 3852 int chunk_reserved = 0; 3853 3854 path = btrfs_alloc_path(); 3855 if (!path) { 3856 ret = -ENOMEM; 3857 goto error; 3858 } 3859 3860 /* zero out stat counters */ 3861 spin_lock(&fs_info->balance_lock); 3862 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3863 spin_unlock(&fs_info->balance_lock); 3864 again: 3865 if (!counting) { 3866 /* 3867 * The single value limit and min/max limits use the same bytes 3868 * in the 3869 */ 3870 bctl->data.limit = limit_data; 3871 bctl->meta.limit = limit_meta; 3872 bctl->sys.limit = limit_sys; 3873 } 3874 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3875 key.offset = (u64)-1; 3876 key.type = BTRFS_CHUNK_ITEM_KEY; 3877 3878 while (1) { 3879 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3880 atomic_read(&fs_info->balance_cancel_req)) { 3881 ret = -ECANCELED; 3882 goto error; 3883 } 3884 3885 mutex_lock(&fs_info->reclaim_bgs_lock); 3886 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3887 if (ret < 0) { 3888 mutex_unlock(&fs_info->reclaim_bgs_lock); 3889 goto error; 3890 } 3891 3892 /* 3893 * this shouldn't happen, it means the last relocate 3894 * failed 3895 */ 3896 if (ret == 0) 3897 BUG(); /* FIXME break ? */ 3898 3899 ret = btrfs_previous_item(chunk_root, path, 0, 3900 BTRFS_CHUNK_ITEM_KEY); 3901 if (ret) { 3902 mutex_unlock(&fs_info->reclaim_bgs_lock); 3903 ret = 0; 3904 break; 3905 } 3906 3907 leaf = path->nodes[0]; 3908 slot = path->slots[0]; 3909 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3910 3911 if (found_key.objectid != key.objectid) { 3912 mutex_unlock(&fs_info->reclaim_bgs_lock); 3913 break; 3914 } 3915 3916 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3917 chunk_type = btrfs_chunk_type(leaf, chunk); 3918 3919 if (!counting) { 3920 spin_lock(&fs_info->balance_lock); 3921 bctl->stat.considered++; 3922 spin_unlock(&fs_info->balance_lock); 3923 } 3924 3925 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3926 3927 btrfs_release_path(path); 3928 if (!ret) { 3929 mutex_unlock(&fs_info->reclaim_bgs_lock); 3930 goto loop; 3931 } 3932 3933 if (counting) { 3934 mutex_unlock(&fs_info->reclaim_bgs_lock); 3935 spin_lock(&fs_info->balance_lock); 3936 bctl->stat.expected++; 3937 spin_unlock(&fs_info->balance_lock); 3938 3939 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3940 count_data++; 3941 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3942 count_sys++; 3943 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3944 count_meta++; 3945 3946 goto loop; 3947 } 3948 3949 /* 3950 * Apply limit_min filter, no need to check if the LIMITS 3951 * filter is used, limit_min is 0 by default 3952 */ 3953 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 3954 count_data < bctl->data.limit_min) 3955 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 3956 count_meta < bctl->meta.limit_min) 3957 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 3958 count_sys < bctl->sys.limit_min)) { 3959 mutex_unlock(&fs_info->reclaim_bgs_lock); 3960 goto loop; 3961 } 3962 3963 if (!chunk_reserved) { 3964 /* 3965 * We may be relocating the only data chunk we have, 3966 * which could potentially end up with losing data's 3967 * raid profile, so lets allocate an empty one in 3968 * advance. 3969 */ 3970 ret = btrfs_may_alloc_data_chunk(fs_info, 3971 found_key.offset); 3972 if (ret < 0) { 3973 mutex_unlock(&fs_info->reclaim_bgs_lock); 3974 goto error; 3975 } else if (ret == 1) { 3976 chunk_reserved = 1; 3977 } 3978 } 3979 3980 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3981 mutex_unlock(&fs_info->reclaim_bgs_lock); 3982 if (ret == -ENOSPC) { 3983 enospc_errors++; 3984 } else if (ret == -ETXTBSY) { 3985 btrfs_info(fs_info, 3986 "skipping relocation of block group %llu due to active swapfile", 3987 found_key.offset); 3988 ret = 0; 3989 } else if (ret) { 3990 goto error; 3991 } else { 3992 spin_lock(&fs_info->balance_lock); 3993 bctl->stat.completed++; 3994 spin_unlock(&fs_info->balance_lock); 3995 } 3996 loop: 3997 if (found_key.offset == 0) 3998 break; 3999 key.offset = found_key.offset - 1; 4000 } 4001 4002 if (counting) { 4003 btrfs_release_path(path); 4004 counting = false; 4005 goto again; 4006 } 4007 error: 4008 btrfs_free_path(path); 4009 if (enospc_errors) { 4010 btrfs_info(fs_info, "%d enospc errors during balance", 4011 enospc_errors); 4012 if (!ret) 4013 ret = -ENOSPC; 4014 } 4015 4016 return ret; 4017 } 4018 4019 /* 4020 * See if a given profile is valid and reduced. 4021 * 4022 * @flags: profile to validate 4023 * @extended: if true @flags is treated as an extended profile 4024 */ 4025 static int alloc_profile_is_valid(u64 flags, int extended) 4026 { 4027 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 4028 BTRFS_BLOCK_GROUP_PROFILE_MASK); 4029 4030 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 4031 4032 /* 1) check that all other bits are zeroed */ 4033 if (flags & ~mask) 4034 return 0; 4035 4036 /* 2) see if profile is reduced */ 4037 if (flags == 0) 4038 return !extended; /* "0" is valid for usual profiles */ 4039 4040 return has_single_bit_set(flags); 4041 } 4042 4043 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 4044 { 4045 /* cancel requested || normal exit path */ 4046 return atomic_read(&fs_info->balance_cancel_req) || 4047 (atomic_read(&fs_info->balance_pause_req) == 0 && 4048 atomic_read(&fs_info->balance_cancel_req) == 0); 4049 } 4050 4051 /* 4052 * Validate target profile against allowed profiles and return true if it's OK. 4053 * Otherwise print the error message and return false. 4054 */ 4055 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 4056 const struct btrfs_balance_args *bargs, 4057 u64 allowed, const char *type) 4058 { 4059 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4060 return true; 4061 4062 /* Profile is valid and does not have bits outside of the allowed set */ 4063 if (alloc_profile_is_valid(bargs->target, 1) && 4064 (bargs->target & ~allowed) == 0) 4065 return true; 4066 4067 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4068 type, btrfs_bg_type_to_raid_name(bargs->target)); 4069 return false; 4070 } 4071 4072 /* 4073 * Fill @buf with textual description of balance filter flags @bargs, up to 4074 * @size_buf including the terminating null. The output may be trimmed if it 4075 * does not fit into the provided buffer. 4076 */ 4077 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4078 u32 size_buf) 4079 { 4080 int ret; 4081 u32 size_bp = size_buf; 4082 char *bp = buf; 4083 u64 flags = bargs->flags; 4084 char tmp_buf[128] = {'\0'}; 4085 4086 if (!flags) 4087 return; 4088 4089 #define CHECK_APPEND_NOARG(a) \ 4090 do { \ 4091 ret = snprintf(bp, size_bp, (a)); \ 4092 if (ret < 0 || ret >= size_bp) \ 4093 goto out_overflow; \ 4094 size_bp -= ret; \ 4095 bp += ret; \ 4096 } while (0) 4097 4098 #define CHECK_APPEND_1ARG(a, v1) \ 4099 do { \ 4100 ret = snprintf(bp, size_bp, (a), (v1)); \ 4101 if (ret < 0 || ret >= size_bp) \ 4102 goto out_overflow; \ 4103 size_bp -= ret; \ 4104 bp += ret; \ 4105 } while (0) 4106 4107 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4108 do { \ 4109 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4110 if (ret < 0 || ret >= size_bp) \ 4111 goto out_overflow; \ 4112 size_bp -= ret; \ 4113 bp += ret; \ 4114 } while (0) 4115 4116 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4117 CHECK_APPEND_1ARG("convert=%s,", 4118 btrfs_bg_type_to_raid_name(bargs->target)); 4119 4120 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4121 CHECK_APPEND_NOARG("soft,"); 4122 4123 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4124 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4125 sizeof(tmp_buf)); 4126 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4127 } 4128 4129 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4130 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4131 4132 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4133 CHECK_APPEND_2ARG("usage=%u..%u,", 4134 bargs->usage_min, bargs->usage_max); 4135 4136 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4137 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4138 4139 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4140 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4141 bargs->pstart, bargs->pend); 4142 4143 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4144 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4145 bargs->vstart, bargs->vend); 4146 4147 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4148 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4149 4150 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4151 CHECK_APPEND_2ARG("limit=%u..%u,", 4152 bargs->limit_min, bargs->limit_max); 4153 4154 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4155 CHECK_APPEND_2ARG("stripes=%u..%u,", 4156 bargs->stripes_min, bargs->stripes_max); 4157 4158 #undef CHECK_APPEND_2ARG 4159 #undef CHECK_APPEND_1ARG 4160 #undef CHECK_APPEND_NOARG 4161 4162 out_overflow: 4163 4164 if (size_bp < size_buf) 4165 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4166 else 4167 buf[0] = '\0'; 4168 } 4169 4170 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4171 { 4172 u32 size_buf = 1024; 4173 char tmp_buf[192] = {'\0'}; 4174 char *buf; 4175 char *bp; 4176 u32 size_bp = size_buf; 4177 int ret; 4178 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4179 4180 buf = kzalloc(size_buf, GFP_KERNEL); 4181 if (!buf) 4182 return; 4183 4184 bp = buf; 4185 4186 #define CHECK_APPEND_1ARG(a, v1) \ 4187 do { \ 4188 ret = snprintf(bp, size_bp, (a), (v1)); \ 4189 if (ret < 0 || ret >= size_bp) \ 4190 goto out_overflow; \ 4191 size_bp -= ret; \ 4192 bp += ret; \ 4193 } while (0) 4194 4195 if (bctl->flags & BTRFS_BALANCE_FORCE) 4196 CHECK_APPEND_1ARG("%s", "-f "); 4197 4198 if (bctl->flags & BTRFS_BALANCE_DATA) { 4199 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4200 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4201 } 4202 4203 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4204 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4205 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4206 } 4207 4208 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4209 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4210 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4211 } 4212 4213 #undef CHECK_APPEND_1ARG 4214 4215 out_overflow: 4216 4217 if (size_bp < size_buf) 4218 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4219 btrfs_info(fs_info, "balance: %s %s", 4220 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4221 "resume" : "start", buf); 4222 4223 kfree(buf); 4224 } 4225 4226 /* 4227 * Should be called with balance mutexe held 4228 */ 4229 int btrfs_balance(struct btrfs_fs_info *fs_info, 4230 struct btrfs_balance_control *bctl, 4231 struct btrfs_ioctl_balance_args *bargs) 4232 { 4233 u64 meta_target, data_target; 4234 u64 allowed; 4235 int mixed = 0; 4236 int ret; 4237 u64 num_devices; 4238 unsigned seq; 4239 bool reducing_redundancy; 4240 int i; 4241 4242 if (btrfs_fs_closing(fs_info) || 4243 atomic_read(&fs_info->balance_pause_req) || 4244 btrfs_should_cancel_balance(fs_info)) { 4245 ret = -EINVAL; 4246 goto out; 4247 } 4248 4249 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4250 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4251 mixed = 1; 4252 4253 /* 4254 * In case of mixed groups both data and meta should be picked, 4255 * and identical options should be given for both of them. 4256 */ 4257 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4258 if (mixed && (bctl->flags & allowed)) { 4259 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4260 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4261 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4262 btrfs_err(fs_info, 4263 "balance: mixed groups data and metadata options must be the same"); 4264 ret = -EINVAL; 4265 goto out; 4266 } 4267 } 4268 4269 /* 4270 * rw_devices will not change at the moment, device add/delete/replace 4271 * are exclusive 4272 */ 4273 num_devices = fs_info->fs_devices->rw_devices; 4274 4275 /* 4276 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4277 * special bit for it, to make it easier to distinguish. Thus we need 4278 * to set it manually, or balance would refuse the profile. 4279 */ 4280 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4281 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4282 if (num_devices >= btrfs_raid_array[i].devs_min) 4283 allowed |= btrfs_raid_array[i].bg_flag; 4284 4285 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4286 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4287 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4288 ret = -EINVAL; 4289 goto out; 4290 } 4291 4292 /* 4293 * Allow to reduce metadata or system integrity only if force set for 4294 * profiles with redundancy (copies, parity) 4295 */ 4296 allowed = 0; 4297 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4298 if (btrfs_raid_array[i].ncopies >= 2 || 4299 btrfs_raid_array[i].tolerated_failures >= 1) 4300 allowed |= btrfs_raid_array[i].bg_flag; 4301 } 4302 do { 4303 seq = read_seqbegin(&fs_info->profiles_lock); 4304 4305 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4306 (fs_info->avail_system_alloc_bits & allowed) && 4307 !(bctl->sys.target & allowed)) || 4308 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4309 (fs_info->avail_metadata_alloc_bits & allowed) && 4310 !(bctl->meta.target & allowed))) 4311 reducing_redundancy = true; 4312 else 4313 reducing_redundancy = false; 4314 4315 /* if we're not converting, the target field is uninitialized */ 4316 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4317 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4318 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4319 bctl->data.target : fs_info->avail_data_alloc_bits; 4320 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4321 4322 if (reducing_redundancy) { 4323 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4324 btrfs_info(fs_info, 4325 "balance: force reducing metadata redundancy"); 4326 } else { 4327 btrfs_err(fs_info, 4328 "balance: reduces metadata redundancy, use --force if you want this"); 4329 ret = -EINVAL; 4330 goto out; 4331 } 4332 } 4333 4334 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4335 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4336 btrfs_warn(fs_info, 4337 "balance: metadata profile %s has lower redundancy than data profile %s", 4338 btrfs_bg_type_to_raid_name(meta_target), 4339 btrfs_bg_type_to_raid_name(data_target)); 4340 } 4341 4342 ret = insert_balance_item(fs_info, bctl); 4343 if (ret && ret != -EEXIST) 4344 goto out; 4345 4346 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4347 BUG_ON(ret == -EEXIST); 4348 BUG_ON(fs_info->balance_ctl); 4349 spin_lock(&fs_info->balance_lock); 4350 fs_info->balance_ctl = bctl; 4351 spin_unlock(&fs_info->balance_lock); 4352 } else { 4353 BUG_ON(ret != -EEXIST); 4354 spin_lock(&fs_info->balance_lock); 4355 update_balance_args(bctl); 4356 spin_unlock(&fs_info->balance_lock); 4357 } 4358 4359 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4360 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4361 describe_balance_start_or_resume(fs_info); 4362 mutex_unlock(&fs_info->balance_mutex); 4363 4364 ret = __btrfs_balance(fs_info); 4365 4366 mutex_lock(&fs_info->balance_mutex); 4367 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4368 btrfs_info(fs_info, "balance: paused"); 4369 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4370 } 4371 /* 4372 * Balance can be canceled by: 4373 * 4374 * - Regular cancel request 4375 * Then ret == -ECANCELED and balance_cancel_req > 0 4376 * 4377 * - Fatal signal to "btrfs" process 4378 * Either the signal caught by wait_reserve_ticket() and callers 4379 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4380 * got -ECANCELED. 4381 * Either way, in this case balance_cancel_req = 0, and 4382 * ret == -EINTR or ret == -ECANCELED. 4383 * 4384 * So here we only check the return value to catch canceled balance. 4385 */ 4386 else if (ret == -ECANCELED || ret == -EINTR) 4387 btrfs_info(fs_info, "balance: canceled"); 4388 else 4389 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4390 4391 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4392 4393 if (bargs) { 4394 memset(bargs, 0, sizeof(*bargs)); 4395 btrfs_update_ioctl_balance_args(fs_info, bargs); 4396 } 4397 4398 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4399 balance_need_close(fs_info)) { 4400 reset_balance_state(fs_info); 4401 btrfs_exclop_finish(fs_info); 4402 } 4403 4404 wake_up(&fs_info->balance_wait_q); 4405 4406 return ret; 4407 out: 4408 if (bctl->flags & BTRFS_BALANCE_RESUME) 4409 reset_balance_state(fs_info); 4410 else 4411 kfree(bctl); 4412 btrfs_exclop_finish(fs_info); 4413 4414 return ret; 4415 } 4416 4417 static int balance_kthread(void *data) 4418 { 4419 struct btrfs_fs_info *fs_info = data; 4420 int ret = 0; 4421 4422 sb_start_write(fs_info->sb); 4423 mutex_lock(&fs_info->balance_mutex); 4424 if (fs_info->balance_ctl) 4425 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4426 mutex_unlock(&fs_info->balance_mutex); 4427 sb_end_write(fs_info->sb); 4428 4429 return ret; 4430 } 4431 4432 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4433 { 4434 struct task_struct *tsk; 4435 4436 mutex_lock(&fs_info->balance_mutex); 4437 if (!fs_info->balance_ctl) { 4438 mutex_unlock(&fs_info->balance_mutex); 4439 return 0; 4440 } 4441 mutex_unlock(&fs_info->balance_mutex); 4442 4443 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4444 btrfs_info(fs_info, "balance: resume skipped"); 4445 return 0; 4446 } 4447 4448 spin_lock(&fs_info->super_lock); 4449 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 4450 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 4451 spin_unlock(&fs_info->super_lock); 4452 /* 4453 * A ro->rw remount sequence should continue with the paused balance 4454 * regardless of who pauses it, system or the user as of now, so set 4455 * the resume flag. 4456 */ 4457 spin_lock(&fs_info->balance_lock); 4458 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4459 spin_unlock(&fs_info->balance_lock); 4460 4461 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4462 return PTR_ERR_OR_ZERO(tsk); 4463 } 4464 4465 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4466 { 4467 struct btrfs_balance_control *bctl; 4468 struct btrfs_balance_item *item; 4469 struct btrfs_disk_balance_args disk_bargs; 4470 struct btrfs_path *path; 4471 struct extent_buffer *leaf; 4472 struct btrfs_key key; 4473 int ret; 4474 4475 path = btrfs_alloc_path(); 4476 if (!path) 4477 return -ENOMEM; 4478 4479 key.objectid = BTRFS_BALANCE_OBJECTID; 4480 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4481 key.offset = 0; 4482 4483 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4484 if (ret < 0) 4485 goto out; 4486 if (ret > 0) { /* ret = -ENOENT; */ 4487 ret = 0; 4488 goto out; 4489 } 4490 4491 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4492 if (!bctl) { 4493 ret = -ENOMEM; 4494 goto out; 4495 } 4496 4497 leaf = path->nodes[0]; 4498 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4499 4500 bctl->flags = btrfs_balance_flags(leaf, item); 4501 bctl->flags |= BTRFS_BALANCE_RESUME; 4502 4503 btrfs_balance_data(leaf, item, &disk_bargs); 4504 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4505 btrfs_balance_meta(leaf, item, &disk_bargs); 4506 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4507 btrfs_balance_sys(leaf, item, &disk_bargs); 4508 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4509 4510 /* 4511 * This should never happen, as the paused balance state is recovered 4512 * during mount without any chance of other exclusive ops to collide. 4513 * 4514 * This gives the exclusive op status to balance and keeps in paused 4515 * state until user intervention (cancel or umount). If the ownership 4516 * cannot be assigned, show a message but do not fail. The balance 4517 * is in a paused state and must have fs_info::balance_ctl properly 4518 * set up. 4519 */ 4520 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 4521 btrfs_warn(fs_info, 4522 "balance: cannot set exclusive op status, resume manually"); 4523 4524 btrfs_release_path(path); 4525 4526 mutex_lock(&fs_info->balance_mutex); 4527 BUG_ON(fs_info->balance_ctl); 4528 spin_lock(&fs_info->balance_lock); 4529 fs_info->balance_ctl = bctl; 4530 spin_unlock(&fs_info->balance_lock); 4531 mutex_unlock(&fs_info->balance_mutex); 4532 out: 4533 btrfs_free_path(path); 4534 return ret; 4535 } 4536 4537 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4538 { 4539 int ret = 0; 4540 4541 mutex_lock(&fs_info->balance_mutex); 4542 if (!fs_info->balance_ctl) { 4543 mutex_unlock(&fs_info->balance_mutex); 4544 return -ENOTCONN; 4545 } 4546 4547 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4548 atomic_inc(&fs_info->balance_pause_req); 4549 mutex_unlock(&fs_info->balance_mutex); 4550 4551 wait_event(fs_info->balance_wait_q, 4552 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4553 4554 mutex_lock(&fs_info->balance_mutex); 4555 /* we are good with balance_ctl ripped off from under us */ 4556 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4557 atomic_dec(&fs_info->balance_pause_req); 4558 } else { 4559 ret = -ENOTCONN; 4560 } 4561 4562 mutex_unlock(&fs_info->balance_mutex); 4563 return ret; 4564 } 4565 4566 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4567 { 4568 mutex_lock(&fs_info->balance_mutex); 4569 if (!fs_info->balance_ctl) { 4570 mutex_unlock(&fs_info->balance_mutex); 4571 return -ENOTCONN; 4572 } 4573 4574 /* 4575 * A paused balance with the item stored on disk can be resumed at 4576 * mount time if the mount is read-write. Otherwise it's still paused 4577 * and we must not allow cancelling as it deletes the item. 4578 */ 4579 if (sb_rdonly(fs_info->sb)) { 4580 mutex_unlock(&fs_info->balance_mutex); 4581 return -EROFS; 4582 } 4583 4584 atomic_inc(&fs_info->balance_cancel_req); 4585 /* 4586 * if we are running just wait and return, balance item is 4587 * deleted in btrfs_balance in this case 4588 */ 4589 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4590 mutex_unlock(&fs_info->balance_mutex); 4591 wait_event(fs_info->balance_wait_q, 4592 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4593 mutex_lock(&fs_info->balance_mutex); 4594 } else { 4595 mutex_unlock(&fs_info->balance_mutex); 4596 /* 4597 * Lock released to allow other waiters to continue, we'll 4598 * reexamine the status again. 4599 */ 4600 mutex_lock(&fs_info->balance_mutex); 4601 4602 if (fs_info->balance_ctl) { 4603 reset_balance_state(fs_info); 4604 btrfs_exclop_finish(fs_info); 4605 btrfs_info(fs_info, "balance: canceled"); 4606 } 4607 } 4608 4609 BUG_ON(fs_info->balance_ctl || 4610 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4611 atomic_dec(&fs_info->balance_cancel_req); 4612 mutex_unlock(&fs_info->balance_mutex); 4613 return 0; 4614 } 4615 4616 int btrfs_uuid_scan_kthread(void *data) 4617 { 4618 struct btrfs_fs_info *fs_info = data; 4619 struct btrfs_root *root = fs_info->tree_root; 4620 struct btrfs_key key; 4621 struct btrfs_path *path = NULL; 4622 int ret = 0; 4623 struct extent_buffer *eb; 4624 int slot; 4625 struct btrfs_root_item root_item; 4626 u32 item_size; 4627 struct btrfs_trans_handle *trans = NULL; 4628 bool closing = false; 4629 4630 path = btrfs_alloc_path(); 4631 if (!path) { 4632 ret = -ENOMEM; 4633 goto out; 4634 } 4635 4636 key.objectid = 0; 4637 key.type = BTRFS_ROOT_ITEM_KEY; 4638 key.offset = 0; 4639 4640 while (1) { 4641 if (btrfs_fs_closing(fs_info)) { 4642 closing = true; 4643 break; 4644 } 4645 ret = btrfs_search_forward(root, &key, path, 4646 BTRFS_OLDEST_GENERATION); 4647 if (ret) { 4648 if (ret > 0) 4649 ret = 0; 4650 break; 4651 } 4652 4653 if (key.type != BTRFS_ROOT_ITEM_KEY || 4654 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4655 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4656 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4657 goto skip; 4658 4659 eb = path->nodes[0]; 4660 slot = path->slots[0]; 4661 item_size = btrfs_item_size(eb, slot); 4662 if (item_size < sizeof(root_item)) 4663 goto skip; 4664 4665 read_extent_buffer(eb, &root_item, 4666 btrfs_item_ptr_offset(eb, slot), 4667 (int)sizeof(root_item)); 4668 if (btrfs_root_refs(&root_item) == 0) 4669 goto skip; 4670 4671 if (!btrfs_is_empty_uuid(root_item.uuid) || 4672 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4673 if (trans) 4674 goto update_tree; 4675 4676 btrfs_release_path(path); 4677 /* 4678 * 1 - subvol uuid item 4679 * 1 - received_subvol uuid item 4680 */ 4681 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4682 if (IS_ERR(trans)) { 4683 ret = PTR_ERR(trans); 4684 break; 4685 } 4686 continue; 4687 } else { 4688 goto skip; 4689 } 4690 update_tree: 4691 btrfs_release_path(path); 4692 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4693 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4694 BTRFS_UUID_KEY_SUBVOL, 4695 key.objectid); 4696 if (ret < 0) { 4697 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4698 ret); 4699 break; 4700 } 4701 } 4702 4703 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4704 ret = btrfs_uuid_tree_add(trans, 4705 root_item.received_uuid, 4706 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4707 key.objectid); 4708 if (ret < 0) { 4709 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4710 ret); 4711 break; 4712 } 4713 } 4714 4715 skip: 4716 btrfs_release_path(path); 4717 if (trans) { 4718 ret = btrfs_end_transaction(trans); 4719 trans = NULL; 4720 if (ret) 4721 break; 4722 } 4723 4724 if (key.offset < (u64)-1) { 4725 key.offset++; 4726 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4727 key.offset = 0; 4728 key.type = BTRFS_ROOT_ITEM_KEY; 4729 } else if (key.objectid < (u64)-1) { 4730 key.offset = 0; 4731 key.type = BTRFS_ROOT_ITEM_KEY; 4732 key.objectid++; 4733 } else { 4734 break; 4735 } 4736 cond_resched(); 4737 } 4738 4739 out: 4740 btrfs_free_path(path); 4741 if (trans && !IS_ERR(trans)) 4742 btrfs_end_transaction(trans); 4743 if (ret) 4744 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4745 else if (!closing) 4746 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4747 up(&fs_info->uuid_tree_rescan_sem); 4748 return 0; 4749 } 4750 4751 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4752 { 4753 struct btrfs_trans_handle *trans; 4754 struct btrfs_root *tree_root = fs_info->tree_root; 4755 struct btrfs_root *uuid_root; 4756 struct task_struct *task; 4757 int ret; 4758 4759 /* 4760 * 1 - root node 4761 * 1 - root item 4762 */ 4763 trans = btrfs_start_transaction(tree_root, 2); 4764 if (IS_ERR(trans)) 4765 return PTR_ERR(trans); 4766 4767 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4768 if (IS_ERR(uuid_root)) { 4769 ret = PTR_ERR(uuid_root); 4770 btrfs_abort_transaction(trans, ret); 4771 btrfs_end_transaction(trans); 4772 return ret; 4773 } 4774 4775 fs_info->uuid_root = uuid_root; 4776 4777 ret = btrfs_commit_transaction(trans); 4778 if (ret) 4779 return ret; 4780 4781 down(&fs_info->uuid_tree_rescan_sem); 4782 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4783 if (IS_ERR(task)) { 4784 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4785 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4786 up(&fs_info->uuid_tree_rescan_sem); 4787 return PTR_ERR(task); 4788 } 4789 4790 return 0; 4791 } 4792 4793 /* 4794 * shrinking a device means finding all of the device extents past 4795 * the new size, and then following the back refs to the chunks. 4796 * The chunk relocation code actually frees the device extent 4797 */ 4798 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4799 { 4800 struct btrfs_fs_info *fs_info = device->fs_info; 4801 struct btrfs_root *root = fs_info->dev_root; 4802 struct btrfs_trans_handle *trans; 4803 struct btrfs_dev_extent *dev_extent = NULL; 4804 struct btrfs_path *path; 4805 u64 length; 4806 u64 chunk_offset; 4807 int ret; 4808 int slot; 4809 int failed = 0; 4810 bool retried = false; 4811 struct extent_buffer *l; 4812 struct btrfs_key key; 4813 struct btrfs_super_block *super_copy = fs_info->super_copy; 4814 u64 old_total = btrfs_super_total_bytes(super_copy); 4815 u64 old_size = btrfs_device_get_total_bytes(device); 4816 u64 diff; 4817 u64 start; 4818 4819 new_size = round_down(new_size, fs_info->sectorsize); 4820 start = new_size; 4821 diff = round_down(old_size - new_size, fs_info->sectorsize); 4822 4823 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4824 return -EINVAL; 4825 4826 path = btrfs_alloc_path(); 4827 if (!path) 4828 return -ENOMEM; 4829 4830 path->reada = READA_BACK; 4831 4832 trans = btrfs_start_transaction(root, 0); 4833 if (IS_ERR(trans)) { 4834 btrfs_free_path(path); 4835 return PTR_ERR(trans); 4836 } 4837 4838 mutex_lock(&fs_info->chunk_mutex); 4839 4840 btrfs_device_set_total_bytes(device, new_size); 4841 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4842 device->fs_devices->total_rw_bytes -= diff; 4843 atomic64_sub(diff, &fs_info->free_chunk_space); 4844 } 4845 4846 /* 4847 * Once the device's size has been set to the new size, ensure all 4848 * in-memory chunks are synced to disk so that the loop below sees them 4849 * and relocates them accordingly. 4850 */ 4851 if (contains_pending_extent(device, &start, diff)) { 4852 mutex_unlock(&fs_info->chunk_mutex); 4853 ret = btrfs_commit_transaction(trans); 4854 if (ret) 4855 goto done; 4856 } else { 4857 mutex_unlock(&fs_info->chunk_mutex); 4858 btrfs_end_transaction(trans); 4859 } 4860 4861 again: 4862 key.objectid = device->devid; 4863 key.offset = (u64)-1; 4864 key.type = BTRFS_DEV_EXTENT_KEY; 4865 4866 do { 4867 mutex_lock(&fs_info->reclaim_bgs_lock); 4868 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4869 if (ret < 0) { 4870 mutex_unlock(&fs_info->reclaim_bgs_lock); 4871 goto done; 4872 } 4873 4874 ret = btrfs_previous_item(root, path, 0, key.type); 4875 if (ret) { 4876 mutex_unlock(&fs_info->reclaim_bgs_lock); 4877 if (ret < 0) 4878 goto done; 4879 ret = 0; 4880 btrfs_release_path(path); 4881 break; 4882 } 4883 4884 l = path->nodes[0]; 4885 slot = path->slots[0]; 4886 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4887 4888 if (key.objectid != device->devid) { 4889 mutex_unlock(&fs_info->reclaim_bgs_lock); 4890 btrfs_release_path(path); 4891 break; 4892 } 4893 4894 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4895 length = btrfs_dev_extent_length(l, dev_extent); 4896 4897 if (key.offset + length <= new_size) { 4898 mutex_unlock(&fs_info->reclaim_bgs_lock); 4899 btrfs_release_path(path); 4900 break; 4901 } 4902 4903 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4904 btrfs_release_path(path); 4905 4906 /* 4907 * We may be relocating the only data chunk we have, 4908 * which could potentially end up with losing data's 4909 * raid profile, so lets allocate an empty one in 4910 * advance. 4911 */ 4912 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4913 if (ret < 0) { 4914 mutex_unlock(&fs_info->reclaim_bgs_lock); 4915 goto done; 4916 } 4917 4918 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4919 mutex_unlock(&fs_info->reclaim_bgs_lock); 4920 if (ret == -ENOSPC) { 4921 failed++; 4922 } else if (ret) { 4923 if (ret == -ETXTBSY) { 4924 btrfs_warn(fs_info, 4925 "could not shrink block group %llu due to active swapfile", 4926 chunk_offset); 4927 } 4928 goto done; 4929 } 4930 } while (key.offset-- > 0); 4931 4932 if (failed && !retried) { 4933 failed = 0; 4934 retried = true; 4935 goto again; 4936 } else if (failed && retried) { 4937 ret = -ENOSPC; 4938 goto done; 4939 } 4940 4941 /* Shrinking succeeded, else we would be at "done". */ 4942 trans = btrfs_start_transaction(root, 0); 4943 if (IS_ERR(trans)) { 4944 ret = PTR_ERR(trans); 4945 goto done; 4946 } 4947 4948 mutex_lock(&fs_info->chunk_mutex); 4949 /* Clear all state bits beyond the shrunk device size */ 4950 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 4951 CHUNK_STATE_MASK); 4952 4953 btrfs_device_set_disk_total_bytes(device, new_size); 4954 if (list_empty(&device->post_commit_list)) 4955 list_add_tail(&device->post_commit_list, 4956 &trans->transaction->dev_update_list); 4957 4958 WARN_ON(diff > old_total); 4959 btrfs_set_super_total_bytes(super_copy, 4960 round_down(old_total - diff, fs_info->sectorsize)); 4961 mutex_unlock(&fs_info->chunk_mutex); 4962 4963 btrfs_reserve_chunk_metadata(trans, false); 4964 /* Now btrfs_update_device() will change the on-disk size. */ 4965 ret = btrfs_update_device(trans, device); 4966 btrfs_trans_release_chunk_metadata(trans); 4967 if (ret < 0) { 4968 btrfs_abort_transaction(trans, ret); 4969 btrfs_end_transaction(trans); 4970 } else { 4971 ret = btrfs_commit_transaction(trans); 4972 } 4973 done: 4974 btrfs_free_path(path); 4975 if (ret) { 4976 mutex_lock(&fs_info->chunk_mutex); 4977 btrfs_device_set_total_bytes(device, old_size); 4978 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 4979 device->fs_devices->total_rw_bytes += diff; 4980 atomic64_add(diff, &fs_info->free_chunk_space); 4981 mutex_unlock(&fs_info->chunk_mutex); 4982 } 4983 return ret; 4984 } 4985 4986 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 4987 struct btrfs_key *key, 4988 struct btrfs_chunk *chunk, int item_size) 4989 { 4990 struct btrfs_super_block *super_copy = fs_info->super_copy; 4991 struct btrfs_disk_key disk_key; 4992 u32 array_size; 4993 u8 *ptr; 4994 4995 lockdep_assert_held(&fs_info->chunk_mutex); 4996 4997 array_size = btrfs_super_sys_array_size(super_copy); 4998 if (array_size + item_size + sizeof(disk_key) 4999 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 5000 return -EFBIG; 5001 5002 ptr = super_copy->sys_chunk_array + array_size; 5003 btrfs_cpu_key_to_disk(&disk_key, key); 5004 memcpy(ptr, &disk_key, sizeof(disk_key)); 5005 ptr += sizeof(disk_key); 5006 memcpy(ptr, chunk, item_size); 5007 item_size += sizeof(disk_key); 5008 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 5009 5010 return 0; 5011 } 5012 5013 /* 5014 * sort the devices in descending order by max_avail, total_avail 5015 */ 5016 static int btrfs_cmp_device_info(const void *a, const void *b) 5017 { 5018 const struct btrfs_device_info *di_a = a; 5019 const struct btrfs_device_info *di_b = b; 5020 5021 if (di_a->max_avail > di_b->max_avail) 5022 return -1; 5023 if (di_a->max_avail < di_b->max_avail) 5024 return 1; 5025 if (di_a->total_avail > di_b->total_avail) 5026 return -1; 5027 if (di_a->total_avail < di_b->total_avail) 5028 return 1; 5029 return 0; 5030 } 5031 5032 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 5033 { 5034 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5035 return; 5036 5037 btrfs_set_fs_incompat(info, RAID56); 5038 } 5039 5040 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 5041 { 5042 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 5043 return; 5044 5045 btrfs_set_fs_incompat(info, RAID1C34); 5046 } 5047 5048 /* 5049 * Structure used internally for btrfs_create_chunk() function. 5050 * Wraps needed parameters. 5051 */ 5052 struct alloc_chunk_ctl { 5053 u64 start; 5054 u64 type; 5055 /* Total number of stripes to allocate */ 5056 int num_stripes; 5057 /* sub_stripes info for map */ 5058 int sub_stripes; 5059 /* Stripes per device */ 5060 int dev_stripes; 5061 /* Maximum number of devices to use */ 5062 int devs_max; 5063 /* Minimum number of devices to use */ 5064 int devs_min; 5065 /* ndevs has to be a multiple of this */ 5066 int devs_increment; 5067 /* Number of copies */ 5068 int ncopies; 5069 /* Number of stripes worth of bytes to store parity information */ 5070 int nparity; 5071 u64 max_stripe_size; 5072 u64 max_chunk_size; 5073 u64 dev_extent_min; 5074 u64 stripe_size; 5075 u64 chunk_size; 5076 int ndevs; 5077 }; 5078 5079 static void init_alloc_chunk_ctl_policy_regular( 5080 struct btrfs_fs_devices *fs_devices, 5081 struct alloc_chunk_ctl *ctl) 5082 { 5083 struct btrfs_space_info *space_info; 5084 5085 space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); 5086 ASSERT(space_info); 5087 5088 ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); 5089 ctl->max_stripe_size = ctl->max_chunk_size; 5090 5091 if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) 5092 ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); 5093 5094 /* We don't want a chunk larger than 10% of writable space */ 5095 ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), 5096 ctl->max_chunk_size); 5097 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5098 } 5099 5100 static void init_alloc_chunk_ctl_policy_zoned( 5101 struct btrfs_fs_devices *fs_devices, 5102 struct alloc_chunk_ctl *ctl) 5103 { 5104 u64 zone_size = fs_devices->fs_info->zone_size; 5105 u64 limit; 5106 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5107 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5108 u64 min_chunk_size = min_data_stripes * zone_size; 5109 u64 type = ctl->type; 5110 5111 ctl->max_stripe_size = zone_size; 5112 if (type & BTRFS_BLOCK_GROUP_DATA) { 5113 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5114 zone_size); 5115 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5116 ctl->max_chunk_size = ctl->max_stripe_size; 5117 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5118 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5119 ctl->devs_max = min_t(int, ctl->devs_max, 5120 BTRFS_MAX_DEVS_SYS_CHUNK); 5121 } else { 5122 BUG(); 5123 } 5124 5125 /* We don't want a chunk larger than 10% of writable space */ 5126 limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), 5127 zone_size), 5128 min_chunk_size); 5129 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5130 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5131 } 5132 5133 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5134 struct alloc_chunk_ctl *ctl) 5135 { 5136 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5137 5138 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5139 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5140 ctl->devs_max = btrfs_raid_array[index].devs_max; 5141 if (!ctl->devs_max) 5142 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5143 ctl->devs_min = btrfs_raid_array[index].devs_min; 5144 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5145 ctl->ncopies = btrfs_raid_array[index].ncopies; 5146 ctl->nparity = btrfs_raid_array[index].nparity; 5147 ctl->ndevs = 0; 5148 5149 switch (fs_devices->chunk_alloc_policy) { 5150 case BTRFS_CHUNK_ALLOC_REGULAR: 5151 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5152 break; 5153 case BTRFS_CHUNK_ALLOC_ZONED: 5154 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5155 break; 5156 default: 5157 BUG(); 5158 } 5159 } 5160 5161 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5162 struct alloc_chunk_ctl *ctl, 5163 struct btrfs_device_info *devices_info) 5164 { 5165 struct btrfs_fs_info *info = fs_devices->fs_info; 5166 struct btrfs_device *device; 5167 u64 total_avail; 5168 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5169 int ret; 5170 int ndevs = 0; 5171 u64 max_avail; 5172 u64 dev_offset; 5173 5174 /* 5175 * in the first pass through the devices list, we gather information 5176 * about the available holes on each device. 5177 */ 5178 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5179 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5180 WARN(1, KERN_ERR 5181 "BTRFS: read-only device in alloc_list\n"); 5182 continue; 5183 } 5184 5185 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5186 &device->dev_state) || 5187 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5188 continue; 5189 5190 if (device->total_bytes > device->bytes_used) 5191 total_avail = device->total_bytes - device->bytes_used; 5192 else 5193 total_avail = 0; 5194 5195 /* If there is no space on this device, skip it. */ 5196 if (total_avail < ctl->dev_extent_min) 5197 continue; 5198 5199 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5200 &max_avail); 5201 if (ret && ret != -ENOSPC) 5202 return ret; 5203 5204 if (ret == 0) 5205 max_avail = dev_extent_want; 5206 5207 if (max_avail < ctl->dev_extent_min) { 5208 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5209 btrfs_debug(info, 5210 "%s: devid %llu has no free space, have=%llu want=%llu", 5211 __func__, device->devid, max_avail, 5212 ctl->dev_extent_min); 5213 continue; 5214 } 5215 5216 if (ndevs == fs_devices->rw_devices) { 5217 WARN(1, "%s: found more than %llu devices\n", 5218 __func__, fs_devices->rw_devices); 5219 break; 5220 } 5221 devices_info[ndevs].dev_offset = dev_offset; 5222 devices_info[ndevs].max_avail = max_avail; 5223 devices_info[ndevs].total_avail = total_avail; 5224 devices_info[ndevs].dev = device; 5225 ++ndevs; 5226 } 5227 ctl->ndevs = ndevs; 5228 5229 /* 5230 * now sort the devices by hole size / available space 5231 */ 5232 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5233 btrfs_cmp_device_info, NULL); 5234 5235 return 0; 5236 } 5237 5238 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5239 struct btrfs_device_info *devices_info) 5240 { 5241 /* Number of stripes that count for block group size */ 5242 int data_stripes; 5243 5244 /* 5245 * The primary goal is to maximize the number of stripes, so use as 5246 * many devices as possible, even if the stripes are not maximum sized. 5247 * 5248 * The DUP profile stores more than one stripe per device, the 5249 * max_avail is the total size so we have to adjust. 5250 */ 5251 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5252 ctl->dev_stripes); 5253 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5254 5255 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5256 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5257 5258 /* 5259 * Use the number of data stripes to figure out how big this chunk is 5260 * really going to be in terms of logical address space, and compare 5261 * that answer with the max chunk size. If it's higher, we try to 5262 * reduce stripe_size. 5263 */ 5264 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5265 /* 5266 * Reduce stripe_size, round it up to a 16MB boundary again and 5267 * then use it, unless it ends up being even bigger than the 5268 * previous value we had already. 5269 */ 5270 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5271 data_stripes), SZ_16M), 5272 ctl->stripe_size); 5273 } 5274 5275 /* Stripe size should not go beyond 1G. */ 5276 ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G); 5277 5278 /* Align to BTRFS_STRIPE_LEN */ 5279 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5280 ctl->chunk_size = ctl->stripe_size * data_stripes; 5281 5282 return 0; 5283 } 5284 5285 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5286 struct btrfs_device_info *devices_info) 5287 { 5288 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5289 /* Number of stripes that count for block group size */ 5290 int data_stripes; 5291 5292 /* 5293 * It should hold because: 5294 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5295 */ 5296 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5297 5298 ctl->stripe_size = zone_size; 5299 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5300 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5301 5302 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5303 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5304 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5305 ctl->stripe_size) + ctl->nparity, 5306 ctl->dev_stripes); 5307 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5308 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5309 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5310 } 5311 5312 ctl->chunk_size = ctl->stripe_size * data_stripes; 5313 5314 return 0; 5315 } 5316 5317 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5318 struct alloc_chunk_ctl *ctl, 5319 struct btrfs_device_info *devices_info) 5320 { 5321 struct btrfs_fs_info *info = fs_devices->fs_info; 5322 5323 /* 5324 * Round down to number of usable stripes, devs_increment can be any 5325 * number so we can't use round_down() that requires power of 2, while 5326 * rounddown is safe. 5327 */ 5328 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5329 5330 if (ctl->ndevs < ctl->devs_min) { 5331 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5332 btrfs_debug(info, 5333 "%s: not enough devices with free space: have=%d minimum required=%d", 5334 __func__, ctl->ndevs, ctl->devs_min); 5335 } 5336 return -ENOSPC; 5337 } 5338 5339 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5340 5341 switch (fs_devices->chunk_alloc_policy) { 5342 case BTRFS_CHUNK_ALLOC_REGULAR: 5343 return decide_stripe_size_regular(ctl, devices_info); 5344 case BTRFS_CHUNK_ALLOC_ZONED: 5345 return decide_stripe_size_zoned(ctl, devices_info); 5346 default: 5347 BUG(); 5348 } 5349 } 5350 5351 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5352 struct alloc_chunk_ctl *ctl, 5353 struct btrfs_device_info *devices_info) 5354 { 5355 struct btrfs_fs_info *info = trans->fs_info; 5356 struct map_lookup *map = NULL; 5357 struct extent_map_tree *em_tree; 5358 struct btrfs_block_group *block_group; 5359 struct extent_map *em; 5360 u64 start = ctl->start; 5361 u64 type = ctl->type; 5362 int ret; 5363 int i; 5364 int j; 5365 5366 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5367 if (!map) 5368 return ERR_PTR(-ENOMEM); 5369 map->num_stripes = ctl->num_stripes; 5370 5371 for (i = 0; i < ctl->ndevs; ++i) { 5372 for (j = 0; j < ctl->dev_stripes; ++j) { 5373 int s = i * ctl->dev_stripes + j; 5374 map->stripes[s].dev = devices_info[i].dev; 5375 map->stripes[s].physical = devices_info[i].dev_offset + 5376 j * ctl->stripe_size; 5377 } 5378 } 5379 map->stripe_len = BTRFS_STRIPE_LEN; 5380 map->io_align = BTRFS_STRIPE_LEN; 5381 map->io_width = BTRFS_STRIPE_LEN; 5382 map->type = type; 5383 map->sub_stripes = ctl->sub_stripes; 5384 5385 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5386 5387 em = alloc_extent_map(); 5388 if (!em) { 5389 kfree(map); 5390 return ERR_PTR(-ENOMEM); 5391 } 5392 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5393 em->map_lookup = map; 5394 em->start = start; 5395 em->len = ctl->chunk_size; 5396 em->block_start = 0; 5397 em->block_len = em->len; 5398 em->orig_block_len = ctl->stripe_size; 5399 5400 em_tree = &info->mapping_tree; 5401 write_lock(&em_tree->lock); 5402 ret = add_extent_mapping(em_tree, em, 0); 5403 if (ret) { 5404 write_unlock(&em_tree->lock); 5405 free_extent_map(em); 5406 return ERR_PTR(ret); 5407 } 5408 write_unlock(&em_tree->lock); 5409 5410 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5411 if (IS_ERR(block_group)) 5412 goto error_del_extent; 5413 5414 for (i = 0; i < map->num_stripes; i++) { 5415 struct btrfs_device *dev = map->stripes[i].dev; 5416 5417 btrfs_device_set_bytes_used(dev, 5418 dev->bytes_used + ctl->stripe_size); 5419 if (list_empty(&dev->post_commit_list)) 5420 list_add_tail(&dev->post_commit_list, 5421 &trans->transaction->dev_update_list); 5422 } 5423 5424 atomic64_sub(ctl->stripe_size * map->num_stripes, 5425 &info->free_chunk_space); 5426 5427 free_extent_map(em); 5428 check_raid56_incompat_flag(info, type); 5429 check_raid1c34_incompat_flag(info, type); 5430 5431 return block_group; 5432 5433 error_del_extent: 5434 write_lock(&em_tree->lock); 5435 remove_extent_mapping(em_tree, em); 5436 write_unlock(&em_tree->lock); 5437 5438 /* One for our allocation */ 5439 free_extent_map(em); 5440 /* One for the tree reference */ 5441 free_extent_map(em); 5442 5443 return block_group; 5444 } 5445 5446 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5447 u64 type) 5448 { 5449 struct btrfs_fs_info *info = trans->fs_info; 5450 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5451 struct btrfs_device_info *devices_info = NULL; 5452 struct alloc_chunk_ctl ctl; 5453 struct btrfs_block_group *block_group; 5454 int ret; 5455 5456 lockdep_assert_held(&info->chunk_mutex); 5457 5458 if (!alloc_profile_is_valid(type, 0)) { 5459 ASSERT(0); 5460 return ERR_PTR(-EINVAL); 5461 } 5462 5463 if (list_empty(&fs_devices->alloc_list)) { 5464 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5465 btrfs_debug(info, "%s: no writable device", __func__); 5466 return ERR_PTR(-ENOSPC); 5467 } 5468 5469 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5470 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5471 ASSERT(0); 5472 return ERR_PTR(-EINVAL); 5473 } 5474 5475 ctl.start = find_next_chunk(info); 5476 ctl.type = type; 5477 init_alloc_chunk_ctl(fs_devices, &ctl); 5478 5479 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5480 GFP_NOFS); 5481 if (!devices_info) 5482 return ERR_PTR(-ENOMEM); 5483 5484 ret = gather_device_info(fs_devices, &ctl, devices_info); 5485 if (ret < 0) { 5486 block_group = ERR_PTR(ret); 5487 goto out; 5488 } 5489 5490 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5491 if (ret < 0) { 5492 block_group = ERR_PTR(ret); 5493 goto out; 5494 } 5495 5496 block_group = create_chunk(trans, &ctl, devices_info); 5497 5498 out: 5499 kfree(devices_info); 5500 return block_group; 5501 } 5502 5503 /* 5504 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5505 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5506 * chunks. 5507 * 5508 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5509 * phases. 5510 */ 5511 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5512 struct btrfs_block_group *bg) 5513 { 5514 struct btrfs_fs_info *fs_info = trans->fs_info; 5515 struct btrfs_root *chunk_root = fs_info->chunk_root; 5516 struct btrfs_key key; 5517 struct btrfs_chunk *chunk; 5518 struct btrfs_stripe *stripe; 5519 struct extent_map *em; 5520 struct map_lookup *map; 5521 size_t item_size; 5522 int i; 5523 int ret; 5524 5525 /* 5526 * We take the chunk_mutex for 2 reasons: 5527 * 5528 * 1) Updates and insertions in the chunk btree must be done while holding 5529 * the chunk_mutex, as well as updating the system chunk array in the 5530 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5531 * details; 5532 * 5533 * 2) To prevent races with the final phase of a device replace operation 5534 * that replaces the device object associated with the map's stripes, 5535 * because the device object's id can change at any time during that 5536 * final phase of the device replace operation 5537 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5538 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5539 * which would cause a failure when updating the device item, which does 5540 * not exists, or persisting a stripe of the chunk item with such ID. 5541 * Here we can't use the device_list_mutex because our caller already 5542 * has locked the chunk_mutex, and the final phase of device replace 5543 * acquires both mutexes - first the device_list_mutex and then the 5544 * chunk_mutex. Using any of those two mutexes protects us from a 5545 * concurrent device replace. 5546 */ 5547 lockdep_assert_held(&fs_info->chunk_mutex); 5548 5549 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5550 if (IS_ERR(em)) { 5551 ret = PTR_ERR(em); 5552 btrfs_abort_transaction(trans, ret); 5553 return ret; 5554 } 5555 5556 map = em->map_lookup; 5557 item_size = btrfs_chunk_item_size(map->num_stripes); 5558 5559 chunk = kzalloc(item_size, GFP_NOFS); 5560 if (!chunk) { 5561 ret = -ENOMEM; 5562 btrfs_abort_transaction(trans, ret); 5563 goto out; 5564 } 5565 5566 for (i = 0; i < map->num_stripes; i++) { 5567 struct btrfs_device *device = map->stripes[i].dev; 5568 5569 ret = btrfs_update_device(trans, device); 5570 if (ret) 5571 goto out; 5572 } 5573 5574 stripe = &chunk->stripe; 5575 for (i = 0; i < map->num_stripes; i++) { 5576 struct btrfs_device *device = map->stripes[i].dev; 5577 const u64 dev_offset = map->stripes[i].physical; 5578 5579 btrfs_set_stack_stripe_devid(stripe, device->devid); 5580 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5581 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5582 stripe++; 5583 } 5584 5585 btrfs_set_stack_chunk_length(chunk, bg->length); 5586 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 5587 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5588 btrfs_set_stack_chunk_type(chunk, map->type); 5589 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5590 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5591 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5592 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5593 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5594 5595 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5596 key.type = BTRFS_CHUNK_ITEM_KEY; 5597 key.offset = bg->start; 5598 5599 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5600 if (ret) 5601 goto out; 5602 5603 set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags); 5604 5605 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5606 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5607 if (ret) 5608 goto out; 5609 } 5610 5611 out: 5612 kfree(chunk); 5613 free_extent_map(em); 5614 return ret; 5615 } 5616 5617 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5618 { 5619 struct btrfs_fs_info *fs_info = trans->fs_info; 5620 u64 alloc_profile; 5621 struct btrfs_block_group *meta_bg; 5622 struct btrfs_block_group *sys_bg; 5623 5624 /* 5625 * When adding a new device for sprouting, the seed device is read-only 5626 * so we must first allocate a metadata and a system chunk. But before 5627 * adding the block group items to the extent, device and chunk btrees, 5628 * we must first: 5629 * 5630 * 1) Create both chunks without doing any changes to the btrees, as 5631 * otherwise we would get -ENOSPC since the block groups from the 5632 * seed device are read-only; 5633 * 5634 * 2) Add the device item for the new sprout device - finishing the setup 5635 * of a new block group requires updating the device item in the chunk 5636 * btree, so it must exist when we attempt to do it. The previous step 5637 * ensures this does not fail with -ENOSPC. 5638 * 5639 * After that we can add the block group items to their btrees: 5640 * update existing device item in the chunk btree, add a new block group 5641 * item to the extent btree, add a new chunk item to the chunk btree and 5642 * finally add the new device extent items to the devices btree. 5643 */ 5644 5645 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5646 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5647 if (IS_ERR(meta_bg)) 5648 return PTR_ERR(meta_bg); 5649 5650 alloc_profile = btrfs_system_alloc_profile(fs_info); 5651 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5652 if (IS_ERR(sys_bg)) 5653 return PTR_ERR(sys_bg); 5654 5655 return 0; 5656 } 5657 5658 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5659 { 5660 const int index = btrfs_bg_flags_to_raid_index(map->type); 5661 5662 return btrfs_raid_array[index].tolerated_failures; 5663 } 5664 5665 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5666 { 5667 struct extent_map *em; 5668 struct map_lookup *map; 5669 int miss_ndevs = 0; 5670 int i; 5671 bool ret = true; 5672 5673 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5674 if (IS_ERR(em)) 5675 return false; 5676 5677 map = em->map_lookup; 5678 for (i = 0; i < map->num_stripes; i++) { 5679 if (test_bit(BTRFS_DEV_STATE_MISSING, 5680 &map->stripes[i].dev->dev_state)) { 5681 miss_ndevs++; 5682 continue; 5683 } 5684 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5685 &map->stripes[i].dev->dev_state)) { 5686 ret = false; 5687 goto end; 5688 } 5689 } 5690 5691 /* 5692 * If the number of missing devices is larger than max errors, we can 5693 * not write the data into that chunk successfully. 5694 */ 5695 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5696 ret = false; 5697 end: 5698 free_extent_map(em); 5699 return ret; 5700 } 5701 5702 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5703 { 5704 struct extent_map *em; 5705 5706 while (1) { 5707 write_lock(&tree->lock); 5708 em = lookup_extent_mapping(tree, 0, (u64)-1); 5709 if (em) 5710 remove_extent_mapping(tree, em); 5711 write_unlock(&tree->lock); 5712 if (!em) 5713 break; 5714 /* once for us */ 5715 free_extent_map(em); 5716 /* once for the tree */ 5717 free_extent_map(em); 5718 } 5719 } 5720 5721 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5722 { 5723 struct extent_map *em; 5724 struct map_lookup *map; 5725 enum btrfs_raid_types index; 5726 int ret = 1; 5727 5728 em = btrfs_get_chunk_map(fs_info, logical, len); 5729 if (IS_ERR(em)) 5730 /* 5731 * We could return errors for these cases, but that could get 5732 * ugly and we'd probably do the same thing which is just not do 5733 * anything else and exit, so return 1 so the callers don't try 5734 * to use other copies. 5735 */ 5736 return 1; 5737 5738 map = em->map_lookup; 5739 index = btrfs_bg_flags_to_raid_index(map->type); 5740 5741 /* Non-RAID56, use their ncopies from btrfs_raid_array. */ 5742 if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5743 ret = btrfs_raid_array[index].ncopies; 5744 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5745 ret = 2; 5746 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5747 /* 5748 * There could be two corrupted data stripes, we need 5749 * to loop retry in order to rebuild the correct data. 5750 * 5751 * Fail a stripe at a time on every retry except the 5752 * stripe under reconstruction. 5753 */ 5754 ret = map->num_stripes; 5755 free_extent_map(em); 5756 5757 down_read(&fs_info->dev_replace.rwsem); 5758 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5759 fs_info->dev_replace.tgtdev) 5760 ret++; 5761 up_read(&fs_info->dev_replace.rwsem); 5762 5763 return ret; 5764 } 5765 5766 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5767 u64 logical) 5768 { 5769 struct extent_map *em; 5770 struct map_lookup *map; 5771 unsigned long len = fs_info->sectorsize; 5772 5773 if (!btrfs_fs_incompat(fs_info, RAID56)) 5774 return len; 5775 5776 em = btrfs_get_chunk_map(fs_info, logical, len); 5777 5778 if (!WARN_ON(IS_ERR(em))) { 5779 map = em->map_lookup; 5780 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5781 len = map->stripe_len * nr_data_stripes(map); 5782 free_extent_map(em); 5783 } 5784 return len; 5785 } 5786 5787 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5788 { 5789 struct extent_map *em; 5790 struct map_lookup *map; 5791 int ret = 0; 5792 5793 if (!btrfs_fs_incompat(fs_info, RAID56)) 5794 return 0; 5795 5796 em = btrfs_get_chunk_map(fs_info, logical, len); 5797 5798 if(!WARN_ON(IS_ERR(em))) { 5799 map = em->map_lookup; 5800 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5801 ret = 1; 5802 free_extent_map(em); 5803 } 5804 return ret; 5805 } 5806 5807 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5808 struct map_lookup *map, int first, 5809 int dev_replace_is_ongoing) 5810 { 5811 int i; 5812 int num_stripes; 5813 int preferred_mirror; 5814 int tolerance; 5815 struct btrfs_device *srcdev; 5816 5817 ASSERT((map->type & 5818 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5819 5820 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5821 num_stripes = map->sub_stripes; 5822 else 5823 num_stripes = map->num_stripes; 5824 5825 switch (fs_info->fs_devices->read_policy) { 5826 default: 5827 /* Shouldn't happen, just warn and use pid instead of failing */ 5828 btrfs_warn_rl(fs_info, 5829 "unknown read_policy type %u, reset to pid", 5830 fs_info->fs_devices->read_policy); 5831 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5832 fallthrough; 5833 case BTRFS_READ_POLICY_PID: 5834 preferred_mirror = first + (current->pid % num_stripes); 5835 break; 5836 } 5837 5838 if (dev_replace_is_ongoing && 5839 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5840 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5841 srcdev = fs_info->dev_replace.srcdev; 5842 else 5843 srcdev = NULL; 5844 5845 /* 5846 * try to avoid the drive that is the source drive for a 5847 * dev-replace procedure, only choose it if no other non-missing 5848 * mirror is available 5849 */ 5850 for (tolerance = 0; tolerance < 2; tolerance++) { 5851 if (map->stripes[preferred_mirror].dev->bdev && 5852 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5853 return preferred_mirror; 5854 for (i = first; i < first + num_stripes; i++) { 5855 if (map->stripes[i].dev->bdev && 5856 (tolerance || map->stripes[i].dev != srcdev)) 5857 return i; 5858 } 5859 } 5860 5861 /* we couldn't find one that doesn't fail. Just return something 5862 * and the io error handling code will clean up eventually 5863 */ 5864 return preferred_mirror; 5865 } 5866 5867 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5868 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) 5869 { 5870 int i; 5871 int again = 1; 5872 5873 while (again) { 5874 again = 0; 5875 for (i = 0; i < num_stripes - 1; i++) { 5876 /* Swap if parity is on a smaller index */ 5877 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { 5878 swap(bioc->stripes[i], bioc->stripes[i + 1]); 5879 swap(bioc->raid_map[i], bioc->raid_map[i + 1]); 5880 again = 1; 5881 } 5882 } 5883 } 5884 } 5885 5886 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 5887 int total_stripes, 5888 int real_stripes) 5889 { 5890 struct btrfs_io_context *bioc = kzalloc( 5891 /* The size of btrfs_io_context */ 5892 sizeof(struct btrfs_io_context) + 5893 /* Plus the variable array for the stripes */ 5894 sizeof(struct btrfs_io_stripe) * (total_stripes) + 5895 /* Plus the variable array for the tgt dev */ 5896 sizeof(int) * (real_stripes) + 5897 /* 5898 * Plus the raid_map, which includes both the tgt dev 5899 * and the stripes. 5900 */ 5901 sizeof(u64) * (total_stripes), 5902 GFP_NOFS); 5903 5904 if (!bioc) 5905 return NULL; 5906 5907 refcount_set(&bioc->refs, 1); 5908 5909 bioc->fs_info = fs_info; 5910 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); 5911 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); 5912 5913 return bioc; 5914 } 5915 5916 void btrfs_get_bioc(struct btrfs_io_context *bioc) 5917 { 5918 WARN_ON(!refcount_read(&bioc->refs)); 5919 refcount_inc(&bioc->refs); 5920 } 5921 5922 void btrfs_put_bioc(struct btrfs_io_context *bioc) 5923 { 5924 if (!bioc) 5925 return; 5926 if (refcount_dec_and_test(&bioc->refs)) 5927 kfree(bioc); 5928 } 5929 5930 /* 5931 * Please note that, discard won't be sent to target device of device 5932 * replace. 5933 */ 5934 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, 5935 u64 logical, u64 *length_ret, 5936 u32 *num_stripes) 5937 { 5938 struct extent_map *em; 5939 struct map_lookup *map; 5940 struct btrfs_discard_stripe *stripes; 5941 u64 length = *length_ret; 5942 u64 offset; 5943 u64 stripe_nr; 5944 u64 stripe_nr_end; 5945 u64 stripe_end_offset; 5946 u64 stripe_cnt; 5947 u64 stripe_len; 5948 u64 stripe_offset; 5949 u32 stripe_index; 5950 u32 factor = 0; 5951 u32 sub_stripes = 0; 5952 u64 stripes_per_dev = 0; 5953 u32 remaining_stripes = 0; 5954 u32 last_stripe = 0; 5955 int ret; 5956 int i; 5957 5958 em = btrfs_get_chunk_map(fs_info, logical, length); 5959 if (IS_ERR(em)) 5960 return ERR_CAST(em); 5961 5962 map = em->map_lookup; 5963 5964 /* we don't discard raid56 yet */ 5965 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5966 ret = -EOPNOTSUPP; 5967 goto out_free_map; 5968 } 5969 5970 offset = logical - em->start; 5971 length = min_t(u64, em->start + em->len - logical, length); 5972 *length_ret = length; 5973 5974 stripe_len = map->stripe_len; 5975 /* 5976 * stripe_nr counts the total number of stripes we have to stride 5977 * to get to this block 5978 */ 5979 stripe_nr = div64_u64(offset, stripe_len); 5980 5981 /* stripe_offset is the offset of this block in its stripe */ 5982 stripe_offset = offset - stripe_nr * stripe_len; 5983 5984 stripe_nr_end = round_up(offset + length, map->stripe_len); 5985 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 5986 stripe_cnt = stripe_nr_end - stripe_nr; 5987 stripe_end_offset = stripe_nr_end * map->stripe_len - 5988 (offset + length); 5989 /* 5990 * after this, stripe_nr is the number of stripes on this 5991 * device we have to walk to find the data, and stripe_index is 5992 * the number of our device in the stripe array 5993 */ 5994 *num_stripes = 1; 5995 stripe_index = 0; 5996 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 5997 BTRFS_BLOCK_GROUP_RAID10)) { 5998 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5999 sub_stripes = 1; 6000 else 6001 sub_stripes = map->sub_stripes; 6002 6003 factor = map->num_stripes / sub_stripes; 6004 *num_stripes = min_t(u64, map->num_stripes, 6005 sub_stripes * stripe_cnt); 6006 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6007 stripe_index *= sub_stripes; 6008 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 6009 &remaining_stripes); 6010 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 6011 last_stripe *= sub_stripes; 6012 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 6013 BTRFS_BLOCK_GROUP_DUP)) { 6014 *num_stripes = map->num_stripes; 6015 } else { 6016 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6017 &stripe_index); 6018 } 6019 6020 stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); 6021 if (!stripes) { 6022 ret = -ENOMEM; 6023 goto out_free_map; 6024 } 6025 6026 for (i = 0; i < *num_stripes; i++) { 6027 stripes[i].physical = 6028 map->stripes[stripe_index].physical + 6029 stripe_offset + stripe_nr * map->stripe_len; 6030 stripes[i].dev = map->stripes[stripe_index].dev; 6031 6032 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6033 BTRFS_BLOCK_GROUP_RAID10)) { 6034 stripes[i].length = stripes_per_dev * map->stripe_len; 6035 6036 if (i / sub_stripes < remaining_stripes) 6037 stripes[i].length += map->stripe_len; 6038 6039 /* 6040 * Special for the first stripe and 6041 * the last stripe: 6042 * 6043 * |-------|...|-------| 6044 * |----------| 6045 * off end_off 6046 */ 6047 if (i < sub_stripes) 6048 stripes[i].length -= stripe_offset; 6049 6050 if (stripe_index >= last_stripe && 6051 stripe_index <= (last_stripe + 6052 sub_stripes - 1)) 6053 stripes[i].length -= stripe_end_offset; 6054 6055 if (i == sub_stripes - 1) 6056 stripe_offset = 0; 6057 } else { 6058 stripes[i].length = length; 6059 } 6060 6061 stripe_index++; 6062 if (stripe_index == map->num_stripes) { 6063 stripe_index = 0; 6064 stripe_nr++; 6065 } 6066 } 6067 6068 free_extent_map(em); 6069 return stripes; 6070 out_free_map: 6071 free_extent_map(em); 6072 return ERR_PTR(ret); 6073 } 6074 6075 /* 6076 * In dev-replace case, for repair case (that's the only case where the mirror 6077 * is selected explicitly when calling btrfs_map_block), blocks left of the 6078 * left cursor can also be read from the target drive. 6079 * 6080 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6081 * array of stripes. 6082 * For READ, it also needs to be supported using the same mirror number. 6083 * 6084 * If the requested block is not left of the left cursor, EIO is returned. This 6085 * can happen because btrfs_num_copies() returns one more in the dev-replace 6086 * case. 6087 */ 6088 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6089 u64 logical, u64 length, 6090 u64 srcdev_devid, int *mirror_num, 6091 u64 *physical) 6092 { 6093 struct btrfs_io_context *bioc = NULL; 6094 int num_stripes; 6095 int index_srcdev = 0; 6096 int found = 0; 6097 u64 physical_of_found = 0; 6098 int i; 6099 int ret = 0; 6100 6101 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6102 logical, &length, &bioc, NULL, NULL, 0); 6103 if (ret) { 6104 ASSERT(bioc == NULL); 6105 return ret; 6106 } 6107 6108 num_stripes = bioc->num_stripes; 6109 if (*mirror_num > num_stripes) { 6110 /* 6111 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6112 * that means that the requested area is not left of the left 6113 * cursor 6114 */ 6115 btrfs_put_bioc(bioc); 6116 return -EIO; 6117 } 6118 6119 /* 6120 * process the rest of the function using the mirror_num of the source 6121 * drive. Therefore look it up first. At the end, patch the device 6122 * pointer to the one of the target drive. 6123 */ 6124 for (i = 0; i < num_stripes; i++) { 6125 if (bioc->stripes[i].dev->devid != srcdev_devid) 6126 continue; 6127 6128 /* 6129 * In case of DUP, in order to keep it simple, only add the 6130 * mirror with the lowest physical address 6131 */ 6132 if (found && 6133 physical_of_found <= bioc->stripes[i].physical) 6134 continue; 6135 6136 index_srcdev = i; 6137 found = 1; 6138 physical_of_found = bioc->stripes[i].physical; 6139 } 6140 6141 btrfs_put_bioc(bioc); 6142 6143 ASSERT(found); 6144 if (!found) 6145 return -EIO; 6146 6147 *mirror_num = index_srcdev + 1; 6148 *physical = physical_of_found; 6149 return ret; 6150 } 6151 6152 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6153 { 6154 struct btrfs_block_group *cache; 6155 bool ret; 6156 6157 /* Non zoned filesystem does not use "to_copy" flag */ 6158 if (!btrfs_is_zoned(fs_info)) 6159 return false; 6160 6161 cache = btrfs_lookup_block_group(fs_info, logical); 6162 6163 ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags); 6164 6165 btrfs_put_block_group(cache); 6166 return ret; 6167 } 6168 6169 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6170 struct btrfs_io_context **bioc_ret, 6171 struct btrfs_dev_replace *dev_replace, 6172 u64 logical, 6173 int *num_stripes_ret, int *max_errors_ret) 6174 { 6175 struct btrfs_io_context *bioc = *bioc_ret; 6176 u64 srcdev_devid = dev_replace->srcdev->devid; 6177 int tgtdev_indexes = 0; 6178 int num_stripes = *num_stripes_ret; 6179 int max_errors = *max_errors_ret; 6180 int i; 6181 6182 if (op == BTRFS_MAP_WRITE) { 6183 int index_where_to_add; 6184 6185 /* 6186 * A block group which have "to_copy" set will eventually 6187 * copied by dev-replace process. We can avoid cloning IO here. 6188 */ 6189 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6190 return; 6191 6192 /* 6193 * duplicate the write operations while the dev replace 6194 * procedure is running. Since the copying of the old disk to 6195 * the new disk takes place at run time while the filesystem is 6196 * mounted writable, the regular write operations to the old 6197 * disk have to be duplicated to go to the new disk as well. 6198 * 6199 * Note that device->missing is handled by the caller, and that 6200 * the write to the old disk is already set up in the stripes 6201 * array. 6202 */ 6203 index_where_to_add = num_stripes; 6204 for (i = 0; i < num_stripes; i++) { 6205 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6206 /* write to new disk, too */ 6207 struct btrfs_io_stripe *new = 6208 bioc->stripes + index_where_to_add; 6209 struct btrfs_io_stripe *old = 6210 bioc->stripes + i; 6211 6212 new->physical = old->physical; 6213 new->dev = dev_replace->tgtdev; 6214 bioc->tgtdev_map[i] = index_where_to_add; 6215 index_where_to_add++; 6216 max_errors++; 6217 tgtdev_indexes++; 6218 } 6219 } 6220 num_stripes = index_where_to_add; 6221 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6222 int index_srcdev = 0; 6223 int found = 0; 6224 u64 physical_of_found = 0; 6225 6226 /* 6227 * During the dev-replace procedure, the target drive can also 6228 * be used to read data in case it is needed to repair a corrupt 6229 * block elsewhere. This is possible if the requested area is 6230 * left of the left cursor. In this area, the target drive is a 6231 * full copy of the source drive. 6232 */ 6233 for (i = 0; i < num_stripes; i++) { 6234 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6235 /* 6236 * In case of DUP, in order to keep it simple, 6237 * only add the mirror with the lowest physical 6238 * address 6239 */ 6240 if (found && 6241 physical_of_found <= bioc->stripes[i].physical) 6242 continue; 6243 index_srcdev = i; 6244 found = 1; 6245 physical_of_found = bioc->stripes[i].physical; 6246 } 6247 } 6248 if (found) { 6249 struct btrfs_io_stripe *tgtdev_stripe = 6250 bioc->stripes + num_stripes; 6251 6252 tgtdev_stripe->physical = physical_of_found; 6253 tgtdev_stripe->dev = dev_replace->tgtdev; 6254 bioc->tgtdev_map[index_srcdev] = num_stripes; 6255 6256 tgtdev_indexes++; 6257 num_stripes++; 6258 } 6259 } 6260 6261 *num_stripes_ret = num_stripes; 6262 *max_errors_ret = max_errors; 6263 bioc->num_tgtdevs = tgtdev_indexes; 6264 *bioc_ret = bioc; 6265 } 6266 6267 static bool need_full_stripe(enum btrfs_map_op op) 6268 { 6269 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6270 } 6271 6272 /* 6273 * Calculate the geometry of a particular (address, len) tuple. This 6274 * information is used to calculate how big a particular bio can get before it 6275 * straddles a stripe. 6276 * 6277 * @fs_info: the filesystem 6278 * @em: mapping containing the logical extent 6279 * @op: type of operation - write or read 6280 * @logical: address that we want to figure out the geometry of 6281 * @io_geom: pointer used to return values 6282 * 6283 * Returns < 0 in case a chunk for the given logical address cannot be found, 6284 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6285 */ 6286 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6287 enum btrfs_map_op op, u64 logical, 6288 struct btrfs_io_geometry *io_geom) 6289 { 6290 struct map_lookup *map; 6291 u64 len; 6292 u64 offset; 6293 u64 stripe_offset; 6294 u64 stripe_nr; 6295 u32 stripe_len; 6296 u64 raid56_full_stripe_start = (u64)-1; 6297 int data_stripes; 6298 6299 ASSERT(op != BTRFS_MAP_DISCARD); 6300 6301 map = em->map_lookup; 6302 /* Offset of this logical address in the chunk */ 6303 offset = logical - em->start; 6304 /* Len of a stripe in a chunk */ 6305 stripe_len = map->stripe_len; 6306 /* 6307 * Stripe_nr is where this block falls in 6308 * stripe_offset is the offset of this block in its stripe. 6309 */ 6310 stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); 6311 ASSERT(stripe_offset < U32_MAX); 6312 6313 data_stripes = nr_data_stripes(map); 6314 6315 /* Only stripe based profiles needs to check against stripe length. */ 6316 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { 6317 u64 max_len = stripe_len - stripe_offset; 6318 6319 /* 6320 * In case of raid56, we need to know the stripe aligned start 6321 */ 6322 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6323 unsigned long full_stripe_len = stripe_len * data_stripes; 6324 raid56_full_stripe_start = offset; 6325 6326 /* 6327 * Allow a write of a full stripe, but make sure we 6328 * don't allow straddling of stripes 6329 */ 6330 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6331 full_stripe_len); 6332 raid56_full_stripe_start *= full_stripe_len; 6333 6334 /* 6335 * For writes to RAID[56], allow a full stripeset across 6336 * all disks. For other RAID types and for RAID[56] 6337 * reads, just allow a single stripe (on a single disk). 6338 */ 6339 if (op == BTRFS_MAP_WRITE) { 6340 max_len = stripe_len * data_stripes - 6341 (offset - raid56_full_stripe_start); 6342 } 6343 } 6344 len = min_t(u64, em->len - offset, max_len); 6345 } else { 6346 len = em->len - offset; 6347 } 6348 6349 io_geom->len = len; 6350 io_geom->offset = offset; 6351 io_geom->stripe_len = stripe_len; 6352 io_geom->stripe_nr = stripe_nr; 6353 io_geom->stripe_offset = stripe_offset; 6354 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6355 6356 return 0; 6357 } 6358 6359 static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, 6360 u32 stripe_index, u64 stripe_offset, u64 stripe_nr) 6361 { 6362 dst->dev = map->stripes[stripe_index].dev; 6363 dst->physical = map->stripes[stripe_index].physical + 6364 stripe_offset + stripe_nr * map->stripe_len; 6365 } 6366 6367 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6368 enum btrfs_map_op op, u64 logical, u64 *length, 6369 struct btrfs_io_context **bioc_ret, 6370 struct btrfs_io_stripe *smap, 6371 int *mirror_num_ret, int need_raid_map) 6372 { 6373 struct extent_map *em; 6374 struct map_lookup *map; 6375 u64 stripe_offset; 6376 u64 stripe_nr; 6377 u64 stripe_len; 6378 u32 stripe_index; 6379 int data_stripes; 6380 int i; 6381 int ret = 0; 6382 int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); 6383 int num_stripes; 6384 int max_errors = 0; 6385 int tgtdev_indexes = 0; 6386 struct btrfs_io_context *bioc = NULL; 6387 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6388 int dev_replace_is_ongoing = 0; 6389 int num_alloc_stripes; 6390 int patch_the_first_stripe_for_dev_replace = 0; 6391 u64 physical_to_patch_in_first_stripe = 0; 6392 u64 raid56_full_stripe_start = (u64)-1; 6393 struct btrfs_io_geometry geom; 6394 6395 ASSERT(bioc_ret); 6396 ASSERT(op != BTRFS_MAP_DISCARD); 6397 6398 em = btrfs_get_chunk_map(fs_info, logical, *length); 6399 ASSERT(!IS_ERR(em)); 6400 6401 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6402 if (ret < 0) 6403 return ret; 6404 6405 map = em->map_lookup; 6406 6407 *length = geom.len; 6408 stripe_len = geom.stripe_len; 6409 stripe_nr = geom.stripe_nr; 6410 stripe_offset = geom.stripe_offset; 6411 raid56_full_stripe_start = geom.raid56_stripe_offset; 6412 data_stripes = nr_data_stripes(map); 6413 6414 down_read(&dev_replace->rwsem); 6415 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6416 /* 6417 * Hold the semaphore for read during the whole operation, write is 6418 * requested at commit time but must wait. 6419 */ 6420 if (!dev_replace_is_ongoing) 6421 up_read(&dev_replace->rwsem); 6422 6423 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6424 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6425 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6426 dev_replace->srcdev->devid, 6427 &mirror_num, 6428 &physical_to_patch_in_first_stripe); 6429 if (ret) 6430 goto out; 6431 else 6432 patch_the_first_stripe_for_dev_replace = 1; 6433 } else if (mirror_num > map->num_stripes) { 6434 mirror_num = 0; 6435 } 6436 6437 num_stripes = 1; 6438 stripe_index = 0; 6439 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6440 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6441 &stripe_index); 6442 if (!need_full_stripe(op)) 6443 mirror_num = 1; 6444 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6445 if (need_full_stripe(op)) 6446 num_stripes = map->num_stripes; 6447 else if (mirror_num) 6448 stripe_index = mirror_num - 1; 6449 else { 6450 stripe_index = find_live_mirror(fs_info, map, 0, 6451 dev_replace_is_ongoing); 6452 mirror_num = stripe_index + 1; 6453 } 6454 6455 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6456 if (need_full_stripe(op)) { 6457 num_stripes = map->num_stripes; 6458 } else if (mirror_num) { 6459 stripe_index = mirror_num - 1; 6460 } else { 6461 mirror_num = 1; 6462 } 6463 6464 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6465 u32 factor = map->num_stripes / map->sub_stripes; 6466 6467 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6468 stripe_index *= map->sub_stripes; 6469 6470 if (need_full_stripe(op)) 6471 num_stripes = map->sub_stripes; 6472 else if (mirror_num) 6473 stripe_index += mirror_num - 1; 6474 else { 6475 int old_stripe_index = stripe_index; 6476 stripe_index = find_live_mirror(fs_info, map, 6477 stripe_index, 6478 dev_replace_is_ongoing); 6479 mirror_num = stripe_index - old_stripe_index + 1; 6480 } 6481 6482 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6483 ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); 6484 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6485 /* push stripe_nr back to the start of the full stripe */ 6486 stripe_nr = div64_u64(raid56_full_stripe_start, 6487 stripe_len * data_stripes); 6488 6489 /* RAID[56] write or recovery. Return all stripes */ 6490 num_stripes = map->num_stripes; 6491 max_errors = btrfs_chunk_max_errors(map); 6492 6493 /* Return the length to the full stripe end */ 6494 *length = min(logical + *length, 6495 raid56_full_stripe_start + em->start + 6496 data_stripes * stripe_len) - logical; 6497 stripe_index = 0; 6498 stripe_offset = 0; 6499 } else { 6500 /* 6501 * Mirror #0 or #1 means the original data block. 6502 * Mirror #2 is RAID5 parity block. 6503 * Mirror #3 is RAID6 Q block. 6504 */ 6505 stripe_nr = div_u64_rem(stripe_nr, 6506 data_stripes, &stripe_index); 6507 if (mirror_num > 1) 6508 stripe_index = data_stripes + mirror_num - 2; 6509 6510 /* We distribute the parity blocks across stripes */ 6511 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6512 &stripe_index); 6513 if (!need_full_stripe(op) && mirror_num <= 1) 6514 mirror_num = 1; 6515 } 6516 } else { 6517 /* 6518 * after this, stripe_nr is the number of stripes on this 6519 * device we have to walk to find the data, and stripe_index is 6520 * the number of our device in the stripe array 6521 */ 6522 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6523 &stripe_index); 6524 mirror_num = stripe_index + 1; 6525 } 6526 if (stripe_index >= map->num_stripes) { 6527 btrfs_crit(fs_info, 6528 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6529 stripe_index, map->num_stripes); 6530 ret = -EINVAL; 6531 goto out; 6532 } 6533 6534 num_alloc_stripes = num_stripes; 6535 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6536 if (op == BTRFS_MAP_WRITE) 6537 num_alloc_stripes <<= 1; 6538 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6539 num_alloc_stripes++; 6540 tgtdev_indexes = num_stripes; 6541 } 6542 6543 /* 6544 * If this I/O maps to a single device, try to return the device and 6545 * physical block information on the stack instead of allocating an 6546 * I/O context structure. 6547 */ 6548 if (smap && num_alloc_stripes == 1 && 6549 !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && 6550 (!need_full_stripe(op) || !dev_replace_is_ongoing || 6551 !dev_replace->tgtdev)) { 6552 if (patch_the_first_stripe_for_dev_replace) { 6553 smap->dev = dev_replace->tgtdev; 6554 smap->physical = physical_to_patch_in_first_stripe; 6555 *mirror_num_ret = map->num_stripes + 1; 6556 } else { 6557 set_io_stripe(smap, map, stripe_index, stripe_offset, 6558 stripe_nr); 6559 *mirror_num_ret = mirror_num; 6560 } 6561 *bioc_ret = NULL; 6562 ret = 0; 6563 goto out; 6564 } 6565 6566 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); 6567 if (!bioc) { 6568 ret = -ENOMEM; 6569 goto out; 6570 } 6571 6572 for (i = 0; i < num_stripes; i++) { 6573 set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, 6574 stripe_nr); 6575 stripe_index++; 6576 } 6577 6578 /* Build raid_map */ 6579 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6580 (need_full_stripe(op) || mirror_num > 1)) { 6581 u64 tmp; 6582 unsigned rot; 6583 6584 /* Work out the disk rotation on this stripe-set */ 6585 div_u64_rem(stripe_nr, num_stripes, &rot); 6586 6587 /* Fill in the logical address of each stripe */ 6588 tmp = stripe_nr * data_stripes; 6589 for (i = 0; i < data_stripes; i++) 6590 bioc->raid_map[(i + rot) % num_stripes] = 6591 em->start + (tmp + i) * map->stripe_len; 6592 6593 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; 6594 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6595 bioc->raid_map[(i + rot + 1) % num_stripes] = 6596 RAID6_Q_STRIPE; 6597 6598 sort_parity_stripes(bioc, num_stripes); 6599 } 6600 6601 if (need_full_stripe(op)) 6602 max_errors = btrfs_chunk_max_errors(map); 6603 6604 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6605 need_full_stripe(op)) { 6606 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, 6607 &num_stripes, &max_errors); 6608 } 6609 6610 *bioc_ret = bioc; 6611 bioc->map_type = map->type; 6612 bioc->num_stripes = num_stripes; 6613 bioc->max_errors = max_errors; 6614 bioc->mirror_num = mirror_num; 6615 6616 /* 6617 * this is the case that REQ_READ && dev_replace_is_ongoing && 6618 * mirror_num == num_stripes + 1 && dev_replace target drive is 6619 * available as a mirror 6620 */ 6621 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6622 WARN_ON(num_stripes > 1); 6623 bioc->stripes[0].dev = dev_replace->tgtdev; 6624 bioc->stripes[0].physical = physical_to_patch_in_first_stripe; 6625 bioc->mirror_num = map->num_stripes + 1; 6626 } 6627 out: 6628 if (dev_replace_is_ongoing) { 6629 lockdep_assert_held(&dev_replace->rwsem); 6630 /* Unlock and let waiting writers proceed */ 6631 up_read(&dev_replace->rwsem); 6632 } 6633 free_extent_map(em); 6634 return ret; 6635 } 6636 6637 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6638 u64 logical, u64 *length, 6639 struct btrfs_io_context **bioc_ret, int mirror_num) 6640 { 6641 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6642 NULL, &mirror_num, 0); 6643 } 6644 6645 /* For Scrub/replace */ 6646 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6647 u64 logical, u64 *length, 6648 struct btrfs_io_context **bioc_ret) 6649 { 6650 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6651 NULL, NULL, 1); 6652 } 6653 6654 /* 6655 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 6656 * is already initialized by the block layer. 6657 */ 6658 static inline void btrfs_bio_init(struct btrfs_bio *bbio, 6659 btrfs_bio_end_io_t end_io, void *private) 6660 { 6661 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 6662 bbio->end_io = end_io; 6663 bbio->private = private; 6664 } 6665 6666 /* 6667 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 6668 * btrfs, and is used for all I/O submitted through btrfs_submit_bio. 6669 * 6670 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 6671 * a mempool. 6672 */ 6673 struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 6674 btrfs_bio_end_io_t end_io, void *private) 6675 { 6676 struct bio *bio; 6677 6678 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 6679 btrfs_bio_init(btrfs_bio(bio), end_io, private); 6680 return bio; 6681 } 6682 6683 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, 6684 btrfs_bio_end_io_t end_io, void *private) 6685 { 6686 struct bio *bio; 6687 struct btrfs_bio *bbio; 6688 6689 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 6690 6691 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 6692 bbio = btrfs_bio(bio); 6693 btrfs_bio_init(bbio, end_io, private); 6694 6695 bio_trim(bio, offset >> 9, size >> 9); 6696 bbio->iter = bio->bi_iter; 6697 return bio; 6698 } 6699 6700 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) 6701 { 6702 if (!dev || !dev->bdev) 6703 return; 6704 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 6705 return; 6706 6707 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6708 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 6709 if (!(bio->bi_opf & REQ_RAHEAD)) 6710 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 6711 if (bio->bi_opf & REQ_PREFLUSH) 6712 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 6713 } 6714 6715 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, 6716 struct bio *bio) 6717 { 6718 if (bio->bi_opf & REQ_META) 6719 return fs_info->endio_meta_workers; 6720 return fs_info->endio_workers; 6721 } 6722 6723 static void btrfs_end_bio_work(struct work_struct *work) 6724 { 6725 struct btrfs_bio *bbio = 6726 container_of(work, struct btrfs_bio, end_io_work); 6727 6728 bbio->end_io(bbio); 6729 } 6730 6731 static void btrfs_simple_end_io(struct bio *bio) 6732 { 6733 struct btrfs_fs_info *fs_info = bio->bi_private; 6734 struct btrfs_bio *bbio = btrfs_bio(bio); 6735 6736 btrfs_bio_counter_dec(fs_info); 6737 6738 if (bio->bi_status) 6739 btrfs_log_dev_io_error(bio, bbio->device); 6740 6741 if (bio_op(bio) == REQ_OP_READ) { 6742 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); 6743 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 6744 } else { 6745 bbio->end_io(bbio); 6746 } 6747 } 6748 6749 static void btrfs_raid56_end_io(struct bio *bio) 6750 { 6751 struct btrfs_io_context *bioc = bio->bi_private; 6752 struct btrfs_bio *bbio = btrfs_bio(bio); 6753 6754 btrfs_bio_counter_dec(bioc->fs_info); 6755 bbio->mirror_num = bioc->mirror_num; 6756 bbio->end_io(bbio); 6757 6758 btrfs_put_bioc(bioc); 6759 } 6760 6761 static void btrfs_orig_write_end_io(struct bio *bio) 6762 { 6763 struct btrfs_io_stripe *stripe = bio->bi_private; 6764 struct btrfs_io_context *bioc = stripe->bioc; 6765 struct btrfs_bio *bbio = btrfs_bio(bio); 6766 6767 btrfs_bio_counter_dec(bioc->fs_info); 6768 6769 if (bio->bi_status) { 6770 atomic_inc(&bioc->error); 6771 btrfs_log_dev_io_error(bio, stripe->dev); 6772 } 6773 6774 /* 6775 * Only send an error to the higher layers if it is beyond the tolerance 6776 * threshold. 6777 */ 6778 if (atomic_read(&bioc->error) > bioc->max_errors) 6779 bio->bi_status = BLK_STS_IOERR; 6780 else 6781 bio->bi_status = BLK_STS_OK; 6782 6783 bbio->end_io(bbio); 6784 btrfs_put_bioc(bioc); 6785 } 6786 6787 static void btrfs_clone_write_end_io(struct bio *bio) 6788 { 6789 struct btrfs_io_stripe *stripe = bio->bi_private; 6790 6791 if (bio->bi_status) { 6792 atomic_inc(&stripe->bioc->error); 6793 btrfs_log_dev_io_error(bio, stripe->dev); 6794 } 6795 6796 /* Pass on control to the original bio this one was cloned from */ 6797 bio_endio(stripe->bioc->orig_bio); 6798 bio_put(bio); 6799 } 6800 6801 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 6802 { 6803 if (!dev || !dev->bdev || 6804 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 6805 (btrfs_op(bio) == BTRFS_MAP_WRITE && 6806 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6807 bio_io_error(bio); 6808 return; 6809 } 6810 6811 bio_set_dev(bio, dev->bdev); 6812 6813 /* 6814 * For zone append writing, bi_sector must point the beginning of the 6815 * zone 6816 */ 6817 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6818 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 6819 6820 if (btrfs_dev_is_sequential(dev, physical)) { 6821 u64 zone_start = round_down(physical, 6822 dev->fs_info->zone_size); 6823 6824 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6825 } else { 6826 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6827 bio->bi_opf |= REQ_OP_WRITE; 6828 } 6829 } 6830 btrfs_debug_in_rcu(dev->fs_info, 6831 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6832 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6833 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), 6834 dev->devid, bio->bi_iter.bi_size); 6835 6836 btrfsic_check_bio(bio); 6837 submit_bio(bio); 6838 } 6839 6840 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 6841 { 6842 struct bio *orig_bio = bioc->orig_bio, *bio; 6843 6844 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 6845 6846 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 6847 if (dev_nr == bioc->num_stripes - 1) { 6848 bio = orig_bio; 6849 bio->bi_end_io = btrfs_orig_write_end_io; 6850 } else { 6851 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); 6852 bio_inc_remaining(orig_bio); 6853 bio->bi_end_io = btrfs_clone_write_end_io; 6854 } 6855 6856 bio->bi_private = &bioc->stripes[dev_nr]; 6857 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 6858 bioc->stripes[dev_nr].bioc = bioc; 6859 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 6860 } 6861 6862 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) 6863 { 6864 u64 logical = bio->bi_iter.bi_sector << 9; 6865 u64 length = bio->bi_iter.bi_size; 6866 u64 map_length = length; 6867 struct btrfs_io_context *bioc = NULL; 6868 struct btrfs_io_stripe smap; 6869 int ret; 6870 6871 btrfs_bio_counter_inc_blocked(fs_info); 6872 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 6873 &bioc, &smap, &mirror_num, 1); 6874 if (ret) { 6875 btrfs_bio_counter_dec(fs_info); 6876 btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); 6877 return; 6878 } 6879 6880 if (map_length < length) { 6881 btrfs_crit(fs_info, 6882 "mapping failed logical %llu bio len %llu len %llu", 6883 logical, length, map_length); 6884 BUG(); 6885 } 6886 6887 if (!bioc) { 6888 /* Single mirror read/write fast path */ 6889 btrfs_bio(bio)->mirror_num = mirror_num; 6890 btrfs_bio(bio)->device = smap.dev; 6891 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 6892 bio->bi_private = fs_info; 6893 bio->bi_end_io = btrfs_simple_end_io; 6894 btrfs_submit_dev_bio(smap.dev, bio); 6895 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6896 /* Parity RAID write or read recovery */ 6897 bio->bi_private = bioc; 6898 bio->bi_end_io = btrfs_raid56_end_io; 6899 if (bio_op(bio) == REQ_OP_READ) 6900 raid56_parity_recover(bio, bioc, mirror_num); 6901 else 6902 raid56_parity_write(bio, bioc); 6903 } else { 6904 /* Write to multiple mirrors */ 6905 int total_devs = bioc->num_stripes; 6906 int dev_nr; 6907 6908 bioc->orig_bio = bio; 6909 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) 6910 btrfs_submit_mirrored_bio(bioc, dev_nr); 6911 } 6912 } 6913 6914 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 6915 const struct btrfs_fs_devices *fs_devices) 6916 { 6917 if (args->fsid == NULL) 6918 return true; 6919 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 6920 return true; 6921 return false; 6922 } 6923 6924 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 6925 const struct btrfs_device *device) 6926 { 6927 if (args->missing) { 6928 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 6929 !device->bdev) 6930 return true; 6931 return false; 6932 } 6933 6934 if (device->devid != args->devid) 6935 return false; 6936 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 6937 return false; 6938 return true; 6939 } 6940 6941 /* 6942 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6943 * return NULL. 6944 * 6945 * If devid and uuid are both specified, the match must be exact, otherwise 6946 * only devid is used. 6947 */ 6948 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 6949 const struct btrfs_dev_lookup_args *args) 6950 { 6951 struct btrfs_device *device; 6952 struct btrfs_fs_devices *seed_devs; 6953 6954 if (dev_args_match_fs_devices(args, fs_devices)) { 6955 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6956 if (dev_args_match_device(args, device)) 6957 return device; 6958 } 6959 } 6960 6961 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6962 if (!dev_args_match_fs_devices(args, seed_devs)) 6963 continue; 6964 list_for_each_entry(device, &seed_devs->devices, dev_list) { 6965 if (dev_args_match_device(args, device)) 6966 return device; 6967 } 6968 } 6969 6970 return NULL; 6971 } 6972 6973 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6974 u64 devid, u8 *dev_uuid) 6975 { 6976 struct btrfs_device *device; 6977 unsigned int nofs_flag; 6978 6979 /* 6980 * We call this under the chunk_mutex, so we want to use NOFS for this 6981 * allocation, however we don't want to change btrfs_alloc_device() to 6982 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6983 * places. 6984 */ 6985 6986 nofs_flag = memalloc_nofs_save(); 6987 device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); 6988 memalloc_nofs_restore(nofs_flag); 6989 if (IS_ERR(device)) 6990 return device; 6991 6992 list_add(&device->dev_list, &fs_devices->devices); 6993 device->fs_devices = fs_devices; 6994 fs_devices->num_devices++; 6995 6996 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6997 fs_devices->missing_devices++; 6998 6999 return device; 7000 } 7001 7002 /* 7003 * Allocate new device struct, set up devid and UUID. 7004 * 7005 * @fs_info: used only for generating a new devid, can be NULL if 7006 * devid is provided (i.e. @devid != NULL). 7007 * @devid: a pointer to devid for this device. If NULL a new devid 7008 * is generated. 7009 * @uuid: a pointer to UUID for this device. If NULL a new UUID 7010 * is generated. 7011 * @path: a pointer to device path if available, NULL otherwise. 7012 * 7013 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 7014 * on error. Returned struct is not linked onto any lists and must be 7015 * destroyed with btrfs_free_device. 7016 */ 7017 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 7018 const u64 *devid, const u8 *uuid, 7019 const char *path) 7020 { 7021 struct btrfs_device *dev; 7022 u64 tmp; 7023 7024 if (WARN_ON(!devid && !fs_info)) 7025 return ERR_PTR(-EINVAL); 7026 7027 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 7028 if (!dev) 7029 return ERR_PTR(-ENOMEM); 7030 7031 INIT_LIST_HEAD(&dev->dev_list); 7032 INIT_LIST_HEAD(&dev->dev_alloc_list); 7033 INIT_LIST_HEAD(&dev->post_commit_list); 7034 7035 atomic_set(&dev->dev_stats_ccnt, 0); 7036 btrfs_device_data_ordered_init(dev); 7037 extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); 7038 7039 if (devid) 7040 tmp = *devid; 7041 else { 7042 int ret; 7043 7044 ret = find_next_devid(fs_info, &tmp); 7045 if (ret) { 7046 btrfs_free_device(dev); 7047 return ERR_PTR(ret); 7048 } 7049 } 7050 dev->devid = tmp; 7051 7052 if (uuid) 7053 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 7054 else 7055 generate_random_uuid(dev->uuid); 7056 7057 if (path) { 7058 struct rcu_string *name; 7059 7060 name = rcu_string_strdup(path, GFP_KERNEL); 7061 if (!name) { 7062 btrfs_free_device(dev); 7063 return ERR_PTR(-ENOMEM); 7064 } 7065 rcu_assign_pointer(dev->name, name); 7066 } 7067 7068 return dev; 7069 } 7070 7071 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 7072 u64 devid, u8 *uuid, bool error) 7073 { 7074 if (error) 7075 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 7076 devid, uuid); 7077 else 7078 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 7079 devid, uuid); 7080 } 7081 7082 u64 btrfs_calc_stripe_length(const struct extent_map *em) 7083 { 7084 const struct map_lookup *map = em->map_lookup; 7085 const int data_stripes = calc_data_stripes(map->type, map->num_stripes); 7086 7087 return div_u64(em->len, data_stripes); 7088 } 7089 7090 #if BITS_PER_LONG == 32 7091 /* 7092 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 7093 * can't be accessed on 32bit systems. 7094 * 7095 * This function do mount time check to reject the fs if it already has 7096 * metadata chunk beyond that limit. 7097 */ 7098 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7099 u64 logical, u64 length, u64 type) 7100 { 7101 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7102 return 0; 7103 7104 if (logical + length < MAX_LFS_FILESIZE) 7105 return 0; 7106 7107 btrfs_err_32bit_limit(fs_info); 7108 return -EOVERFLOW; 7109 } 7110 7111 /* 7112 * This is to give early warning for any metadata chunk reaching 7113 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 7114 * Although we can still access the metadata, it's not going to be possible 7115 * once the limit is reached. 7116 */ 7117 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7118 u64 logical, u64 length, u64 type) 7119 { 7120 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7121 return; 7122 7123 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 7124 return; 7125 7126 btrfs_warn_32bit_limit(fs_info); 7127 } 7128 #endif 7129 7130 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 7131 u64 devid, u8 *uuid) 7132 { 7133 struct btrfs_device *dev; 7134 7135 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7136 btrfs_report_missing_device(fs_info, devid, uuid, true); 7137 return ERR_PTR(-ENOENT); 7138 } 7139 7140 dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 7141 if (IS_ERR(dev)) { 7142 btrfs_err(fs_info, "failed to init missing device %llu: %ld", 7143 devid, PTR_ERR(dev)); 7144 return dev; 7145 } 7146 btrfs_report_missing_device(fs_info, devid, uuid, false); 7147 7148 return dev; 7149 } 7150 7151 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 7152 struct btrfs_chunk *chunk) 7153 { 7154 BTRFS_DEV_LOOKUP_ARGS(args); 7155 struct btrfs_fs_info *fs_info = leaf->fs_info; 7156 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7157 struct map_lookup *map; 7158 struct extent_map *em; 7159 u64 logical; 7160 u64 length; 7161 u64 devid; 7162 u64 type; 7163 u8 uuid[BTRFS_UUID_SIZE]; 7164 int index; 7165 int num_stripes; 7166 int ret; 7167 int i; 7168 7169 logical = key->offset; 7170 length = btrfs_chunk_length(leaf, chunk); 7171 type = btrfs_chunk_type(leaf, chunk); 7172 index = btrfs_bg_flags_to_raid_index(type); 7173 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 7174 7175 #if BITS_PER_LONG == 32 7176 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 7177 if (ret < 0) 7178 return ret; 7179 warn_32bit_meta_chunk(fs_info, logical, length, type); 7180 #endif 7181 7182 /* 7183 * Only need to verify chunk item if we're reading from sys chunk array, 7184 * as chunk item in tree block is already verified by tree-checker. 7185 */ 7186 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 7187 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 7188 if (ret) 7189 return ret; 7190 } 7191 7192 read_lock(&map_tree->lock); 7193 em = lookup_extent_mapping(map_tree, logical, 1); 7194 read_unlock(&map_tree->lock); 7195 7196 /* already mapped? */ 7197 if (em && em->start <= logical && em->start + em->len > logical) { 7198 free_extent_map(em); 7199 return 0; 7200 } else if (em) { 7201 free_extent_map(em); 7202 } 7203 7204 em = alloc_extent_map(); 7205 if (!em) 7206 return -ENOMEM; 7207 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 7208 if (!map) { 7209 free_extent_map(em); 7210 return -ENOMEM; 7211 } 7212 7213 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 7214 em->map_lookup = map; 7215 em->start = logical; 7216 em->len = length; 7217 em->orig_start = 0; 7218 em->block_start = 0; 7219 em->block_len = em->len; 7220 7221 map->num_stripes = num_stripes; 7222 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7223 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7224 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7225 map->type = type; 7226 /* 7227 * We can't use the sub_stripes value, as for profiles other than 7228 * RAID10, they may have 0 as sub_stripes for filesystems created by 7229 * older mkfs (<v5.4). 7230 * In that case, it can cause divide-by-zero errors later. 7231 * Since currently sub_stripes is fixed for each profile, let's 7232 * use the trusted value instead. 7233 */ 7234 map->sub_stripes = btrfs_raid_array[index].sub_stripes; 7235 map->verified_stripes = 0; 7236 em->orig_block_len = btrfs_calc_stripe_length(em); 7237 for (i = 0; i < num_stripes; i++) { 7238 map->stripes[i].physical = 7239 btrfs_stripe_offset_nr(leaf, chunk, i); 7240 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7241 args.devid = devid; 7242 read_extent_buffer(leaf, uuid, (unsigned long) 7243 btrfs_stripe_dev_uuid_nr(chunk, i), 7244 BTRFS_UUID_SIZE); 7245 args.uuid = uuid; 7246 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 7247 if (!map->stripes[i].dev) { 7248 map->stripes[i].dev = handle_missing_device(fs_info, 7249 devid, uuid); 7250 if (IS_ERR(map->stripes[i].dev)) { 7251 free_extent_map(em); 7252 return PTR_ERR(map->stripes[i].dev); 7253 } 7254 } 7255 7256 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7257 &(map->stripes[i].dev->dev_state)); 7258 } 7259 7260 write_lock(&map_tree->lock); 7261 ret = add_extent_mapping(map_tree, em, 0); 7262 write_unlock(&map_tree->lock); 7263 if (ret < 0) { 7264 btrfs_err(fs_info, 7265 "failed to add chunk map, start=%llu len=%llu: %d", 7266 em->start, em->len, ret); 7267 } 7268 free_extent_map(em); 7269 7270 return ret; 7271 } 7272 7273 static void fill_device_from_item(struct extent_buffer *leaf, 7274 struct btrfs_dev_item *dev_item, 7275 struct btrfs_device *device) 7276 { 7277 unsigned long ptr; 7278 7279 device->devid = btrfs_device_id(leaf, dev_item); 7280 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7281 device->total_bytes = device->disk_total_bytes; 7282 device->commit_total_bytes = device->disk_total_bytes; 7283 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7284 device->commit_bytes_used = device->bytes_used; 7285 device->type = btrfs_device_type(leaf, dev_item); 7286 device->io_align = btrfs_device_io_align(leaf, dev_item); 7287 device->io_width = btrfs_device_io_width(leaf, dev_item); 7288 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7289 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7290 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7291 7292 ptr = btrfs_device_uuid(dev_item); 7293 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7294 } 7295 7296 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7297 u8 *fsid) 7298 { 7299 struct btrfs_fs_devices *fs_devices; 7300 int ret; 7301 7302 lockdep_assert_held(&uuid_mutex); 7303 ASSERT(fsid); 7304 7305 /* This will match only for multi-device seed fs */ 7306 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7307 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7308 return fs_devices; 7309 7310 7311 fs_devices = find_fsid(fsid, NULL); 7312 if (!fs_devices) { 7313 if (!btrfs_test_opt(fs_info, DEGRADED)) 7314 return ERR_PTR(-ENOENT); 7315 7316 fs_devices = alloc_fs_devices(fsid, NULL); 7317 if (IS_ERR(fs_devices)) 7318 return fs_devices; 7319 7320 fs_devices->seeding = true; 7321 fs_devices->opened = 1; 7322 return fs_devices; 7323 } 7324 7325 /* 7326 * Upon first call for a seed fs fsid, just create a private copy of the 7327 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7328 */ 7329 fs_devices = clone_fs_devices(fs_devices); 7330 if (IS_ERR(fs_devices)) 7331 return fs_devices; 7332 7333 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7334 if (ret) { 7335 free_fs_devices(fs_devices); 7336 return ERR_PTR(ret); 7337 } 7338 7339 if (!fs_devices->seeding) { 7340 close_fs_devices(fs_devices); 7341 free_fs_devices(fs_devices); 7342 return ERR_PTR(-EINVAL); 7343 } 7344 7345 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7346 7347 return fs_devices; 7348 } 7349 7350 static int read_one_dev(struct extent_buffer *leaf, 7351 struct btrfs_dev_item *dev_item) 7352 { 7353 BTRFS_DEV_LOOKUP_ARGS(args); 7354 struct btrfs_fs_info *fs_info = leaf->fs_info; 7355 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7356 struct btrfs_device *device; 7357 u64 devid; 7358 int ret; 7359 u8 fs_uuid[BTRFS_FSID_SIZE]; 7360 u8 dev_uuid[BTRFS_UUID_SIZE]; 7361 7362 devid = btrfs_device_id(leaf, dev_item); 7363 args.devid = devid; 7364 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7365 BTRFS_UUID_SIZE); 7366 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7367 BTRFS_FSID_SIZE); 7368 args.uuid = dev_uuid; 7369 args.fsid = fs_uuid; 7370 7371 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7372 fs_devices = open_seed_devices(fs_info, fs_uuid); 7373 if (IS_ERR(fs_devices)) 7374 return PTR_ERR(fs_devices); 7375 } 7376 7377 device = btrfs_find_device(fs_info->fs_devices, &args); 7378 if (!device) { 7379 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7380 btrfs_report_missing_device(fs_info, devid, 7381 dev_uuid, true); 7382 return -ENOENT; 7383 } 7384 7385 device = add_missing_dev(fs_devices, devid, dev_uuid); 7386 if (IS_ERR(device)) { 7387 btrfs_err(fs_info, 7388 "failed to add missing dev %llu: %ld", 7389 devid, PTR_ERR(device)); 7390 return PTR_ERR(device); 7391 } 7392 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7393 } else { 7394 if (!device->bdev) { 7395 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7396 btrfs_report_missing_device(fs_info, 7397 devid, dev_uuid, true); 7398 return -ENOENT; 7399 } 7400 btrfs_report_missing_device(fs_info, devid, 7401 dev_uuid, false); 7402 } 7403 7404 if (!device->bdev && 7405 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7406 /* 7407 * this happens when a device that was properly setup 7408 * in the device info lists suddenly goes bad. 7409 * device->bdev is NULL, and so we have to set 7410 * device->missing to one here 7411 */ 7412 device->fs_devices->missing_devices++; 7413 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7414 } 7415 7416 /* Move the device to its own fs_devices */ 7417 if (device->fs_devices != fs_devices) { 7418 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7419 &device->dev_state)); 7420 7421 list_move(&device->dev_list, &fs_devices->devices); 7422 device->fs_devices->num_devices--; 7423 fs_devices->num_devices++; 7424 7425 device->fs_devices->missing_devices--; 7426 fs_devices->missing_devices++; 7427 7428 device->fs_devices = fs_devices; 7429 } 7430 } 7431 7432 if (device->fs_devices != fs_info->fs_devices) { 7433 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7434 if (device->generation != 7435 btrfs_device_generation(leaf, dev_item)) 7436 return -EINVAL; 7437 } 7438 7439 fill_device_from_item(leaf, dev_item, device); 7440 if (device->bdev) { 7441 u64 max_total_bytes = bdev_nr_bytes(device->bdev); 7442 7443 if (device->total_bytes > max_total_bytes) { 7444 btrfs_err(fs_info, 7445 "device total_bytes should be at most %llu but found %llu", 7446 max_total_bytes, device->total_bytes); 7447 return -EINVAL; 7448 } 7449 } 7450 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7451 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7452 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7453 device->fs_devices->total_rw_bytes += device->total_bytes; 7454 atomic64_add(device->total_bytes - device->bytes_used, 7455 &fs_info->free_chunk_space); 7456 } 7457 ret = 0; 7458 return ret; 7459 } 7460 7461 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7462 { 7463 struct btrfs_super_block *super_copy = fs_info->super_copy; 7464 struct extent_buffer *sb; 7465 struct btrfs_disk_key *disk_key; 7466 struct btrfs_chunk *chunk; 7467 u8 *array_ptr; 7468 unsigned long sb_array_offset; 7469 int ret = 0; 7470 u32 num_stripes; 7471 u32 array_size; 7472 u32 len = 0; 7473 u32 cur_offset; 7474 u64 type; 7475 struct btrfs_key key; 7476 7477 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7478 7479 /* 7480 * We allocated a dummy extent, just to use extent buffer accessors. 7481 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but 7482 * that's fine, we will not go beyond system chunk array anyway. 7483 */ 7484 sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); 7485 if (!sb) 7486 return -ENOMEM; 7487 set_extent_buffer_uptodate(sb); 7488 7489 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7490 array_size = btrfs_super_sys_array_size(super_copy); 7491 7492 array_ptr = super_copy->sys_chunk_array; 7493 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7494 cur_offset = 0; 7495 7496 while (cur_offset < array_size) { 7497 disk_key = (struct btrfs_disk_key *)array_ptr; 7498 len = sizeof(*disk_key); 7499 if (cur_offset + len > array_size) 7500 goto out_short_read; 7501 7502 btrfs_disk_key_to_cpu(&key, disk_key); 7503 7504 array_ptr += len; 7505 sb_array_offset += len; 7506 cur_offset += len; 7507 7508 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7509 btrfs_err(fs_info, 7510 "unexpected item type %u in sys_array at offset %u", 7511 (u32)key.type, cur_offset); 7512 ret = -EIO; 7513 break; 7514 } 7515 7516 chunk = (struct btrfs_chunk *)sb_array_offset; 7517 /* 7518 * At least one btrfs_chunk with one stripe must be present, 7519 * exact stripe count check comes afterwards 7520 */ 7521 len = btrfs_chunk_item_size(1); 7522 if (cur_offset + len > array_size) 7523 goto out_short_read; 7524 7525 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7526 if (!num_stripes) { 7527 btrfs_err(fs_info, 7528 "invalid number of stripes %u in sys_array at offset %u", 7529 num_stripes, cur_offset); 7530 ret = -EIO; 7531 break; 7532 } 7533 7534 type = btrfs_chunk_type(sb, chunk); 7535 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7536 btrfs_err(fs_info, 7537 "invalid chunk type %llu in sys_array at offset %u", 7538 type, cur_offset); 7539 ret = -EIO; 7540 break; 7541 } 7542 7543 len = btrfs_chunk_item_size(num_stripes); 7544 if (cur_offset + len > array_size) 7545 goto out_short_read; 7546 7547 ret = read_one_chunk(&key, sb, chunk); 7548 if (ret) 7549 break; 7550 7551 array_ptr += len; 7552 sb_array_offset += len; 7553 cur_offset += len; 7554 } 7555 clear_extent_buffer_uptodate(sb); 7556 free_extent_buffer_stale(sb); 7557 return ret; 7558 7559 out_short_read: 7560 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7561 len, cur_offset); 7562 clear_extent_buffer_uptodate(sb); 7563 free_extent_buffer_stale(sb); 7564 return -EIO; 7565 } 7566 7567 /* 7568 * Check if all chunks in the fs are OK for read-write degraded mount 7569 * 7570 * If the @failing_dev is specified, it's accounted as missing. 7571 * 7572 * Return true if all chunks meet the minimal RW mount requirements. 7573 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7574 */ 7575 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7576 struct btrfs_device *failing_dev) 7577 { 7578 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7579 struct extent_map *em; 7580 u64 next_start = 0; 7581 bool ret = true; 7582 7583 read_lock(&map_tree->lock); 7584 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7585 read_unlock(&map_tree->lock); 7586 /* No chunk at all? Return false anyway */ 7587 if (!em) { 7588 ret = false; 7589 goto out; 7590 } 7591 while (em) { 7592 struct map_lookup *map; 7593 int missing = 0; 7594 int max_tolerated; 7595 int i; 7596 7597 map = em->map_lookup; 7598 max_tolerated = 7599 btrfs_get_num_tolerated_disk_barrier_failures( 7600 map->type); 7601 for (i = 0; i < map->num_stripes; i++) { 7602 struct btrfs_device *dev = map->stripes[i].dev; 7603 7604 if (!dev || !dev->bdev || 7605 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7606 dev->last_flush_error) 7607 missing++; 7608 else if (failing_dev && failing_dev == dev) 7609 missing++; 7610 } 7611 if (missing > max_tolerated) { 7612 if (!failing_dev) 7613 btrfs_warn(fs_info, 7614 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7615 em->start, missing, max_tolerated); 7616 free_extent_map(em); 7617 ret = false; 7618 goto out; 7619 } 7620 next_start = extent_map_end(em); 7621 free_extent_map(em); 7622 7623 read_lock(&map_tree->lock); 7624 em = lookup_extent_mapping(map_tree, next_start, 7625 (u64)(-1) - next_start); 7626 read_unlock(&map_tree->lock); 7627 } 7628 out: 7629 return ret; 7630 } 7631 7632 static void readahead_tree_node_children(struct extent_buffer *node) 7633 { 7634 int i; 7635 const int nr_items = btrfs_header_nritems(node); 7636 7637 for (i = 0; i < nr_items; i++) 7638 btrfs_readahead_node_child(node, i); 7639 } 7640 7641 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7642 { 7643 struct btrfs_root *root = fs_info->chunk_root; 7644 struct btrfs_path *path; 7645 struct extent_buffer *leaf; 7646 struct btrfs_key key; 7647 struct btrfs_key found_key; 7648 int ret; 7649 int slot; 7650 int iter_ret = 0; 7651 u64 total_dev = 0; 7652 u64 last_ra_node = 0; 7653 7654 path = btrfs_alloc_path(); 7655 if (!path) 7656 return -ENOMEM; 7657 7658 /* 7659 * uuid_mutex is needed only if we are mounting a sprout FS 7660 * otherwise we don't need it. 7661 */ 7662 mutex_lock(&uuid_mutex); 7663 7664 /* 7665 * It is possible for mount and umount to race in such a way that 7666 * we execute this code path, but open_fs_devices failed to clear 7667 * total_rw_bytes. We certainly want it cleared before reading the 7668 * device items, so clear it here. 7669 */ 7670 fs_info->fs_devices->total_rw_bytes = 0; 7671 7672 /* 7673 * Lockdep complains about possible circular locking dependency between 7674 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 7675 * used for freeze procection of a fs (struct super_block.s_writers), 7676 * which we take when starting a transaction, and extent buffers of the 7677 * chunk tree if we call read_one_dev() while holding a lock on an 7678 * extent buffer of the chunk tree. Since we are mounting the filesystem 7679 * and at this point there can't be any concurrent task modifying the 7680 * chunk tree, to keep it simple, just skip locking on the chunk tree. 7681 */ 7682 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7683 path->skip_locking = 1; 7684 7685 /* 7686 * Read all device items, and then all the chunk items. All 7687 * device items are found before any chunk item (their object id 7688 * is smaller than the lowest possible object id for a chunk 7689 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7690 */ 7691 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7692 key.offset = 0; 7693 key.type = 0; 7694 btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { 7695 struct extent_buffer *node = path->nodes[1]; 7696 7697 leaf = path->nodes[0]; 7698 slot = path->slots[0]; 7699 7700 if (node) { 7701 if (last_ra_node != node->start) { 7702 readahead_tree_node_children(node); 7703 last_ra_node = node->start; 7704 } 7705 } 7706 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7707 struct btrfs_dev_item *dev_item; 7708 dev_item = btrfs_item_ptr(leaf, slot, 7709 struct btrfs_dev_item); 7710 ret = read_one_dev(leaf, dev_item); 7711 if (ret) 7712 goto error; 7713 total_dev++; 7714 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7715 struct btrfs_chunk *chunk; 7716 7717 /* 7718 * We are only called at mount time, so no need to take 7719 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7720 * we always lock first fs_info->chunk_mutex before 7721 * acquiring any locks on the chunk tree. This is a 7722 * requirement for chunk allocation, see the comment on 7723 * top of btrfs_chunk_alloc() for details. 7724 */ 7725 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7726 ret = read_one_chunk(&found_key, leaf, chunk); 7727 if (ret) 7728 goto error; 7729 } 7730 } 7731 /* Catch error found during iteration */ 7732 if (iter_ret < 0) { 7733 ret = iter_ret; 7734 goto error; 7735 } 7736 7737 /* 7738 * After loading chunk tree, we've got all device information, 7739 * do another round of validation checks. 7740 */ 7741 if (total_dev != fs_info->fs_devices->total_devices) { 7742 btrfs_warn(fs_info, 7743 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", 7744 btrfs_super_num_devices(fs_info->super_copy), 7745 total_dev); 7746 fs_info->fs_devices->total_devices = total_dev; 7747 btrfs_set_super_num_devices(fs_info->super_copy, total_dev); 7748 } 7749 if (btrfs_super_total_bytes(fs_info->super_copy) < 7750 fs_info->fs_devices->total_rw_bytes) { 7751 btrfs_err(fs_info, 7752 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7753 btrfs_super_total_bytes(fs_info->super_copy), 7754 fs_info->fs_devices->total_rw_bytes); 7755 ret = -EINVAL; 7756 goto error; 7757 } 7758 ret = 0; 7759 error: 7760 mutex_unlock(&uuid_mutex); 7761 7762 btrfs_free_path(path); 7763 return ret; 7764 } 7765 7766 int btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7767 { 7768 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7769 struct btrfs_device *device; 7770 int ret = 0; 7771 7772 fs_devices->fs_info = fs_info; 7773 7774 mutex_lock(&fs_devices->device_list_mutex); 7775 list_for_each_entry(device, &fs_devices->devices, dev_list) 7776 device->fs_info = fs_info; 7777 7778 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7779 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7780 device->fs_info = fs_info; 7781 ret = btrfs_get_dev_zone_info(device, false); 7782 if (ret) 7783 break; 7784 } 7785 7786 seed_devs->fs_info = fs_info; 7787 } 7788 mutex_unlock(&fs_devices->device_list_mutex); 7789 7790 return ret; 7791 } 7792 7793 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7794 const struct btrfs_dev_stats_item *ptr, 7795 int index) 7796 { 7797 u64 val; 7798 7799 read_extent_buffer(eb, &val, 7800 offsetof(struct btrfs_dev_stats_item, values) + 7801 ((unsigned long)ptr) + (index * sizeof(u64)), 7802 sizeof(val)); 7803 return val; 7804 } 7805 7806 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7807 struct btrfs_dev_stats_item *ptr, 7808 int index, u64 val) 7809 { 7810 write_extent_buffer(eb, &val, 7811 offsetof(struct btrfs_dev_stats_item, values) + 7812 ((unsigned long)ptr) + (index * sizeof(u64)), 7813 sizeof(val)); 7814 } 7815 7816 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7817 struct btrfs_path *path) 7818 { 7819 struct btrfs_dev_stats_item *ptr; 7820 struct extent_buffer *eb; 7821 struct btrfs_key key; 7822 int item_size; 7823 int i, ret, slot; 7824 7825 if (!device->fs_info->dev_root) 7826 return 0; 7827 7828 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7829 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7830 key.offset = device->devid; 7831 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7832 if (ret) { 7833 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7834 btrfs_dev_stat_set(device, i, 0); 7835 device->dev_stats_valid = 1; 7836 btrfs_release_path(path); 7837 return ret < 0 ? ret : 0; 7838 } 7839 slot = path->slots[0]; 7840 eb = path->nodes[0]; 7841 item_size = btrfs_item_size(eb, slot); 7842 7843 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7844 7845 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7846 if (item_size >= (1 + i) * sizeof(__le64)) 7847 btrfs_dev_stat_set(device, i, 7848 btrfs_dev_stats_value(eb, ptr, i)); 7849 else 7850 btrfs_dev_stat_set(device, i, 0); 7851 } 7852 7853 device->dev_stats_valid = 1; 7854 btrfs_dev_stat_print_on_load(device); 7855 btrfs_release_path(path); 7856 7857 return 0; 7858 } 7859 7860 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7861 { 7862 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7863 struct btrfs_device *device; 7864 struct btrfs_path *path = NULL; 7865 int ret = 0; 7866 7867 path = btrfs_alloc_path(); 7868 if (!path) 7869 return -ENOMEM; 7870 7871 mutex_lock(&fs_devices->device_list_mutex); 7872 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7873 ret = btrfs_device_init_dev_stats(device, path); 7874 if (ret) 7875 goto out; 7876 } 7877 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7878 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7879 ret = btrfs_device_init_dev_stats(device, path); 7880 if (ret) 7881 goto out; 7882 } 7883 } 7884 out: 7885 mutex_unlock(&fs_devices->device_list_mutex); 7886 7887 btrfs_free_path(path); 7888 return ret; 7889 } 7890 7891 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7892 struct btrfs_device *device) 7893 { 7894 struct btrfs_fs_info *fs_info = trans->fs_info; 7895 struct btrfs_root *dev_root = fs_info->dev_root; 7896 struct btrfs_path *path; 7897 struct btrfs_key key; 7898 struct extent_buffer *eb; 7899 struct btrfs_dev_stats_item *ptr; 7900 int ret; 7901 int i; 7902 7903 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7904 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7905 key.offset = device->devid; 7906 7907 path = btrfs_alloc_path(); 7908 if (!path) 7909 return -ENOMEM; 7910 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7911 if (ret < 0) { 7912 btrfs_warn_in_rcu(fs_info, 7913 "error %d while searching for dev_stats item for device %s", 7914 ret, btrfs_dev_name(device)); 7915 goto out; 7916 } 7917 7918 if (ret == 0 && 7919 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7920 /* need to delete old one and insert a new one */ 7921 ret = btrfs_del_item(trans, dev_root, path); 7922 if (ret != 0) { 7923 btrfs_warn_in_rcu(fs_info, 7924 "delete too small dev_stats item for device %s failed %d", 7925 btrfs_dev_name(device), ret); 7926 goto out; 7927 } 7928 ret = 1; 7929 } 7930 7931 if (ret == 1) { 7932 /* need to insert a new item */ 7933 btrfs_release_path(path); 7934 ret = btrfs_insert_empty_item(trans, dev_root, path, 7935 &key, sizeof(*ptr)); 7936 if (ret < 0) { 7937 btrfs_warn_in_rcu(fs_info, 7938 "insert dev_stats item for device %s failed %d", 7939 btrfs_dev_name(device), ret); 7940 goto out; 7941 } 7942 } 7943 7944 eb = path->nodes[0]; 7945 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7946 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7947 btrfs_set_dev_stats_value(eb, ptr, i, 7948 btrfs_dev_stat_read(device, i)); 7949 btrfs_mark_buffer_dirty(eb); 7950 7951 out: 7952 btrfs_free_path(path); 7953 return ret; 7954 } 7955 7956 /* 7957 * called from commit_transaction. Writes all changed device stats to disk. 7958 */ 7959 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7960 { 7961 struct btrfs_fs_info *fs_info = trans->fs_info; 7962 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7963 struct btrfs_device *device; 7964 int stats_cnt; 7965 int ret = 0; 7966 7967 mutex_lock(&fs_devices->device_list_mutex); 7968 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7969 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7970 if (!device->dev_stats_valid || stats_cnt == 0) 7971 continue; 7972 7973 7974 /* 7975 * There is a LOAD-LOAD control dependency between the value of 7976 * dev_stats_ccnt and updating the on-disk values which requires 7977 * reading the in-memory counters. Such control dependencies 7978 * require explicit read memory barriers. 7979 * 7980 * This memory barriers pairs with smp_mb__before_atomic in 7981 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7982 * barrier implied by atomic_xchg in 7983 * btrfs_dev_stats_read_and_reset 7984 */ 7985 smp_rmb(); 7986 7987 ret = update_dev_stat_item(trans, device); 7988 if (!ret) 7989 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7990 } 7991 mutex_unlock(&fs_devices->device_list_mutex); 7992 7993 return ret; 7994 } 7995 7996 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7997 { 7998 btrfs_dev_stat_inc(dev, index); 7999 8000 if (!dev->dev_stats_valid) 8001 return; 8002 btrfs_err_rl_in_rcu(dev->fs_info, 8003 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 8004 btrfs_dev_name(dev), 8005 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 8006 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 8007 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 8008 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 8009 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 8010 } 8011 8012 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 8013 { 8014 int i; 8015 8016 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 8017 if (btrfs_dev_stat_read(dev, i) != 0) 8018 break; 8019 if (i == BTRFS_DEV_STAT_VALUES_MAX) 8020 return; /* all values == 0, suppress message */ 8021 8022 btrfs_info_in_rcu(dev->fs_info, 8023 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 8024 btrfs_dev_name(dev), 8025 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 8026 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 8027 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 8028 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 8029 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 8030 } 8031 8032 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 8033 struct btrfs_ioctl_get_dev_stats *stats) 8034 { 8035 BTRFS_DEV_LOOKUP_ARGS(args); 8036 struct btrfs_device *dev; 8037 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 8038 int i; 8039 8040 mutex_lock(&fs_devices->device_list_mutex); 8041 args.devid = stats->devid; 8042 dev = btrfs_find_device(fs_info->fs_devices, &args); 8043 mutex_unlock(&fs_devices->device_list_mutex); 8044 8045 if (!dev) { 8046 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 8047 return -ENODEV; 8048 } else if (!dev->dev_stats_valid) { 8049 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 8050 return -ENODEV; 8051 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 8052 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 8053 if (stats->nr_items > i) 8054 stats->values[i] = 8055 btrfs_dev_stat_read_and_reset(dev, i); 8056 else 8057 btrfs_dev_stat_set(dev, i, 0); 8058 } 8059 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 8060 current->comm, task_pid_nr(current)); 8061 } else { 8062 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 8063 if (stats->nr_items > i) 8064 stats->values[i] = btrfs_dev_stat_read(dev, i); 8065 } 8066 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 8067 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 8068 return 0; 8069 } 8070 8071 /* 8072 * Update the size and bytes used for each device where it changed. This is 8073 * delayed since we would otherwise get errors while writing out the 8074 * superblocks. 8075 * 8076 * Must be invoked during transaction commit. 8077 */ 8078 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 8079 { 8080 struct btrfs_device *curr, *next; 8081 8082 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 8083 8084 if (list_empty(&trans->dev_update_list)) 8085 return; 8086 8087 /* 8088 * We don't need the device_list_mutex here. This list is owned by the 8089 * transaction and the transaction must complete before the device is 8090 * released. 8091 */ 8092 mutex_lock(&trans->fs_info->chunk_mutex); 8093 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 8094 post_commit_list) { 8095 list_del_init(&curr->post_commit_list); 8096 curr->commit_total_bytes = curr->disk_total_bytes; 8097 curr->commit_bytes_used = curr->bytes_used; 8098 } 8099 mutex_unlock(&trans->fs_info->chunk_mutex); 8100 } 8101 8102 /* 8103 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 8104 */ 8105 int btrfs_bg_type_to_factor(u64 flags) 8106 { 8107 const int index = btrfs_bg_flags_to_raid_index(flags); 8108 8109 return btrfs_raid_array[index].ncopies; 8110 } 8111 8112 8113 8114 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 8115 u64 chunk_offset, u64 devid, 8116 u64 physical_offset, u64 physical_len) 8117 { 8118 struct btrfs_dev_lookup_args args = { .devid = devid }; 8119 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8120 struct extent_map *em; 8121 struct map_lookup *map; 8122 struct btrfs_device *dev; 8123 u64 stripe_len; 8124 bool found = false; 8125 int ret = 0; 8126 int i; 8127 8128 read_lock(&em_tree->lock); 8129 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 8130 read_unlock(&em_tree->lock); 8131 8132 if (!em) { 8133 btrfs_err(fs_info, 8134 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 8135 physical_offset, devid); 8136 ret = -EUCLEAN; 8137 goto out; 8138 } 8139 8140 map = em->map_lookup; 8141 stripe_len = btrfs_calc_stripe_length(em); 8142 if (physical_len != stripe_len) { 8143 btrfs_err(fs_info, 8144 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 8145 physical_offset, devid, em->start, physical_len, 8146 stripe_len); 8147 ret = -EUCLEAN; 8148 goto out; 8149 } 8150 8151 /* 8152 * Very old mkfs.btrfs (before v4.1) will not respect the reserved 8153 * space. Although kernel can handle it without problem, better to warn 8154 * the users. 8155 */ 8156 if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) 8157 btrfs_warn(fs_info, 8158 "devid %llu physical %llu len %llu inside the reserved space", 8159 devid, physical_offset, physical_len); 8160 8161 for (i = 0; i < map->num_stripes; i++) { 8162 if (map->stripes[i].dev->devid == devid && 8163 map->stripes[i].physical == physical_offset) { 8164 found = true; 8165 if (map->verified_stripes >= map->num_stripes) { 8166 btrfs_err(fs_info, 8167 "too many dev extents for chunk %llu found", 8168 em->start); 8169 ret = -EUCLEAN; 8170 goto out; 8171 } 8172 map->verified_stripes++; 8173 break; 8174 } 8175 } 8176 if (!found) { 8177 btrfs_err(fs_info, 8178 "dev extent physical offset %llu devid %llu has no corresponding chunk", 8179 physical_offset, devid); 8180 ret = -EUCLEAN; 8181 } 8182 8183 /* Make sure no dev extent is beyond device boundary */ 8184 dev = btrfs_find_device(fs_info->fs_devices, &args); 8185 if (!dev) { 8186 btrfs_err(fs_info, "failed to find devid %llu", devid); 8187 ret = -EUCLEAN; 8188 goto out; 8189 } 8190 8191 if (physical_offset + physical_len > dev->disk_total_bytes) { 8192 btrfs_err(fs_info, 8193 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 8194 devid, physical_offset, physical_len, 8195 dev->disk_total_bytes); 8196 ret = -EUCLEAN; 8197 goto out; 8198 } 8199 8200 if (dev->zone_info) { 8201 u64 zone_size = dev->zone_info->zone_size; 8202 8203 if (!IS_ALIGNED(physical_offset, zone_size) || 8204 !IS_ALIGNED(physical_len, zone_size)) { 8205 btrfs_err(fs_info, 8206 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8207 devid, physical_offset, physical_len); 8208 ret = -EUCLEAN; 8209 goto out; 8210 } 8211 } 8212 8213 out: 8214 free_extent_map(em); 8215 return ret; 8216 } 8217 8218 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8219 { 8220 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8221 struct extent_map *em; 8222 struct rb_node *node; 8223 int ret = 0; 8224 8225 read_lock(&em_tree->lock); 8226 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8227 em = rb_entry(node, struct extent_map, rb_node); 8228 if (em->map_lookup->num_stripes != 8229 em->map_lookup->verified_stripes) { 8230 btrfs_err(fs_info, 8231 "chunk %llu has missing dev extent, have %d expect %d", 8232 em->start, em->map_lookup->verified_stripes, 8233 em->map_lookup->num_stripes); 8234 ret = -EUCLEAN; 8235 goto out; 8236 } 8237 } 8238 out: 8239 read_unlock(&em_tree->lock); 8240 return ret; 8241 } 8242 8243 /* 8244 * Ensure that all dev extents are mapped to correct chunk, otherwise 8245 * later chunk allocation/free would cause unexpected behavior. 8246 * 8247 * NOTE: This will iterate through the whole device tree, which should be of 8248 * the same size level as the chunk tree. This slightly increases mount time. 8249 */ 8250 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8251 { 8252 struct btrfs_path *path; 8253 struct btrfs_root *root = fs_info->dev_root; 8254 struct btrfs_key key; 8255 u64 prev_devid = 0; 8256 u64 prev_dev_ext_end = 0; 8257 int ret = 0; 8258 8259 /* 8260 * We don't have a dev_root because we mounted with ignorebadroots and 8261 * failed to load the root, so we want to skip the verification in this 8262 * case for sure. 8263 * 8264 * However if the dev root is fine, but the tree itself is corrupted 8265 * we'd still fail to mount. This verification is only to make sure 8266 * writes can happen safely, so instead just bypass this check 8267 * completely in the case of IGNOREBADROOTS. 8268 */ 8269 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8270 return 0; 8271 8272 key.objectid = 1; 8273 key.type = BTRFS_DEV_EXTENT_KEY; 8274 key.offset = 0; 8275 8276 path = btrfs_alloc_path(); 8277 if (!path) 8278 return -ENOMEM; 8279 8280 path->reada = READA_FORWARD; 8281 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8282 if (ret < 0) 8283 goto out; 8284 8285 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8286 ret = btrfs_next_leaf(root, path); 8287 if (ret < 0) 8288 goto out; 8289 /* No dev extents at all? Not good */ 8290 if (ret > 0) { 8291 ret = -EUCLEAN; 8292 goto out; 8293 } 8294 } 8295 while (1) { 8296 struct extent_buffer *leaf = path->nodes[0]; 8297 struct btrfs_dev_extent *dext; 8298 int slot = path->slots[0]; 8299 u64 chunk_offset; 8300 u64 physical_offset; 8301 u64 physical_len; 8302 u64 devid; 8303 8304 btrfs_item_key_to_cpu(leaf, &key, slot); 8305 if (key.type != BTRFS_DEV_EXTENT_KEY) 8306 break; 8307 devid = key.objectid; 8308 physical_offset = key.offset; 8309 8310 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8311 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8312 physical_len = btrfs_dev_extent_length(leaf, dext); 8313 8314 /* Check if this dev extent overlaps with the previous one */ 8315 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8316 btrfs_err(fs_info, 8317 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8318 devid, physical_offset, prev_dev_ext_end); 8319 ret = -EUCLEAN; 8320 goto out; 8321 } 8322 8323 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8324 physical_offset, physical_len); 8325 if (ret < 0) 8326 goto out; 8327 prev_devid = devid; 8328 prev_dev_ext_end = physical_offset + physical_len; 8329 8330 ret = btrfs_next_item(root, path); 8331 if (ret < 0) 8332 goto out; 8333 if (ret > 0) { 8334 ret = 0; 8335 break; 8336 } 8337 } 8338 8339 /* Ensure all chunks have corresponding dev extents */ 8340 ret = verify_chunk_dev_extent_mapping(fs_info); 8341 out: 8342 btrfs_free_path(path); 8343 return ret; 8344 } 8345 8346 /* 8347 * Check whether the given block group or device is pinned by any inode being 8348 * used as a swapfile. 8349 */ 8350 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8351 { 8352 struct btrfs_swapfile_pin *sp; 8353 struct rb_node *node; 8354 8355 spin_lock(&fs_info->swapfile_pins_lock); 8356 node = fs_info->swapfile_pins.rb_node; 8357 while (node) { 8358 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8359 if (ptr < sp->ptr) 8360 node = node->rb_left; 8361 else if (ptr > sp->ptr) 8362 node = node->rb_right; 8363 else 8364 break; 8365 } 8366 spin_unlock(&fs_info->swapfile_pins_lock); 8367 return node != NULL; 8368 } 8369 8370 static int relocating_repair_kthread(void *data) 8371 { 8372 struct btrfs_block_group *cache = data; 8373 struct btrfs_fs_info *fs_info = cache->fs_info; 8374 u64 target; 8375 int ret = 0; 8376 8377 target = cache->start; 8378 btrfs_put_block_group(cache); 8379 8380 sb_start_write(fs_info->sb); 8381 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8382 btrfs_info(fs_info, 8383 "zoned: skip relocating block group %llu to repair: EBUSY", 8384 target); 8385 sb_end_write(fs_info->sb); 8386 return -EBUSY; 8387 } 8388 8389 mutex_lock(&fs_info->reclaim_bgs_lock); 8390 8391 /* Ensure block group still exists */ 8392 cache = btrfs_lookup_block_group(fs_info, target); 8393 if (!cache) 8394 goto out; 8395 8396 if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) 8397 goto out; 8398 8399 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8400 if (ret < 0) 8401 goto out; 8402 8403 btrfs_info(fs_info, 8404 "zoned: relocating block group %llu to repair IO failure", 8405 target); 8406 ret = btrfs_relocate_chunk(fs_info, target); 8407 8408 out: 8409 if (cache) 8410 btrfs_put_block_group(cache); 8411 mutex_unlock(&fs_info->reclaim_bgs_lock); 8412 btrfs_exclop_finish(fs_info); 8413 sb_end_write(fs_info->sb); 8414 8415 return ret; 8416 } 8417 8418 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8419 { 8420 struct btrfs_block_group *cache; 8421 8422 if (!btrfs_is_zoned(fs_info)) 8423 return false; 8424 8425 /* Do not attempt to repair in degraded state */ 8426 if (btrfs_test_opt(fs_info, DEGRADED)) 8427 return true; 8428 8429 cache = btrfs_lookup_block_group(fs_info, logical); 8430 if (!cache) 8431 return true; 8432 8433 if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) { 8434 btrfs_put_block_group(cache); 8435 return true; 8436 } 8437 8438 kthread_run(relocating_repair_kthread, cache, 8439 "btrfs-relocating-repair"); 8440 8441 return true; 8442 } 8443 8444 int __init btrfs_bioset_init(void) 8445 { 8446 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 8447 offsetof(struct btrfs_bio, bio), 8448 BIOSET_NEED_BVECS)) 8449 return -ENOMEM; 8450 return 0; 8451 } 8452 8453 void __cold btrfs_bioset_exit(void) 8454 { 8455 bioset_exit(&btrfs_bioset); 8456 } 8457