1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/sched/mm.h> 8 #include <linux/bio.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/ratelimit.h> 12 #include <linux/kthread.h> 13 #include <linux/raid/pq.h> 14 #include <linux/semaphore.h> 15 #include <linux/uuid.h> 16 #include <linux/list_sort.h> 17 #include <linux/namei.h> 18 #include "misc.h" 19 #include "ctree.h" 20 #include "extent_map.h" 21 #include "disk-io.h" 22 #include "transaction.h" 23 #include "print-tree.h" 24 #include "volumes.h" 25 #include "raid56.h" 26 #include "async-thread.h" 27 #include "check-integrity.h" 28 #include "rcu-string.h" 29 #include "dev-replace.h" 30 #include "sysfs.h" 31 #include "tree-checker.h" 32 #include "space-info.h" 33 #include "block-group.h" 34 #include "discard.h" 35 #include "zoned.h" 36 37 #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 38 BTRFS_BLOCK_GROUP_RAID10 | \ 39 BTRFS_BLOCK_GROUP_RAID56_MASK) 40 41 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 42 [BTRFS_RAID_RAID10] = { 43 .sub_stripes = 2, 44 .dev_stripes = 1, 45 .devs_max = 0, /* 0 == as many as possible */ 46 .devs_min = 2, 47 .tolerated_failures = 1, 48 .devs_increment = 2, 49 .ncopies = 2, 50 .nparity = 0, 51 .raid_name = "raid10", 52 .bg_flag = BTRFS_BLOCK_GROUP_RAID10, 53 .mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, 54 }, 55 [BTRFS_RAID_RAID1] = { 56 .sub_stripes = 1, 57 .dev_stripes = 1, 58 .devs_max = 2, 59 .devs_min = 2, 60 .tolerated_failures = 1, 61 .devs_increment = 2, 62 .ncopies = 2, 63 .nparity = 0, 64 .raid_name = "raid1", 65 .bg_flag = BTRFS_BLOCK_GROUP_RAID1, 66 .mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, 67 }, 68 [BTRFS_RAID_RAID1C3] = { 69 .sub_stripes = 1, 70 .dev_stripes = 1, 71 .devs_max = 3, 72 .devs_min = 3, 73 .tolerated_failures = 2, 74 .devs_increment = 3, 75 .ncopies = 3, 76 .nparity = 0, 77 .raid_name = "raid1c3", 78 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, 79 .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, 80 }, 81 [BTRFS_RAID_RAID1C4] = { 82 .sub_stripes = 1, 83 .dev_stripes = 1, 84 .devs_max = 4, 85 .devs_min = 4, 86 .tolerated_failures = 3, 87 .devs_increment = 4, 88 .ncopies = 4, 89 .nparity = 0, 90 .raid_name = "raid1c4", 91 .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, 92 .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, 93 }, 94 [BTRFS_RAID_DUP] = { 95 .sub_stripes = 1, 96 .dev_stripes = 2, 97 .devs_max = 1, 98 .devs_min = 1, 99 .tolerated_failures = 0, 100 .devs_increment = 1, 101 .ncopies = 2, 102 .nparity = 0, 103 .raid_name = "dup", 104 .bg_flag = BTRFS_BLOCK_GROUP_DUP, 105 .mindev_error = 0, 106 }, 107 [BTRFS_RAID_RAID0] = { 108 .sub_stripes = 1, 109 .dev_stripes = 1, 110 .devs_max = 0, 111 .devs_min = 1, 112 .tolerated_failures = 0, 113 .devs_increment = 1, 114 .ncopies = 1, 115 .nparity = 0, 116 .raid_name = "raid0", 117 .bg_flag = BTRFS_BLOCK_GROUP_RAID0, 118 .mindev_error = 0, 119 }, 120 [BTRFS_RAID_SINGLE] = { 121 .sub_stripes = 1, 122 .dev_stripes = 1, 123 .devs_max = 1, 124 .devs_min = 1, 125 .tolerated_failures = 0, 126 .devs_increment = 1, 127 .ncopies = 1, 128 .nparity = 0, 129 .raid_name = "single", 130 .bg_flag = 0, 131 .mindev_error = 0, 132 }, 133 [BTRFS_RAID_RAID5] = { 134 .sub_stripes = 1, 135 .dev_stripes = 1, 136 .devs_max = 0, 137 .devs_min = 2, 138 .tolerated_failures = 1, 139 .devs_increment = 1, 140 .ncopies = 1, 141 .nparity = 1, 142 .raid_name = "raid5", 143 .bg_flag = BTRFS_BLOCK_GROUP_RAID5, 144 .mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, 145 }, 146 [BTRFS_RAID_RAID6] = { 147 .sub_stripes = 1, 148 .dev_stripes = 1, 149 .devs_max = 0, 150 .devs_min = 3, 151 .tolerated_failures = 2, 152 .devs_increment = 1, 153 .ncopies = 1, 154 .nparity = 2, 155 .raid_name = "raid6", 156 .bg_flag = BTRFS_BLOCK_GROUP_RAID6, 157 .mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, 158 }, 159 }; 160 161 /* 162 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which 163 * can be used as index to access btrfs_raid_array[]. 164 */ 165 enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) 166 { 167 if (flags & BTRFS_BLOCK_GROUP_RAID10) 168 return BTRFS_RAID_RAID10; 169 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 170 return BTRFS_RAID_RAID1; 171 else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) 172 return BTRFS_RAID_RAID1C3; 173 else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) 174 return BTRFS_RAID_RAID1C4; 175 else if (flags & BTRFS_BLOCK_GROUP_DUP) 176 return BTRFS_RAID_DUP; 177 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 178 return BTRFS_RAID_RAID0; 179 else if (flags & BTRFS_BLOCK_GROUP_RAID5) 180 return BTRFS_RAID_RAID5; 181 else if (flags & BTRFS_BLOCK_GROUP_RAID6) 182 return BTRFS_RAID_RAID6; 183 184 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ 185 } 186 187 const char *btrfs_bg_type_to_raid_name(u64 flags) 188 { 189 const int index = btrfs_bg_flags_to_raid_index(flags); 190 191 if (index >= BTRFS_NR_RAID_TYPES) 192 return NULL; 193 194 return btrfs_raid_array[index].raid_name; 195 } 196 197 /* 198 * Fill @buf with textual description of @bg_flags, no more than @size_buf 199 * bytes including terminating null byte. 200 */ 201 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf) 202 { 203 int i; 204 int ret; 205 char *bp = buf; 206 u64 flags = bg_flags; 207 u32 size_bp = size_buf; 208 209 if (!flags) { 210 strcpy(bp, "NONE"); 211 return; 212 } 213 214 #define DESCRIBE_FLAG(flag, desc) \ 215 do { \ 216 if (flags & (flag)) { \ 217 ret = snprintf(bp, size_bp, "%s|", (desc)); \ 218 if (ret < 0 || ret >= size_bp) \ 219 goto out_overflow; \ 220 size_bp -= ret; \ 221 bp += ret; \ 222 flags &= ~(flag); \ 223 } \ 224 } while (0) 225 226 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 227 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 228 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 229 230 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 231 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) 232 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag, 233 btrfs_raid_array[i].raid_name); 234 #undef DESCRIBE_FLAG 235 236 if (flags) { 237 ret = snprintf(bp, size_bp, "0x%llx|", flags); 238 size_bp -= ret; 239 } 240 241 if (size_bp < size_buf) 242 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */ 243 244 /* 245 * The text is trimmed, it's up to the caller to provide sufficiently 246 * large buffer 247 */ 248 out_overflow:; 249 } 250 251 static int init_first_rw_device(struct btrfs_trans_handle *trans); 252 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); 253 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 254 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 255 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 256 enum btrfs_map_op op, 257 u64 logical, u64 *length, 258 struct btrfs_io_context **bioc_ret, 259 int mirror_num, int need_raid_map); 260 261 /* 262 * Device locking 263 * ============== 264 * 265 * There are several mutexes that protect manipulation of devices and low-level 266 * structures like chunks but not block groups, extents or files 267 * 268 * uuid_mutex (global lock) 269 * ------------------------ 270 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from 271 * the SCAN_DEV ioctl registration or from mount either implicitly (the first 272 * device) or requested by the device= mount option 273 * 274 * the mutex can be very coarse and can cover long-running operations 275 * 276 * protects: updates to fs_devices counters like missing devices, rw devices, 277 * seeding, structure cloning, opening/closing devices at mount/umount time 278 * 279 * global::fs_devs - add, remove, updates to the global list 280 * 281 * does not protect: manipulation of the fs_devices::devices list in general 282 * but in mount context it could be used to exclude list modifications by eg. 283 * scan ioctl 284 * 285 * btrfs_device::name - renames (write side), read is RCU 286 * 287 * fs_devices::device_list_mutex (per-fs, with RCU) 288 * ------------------------------------------------ 289 * protects updates to fs_devices::devices, ie. adding and deleting 290 * 291 * simple list traversal with read-only actions can be done with RCU protection 292 * 293 * may be used to exclude some operations from running concurrently without any 294 * modifications to the list (see write_all_supers) 295 * 296 * Is not required at mount and close times, because our device list is 297 * protected by the uuid_mutex at that point. 298 * 299 * balance_mutex 300 * ------------- 301 * protects balance structures (status, state) and context accessed from 302 * several places (internally, ioctl) 303 * 304 * chunk_mutex 305 * ----------- 306 * protects chunks, adding or removing during allocation, trim or when a new 307 * device is added/removed. Additionally it also protects post_commit_list of 308 * individual devices, since they can be added to the transaction's 309 * post_commit_list only with chunk_mutex held. 310 * 311 * cleaner_mutex 312 * ------------- 313 * a big lock that is held by the cleaner thread and prevents running subvolume 314 * cleaning together with relocation or delayed iputs 315 * 316 * 317 * Lock nesting 318 * ============ 319 * 320 * uuid_mutex 321 * device_list_mutex 322 * chunk_mutex 323 * balance_mutex 324 * 325 * 326 * Exclusive operations 327 * ==================== 328 * 329 * Maintains the exclusivity of the following operations that apply to the 330 * whole filesystem and cannot run in parallel. 331 * 332 * - Balance (*) 333 * - Device add 334 * - Device remove 335 * - Device replace (*) 336 * - Resize 337 * 338 * The device operations (as above) can be in one of the following states: 339 * 340 * - Running state 341 * - Paused state 342 * - Completed state 343 * 344 * Only device operations marked with (*) can go into the Paused state for the 345 * following reasons: 346 * 347 * - ioctl (only Balance can be Paused through ioctl) 348 * - filesystem remounted as read-only 349 * - filesystem unmounted and mounted as read-only 350 * - system power-cycle and filesystem mounted as read-only 351 * - filesystem or device errors leading to forced read-only 352 * 353 * The status of exclusive operation is set and cleared atomically. 354 * During the course of Paused state, fs_info::exclusive_operation remains set. 355 * A device operation in Paused or Running state can be canceled or resumed 356 * either by ioctl (Balance only) or when remounted as read-write. 357 * The exclusive status is cleared when the device operation is canceled or 358 * completed. 359 */ 360 361 DEFINE_MUTEX(uuid_mutex); 362 static LIST_HEAD(fs_uuids); 363 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) 364 { 365 return &fs_uuids; 366 } 367 368 /* 369 * alloc_fs_devices - allocate struct btrfs_fs_devices 370 * @fsid: if not NULL, copy the UUID to fs_devices::fsid 371 * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid 372 * 373 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). 374 * The returned struct is not linked onto any lists and can be destroyed with 375 * kfree() right away. 376 */ 377 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, 378 const u8 *metadata_fsid) 379 { 380 struct btrfs_fs_devices *fs_devs; 381 382 fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); 383 if (!fs_devs) 384 return ERR_PTR(-ENOMEM); 385 386 mutex_init(&fs_devs->device_list_mutex); 387 388 INIT_LIST_HEAD(&fs_devs->devices); 389 INIT_LIST_HEAD(&fs_devs->alloc_list); 390 INIT_LIST_HEAD(&fs_devs->fs_list); 391 INIT_LIST_HEAD(&fs_devs->seed_list); 392 if (fsid) 393 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); 394 395 if (metadata_fsid) 396 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE); 397 else if (fsid) 398 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); 399 400 return fs_devs; 401 } 402 403 void btrfs_free_device(struct btrfs_device *device) 404 { 405 WARN_ON(!list_empty(&device->post_commit_list)); 406 rcu_string_free(device->name); 407 extent_io_tree_release(&device->alloc_state); 408 bio_put(device->flush_bio); 409 btrfs_destroy_dev_zone_info(device); 410 kfree(device); 411 } 412 413 static void free_fs_devices(struct btrfs_fs_devices *fs_devices) 414 { 415 struct btrfs_device *device; 416 WARN_ON(fs_devices->opened); 417 while (!list_empty(&fs_devices->devices)) { 418 device = list_entry(fs_devices->devices.next, 419 struct btrfs_device, dev_list); 420 list_del(&device->dev_list); 421 btrfs_free_device(device); 422 } 423 kfree(fs_devices); 424 } 425 426 void __exit btrfs_cleanup_fs_uuids(void) 427 { 428 struct btrfs_fs_devices *fs_devices; 429 430 while (!list_empty(&fs_uuids)) { 431 fs_devices = list_entry(fs_uuids.next, 432 struct btrfs_fs_devices, fs_list); 433 list_del(&fs_devices->fs_list); 434 free_fs_devices(fs_devices); 435 } 436 } 437 438 static noinline struct btrfs_fs_devices *find_fsid( 439 const u8 *fsid, const u8 *metadata_fsid) 440 { 441 struct btrfs_fs_devices *fs_devices; 442 443 ASSERT(fsid); 444 445 /* Handle non-split brain cases */ 446 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 447 if (metadata_fsid) { 448 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0 449 && memcmp(metadata_fsid, fs_devices->metadata_uuid, 450 BTRFS_FSID_SIZE) == 0) 451 return fs_devices; 452 } else { 453 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) 454 return fs_devices; 455 } 456 } 457 return NULL; 458 } 459 460 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( 461 struct btrfs_super_block *disk_super) 462 { 463 464 struct btrfs_fs_devices *fs_devices; 465 466 /* 467 * Handle scanned device having completed its fsid change but 468 * belonging to a fs_devices that was created by first scanning 469 * a device which didn't have its fsid/metadata_uuid changed 470 * at all and the CHANGING_FSID_V2 flag set. 471 */ 472 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 473 if (fs_devices->fsid_change && 474 memcmp(disk_super->metadata_uuid, fs_devices->fsid, 475 BTRFS_FSID_SIZE) == 0 && 476 memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 477 BTRFS_FSID_SIZE) == 0) { 478 return fs_devices; 479 } 480 } 481 /* 482 * Handle scanned device having completed its fsid change but 483 * belonging to a fs_devices that was created by a device that 484 * has an outdated pair of fsid/metadata_uuid and 485 * CHANGING_FSID_V2 flag set. 486 */ 487 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 488 if (fs_devices->fsid_change && 489 memcmp(fs_devices->metadata_uuid, 490 fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && 491 memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, 492 BTRFS_FSID_SIZE) == 0) { 493 return fs_devices; 494 } 495 } 496 497 return find_fsid(disk_super->fsid, disk_super->metadata_uuid); 498 } 499 500 501 static int 502 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, 503 int flush, struct block_device **bdev, 504 struct btrfs_super_block **disk_super) 505 { 506 int ret; 507 508 *bdev = blkdev_get_by_path(device_path, flags, holder); 509 510 if (IS_ERR(*bdev)) { 511 ret = PTR_ERR(*bdev); 512 goto error; 513 } 514 515 if (flush) 516 sync_blockdev(*bdev); 517 ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); 518 if (ret) { 519 blkdev_put(*bdev, flags); 520 goto error; 521 } 522 invalidate_bdev(*bdev); 523 *disk_super = btrfs_read_dev_super(*bdev); 524 if (IS_ERR(*disk_super)) { 525 ret = PTR_ERR(*disk_super); 526 blkdev_put(*bdev, flags); 527 goto error; 528 } 529 530 return 0; 531 532 error: 533 *bdev = NULL; 534 return ret; 535 } 536 537 /* 538 * Check if the device in the path matches the device in the given struct device. 539 * 540 * Returns: 541 * true If it is the same device. 542 * false If it is not the same device or on error. 543 */ 544 static bool device_matched(const struct btrfs_device *device, dev_t dev_new) 545 { 546 char *device_name; 547 dev_t dev_old; 548 int ret; 549 550 /* 551 * If we are looking for a device with the matching dev_t, then skip 552 * device without a name (a missing device). 553 */ 554 if (!device->name) 555 return false; 556 557 device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL); 558 if (!device_name) 559 return false; 560 561 rcu_read_lock(); 562 scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name)); 563 rcu_read_unlock(); 564 565 ret = lookup_bdev(device_name, &dev_old); 566 kfree(device_name); 567 if (ret) 568 return false; 569 570 if (dev_old == dev_new) 571 return true; 572 573 return false; 574 } 575 576 /** 577 * Search and remove all stale devices (which are not mounted). 578 * When both inputs are NULL, it will search and release all stale devices. 579 * 580 * @devt: Optional. When provided will it release all unmounted devices 581 * matching this devt only. 582 * @skip_device: Optional. Will skip this device when searching for the stale 583 * devices. 584 * 585 * Return: 0 for success or if @devt is 0. 586 * -EBUSY if @devt is a mounted device. 587 * -ENOENT if @devt does not match any device in the list. 588 */ 589 static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device) 590 { 591 struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; 592 struct btrfs_device *device, *tmp_device; 593 int ret = 0; 594 595 lockdep_assert_held(&uuid_mutex); 596 597 if (devt) 598 ret = -ENOENT; 599 600 list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { 601 602 mutex_lock(&fs_devices->device_list_mutex); 603 list_for_each_entry_safe(device, tmp_device, 604 &fs_devices->devices, dev_list) { 605 if (skip_device && skip_device == device) 606 continue; 607 if (devt && !device_matched(device, devt)) 608 continue; 609 if (fs_devices->opened) { 610 /* for an already deleted device return 0 */ 611 if (devt && ret != 0) 612 ret = -EBUSY; 613 break; 614 } 615 616 /* delete the stale device */ 617 fs_devices->num_devices--; 618 list_del(&device->dev_list); 619 btrfs_free_device(device); 620 621 ret = 0; 622 } 623 mutex_unlock(&fs_devices->device_list_mutex); 624 625 if (fs_devices->num_devices == 0) { 626 btrfs_sysfs_remove_fsid(fs_devices); 627 list_del(&fs_devices->fs_list); 628 free_fs_devices(fs_devices); 629 } 630 } 631 632 return ret; 633 } 634 635 /* 636 * This is only used on mount, and we are protected from competing things 637 * messing with our fs_devices by the uuid_mutex, thus we do not need the 638 * fs_devices->device_list_mutex here. 639 */ 640 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, 641 struct btrfs_device *device, fmode_t flags, 642 void *holder) 643 { 644 struct request_queue *q; 645 struct block_device *bdev; 646 struct btrfs_super_block *disk_super; 647 u64 devid; 648 int ret; 649 650 if (device->bdev) 651 return -EINVAL; 652 if (!device->name) 653 return -EINVAL; 654 655 ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, 656 &bdev, &disk_super); 657 if (ret) 658 return ret; 659 660 devid = btrfs_stack_device_id(&disk_super->dev_item); 661 if (devid != device->devid) 662 goto error_free_page; 663 664 if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE)) 665 goto error_free_page; 666 667 device->generation = btrfs_super_generation(disk_super); 668 669 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 670 if (btrfs_super_incompat_flags(disk_super) & 671 BTRFS_FEATURE_INCOMPAT_METADATA_UUID) { 672 pr_err( 673 "BTRFS: Invalid seeding and uuid-changed device detected\n"); 674 goto error_free_page; 675 } 676 677 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 678 fs_devices->seeding = true; 679 } else { 680 if (bdev_read_only(bdev)) 681 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 682 else 683 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 684 } 685 686 q = bdev_get_queue(bdev); 687 if (!blk_queue_nonrot(q)) 688 fs_devices->rotating = true; 689 690 device->bdev = bdev; 691 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 692 device->mode = flags; 693 694 fs_devices->open_devices++; 695 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 696 device->devid != BTRFS_DEV_REPLACE_DEVID) { 697 fs_devices->rw_devices++; 698 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list); 699 } 700 btrfs_release_disk_super(disk_super); 701 702 return 0; 703 704 error_free_page: 705 btrfs_release_disk_super(disk_super); 706 blkdev_put(bdev, flags); 707 708 return -EINVAL; 709 } 710 711 /* 712 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices 713 * being created with a disk that has already completed its fsid change. Such 714 * disk can belong to an fs which has its FSID changed or to one which doesn't. 715 * Handle both cases here. 716 */ 717 static struct btrfs_fs_devices *find_fsid_inprogress( 718 struct btrfs_super_block *disk_super) 719 { 720 struct btrfs_fs_devices *fs_devices; 721 722 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 723 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 724 BTRFS_FSID_SIZE) != 0 && 725 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 726 BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) { 727 return fs_devices; 728 } 729 } 730 731 return find_fsid(disk_super->fsid, NULL); 732 } 733 734 735 static struct btrfs_fs_devices *find_fsid_changed( 736 struct btrfs_super_block *disk_super) 737 { 738 struct btrfs_fs_devices *fs_devices; 739 740 /* 741 * Handles the case where scanned device is part of an fs that had 742 * multiple successful changes of FSID but currently device didn't 743 * observe it. Meaning our fsid will be different than theirs. We need 744 * to handle two subcases : 745 * 1 - The fs still continues to have different METADATA/FSID uuids. 746 * 2 - The fs is switched back to its original FSID (METADATA/FSID 747 * are equal). 748 */ 749 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 750 /* Changed UUIDs */ 751 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 752 BTRFS_FSID_SIZE) != 0 && 753 memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, 754 BTRFS_FSID_SIZE) == 0 && 755 memcmp(fs_devices->fsid, disk_super->fsid, 756 BTRFS_FSID_SIZE) != 0) 757 return fs_devices; 758 759 /* Unchanged UUIDs */ 760 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, 761 BTRFS_FSID_SIZE) == 0 && 762 memcmp(fs_devices->fsid, disk_super->metadata_uuid, 763 BTRFS_FSID_SIZE) == 0) 764 return fs_devices; 765 } 766 767 return NULL; 768 } 769 770 static struct btrfs_fs_devices *find_fsid_reverted_metadata( 771 struct btrfs_super_block *disk_super) 772 { 773 struct btrfs_fs_devices *fs_devices; 774 775 /* 776 * Handle the case where the scanned device is part of an fs whose last 777 * metadata UUID change reverted it to the original FSID. At the same 778 * time * fs_devices was first created by another constitutent device 779 * which didn't fully observe the operation. This results in an 780 * btrfs_fs_devices created with metadata/fsid different AND 781 * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the 782 * fs_devices equal to the FSID of the disk. 783 */ 784 list_for_each_entry(fs_devices, &fs_uuids, fs_list) { 785 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, 786 BTRFS_FSID_SIZE) != 0 && 787 memcmp(fs_devices->metadata_uuid, disk_super->fsid, 788 BTRFS_FSID_SIZE) == 0 && 789 fs_devices->fsid_change) 790 return fs_devices; 791 } 792 793 return NULL; 794 } 795 /* 796 * Add new device to list of registered devices 797 * 798 * Returns: 799 * device pointer which was just added or updated when successful 800 * error pointer when failed 801 */ 802 static noinline struct btrfs_device *device_list_add(const char *path, 803 struct btrfs_super_block *disk_super, 804 bool *new_device_added) 805 { 806 struct btrfs_device *device; 807 struct btrfs_fs_devices *fs_devices = NULL; 808 struct rcu_string *name; 809 u64 found_transid = btrfs_super_generation(disk_super); 810 u64 devid = btrfs_stack_device_id(&disk_super->dev_item); 811 dev_t path_devt; 812 int error; 813 bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & 814 BTRFS_FEATURE_INCOMPAT_METADATA_UUID); 815 bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & 816 BTRFS_SUPER_FLAG_CHANGING_FSID_V2); 817 818 error = lookup_bdev(path, &path_devt); 819 if (error) 820 return ERR_PTR(error); 821 822 if (fsid_change_in_progress) { 823 if (!has_metadata_uuid) 824 fs_devices = find_fsid_inprogress(disk_super); 825 else 826 fs_devices = find_fsid_changed(disk_super); 827 } else if (has_metadata_uuid) { 828 fs_devices = find_fsid_with_metadata_uuid(disk_super); 829 } else { 830 fs_devices = find_fsid_reverted_metadata(disk_super); 831 if (!fs_devices) 832 fs_devices = find_fsid(disk_super->fsid, NULL); 833 } 834 835 836 if (!fs_devices) { 837 if (has_metadata_uuid) 838 fs_devices = alloc_fs_devices(disk_super->fsid, 839 disk_super->metadata_uuid); 840 else 841 fs_devices = alloc_fs_devices(disk_super->fsid, NULL); 842 843 if (IS_ERR(fs_devices)) 844 return ERR_CAST(fs_devices); 845 846 fs_devices->fsid_change = fsid_change_in_progress; 847 848 mutex_lock(&fs_devices->device_list_mutex); 849 list_add(&fs_devices->fs_list, &fs_uuids); 850 851 device = NULL; 852 } else { 853 struct btrfs_dev_lookup_args args = { 854 .devid = devid, 855 .uuid = disk_super->dev_item.uuid, 856 }; 857 858 mutex_lock(&fs_devices->device_list_mutex); 859 device = btrfs_find_device(fs_devices, &args); 860 861 /* 862 * If this disk has been pulled into an fs devices created by 863 * a device which had the CHANGING_FSID_V2 flag then replace the 864 * metadata_uuid/fsid values of the fs_devices. 865 */ 866 if (fs_devices->fsid_change && 867 found_transid > fs_devices->latest_generation) { 868 memcpy(fs_devices->fsid, disk_super->fsid, 869 BTRFS_FSID_SIZE); 870 871 if (has_metadata_uuid) 872 memcpy(fs_devices->metadata_uuid, 873 disk_super->metadata_uuid, 874 BTRFS_FSID_SIZE); 875 else 876 memcpy(fs_devices->metadata_uuid, 877 disk_super->fsid, BTRFS_FSID_SIZE); 878 879 fs_devices->fsid_change = false; 880 } 881 } 882 883 if (!device) { 884 if (fs_devices->opened) { 885 mutex_unlock(&fs_devices->device_list_mutex); 886 return ERR_PTR(-EBUSY); 887 } 888 889 device = btrfs_alloc_device(NULL, &devid, 890 disk_super->dev_item.uuid); 891 if (IS_ERR(device)) { 892 mutex_unlock(&fs_devices->device_list_mutex); 893 /* we can safely leave the fs_devices entry around */ 894 return device; 895 } 896 897 name = rcu_string_strdup(path, GFP_NOFS); 898 if (!name) { 899 btrfs_free_device(device); 900 mutex_unlock(&fs_devices->device_list_mutex); 901 return ERR_PTR(-ENOMEM); 902 } 903 rcu_assign_pointer(device->name, name); 904 device->devt = path_devt; 905 906 list_add_rcu(&device->dev_list, &fs_devices->devices); 907 fs_devices->num_devices++; 908 909 device->fs_devices = fs_devices; 910 *new_device_added = true; 911 912 if (disk_super->label[0]) 913 pr_info( 914 "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n", 915 disk_super->label, devid, found_transid, path, 916 current->comm, task_pid_nr(current)); 917 else 918 pr_info( 919 "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n", 920 disk_super->fsid, devid, found_transid, path, 921 current->comm, task_pid_nr(current)); 922 923 } else if (!device->name || strcmp(device->name->str, path)) { 924 /* 925 * When FS is already mounted. 926 * 1. If you are here and if the device->name is NULL that 927 * means this device was missing at time of FS mount. 928 * 2. If you are here and if the device->name is different 929 * from 'path' that means either 930 * a. The same device disappeared and reappeared with 931 * different name. or 932 * b. The missing-disk-which-was-replaced, has 933 * reappeared now. 934 * 935 * We must allow 1 and 2a above. But 2b would be a spurious 936 * and unintentional. 937 * 938 * Further in case of 1 and 2a above, the disk at 'path' 939 * would have missed some transaction when it was away and 940 * in case of 2a the stale bdev has to be updated as well. 941 * 2b must not be allowed at all time. 942 */ 943 944 /* 945 * For now, we do allow update to btrfs_fs_device through the 946 * btrfs dev scan cli after FS has been mounted. We're still 947 * tracking a problem where systems fail mount by subvolume id 948 * when we reject replacement on a mounted FS. 949 */ 950 if (!fs_devices->opened && found_transid < device->generation) { 951 /* 952 * That is if the FS is _not_ mounted and if you 953 * are here, that means there is more than one 954 * disk with same uuid and devid.We keep the one 955 * with larger generation number or the last-in if 956 * generation are equal. 957 */ 958 mutex_unlock(&fs_devices->device_list_mutex); 959 return ERR_PTR(-EEXIST); 960 } 961 962 /* 963 * We are going to replace the device path for a given devid, 964 * make sure it's the same device if the device is mounted 965 */ 966 if (device->bdev) { 967 if (device->devt != path_devt) { 968 mutex_unlock(&fs_devices->device_list_mutex); 969 /* 970 * device->fs_info may not be reliable here, so 971 * pass in a NULL instead. This avoids a 972 * possible use-after-free when the fs_info and 973 * fs_info->sb are already torn down. 974 */ 975 btrfs_warn_in_rcu(NULL, 976 "duplicate device %s devid %llu generation %llu scanned by %s (%d)", 977 path, devid, found_transid, 978 current->comm, 979 task_pid_nr(current)); 980 return ERR_PTR(-EEXIST); 981 } 982 btrfs_info_in_rcu(device->fs_info, 983 "devid %llu device path %s changed to %s scanned by %s (%d)", 984 devid, rcu_str_deref(device->name), 985 path, current->comm, 986 task_pid_nr(current)); 987 } 988 989 name = rcu_string_strdup(path, GFP_NOFS); 990 if (!name) { 991 mutex_unlock(&fs_devices->device_list_mutex); 992 return ERR_PTR(-ENOMEM); 993 } 994 rcu_string_free(device->name); 995 rcu_assign_pointer(device->name, name); 996 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 997 fs_devices->missing_devices--; 998 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 999 } 1000 device->devt = path_devt; 1001 } 1002 1003 /* 1004 * Unmount does not free the btrfs_device struct but would zero 1005 * generation along with most of the other members. So just update 1006 * it back. We need it to pick the disk with largest generation 1007 * (as above). 1008 */ 1009 if (!fs_devices->opened) { 1010 device->generation = found_transid; 1011 fs_devices->latest_generation = max_t(u64, found_transid, 1012 fs_devices->latest_generation); 1013 } 1014 1015 fs_devices->total_devices = btrfs_super_num_devices(disk_super); 1016 1017 mutex_unlock(&fs_devices->device_list_mutex); 1018 return device; 1019 } 1020 1021 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) 1022 { 1023 struct btrfs_fs_devices *fs_devices; 1024 struct btrfs_device *device; 1025 struct btrfs_device *orig_dev; 1026 int ret = 0; 1027 1028 lockdep_assert_held(&uuid_mutex); 1029 1030 fs_devices = alloc_fs_devices(orig->fsid, NULL); 1031 if (IS_ERR(fs_devices)) 1032 return fs_devices; 1033 1034 fs_devices->total_devices = orig->total_devices; 1035 1036 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 1037 struct rcu_string *name; 1038 1039 device = btrfs_alloc_device(NULL, &orig_dev->devid, 1040 orig_dev->uuid); 1041 if (IS_ERR(device)) { 1042 ret = PTR_ERR(device); 1043 goto error; 1044 } 1045 1046 /* 1047 * This is ok to do without rcu read locked because we hold the 1048 * uuid mutex so nothing we touch in here is going to disappear. 1049 */ 1050 if (orig_dev->name) { 1051 name = rcu_string_strdup(orig_dev->name->str, 1052 GFP_KERNEL); 1053 if (!name) { 1054 btrfs_free_device(device); 1055 ret = -ENOMEM; 1056 goto error; 1057 } 1058 rcu_assign_pointer(device->name, name); 1059 } 1060 1061 list_add(&device->dev_list, &fs_devices->devices); 1062 device->fs_devices = fs_devices; 1063 fs_devices->num_devices++; 1064 } 1065 return fs_devices; 1066 error: 1067 free_fs_devices(fs_devices); 1068 return ERR_PTR(ret); 1069 } 1070 1071 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, 1072 struct btrfs_device **latest_dev) 1073 { 1074 struct btrfs_device *device, *next; 1075 1076 /* This is the initialized path, it is safe to release the devices. */ 1077 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 1078 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { 1079 if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, 1080 &device->dev_state) && 1081 !test_bit(BTRFS_DEV_STATE_MISSING, 1082 &device->dev_state) && 1083 (!*latest_dev || 1084 device->generation > (*latest_dev)->generation)) { 1085 *latest_dev = device; 1086 } 1087 continue; 1088 } 1089 1090 /* 1091 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID, 1092 * in btrfs_init_dev_replace() so just continue. 1093 */ 1094 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1095 continue; 1096 1097 if (device->bdev) { 1098 blkdev_put(device->bdev, device->mode); 1099 device->bdev = NULL; 1100 fs_devices->open_devices--; 1101 } 1102 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1103 list_del_init(&device->dev_alloc_list); 1104 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1105 fs_devices->rw_devices--; 1106 } 1107 list_del_init(&device->dev_list); 1108 fs_devices->num_devices--; 1109 btrfs_free_device(device); 1110 } 1111 1112 } 1113 1114 /* 1115 * After we have read the system tree and know devids belonging to this 1116 * filesystem, remove the device which does not belong there. 1117 */ 1118 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) 1119 { 1120 struct btrfs_device *latest_dev = NULL; 1121 struct btrfs_fs_devices *seed_dev; 1122 1123 mutex_lock(&uuid_mutex); 1124 __btrfs_free_extra_devids(fs_devices, &latest_dev); 1125 1126 list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) 1127 __btrfs_free_extra_devids(seed_dev, &latest_dev); 1128 1129 fs_devices->latest_dev = latest_dev; 1130 1131 mutex_unlock(&uuid_mutex); 1132 } 1133 1134 static void btrfs_close_bdev(struct btrfs_device *device) 1135 { 1136 if (!device->bdev) 1137 return; 1138 1139 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 1140 sync_blockdev(device->bdev); 1141 invalidate_bdev(device->bdev); 1142 } 1143 1144 blkdev_put(device->bdev, device->mode); 1145 } 1146 1147 static void btrfs_close_one_device(struct btrfs_device *device) 1148 { 1149 struct btrfs_fs_devices *fs_devices = device->fs_devices; 1150 1151 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 1152 device->devid != BTRFS_DEV_REPLACE_DEVID) { 1153 list_del_init(&device->dev_alloc_list); 1154 fs_devices->rw_devices--; 1155 } 1156 1157 if (device->devid == BTRFS_DEV_REPLACE_DEVID) 1158 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 1159 1160 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 1161 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 1162 fs_devices->missing_devices--; 1163 } 1164 1165 btrfs_close_bdev(device); 1166 if (device->bdev) { 1167 fs_devices->open_devices--; 1168 device->bdev = NULL; 1169 } 1170 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 1171 btrfs_destroy_dev_zone_info(device); 1172 1173 device->fs_info = NULL; 1174 atomic_set(&device->dev_stats_ccnt, 0); 1175 extent_io_tree_release(&device->alloc_state); 1176 1177 /* 1178 * Reset the flush error record. We might have a transient flush error 1179 * in this mount, and if so we aborted the current transaction and set 1180 * the fs to an error state, guaranteeing no super blocks can be further 1181 * committed. However that error might be transient and if we unmount the 1182 * filesystem and mount it again, we should allow the mount to succeed 1183 * (btrfs_check_rw_degradable() should not fail) - if after mounting the 1184 * filesystem again we still get flush errors, then we will again abort 1185 * any transaction and set the error state, guaranteeing no commits of 1186 * unsafe super blocks. 1187 */ 1188 device->last_flush_error = 0; 1189 1190 /* Verify the device is back in a pristine state */ 1191 ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); 1192 ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); 1193 ASSERT(list_empty(&device->dev_alloc_list)); 1194 ASSERT(list_empty(&device->post_commit_list)); 1195 } 1196 1197 static void close_fs_devices(struct btrfs_fs_devices *fs_devices) 1198 { 1199 struct btrfs_device *device, *tmp; 1200 1201 lockdep_assert_held(&uuid_mutex); 1202 1203 if (--fs_devices->opened > 0) 1204 return; 1205 1206 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) 1207 btrfs_close_one_device(device); 1208 1209 WARN_ON(fs_devices->open_devices); 1210 WARN_ON(fs_devices->rw_devices); 1211 fs_devices->opened = 0; 1212 fs_devices->seeding = false; 1213 fs_devices->fs_info = NULL; 1214 } 1215 1216 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 1217 { 1218 LIST_HEAD(list); 1219 struct btrfs_fs_devices *tmp; 1220 1221 mutex_lock(&uuid_mutex); 1222 close_fs_devices(fs_devices); 1223 if (!fs_devices->opened) 1224 list_splice_init(&fs_devices->seed_list, &list); 1225 1226 list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { 1227 close_fs_devices(fs_devices); 1228 list_del(&fs_devices->seed_list); 1229 free_fs_devices(fs_devices); 1230 } 1231 mutex_unlock(&uuid_mutex); 1232 } 1233 1234 static int open_fs_devices(struct btrfs_fs_devices *fs_devices, 1235 fmode_t flags, void *holder) 1236 { 1237 struct btrfs_device *device; 1238 struct btrfs_device *latest_dev = NULL; 1239 struct btrfs_device *tmp_device; 1240 1241 flags |= FMODE_EXCL; 1242 1243 list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, 1244 dev_list) { 1245 int ret; 1246 1247 ret = btrfs_open_one_device(fs_devices, device, flags, holder); 1248 if (ret == 0 && 1249 (!latest_dev || device->generation > latest_dev->generation)) { 1250 latest_dev = device; 1251 } else if (ret == -ENODATA) { 1252 fs_devices->num_devices--; 1253 list_del(&device->dev_list); 1254 btrfs_free_device(device); 1255 } 1256 } 1257 if (fs_devices->open_devices == 0) 1258 return -EINVAL; 1259 1260 fs_devices->opened = 1; 1261 fs_devices->latest_dev = latest_dev; 1262 fs_devices->total_rw_bytes = 0; 1263 fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; 1264 fs_devices->read_policy = BTRFS_READ_POLICY_PID; 1265 1266 return 0; 1267 } 1268 1269 static int devid_cmp(void *priv, const struct list_head *a, 1270 const struct list_head *b) 1271 { 1272 const struct btrfs_device *dev1, *dev2; 1273 1274 dev1 = list_entry(a, struct btrfs_device, dev_list); 1275 dev2 = list_entry(b, struct btrfs_device, dev_list); 1276 1277 if (dev1->devid < dev2->devid) 1278 return -1; 1279 else if (dev1->devid > dev2->devid) 1280 return 1; 1281 return 0; 1282 } 1283 1284 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 1285 fmode_t flags, void *holder) 1286 { 1287 int ret; 1288 1289 lockdep_assert_held(&uuid_mutex); 1290 /* 1291 * The device_list_mutex cannot be taken here in case opening the 1292 * underlying device takes further locks like open_mutex. 1293 * 1294 * We also don't need the lock here as this is called during mount and 1295 * exclusion is provided by uuid_mutex 1296 */ 1297 1298 if (fs_devices->opened) { 1299 fs_devices->opened++; 1300 ret = 0; 1301 } else { 1302 list_sort(NULL, &fs_devices->devices, devid_cmp); 1303 ret = open_fs_devices(fs_devices, flags, holder); 1304 } 1305 1306 return ret; 1307 } 1308 1309 void btrfs_release_disk_super(struct btrfs_super_block *super) 1310 { 1311 struct page *page = virt_to_page(super); 1312 1313 put_page(page); 1314 } 1315 1316 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, 1317 u64 bytenr, u64 bytenr_orig) 1318 { 1319 struct btrfs_super_block *disk_super; 1320 struct page *page; 1321 void *p; 1322 pgoff_t index; 1323 1324 /* make sure our super fits in the device */ 1325 if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) 1326 return ERR_PTR(-EINVAL); 1327 1328 /* make sure our super fits in the page */ 1329 if (sizeof(*disk_super) > PAGE_SIZE) 1330 return ERR_PTR(-EINVAL); 1331 1332 /* make sure our super doesn't straddle pages on disk */ 1333 index = bytenr >> PAGE_SHIFT; 1334 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) 1335 return ERR_PTR(-EINVAL); 1336 1337 /* pull in the page with our super */ 1338 page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL); 1339 1340 if (IS_ERR(page)) 1341 return ERR_CAST(page); 1342 1343 p = page_address(page); 1344 1345 /* align our pointer to the offset of the super block */ 1346 disk_super = p + offset_in_page(bytenr); 1347 1348 if (btrfs_super_bytenr(disk_super) != bytenr_orig || 1349 btrfs_super_magic(disk_super) != BTRFS_MAGIC) { 1350 btrfs_release_disk_super(p); 1351 return ERR_PTR(-EINVAL); 1352 } 1353 1354 if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) 1355 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; 1356 1357 return disk_super; 1358 } 1359 1360 int btrfs_forget_devices(dev_t devt) 1361 { 1362 int ret; 1363 1364 mutex_lock(&uuid_mutex); 1365 ret = btrfs_free_stale_devices(devt, NULL); 1366 mutex_unlock(&uuid_mutex); 1367 1368 return ret; 1369 } 1370 1371 /* 1372 * Look for a btrfs signature on a device. This may be called out of the mount path 1373 * and we are not allowed to call set_blocksize during the scan. The superblock 1374 * is read via pagecache 1375 */ 1376 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, 1377 void *holder) 1378 { 1379 struct btrfs_super_block *disk_super; 1380 bool new_device_added = false; 1381 struct btrfs_device *device = NULL; 1382 struct block_device *bdev; 1383 u64 bytenr, bytenr_orig; 1384 int ret; 1385 1386 lockdep_assert_held(&uuid_mutex); 1387 1388 /* 1389 * we would like to check all the supers, but that would make 1390 * a btrfs mount succeed after a mkfs from a different FS. 1391 * So, we need to add a special mount option to scan for 1392 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 1393 */ 1394 flags |= FMODE_EXCL; 1395 1396 bdev = blkdev_get_by_path(path, flags, holder); 1397 if (IS_ERR(bdev)) 1398 return ERR_CAST(bdev); 1399 1400 bytenr_orig = btrfs_sb_offset(0); 1401 ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); 1402 if (ret) { 1403 device = ERR_PTR(ret); 1404 goto error_bdev_put; 1405 } 1406 1407 disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); 1408 if (IS_ERR(disk_super)) { 1409 device = ERR_CAST(disk_super); 1410 goto error_bdev_put; 1411 } 1412 1413 device = device_list_add(path, disk_super, &new_device_added); 1414 if (!IS_ERR(device) && new_device_added) 1415 btrfs_free_stale_devices(device->devt, device); 1416 1417 btrfs_release_disk_super(disk_super); 1418 1419 error_bdev_put: 1420 blkdev_put(bdev, flags); 1421 1422 return device; 1423 } 1424 1425 /* 1426 * Try to find a chunk that intersects [start, start + len] range and when one 1427 * such is found, record the end of it in *start 1428 */ 1429 static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1430 u64 len) 1431 { 1432 u64 physical_start, physical_end; 1433 1434 lockdep_assert_held(&device->fs_info->chunk_mutex); 1435 1436 if (!find_first_extent_bit(&device->alloc_state, *start, 1437 &physical_start, &physical_end, 1438 CHUNK_ALLOCATED, NULL)) { 1439 1440 if (in_range(physical_start, *start, len) || 1441 in_range(*start, physical_start, 1442 physical_end - physical_start)) { 1443 *start = physical_end + 1; 1444 return true; 1445 } 1446 } 1447 return false; 1448 } 1449 1450 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) 1451 { 1452 switch (device->fs_devices->chunk_alloc_policy) { 1453 case BTRFS_CHUNK_ALLOC_REGULAR: 1454 /* 1455 * We don't want to overwrite the superblock on the drive nor 1456 * any area used by the boot loader (grub for example), so we 1457 * make sure to start at an offset of at least 1MB. 1458 */ 1459 return max_t(u64, start, SZ_1M); 1460 case BTRFS_CHUNK_ALLOC_ZONED: 1461 /* 1462 * We don't care about the starting region like regular 1463 * allocator, because we anyway use/reserve the first two zones 1464 * for superblock logging. 1465 */ 1466 return ALIGN(start, device->zone_info->zone_size); 1467 default: 1468 BUG(); 1469 } 1470 } 1471 1472 static bool dev_extent_hole_check_zoned(struct btrfs_device *device, 1473 u64 *hole_start, u64 *hole_size, 1474 u64 num_bytes) 1475 { 1476 u64 zone_size = device->zone_info->zone_size; 1477 u64 pos; 1478 int ret; 1479 bool changed = false; 1480 1481 ASSERT(IS_ALIGNED(*hole_start, zone_size)); 1482 1483 while (*hole_size > 0) { 1484 pos = btrfs_find_allocatable_zones(device, *hole_start, 1485 *hole_start + *hole_size, 1486 num_bytes); 1487 if (pos != *hole_start) { 1488 *hole_size = *hole_start + *hole_size - pos; 1489 *hole_start = pos; 1490 changed = true; 1491 if (*hole_size < num_bytes) 1492 break; 1493 } 1494 1495 ret = btrfs_ensure_empty_zones(device, pos, num_bytes); 1496 1497 /* Range is ensured to be empty */ 1498 if (!ret) 1499 return changed; 1500 1501 /* Given hole range was invalid (outside of device) */ 1502 if (ret == -ERANGE) { 1503 *hole_start += *hole_size; 1504 *hole_size = 0; 1505 return true; 1506 } 1507 1508 *hole_start += zone_size; 1509 *hole_size -= zone_size; 1510 changed = true; 1511 } 1512 1513 return changed; 1514 } 1515 1516 /** 1517 * dev_extent_hole_check - check if specified hole is suitable for allocation 1518 * @device: the device which we have the hole 1519 * @hole_start: starting position of the hole 1520 * @hole_size: the size of the hole 1521 * @num_bytes: the size of the free space that we need 1522 * 1523 * This function may modify @hole_start and @hole_size to reflect the suitable 1524 * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1525 */ 1526 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1527 u64 *hole_size, u64 num_bytes) 1528 { 1529 bool changed = false; 1530 u64 hole_end = *hole_start + *hole_size; 1531 1532 for (;;) { 1533 /* 1534 * Check before we set max_hole_start, otherwise we could end up 1535 * sending back this offset anyway. 1536 */ 1537 if (contains_pending_extent(device, hole_start, *hole_size)) { 1538 if (hole_end >= *hole_start) 1539 *hole_size = hole_end - *hole_start; 1540 else 1541 *hole_size = 0; 1542 changed = true; 1543 } 1544 1545 switch (device->fs_devices->chunk_alloc_policy) { 1546 case BTRFS_CHUNK_ALLOC_REGULAR: 1547 /* No extra check */ 1548 break; 1549 case BTRFS_CHUNK_ALLOC_ZONED: 1550 if (dev_extent_hole_check_zoned(device, hole_start, 1551 hole_size, num_bytes)) { 1552 changed = true; 1553 /* 1554 * The changed hole can contain pending extent. 1555 * Loop again to check that. 1556 */ 1557 continue; 1558 } 1559 break; 1560 default: 1561 BUG(); 1562 } 1563 1564 break; 1565 } 1566 1567 return changed; 1568 } 1569 1570 /* 1571 * find_free_dev_extent_start - find free space in the specified device 1572 * @device: the device which we search the free space in 1573 * @num_bytes: the size of the free space that we need 1574 * @search_start: the position from which to begin the search 1575 * @start: store the start of the free space. 1576 * @len: the size of the free space. that we find, or the size 1577 * of the max free space if we don't find suitable free space 1578 * 1579 * this uses a pretty simple search, the expectation is that it is 1580 * called very infrequently and that a given device has a small number 1581 * of extents 1582 * 1583 * @start is used to store the start of the free space if we find. But if we 1584 * don't find suitable free space, it will be used to store the start position 1585 * of the max free space. 1586 * 1587 * @len is used to store the size of the free space that we find. 1588 * But if we don't find suitable free space, it is used to store the size of 1589 * the max free space. 1590 * 1591 * NOTE: This function will search *commit* root of device tree, and does extra 1592 * check to ensure dev extents are not double allocated. 1593 * This makes the function safe to allocate dev extents but may not report 1594 * correct usable device space, as device extent freed in current transaction 1595 * is not reported as available. 1596 */ 1597 static int find_free_dev_extent_start(struct btrfs_device *device, 1598 u64 num_bytes, u64 search_start, u64 *start, 1599 u64 *len) 1600 { 1601 struct btrfs_fs_info *fs_info = device->fs_info; 1602 struct btrfs_root *root = fs_info->dev_root; 1603 struct btrfs_key key; 1604 struct btrfs_dev_extent *dev_extent; 1605 struct btrfs_path *path; 1606 u64 hole_size; 1607 u64 max_hole_start; 1608 u64 max_hole_size; 1609 u64 extent_end; 1610 u64 search_end = device->total_bytes; 1611 int ret; 1612 int slot; 1613 struct extent_buffer *l; 1614 1615 search_start = dev_extent_search_start(device, search_start); 1616 1617 WARN_ON(device->zone_info && 1618 !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); 1619 1620 path = btrfs_alloc_path(); 1621 if (!path) 1622 return -ENOMEM; 1623 1624 max_hole_start = search_start; 1625 max_hole_size = 0; 1626 1627 again: 1628 if (search_start >= search_end || 1629 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1630 ret = -ENOSPC; 1631 goto out; 1632 } 1633 1634 path->reada = READA_FORWARD; 1635 path->search_commit_root = 1; 1636 path->skip_locking = 1; 1637 1638 key.objectid = device->devid; 1639 key.offset = search_start; 1640 key.type = BTRFS_DEV_EXTENT_KEY; 1641 1642 ret = btrfs_search_backwards(root, &key, path); 1643 if (ret < 0) 1644 goto out; 1645 1646 while (1) { 1647 l = path->nodes[0]; 1648 slot = path->slots[0]; 1649 if (slot >= btrfs_header_nritems(l)) { 1650 ret = btrfs_next_leaf(root, path); 1651 if (ret == 0) 1652 continue; 1653 if (ret < 0) 1654 goto out; 1655 1656 break; 1657 } 1658 btrfs_item_key_to_cpu(l, &key, slot); 1659 1660 if (key.objectid < device->devid) 1661 goto next; 1662 1663 if (key.objectid > device->devid) 1664 break; 1665 1666 if (key.type != BTRFS_DEV_EXTENT_KEY) 1667 goto next; 1668 1669 if (key.offset > search_start) { 1670 hole_size = key.offset - search_start; 1671 dev_extent_hole_check(device, &search_start, &hole_size, 1672 num_bytes); 1673 1674 if (hole_size > max_hole_size) { 1675 max_hole_start = search_start; 1676 max_hole_size = hole_size; 1677 } 1678 1679 /* 1680 * If this free space is greater than which we need, 1681 * it must be the max free space that we have found 1682 * until now, so max_hole_start must point to the start 1683 * of this free space and the length of this free space 1684 * is stored in max_hole_size. Thus, we return 1685 * max_hole_start and max_hole_size and go back to the 1686 * caller. 1687 */ 1688 if (hole_size >= num_bytes) { 1689 ret = 0; 1690 goto out; 1691 } 1692 } 1693 1694 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1695 extent_end = key.offset + btrfs_dev_extent_length(l, 1696 dev_extent); 1697 if (extent_end > search_start) 1698 search_start = extent_end; 1699 next: 1700 path->slots[0]++; 1701 cond_resched(); 1702 } 1703 1704 /* 1705 * At this point, search_start should be the end of 1706 * allocated dev extents, and when shrinking the device, 1707 * search_end may be smaller than search_start. 1708 */ 1709 if (search_end > search_start) { 1710 hole_size = search_end - search_start; 1711 if (dev_extent_hole_check(device, &search_start, &hole_size, 1712 num_bytes)) { 1713 btrfs_release_path(path); 1714 goto again; 1715 } 1716 1717 if (hole_size > max_hole_size) { 1718 max_hole_start = search_start; 1719 max_hole_size = hole_size; 1720 } 1721 } 1722 1723 /* See above. */ 1724 if (max_hole_size < num_bytes) 1725 ret = -ENOSPC; 1726 else 1727 ret = 0; 1728 1729 out: 1730 btrfs_free_path(path); 1731 *start = max_hole_start; 1732 if (len) 1733 *len = max_hole_size; 1734 return ret; 1735 } 1736 1737 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, 1738 u64 *start, u64 *len) 1739 { 1740 /* FIXME use last free of some kind */ 1741 return find_free_dev_extent_start(device, num_bytes, 0, start, len); 1742 } 1743 1744 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1745 struct btrfs_device *device, 1746 u64 start, u64 *dev_extent_len) 1747 { 1748 struct btrfs_fs_info *fs_info = device->fs_info; 1749 struct btrfs_root *root = fs_info->dev_root; 1750 int ret; 1751 struct btrfs_path *path; 1752 struct btrfs_key key; 1753 struct btrfs_key found_key; 1754 struct extent_buffer *leaf = NULL; 1755 struct btrfs_dev_extent *extent = NULL; 1756 1757 path = btrfs_alloc_path(); 1758 if (!path) 1759 return -ENOMEM; 1760 1761 key.objectid = device->devid; 1762 key.offset = start; 1763 key.type = BTRFS_DEV_EXTENT_KEY; 1764 again: 1765 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1766 if (ret > 0) { 1767 ret = btrfs_previous_item(root, path, key.objectid, 1768 BTRFS_DEV_EXTENT_KEY); 1769 if (ret) 1770 goto out; 1771 leaf = path->nodes[0]; 1772 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1773 extent = btrfs_item_ptr(leaf, path->slots[0], 1774 struct btrfs_dev_extent); 1775 BUG_ON(found_key.offset > start || found_key.offset + 1776 btrfs_dev_extent_length(leaf, extent) < start); 1777 key = found_key; 1778 btrfs_release_path(path); 1779 goto again; 1780 } else if (ret == 0) { 1781 leaf = path->nodes[0]; 1782 extent = btrfs_item_ptr(leaf, path->slots[0], 1783 struct btrfs_dev_extent); 1784 } else { 1785 goto out; 1786 } 1787 1788 *dev_extent_len = btrfs_dev_extent_length(leaf, extent); 1789 1790 ret = btrfs_del_item(trans, root, path); 1791 if (ret == 0) 1792 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1793 out: 1794 btrfs_free_path(path); 1795 return ret; 1796 } 1797 1798 static u64 find_next_chunk(struct btrfs_fs_info *fs_info) 1799 { 1800 struct extent_map_tree *em_tree; 1801 struct extent_map *em; 1802 struct rb_node *n; 1803 u64 ret = 0; 1804 1805 em_tree = &fs_info->mapping_tree; 1806 read_lock(&em_tree->lock); 1807 n = rb_last(&em_tree->map.rb_root); 1808 if (n) { 1809 em = rb_entry(n, struct extent_map, rb_node); 1810 ret = em->start + em->len; 1811 } 1812 read_unlock(&em_tree->lock); 1813 1814 return ret; 1815 } 1816 1817 static noinline int find_next_devid(struct btrfs_fs_info *fs_info, 1818 u64 *devid_ret) 1819 { 1820 int ret; 1821 struct btrfs_key key; 1822 struct btrfs_key found_key; 1823 struct btrfs_path *path; 1824 1825 path = btrfs_alloc_path(); 1826 if (!path) 1827 return -ENOMEM; 1828 1829 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1830 key.type = BTRFS_DEV_ITEM_KEY; 1831 key.offset = (u64)-1; 1832 1833 ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0); 1834 if (ret < 0) 1835 goto error; 1836 1837 if (ret == 0) { 1838 /* Corruption */ 1839 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched"); 1840 ret = -EUCLEAN; 1841 goto error; 1842 } 1843 1844 ret = btrfs_previous_item(fs_info->chunk_root, path, 1845 BTRFS_DEV_ITEMS_OBJECTID, 1846 BTRFS_DEV_ITEM_KEY); 1847 if (ret) { 1848 *devid_ret = 1; 1849 } else { 1850 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 1851 path->slots[0]); 1852 *devid_ret = found_key.offset + 1; 1853 } 1854 ret = 0; 1855 error: 1856 btrfs_free_path(path); 1857 return ret; 1858 } 1859 1860 /* 1861 * the device information is stored in the chunk root 1862 * the btrfs_device struct should be fully filled in 1863 */ 1864 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, 1865 struct btrfs_device *device) 1866 { 1867 int ret; 1868 struct btrfs_path *path; 1869 struct btrfs_dev_item *dev_item; 1870 struct extent_buffer *leaf; 1871 struct btrfs_key key; 1872 unsigned long ptr; 1873 1874 path = btrfs_alloc_path(); 1875 if (!path) 1876 return -ENOMEM; 1877 1878 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1879 key.type = BTRFS_DEV_ITEM_KEY; 1880 key.offset = device->devid; 1881 1882 btrfs_reserve_chunk_metadata(trans, true); 1883 ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, 1884 &key, sizeof(*dev_item)); 1885 btrfs_trans_release_chunk_metadata(trans); 1886 if (ret) 1887 goto out; 1888 1889 leaf = path->nodes[0]; 1890 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 1891 1892 btrfs_set_device_id(leaf, dev_item, device->devid); 1893 btrfs_set_device_generation(leaf, dev_item, 0); 1894 btrfs_set_device_type(leaf, dev_item, device->type); 1895 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1896 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1897 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1898 btrfs_set_device_total_bytes(leaf, dev_item, 1899 btrfs_device_get_disk_total_bytes(device)); 1900 btrfs_set_device_bytes_used(leaf, dev_item, 1901 btrfs_device_get_bytes_used(device)); 1902 btrfs_set_device_group(leaf, dev_item, 0); 1903 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1904 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1905 btrfs_set_device_start_offset(leaf, dev_item, 0); 1906 1907 ptr = btrfs_device_uuid(dev_item); 1908 write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 1909 ptr = btrfs_device_fsid(dev_item); 1910 write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, 1911 ptr, BTRFS_FSID_SIZE); 1912 btrfs_mark_buffer_dirty(leaf); 1913 1914 ret = 0; 1915 out: 1916 btrfs_free_path(path); 1917 return ret; 1918 } 1919 1920 /* 1921 * Function to update ctime/mtime for a given device path. 1922 * Mainly used for ctime/mtime based probe like libblkid. 1923 * 1924 * We don't care about errors here, this is just to be kind to userspace. 1925 */ 1926 static void update_dev_time(const char *device_path) 1927 { 1928 struct path path; 1929 struct timespec64 now; 1930 int ret; 1931 1932 ret = kern_path(device_path, LOOKUP_FOLLOW, &path); 1933 if (ret) 1934 return; 1935 1936 now = current_time(d_inode(path.dentry)); 1937 inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); 1938 path_put(&path); 1939 } 1940 1941 static int btrfs_rm_dev_item(struct btrfs_device *device) 1942 { 1943 struct btrfs_root *root = device->fs_info->chunk_root; 1944 int ret; 1945 struct btrfs_path *path; 1946 struct btrfs_key key; 1947 struct btrfs_trans_handle *trans; 1948 1949 path = btrfs_alloc_path(); 1950 if (!path) 1951 return -ENOMEM; 1952 1953 trans = btrfs_start_transaction(root, 0); 1954 if (IS_ERR(trans)) { 1955 btrfs_free_path(path); 1956 return PTR_ERR(trans); 1957 } 1958 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1959 key.type = BTRFS_DEV_ITEM_KEY; 1960 key.offset = device->devid; 1961 1962 btrfs_reserve_chunk_metadata(trans, false); 1963 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1964 btrfs_trans_release_chunk_metadata(trans); 1965 if (ret) { 1966 if (ret > 0) 1967 ret = -ENOENT; 1968 btrfs_abort_transaction(trans, ret); 1969 btrfs_end_transaction(trans); 1970 goto out; 1971 } 1972 1973 ret = btrfs_del_item(trans, root, path); 1974 if (ret) { 1975 btrfs_abort_transaction(trans, ret); 1976 btrfs_end_transaction(trans); 1977 } 1978 1979 out: 1980 btrfs_free_path(path); 1981 if (!ret) 1982 ret = btrfs_commit_transaction(trans); 1983 return ret; 1984 } 1985 1986 /* 1987 * Verify that @num_devices satisfies the RAID profile constraints in the whole 1988 * filesystem. It's up to the caller to adjust that number regarding eg. device 1989 * replace. 1990 */ 1991 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, 1992 u64 num_devices) 1993 { 1994 u64 all_avail; 1995 unsigned seq; 1996 int i; 1997 1998 do { 1999 seq = read_seqbegin(&fs_info->profiles_lock); 2000 2001 all_avail = fs_info->avail_data_alloc_bits | 2002 fs_info->avail_system_alloc_bits | 2003 fs_info->avail_metadata_alloc_bits; 2004 } while (read_seqretry(&fs_info->profiles_lock, seq)); 2005 2006 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 2007 if (!(all_avail & btrfs_raid_array[i].bg_flag)) 2008 continue; 2009 2010 if (num_devices < btrfs_raid_array[i].devs_min) 2011 return btrfs_raid_array[i].mindev_error; 2012 } 2013 2014 return 0; 2015 } 2016 2017 static struct btrfs_device * btrfs_find_next_active_device( 2018 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) 2019 { 2020 struct btrfs_device *next_device; 2021 2022 list_for_each_entry(next_device, &fs_devs->devices, dev_list) { 2023 if (next_device != device && 2024 !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state) 2025 && next_device->bdev) 2026 return next_device; 2027 } 2028 2029 return NULL; 2030 } 2031 2032 /* 2033 * Helper function to check if the given device is part of s_bdev / latest_dev 2034 * and replace it with the provided or the next active device, in the context 2035 * where this function called, there should be always be another device (or 2036 * this_dev) which is active. 2037 */ 2038 void __cold btrfs_assign_next_active_device(struct btrfs_device *device, 2039 struct btrfs_device *next_device) 2040 { 2041 struct btrfs_fs_info *fs_info = device->fs_info; 2042 2043 if (!next_device) 2044 next_device = btrfs_find_next_active_device(fs_info->fs_devices, 2045 device); 2046 ASSERT(next_device); 2047 2048 if (fs_info->sb->s_bdev && 2049 (fs_info->sb->s_bdev == device->bdev)) 2050 fs_info->sb->s_bdev = next_device->bdev; 2051 2052 if (fs_info->fs_devices->latest_dev->bdev == device->bdev) 2053 fs_info->fs_devices->latest_dev = next_device; 2054 } 2055 2056 /* 2057 * Return btrfs_fs_devices::num_devices excluding the device that's being 2058 * currently replaced. 2059 */ 2060 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) 2061 { 2062 u64 num_devices = fs_info->fs_devices->num_devices; 2063 2064 down_read(&fs_info->dev_replace.rwsem); 2065 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { 2066 ASSERT(num_devices > 1); 2067 num_devices--; 2068 } 2069 up_read(&fs_info->dev_replace.rwsem); 2070 2071 return num_devices; 2072 } 2073 2074 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, 2075 struct block_device *bdev, 2076 const char *device_path) 2077 { 2078 struct btrfs_super_block *disk_super; 2079 int copy_num; 2080 2081 if (!bdev) 2082 return; 2083 2084 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) { 2085 struct page *page; 2086 int ret; 2087 2088 disk_super = btrfs_read_dev_one_super(bdev, copy_num); 2089 if (IS_ERR(disk_super)) 2090 continue; 2091 2092 if (bdev_is_zoned(bdev)) { 2093 btrfs_reset_sb_log_zones(bdev, copy_num); 2094 continue; 2095 } 2096 2097 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 2098 2099 page = virt_to_page(disk_super); 2100 set_page_dirty(page); 2101 lock_page(page); 2102 /* write_on_page() unlocks the page */ 2103 ret = write_one_page(page); 2104 if (ret) 2105 btrfs_warn(fs_info, 2106 "error clearing superblock number %d (%d)", 2107 copy_num, ret); 2108 btrfs_release_disk_super(disk_super); 2109 2110 } 2111 2112 /* Notify udev that device has changed */ 2113 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 2114 2115 /* Update ctime/mtime for device path for libblkid */ 2116 update_dev_time(device_path); 2117 } 2118 2119 int btrfs_rm_device(struct btrfs_fs_info *fs_info, 2120 struct btrfs_dev_lookup_args *args, 2121 struct block_device **bdev, fmode_t *mode) 2122 { 2123 struct btrfs_device *device; 2124 struct btrfs_fs_devices *cur_devices; 2125 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2126 u64 num_devices; 2127 int ret = 0; 2128 2129 /* 2130 * The device list in fs_devices is accessed without locks (neither 2131 * uuid_mutex nor device_list_mutex) as it won't change on a mounted 2132 * filesystem and another device rm cannot run. 2133 */ 2134 num_devices = btrfs_num_devices(fs_info); 2135 2136 ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); 2137 if (ret) 2138 goto out; 2139 2140 device = btrfs_find_device(fs_info->fs_devices, args); 2141 if (!device) { 2142 if (args->missing) 2143 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; 2144 else 2145 ret = -ENOENT; 2146 goto out; 2147 } 2148 2149 if (btrfs_pinned_by_swapfile(fs_info, device)) { 2150 btrfs_warn_in_rcu(fs_info, 2151 "cannot remove device %s (devid %llu) due to active swapfile", 2152 rcu_str_deref(device->name), device->devid); 2153 ret = -ETXTBSY; 2154 goto out; 2155 } 2156 2157 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2158 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 2159 goto out; 2160 } 2161 2162 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 2163 fs_info->fs_devices->rw_devices == 1) { 2164 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 2165 goto out; 2166 } 2167 2168 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2169 mutex_lock(&fs_info->chunk_mutex); 2170 list_del_init(&device->dev_alloc_list); 2171 device->fs_devices->rw_devices--; 2172 mutex_unlock(&fs_info->chunk_mutex); 2173 } 2174 2175 ret = btrfs_shrink_device(device, 0); 2176 if (ret) 2177 goto error_undo; 2178 2179 /* 2180 * TODO: the superblock still includes this device in its num_devices 2181 * counter although write_all_supers() is not locked out. This 2182 * could give a filesystem state which requires a degraded mount. 2183 */ 2184 ret = btrfs_rm_dev_item(device); 2185 if (ret) 2186 goto error_undo; 2187 2188 clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2189 btrfs_scrub_cancel_dev(device); 2190 2191 /* 2192 * the device list mutex makes sure that we don't change 2193 * the device list while someone else is writing out all 2194 * the device supers. Whoever is writing all supers, should 2195 * lock the device list mutex before getting the number of 2196 * devices in the super block (super_copy). Conversely, 2197 * whoever updates the number of devices in the super block 2198 * (super_copy) should hold the device list mutex. 2199 */ 2200 2201 /* 2202 * In normal cases the cur_devices == fs_devices. But in case 2203 * of deleting a seed device, the cur_devices should point to 2204 * its own fs_devices listed under the fs_devices->seed_list. 2205 */ 2206 cur_devices = device->fs_devices; 2207 mutex_lock(&fs_devices->device_list_mutex); 2208 list_del_rcu(&device->dev_list); 2209 2210 cur_devices->num_devices--; 2211 cur_devices->total_devices--; 2212 /* Update total_devices of the parent fs_devices if it's seed */ 2213 if (cur_devices != fs_devices) 2214 fs_devices->total_devices--; 2215 2216 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 2217 cur_devices->missing_devices--; 2218 2219 btrfs_assign_next_active_device(device, NULL); 2220 2221 if (device->bdev) { 2222 cur_devices->open_devices--; 2223 /* remove sysfs entry */ 2224 btrfs_sysfs_remove_device(device); 2225 } 2226 2227 num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; 2228 btrfs_set_super_num_devices(fs_info->super_copy, num_devices); 2229 mutex_unlock(&fs_devices->device_list_mutex); 2230 2231 /* 2232 * At this point, the device is zero sized and detached from the 2233 * devices list. All that's left is to zero out the old supers and 2234 * free the device. 2235 * 2236 * We cannot call btrfs_close_bdev() here because we're holding the sb 2237 * write lock, and blkdev_put() will pull in the ->open_mutex on the 2238 * block device and it's dependencies. Instead just flush the device 2239 * and let the caller do the final blkdev_put. 2240 */ 2241 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2242 btrfs_scratch_superblocks(fs_info, device->bdev, 2243 device->name->str); 2244 if (device->bdev) { 2245 sync_blockdev(device->bdev); 2246 invalidate_bdev(device->bdev); 2247 } 2248 } 2249 2250 *bdev = device->bdev; 2251 *mode = device->mode; 2252 synchronize_rcu(); 2253 btrfs_free_device(device); 2254 2255 /* 2256 * This can happen if cur_devices is the private seed devices list. We 2257 * cannot call close_fs_devices() here because it expects the uuid_mutex 2258 * to be held, but in fact we don't need that for the private 2259 * seed_devices, we can simply decrement cur_devices->opened and then 2260 * remove it from our list and free the fs_devices. 2261 */ 2262 if (cur_devices->num_devices == 0) { 2263 list_del_init(&cur_devices->seed_list); 2264 ASSERT(cur_devices->opened == 1); 2265 cur_devices->opened--; 2266 free_fs_devices(cur_devices); 2267 } 2268 2269 out: 2270 return ret; 2271 2272 error_undo: 2273 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 2274 mutex_lock(&fs_info->chunk_mutex); 2275 list_add(&device->dev_alloc_list, 2276 &fs_devices->alloc_list); 2277 device->fs_devices->rw_devices++; 2278 mutex_unlock(&fs_info->chunk_mutex); 2279 } 2280 goto out; 2281 } 2282 2283 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) 2284 { 2285 struct btrfs_fs_devices *fs_devices; 2286 2287 lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex); 2288 2289 /* 2290 * in case of fs with no seed, srcdev->fs_devices will point 2291 * to fs_devices of fs_info. However when the dev being replaced is 2292 * a seed dev it will point to the seed's local fs_devices. In short 2293 * srcdev will have its correct fs_devices in both the cases. 2294 */ 2295 fs_devices = srcdev->fs_devices; 2296 2297 list_del_rcu(&srcdev->dev_list); 2298 list_del(&srcdev->dev_alloc_list); 2299 fs_devices->num_devices--; 2300 if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state)) 2301 fs_devices->missing_devices--; 2302 2303 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) 2304 fs_devices->rw_devices--; 2305 2306 if (srcdev->bdev) 2307 fs_devices->open_devices--; 2308 } 2309 2310 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) 2311 { 2312 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 2313 2314 mutex_lock(&uuid_mutex); 2315 2316 btrfs_close_bdev(srcdev); 2317 synchronize_rcu(); 2318 btrfs_free_device(srcdev); 2319 2320 /* if this is no devs we rather delete the fs_devices */ 2321 if (!fs_devices->num_devices) { 2322 /* 2323 * On a mounted FS, num_devices can't be zero unless it's a 2324 * seed. In case of a seed device being replaced, the replace 2325 * target added to the sprout FS, so there will be no more 2326 * device left under the seed FS. 2327 */ 2328 ASSERT(fs_devices->seeding); 2329 2330 list_del_init(&fs_devices->seed_list); 2331 close_fs_devices(fs_devices); 2332 free_fs_devices(fs_devices); 2333 } 2334 mutex_unlock(&uuid_mutex); 2335 } 2336 2337 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) 2338 { 2339 struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; 2340 2341 mutex_lock(&fs_devices->device_list_mutex); 2342 2343 btrfs_sysfs_remove_device(tgtdev); 2344 2345 if (tgtdev->bdev) 2346 fs_devices->open_devices--; 2347 2348 fs_devices->num_devices--; 2349 2350 btrfs_assign_next_active_device(tgtdev, NULL); 2351 2352 list_del_rcu(&tgtdev->dev_list); 2353 2354 mutex_unlock(&fs_devices->device_list_mutex); 2355 2356 btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, 2357 tgtdev->name->str); 2358 2359 btrfs_close_bdev(tgtdev); 2360 synchronize_rcu(); 2361 btrfs_free_device(tgtdev); 2362 } 2363 2364 /** 2365 * Populate args from device at path 2366 * 2367 * @fs_info: the filesystem 2368 * @args: the args to populate 2369 * @path: the path to the device 2370 * 2371 * This will read the super block of the device at @path and populate @args with 2372 * the devid, fsid, and uuid. This is meant to be used for ioctls that need to 2373 * lookup a device to operate on, but need to do it before we take any locks. 2374 * This properly handles the special case of "missing" that a user may pass in, 2375 * and does some basic sanity checks. The caller must make sure that @path is 2376 * properly NUL terminated before calling in, and must call 2377 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and 2378 * uuid buffers. 2379 * 2380 * Return: 0 for success, -errno for failure 2381 */ 2382 int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, 2383 struct btrfs_dev_lookup_args *args, 2384 const char *path) 2385 { 2386 struct btrfs_super_block *disk_super; 2387 struct block_device *bdev; 2388 int ret; 2389 2390 if (!path || !path[0]) 2391 return -EINVAL; 2392 if (!strcmp(path, "missing")) { 2393 args->missing = true; 2394 return 0; 2395 } 2396 2397 args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); 2398 args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); 2399 if (!args->uuid || !args->fsid) { 2400 btrfs_put_dev_args_from_path(args); 2401 return -ENOMEM; 2402 } 2403 2404 ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, 2405 &bdev, &disk_super); 2406 if (ret) 2407 return ret; 2408 args->devid = btrfs_stack_device_id(&disk_super->dev_item); 2409 memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); 2410 if (btrfs_fs_incompat(fs_info, METADATA_UUID)) 2411 memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); 2412 else 2413 memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); 2414 btrfs_release_disk_super(disk_super); 2415 blkdev_put(bdev, FMODE_READ); 2416 return 0; 2417 } 2418 2419 /* 2420 * Only use this jointly with btrfs_get_dev_args_from_path() because we will 2421 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables 2422 * that don't need to be freed. 2423 */ 2424 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) 2425 { 2426 kfree(args->uuid); 2427 kfree(args->fsid); 2428 args->uuid = NULL; 2429 args->fsid = NULL; 2430 } 2431 2432 struct btrfs_device *btrfs_find_device_by_devspec( 2433 struct btrfs_fs_info *fs_info, u64 devid, 2434 const char *device_path) 2435 { 2436 BTRFS_DEV_LOOKUP_ARGS(args); 2437 struct btrfs_device *device; 2438 int ret; 2439 2440 if (devid) { 2441 args.devid = devid; 2442 device = btrfs_find_device(fs_info->fs_devices, &args); 2443 if (!device) 2444 return ERR_PTR(-ENOENT); 2445 return device; 2446 } 2447 2448 ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); 2449 if (ret) 2450 return ERR_PTR(ret); 2451 device = btrfs_find_device(fs_info->fs_devices, &args); 2452 btrfs_put_dev_args_from_path(&args); 2453 if (!device) 2454 return ERR_PTR(-ENOENT); 2455 return device; 2456 } 2457 2458 static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) 2459 { 2460 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2461 struct btrfs_fs_devices *old_devices; 2462 struct btrfs_fs_devices *seed_devices; 2463 2464 lockdep_assert_held(&uuid_mutex); 2465 if (!fs_devices->seeding) 2466 return ERR_PTR(-EINVAL); 2467 2468 /* 2469 * Private copy of the seed devices, anchored at 2470 * fs_info->fs_devices->seed_list 2471 */ 2472 seed_devices = alloc_fs_devices(NULL, NULL); 2473 if (IS_ERR(seed_devices)) 2474 return seed_devices; 2475 2476 /* 2477 * It's necessary to retain a copy of the original seed fs_devices in 2478 * fs_uuids so that filesystems which have been seeded can successfully 2479 * reference the seed device from open_seed_devices. This also supports 2480 * multiple fs seed. 2481 */ 2482 old_devices = clone_fs_devices(fs_devices); 2483 if (IS_ERR(old_devices)) { 2484 kfree(seed_devices); 2485 return old_devices; 2486 } 2487 2488 list_add(&old_devices->fs_list, &fs_uuids); 2489 2490 memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); 2491 seed_devices->opened = 1; 2492 INIT_LIST_HEAD(&seed_devices->devices); 2493 INIT_LIST_HEAD(&seed_devices->alloc_list); 2494 mutex_init(&seed_devices->device_list_mutex); 2495 2496 return seed_devices; 2497 } 2498 2499 /* 2500 * Splice seed devices into the sprout fs_devices. 2501 * Generate a new fsid for the sprouted read-write filesystem. 2502 */ 2503 static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info, 2504 struct btrfs_fs_devices *seed_devices) 2505 { 2506 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2507 struct btrfs_super_block *disk_super = fs_info->super_copy; 2508 struct btrfs_device *device; 2509 u64 super_flags; 2510 2511 /* 2512 * We are updating the fsid, the thread leading to device_list_add() 2513 * could race, so uuid_mutex is needed. 2514 */ 2515 lockdep_assert_held(&uuid_mutex); 2516 2517 /* 2518 * The threads listed below may traverse dev_list but can do that without 2519 * device_list_mutex: 2520 * - All device ops and balance - as we are in btrfs_exclop_start. 2521 * - Various dev_list readers - are using RCU. 2522 * - btrfs_ioctl_fitrim() - is using RCU. 2523 * 2524 * For-read threads as below are using device_list_mutex: 2525 * - Readonly scrub btrfs_scrub_dev() 2526 * - Readonly scrub btrfs_scrub_progress() 2527 * - btrfs_get_dev_stats() 2528 */ 2529 lockdep_assert_held(&fs_devices->device_list_mutex); 2530 2531 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 2532 synchronize_rcu); 2533 list_for_each_entry(device, &seed_devices->devices, dev_list) 2534 device->fs_devices = seed_devices; 2535 2536 fs_devices->seeding = false; 2537 fs_devices->num_devices = 0; 2538 fs_devices->open_devices = 0; 2539 fs_devices->missing_devices = 0; 2540 fs_devices->rotating = false; 2541 list_add(&seed_devices->seed_list, &fs_devices->seed_list); 2542 2543 generate_random_uuid(fs_devices->fsid); 2544 memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); 2545 memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); 2546 2547 super_flags = btrfs_super_flags(disk_super) & 2548 ~BTRFS_SUPER_FLAG_SEEDING; 2549 btrfs_set_super_flags(disk_super, super_flags); 2550 } 2551 2552 /* 2553 * Store the expected generation for seed devices in device items. 2554 */ 2555 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) 2556 { 2557 BTRFS_DEV_LOOKUP_ARGS(args); 2558 struct btrfs_fs_info *fs_info = trans->fs_info; 2559 struct btrfs_root *root = fs_info->chunk_root; 2560 struct btrfs_path *path; 2561 struct extent_buffer *leaf; 2562 struct btrfs_dev_item *dev_item; 2563 struct btrfs_device *device; 2564 struct btrfs_key key; 2565 u8 fs_uuid[BTRFS_FSID_SIZE]; 2566 u8 dev_uuid[BTRFS_UUID_SIZE]; 2567 int ret; 2568 2569 path = btrfs_alloc_path(); 2570 if (!path) 2571 return -ENOMEM; 2572 2573 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2574 key.offset = 0; 2575 key.type = BTRFS_DEV_ITEM_KEY; 2576 2577 while (1) { 2578 btrfs_reserve_chunk_metadata(trans, false); 2579 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2580 btrfs_trans_release_chunk_metadata(trans); 2581 if (ret < 0) 2582 goto error; 2583 2584 leaf = path->nodes[0]; 2585 next_slot: 2586 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 2587 ret = btrfs_next_leaf(root, path); 2588 if (ret > 0) 2589 break; 2590 if (ret < 0) 2591 goto error; 2592 leaf = path->nodes[0]; 2593 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2594 btrfs_release_path(path); 2595 continue; 2596 } 2597 2598 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 2599 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || 2600 key.type != BTRFS_DEV_ITEM_KEY) 2601 break; 2602 2603 dev_item = btrfs_item_ptr(leaf, path->slots[0], 2604 struct btrfs_dev_item); 2605 args.devid = btrfs_device_id(leaf, dev_item); 2606 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 2607 BTRFS_UUID_SIZE); 2608 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 2609 BTRFS_FSID_SIZE); 2610 args.uuid = dev_uuid; 2611 args.fsid = fs_uuid; 2612 device = btrfs_find_device(fs_info->fs_devices, &args); 2613 BUG_ON(!device); /* Logic error */ 2614 2615 if (device->fs_devices->seeding) { 2616 btrfs_set_device_generation(leaf, dev_item, 2617 device->generation); 2618 btrfs_mark_buffer_dirty(leaf); 2619 } 2620 2621 path->slots[0]++; 2622 goto next_slot; 2623 } 2624 ret = 0; 2625 error: 2626 btrfs_free_path(path); 2627 return ret; 2628 } 2629 2630 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path) 2631 { 2632 struct btrfs_root *root = fs_info->dev_root; 2633 struct request_queue *q; 2634 struct btrfs_trans_handle *trans; 2635 struct btrfs_device *device; 2636 struct block_device *bdev; 2637 struct super_block *sb = fs_info->sb; 2638 struct rcu_string *name; 2639 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2640 struct btrfs_fs_devices *seed_devices; 2641 u64 orig_super_total_bytes; 2642 u64 orig_super_num_devices; 2643 int ret = 0; 2644 bool seeding_dev = false; 2645 bool locked = false; 2646 2647 if (sb_rdonly(sb) && !fs_devices->seeding) 2648 return -EROFS; 2649 2650 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2651 fs_info->bdev_holder); 2652 if (IS_ERR(bdev)) 2653 return PTR_ERR(bdev); 2654 2655 if (!btrfs_check_device_zone_type(fs_info, bdev)) { 2656 ret = -EINVAL; 2657 goto error; 2658 } 2659 2660 if (fs_devices->seeding) { 2661 seeding_dev = true; 2662 down_write(&sb->s_umount); 2663 mutex_lock(&uuid_mutex); 2664 locked = true; 2665 } 2666 2667 sync_blockdev(bdev); 2668 2669 rcu_read_lock(); 2670 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 2671 if (device->bdev == bdev) { 2672 ret = -EEXIST; 2673 rcu_read_unlock(); 2674 goto error; 2675 } 2676 } 2677 rcu_read_unlock(); 2678 2679 device = btrfs_alloc_device(fs_info, NULL, NULL); 2680 if (IS_ERR(device)) { 2681 /* we can safely leave the fs_devices entry around */ 2682 ret = PTR_ERR(device); 2683 goto error; 2684 } 2685 2686 name = rcu_string_strdup(device_path, GFP_KERNEL); 2687 if (!name) { 2688 ret = -ENOMEM; 2689 goto error_free_device; 2690 } 2691 rcu_assign_pointer(device->name, name); 2692 2693 device->fs_info = fs_info; 2694 device->bdev = bdev; 2695 ret = lookup_bdev(device_path, &device->devt); 2696 if (ret) 2697 goto error_free_device; 2698 2699 ret = btrfs_get_dev_zone_info(device, false); 2700 if (ret) 2701 goto error_free_device; 2702 2703 trans = btrfs_start_transaction(root, 0); 2704 if (IS_ERR(trans)) { 2705 ret = PTR_ERR(trans); 2706 goto error_free_zone; 2707 } 2708 2709 q = bdev_get_queue(bdev); 2710 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); 2711 device->generation = trans->transid; 2712 device->io_width = fs_info->sectorsize; 2713 device->io_align = fs_info->sectorsize; 2714 device->sector_size = fs_info->sectorsize; 2715 device->total_bytes = 2716 round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); 2717 device->disk_total_bytes = device->total_bytes; 2718 device->commit_total_bytes = device->total_bytes; 2719 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 2720 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 2721 device->mode = FMODE_EXCL; 2722 device->dev_stats_valid = 1; 2723 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); 2724 2725 if (seeding_dev) { 2726 btrfs_clear_sb_rdonly(sb); 2727 2728 /* GFP_KERNEL allocation must not be under device_list_mutex */ 2729 seed_devices = btrfs_init_sprout(fs_info); 2730 if (IS_ERR(seed_devices)) { 2731 ret = PTR_ERR(seed_devices); 2732 btrfs_abort_transaction(trans, ret); 2733 goto error_trans; 2734 } 2735 } 2736 2737 mutex_lock(&fs_devices->device_list_mutex); 2738 if (seeding_dev) { 2739 btrfs_setup_sprout(fs_info, seed_devices); 2740 btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, 2741 device); 2742 } 2743 2744 device->fs_devices = fs_devices; 2745 2746 mutex_lock(&fs_info->chunk_mutex); 2747 list_add_rcu(&device->dev_list, &fs_devices->devices); 2748 list_add(&device->dev_alloc_list, &fs_devices->alloc_list); 2749 fs_devices->num_devices++; 2750 fs_devices->open_devices++; 2751 fs_devices->rw_devices++; 2752 fs_devices->total_devices++; 2753 fs_devices->total_rw_bytes += device->total_bytes; 2754 2755 atomic64_add(device->total_bytes, &fs_info->free_chunk_space); 2756 2757 if (!blk_queue_nonrot(q)) 2758 fs_devices->rotating = true; 2759 2760 orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); 2761 btrfs_set_super_total_bytes(fs_info->super_copy, 2762 round_down(orig_super_total_bytes + device->total_bytes, 2763 fs_info->sectorsize)); 2764 2765 orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy); 2766 btrfs_set_super_num_devices(fs_info->super_copy, 2767 orig_super_num_devices + 1); 2768 2769 /* 2770 * we've got more storage, clear any full flags on the space 2771 * infos 2772 */ 2773 btrfs_clear_space_info_full(fs_info); 2774 2775 mutex_unlock(&fs_info->chunk_mutex); 2776 2777 /* Add sysfs device entry */ 2778 btrfs_sysfs_add_device(device); 2779 2780 mutex_unlock(&fs_devices->device_list_mutex); 2781 2782 if (seeding_dev) { 2783 mutex_lock(&fs_info->chunk_mutex); 2784 ret = init_first_rw_device(trans); 2785 mutex_unlock(&fs_info->chunk_mutex); 2786 if (ret) { 2787 btrfs_abort_transaction(trans, ret); 2788 goto error_sysfs; 2789 } 2790 } 2791 2792 ret = btrfs_add_dev_item(trans, device); 2793 if (ret) { 2794 btrfs_abort_transaction(trans, ret); 2795 goto error_sysfs; 2796 } 2797 2798 if (seeding_dev) { 2799 ret = btrfs_finish_sprout(trans); 2800 if (ret) { 2801 btrfs_abort_transaction(trans, ret); 2802 goto error_sysfs; 2803 } 2804 2805 /* 2806 * fs_devices now represents the newly sprouted filesystem and 2807 * its fsid has been changed by btrfs_sprout_splice(). 2808 */ 2809 btrfs_sysfs_update_sprout_fsid(fs_devices); 2810 } 2811 2812 ret = btrfs_commit_transaction(trans); 2813 2814 if (seeding_dev) { 2815 mutex_unlock(&uuid_mutex); 2816 up_write(&sb->s_umount); 2817 locked = false; 2818 2819 if (ret) /* transaction commit */ 2820 return ret; 2821 2822 ret = btrfs_relocate_sys_chunks(fs_info); 2823 if (ret < 0) 2824 btrfs_handle_fs_error(fs_info, ret, 2825 "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); 2826 trans = btrfs_attach_transaction(root); 2827 if (IS_ERR(trans)) { 2828 if (PTR_ERR(trans) == -ENOENT) 2829 return 0; 2830 ret = PTR_ERR(trans); 2831 trans = NULL; 2832 goto error_sysfs; 2833 } 2834 ret = btrfs_commit_transaction(trans); 2835 } 2836 2837 /* 2838 * Now that we have written a new super block to this device, check all 2839 * other fs_devices list if device_path alienates any other scanned 2840 * device. 2841 * We can ignore the return value as it typically returns -EINVAL and 2842 * only succeeds if the device was an alien. 2843 */ 2844 btrfs_forget_devices(device->devt); 2845 2846 /* Update ctime/mtime for blkid or udev */ 2847 update_dev_time(device_path); 2848 2849 return ret; 2850 2851 error_sysfs: 2852 btrfs_sysfs_remove_device(device); 2853 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2854 mutex_lock(&fs_info->chunk_mutex); 2855 list_del_rcu(&device->dev_list); 2856 list_del(&device->dev_alloc_list); 2857 fs_info->fs_devices->num_devices--; 2858 fs_info->fs_devices->open_devices--; 2859 fs_info->fs_devices->rw_devices--; 2860 fs_info->fs_devices->total_devices--; 2861 fs_info->fs_devices->total_rw_bytes -= device->total_bytes; 2862 atomic64_sub(device->total_bytes, &fs_info->free_chunk_space); 2863 btrfs_set_super_total_bytes(fs_info->super_copy, 2864 orig_super_total_bytes); 2865 btrfs_set_super_num_devices(fs_info->super_copy, 2866 orig_super_num_devices); 2867 mutex_unlock(&fs_info->chunk_mutex); 2868 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2869 error_trans: 2870 if (seeding_dev) 2871 btrfs_set_sb_rdonly(sb); 2872 if (trans) 2873 btrfs_end_transaction(trans); 2874 error_free_zone: 2875 btrfs_destroy_dev_zone_info(device); 2876 error_free_device: 2877 btrfs_free_device(device); 2878 error: 2879 blkdev_put(bdev, FMODE_EXCL); 2880 if (locked) { 2881 mutex_unlock(&uuid_mutex); 2882 up_write(&sb->s_umount); 2883 } 2884 return ret; 2885 } 2886 2887 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 2888 struct btrfs_device *device) 2889 { 2890 int ret; 2891 struct btrfs_path *path; 2892 struct btrfs_root *root = device->fs_info->chunk_root; 2893 struct btrfs_dev_item *dev_item; 2894 struct extent_buffer *leaf; 2895 struct btrfs_key key; 2896 2897 path = btrfs_alloc_path(); 2898 if (!path) 2899 return -ENOMEM; 2900 2901 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 2902 key.type = BTRFS_DEV_ITEM_KEY; 2903 key.offset = device->devid; 2904 2905 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2906 if (ret < 0) 2907 goto out; 2908 2909 if (ret > 0) { 2910 ret = -ENOENT; 2911 goto out; 2912 } 2913 2914 leaf = path->nodes[0]; 2915 dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); 2916 2917 btrfs_set_device_id(leaf, dev_item, device->devid); 2918 btrfs_set_device_type(leaf, dev_item, device->type); 2919 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2920 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2921 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2922 btrfs_set_device_total_bytes(leaf, dev_item, 2923 btrfs_device_get_disk_total_bytes(device)); 2924 btrfs_set_device_bytes_used(leaf, dev_item, 2925 btrfs_device_get_bytes_used(device)); 2926 btrfs_mark_buffer_dirty(leaf); 2927 2928 out: 2929 btrfs_free_path(path); 2930 return ret; 2931 } 2932 2933 int btrfs_grow_device(struct btrfs_trans_handle *trans, 2934 struct btrfs_device *device, u64 new_size) 2935 { 2936 struct btrfs_fs_info *fs_info = device->fs_info; 2937 struct btrfs_super_block *super_copy = fs_info->super_copy; 2938 u64 old_total; 2939 u64 diff; 2940 int ret; 2941 2942 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 2943 return -EACCES; 2944 2945 new_size = round_down(new_size, fs_info->sectorsize); 2946 2947 mutex_lock(&fs_info->chunk_mutex); 2948 old_total = btrfs_super_total_bytes(super_copy); 2949 diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); 2950 2951 if (new_size <= device->total_bytes || 2952 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 2953 mutex_unlock(&fs_info->chunk_mutex); 2954 return -EINVAL; 2955 } 2956 2957 btrfs_set_super_total_bytes(super_copy, 2958 round_down(old_total + diff, fs_info->sectorsize)); 2959 device->fs_devices->total_rw_bytes += diff; 2960 2961 btrfs_device_set_total_bytes(device, new_size); 2962 btrfs_device_set_disk_total_bytes(device, new_size); 2963 btrfs_clear_space_info_full(device->fs_info); 2964 if (list_empty(&device->post_commit_list)) 2965 list_add_tail(&device->post_commit_list, 2966 &trans->transaction->dev_update_list); 2967 mutex_unlock(&fs_info->chunk_mutex); 2968 2969 btrfs_reserve_chunk_metadata(trans, false); 2970 ret = btrfs_update_device(trans, device); 2971 btrfs_trans_release_chunk_metadata(trans); 2972 2973 return ret; 2974 } 2975 2976 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 2977 { 2978 struct btrfs_fs_info *fs_info = trans->fs_info; 2979 struct btrfs_root *root = fs_info->chunk_root; 2980 int ret; 2981 struct btrfs_path *path; 2982 struct btrfs_key key; 2983 2984 path = btrfs_alloc_path(); 2985 if (!path) 2986 return -ENOMEM; 2987 2988 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2989 key.offset = chunk_offset; 2990 key.type = BTRFS_CHUNK_ITEM_KEY; 2991 2992 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 2993 if (ret < 0) 2994 goto out; 2995 else if (ret > 0) { /* Logic error or corruption */ 2996 btrfs_handle_fs_error(fs_info, -ENOENT, 2997 "Failed lookup while freeing chunk."); 2998 ret = -ENOENT; 2999 goto out; 3000 } 3001 3002 ret = btrfs_del_item(trans, root, path); 3003 if (ret < 0) 3004 btrfs_handle_fs_error(fs_info, ret, 3005 "Failed to delete chunk item."); 3006 out: 3007 btrfs_free_path(path); 3008 return ret; 3009 } 3010 3011 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3012 { 3013 struct btrfs_super_block *super_copy = fs_info->super_copy; 3014 struct btrfs_disk_key *disk_key; 3015 struct btrfs_chunk *chunk; 3016 u8 *ptr; 3017 int ret = 0; 3018 u32 num_stripes; 3019 u32 array_size; 3020 u32 len = 0; 3021 u32 cur; 3022 struct btrfs_key key; 3023 3024 lockdep_assert_held(&fs_info->chunk_mutex); 3025 array_size = btrfs_super_sys_array_size(super_copy); 3026 3027 ptr = super_copy->sys_chunk_array; 3028 cur = 0; 3029 3030 while (cur < array_size) { 3031 disk_key = (struct btrfs_disk_key *)ptr; 3032 btrfs_disk_key_to_cpu(&key, disk_key); 3033 3034 len = sizeof(*disk_key); 3035 3036 if (key.type == BTRFS_CHUNK_ITEM_KEY) { 3037 chunk = (struct btrfs_chunk *)(ptr + len); 3038 num_stripes = btrfs_stack_chunk_num_stripes(chunk); 3039 len += btrfs_chunk_item_size(num_stripes); 3040 } else { 3041 ret = -EIO; 3042 break; 3043 } 3044 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && 3045 key.offset == chunk_offset) { 3046 memmove(ptr, ptr + len, array_size - (cur + len)); 3047 array_size -= len; 3048 btrfs_set_super_sys_array_size(super_copy, array_size); 3049 } else { 3050 ptr += len; 3051 cur += len; 3052 } 3053 } 3054 return ret; 3055 } 3056 3057 /* 3058 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. 3059 * @logical: Logical block offset in bytes. 3060 * @length: Length of extent in bytes. 3061 * 3062 * Return: Chunk mapping or ERR_PTR. 3063 */ 3064 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, 3065 u64 logical, u64 length) 3066 { 3067 struct extent_map_tree *em_tree; 3068 struct extent_map *em; 3069 3070 em_tree = &fs_info->mapping_tree; 3071 read_lock(&em_tree->lock); 3072 em = lookup_extent_mapping(em_tree, logical, length); 3073 read_unlock(&em_tree->lock); 3074 3075 if (!em) { 3076 btrfs_crit(fs_info, "unable to find logical %llu length %llu", 3077 logical, length); 3078 return ERR_PTR(-EINVAL); 3079 } 3080 3081 if (em->start > logical || em->start + em->len < logical) { 3082 btrfs_crit(fs_info, 3083 "found a bad mapping, wanted %llu-%llu, found %llu-%llu", 3084 logical, length, em->start, em->start + em->len); 3085 free_extent_map(em); 3086 return ERR_PTR(-EINVAL); 3087 } 3088 3089 /* callers are responsible for dropping em's ref. */ 3090 return em; 3091 } 3092 3093 static int remove_chunk_item(struct btrfs_trans_handle *trans, 3094 struct map_lookup *map, u64 chunk_offset) 3095 { 3096 int i; 3097 3098 /* 3099 * Removing chunk items and updating the device items in the chunks btree 3100 * requires holding the chunk_mutex. 3101 * See the comment at btrfs_chunk_alloc() for the details. 3102 */ 3103 lockdep_assert_held(&trans->fs_info->chunk_mutex); 3104 3105 for (i = 0; i < map->num_stripes; i++) { 3106 int ret; 3107 3108 ret = btrfs_update_device(trans, map->stripes[i].dev); 3109 if (ret) 3110 return ret; 3111 } 3112 3113 return btrfs_free_chunk(trans, chunk_offset); 3114 } 3115 3116 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3117 { 3118 struct btrfs_fs_info *fs_info = trans->fs_info; 3119 struct extent_map *em; 3120 struct map_lookup *map; 3121 u64 dev_extent_len = 0; 3122 int i, ret = 0; 3123 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3124 3125 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3126 if (IS_ERR(em)) { 3127 /* 3128 * This is a logic error, but we don't want to just rely on the 3129 * user having built with ASSERT enabled, so if ASSERT doesn't 3130 * do anything we still error out. 3131 */ 3132 ASSERT(0); 3133 return PTR_ERR(em); 3134 } 3135 map = em->map_lookup; 3136 3137 /* 3138 * First delete the device extent items from the devices btree. 3139 * We take the device_list_mutex to avoid racing with the finishing phase 3140 * of a device replace operation. See the comment below before acquiring 3141 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex 3142 * because that can result in a deadlock when deleting the device extent 3143 * items from the devices btree - COWing an extent buffer from the btree 3144 * may result in allocating a new metadata chunk, which would attempt to 3145 * lock again fs_info->chunk_mutex. 3146 */ 3147 mutex_lock(&fs_devices->device_list_mutex); 3148 for (i = 0; i < map->num_stripes; i++) { 3149 struct btrfs_device *device = map->stripes[i].dev; 3150 ret = btrfs_free_dev_extent(trans, device, 3151 map->stripes[i].physical, 3152 &dev_extent_len); 3153 if (ret) { 3154 mutex_unlock(&fs_devices->device_list_mutex); 3155 btrfs_abort_transaction(trans, ret); 3156 goto out; 3157 } 3158 3159 if (device->bytes_used > 0) { 3160 mutex_lock(&fs_info->chunk_mutex); 3161 btrfs_device_set_bytes_used(device, 3162 device->bytes_used - dev_extent_len); 3163 atomic64_add(dev_extent_len, &fs_info->free_chunk_space); 3164 btrfs_clear_space_info_full(fs_info); 3165 mutex_unlock(&fs_info->chunk_mutex); 3166 } 3167 } 3168 mutex_unlock(&fs_devices->device_list_mutex); 3169 3170 /* 3171 * We acquire fs_info->chunk_mutex for 2 reasons: 3172 * 3173 * 1) Just like with the first phase of the chunk allocation, we must 3174 * reserve system space, do all chunk btree updates and deletions, and 3175 * update the system chunk array in the superblock while holding this 3176 * mutex. This is for similar reasons as explained on the comment at 3177 * the top of btrfs_chunk_alloc(); 3178 * 3179 * 2) Prevent races with the final phase of a device replace operation 3180 * that replaces the device object associated with the map's stripes, 3181 * because the device object's id can change at any time during that 3182 * final phase of the device replace operation 3183 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 3184 * replaced device and then see it with an ID of 3185 * BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating 3186 * the device item, which does not exists on the chunk btree. 3187 * The finishing phase of device replace acquires both the 3188 * device_list_mutex and the chunk_mutex, in that order, so we are 3189 * safe by just acquiring the chunk_mutex. 3190 */ 3191 trans->removing_chunk = true; 3192 mutex_lock(&fs_info->chunk_mutex); 3193 3194 check_system_chunk(trans, map->type); 3195 3196 ret = remove_chunk_item(trans, map, chunk_offset); 3197 /* 3198 * Normally we should not get -ENOSPC since we reserved space before 3199 * through the call to check_system_chunk(). 3200 * 3201 * Despite our system space_info having enough free space, we may not 3202 * be able to allocate extents from its block groups, because all have 3203 * an incompatible profile, which will force us to allocate a new system 3204 * block group with the right profile, or right after we called 3205 * check_system_space() above, a scrub turned the only system block group 3206 * with enough free space into RO mode. 3207 * This is explained with more detail at do_chunk_alloc(). 3208 * 3209 * So if we get -ENOSPC, allocate a new system chunk and retry once. 3210 */ 3211 if (ret == -ENOSPC) { 3212 const u64 sys_flags = btrfs_system_alloc_profile(fs_info); 3213 struct btrfs_block_group *sys_bg; 3214 3215 sys_bg = btrfs_create_chunk(trans, sys_flags); 3216 if (IS_ERR(sys_bg)) { 3217 ret = PTR_ERR(sys_bg); 3218 btrfs_abort_transaction(trans, ret); 3219 goto out; 3220 } 3221 3222 ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); 3223 if (ret) { 3224 btrfs_abort_transaction(trans, ret); 3225 goto out; 3226 } 3227 3228 ret = remove_chunk_item(trans, map, chunk_offset); 3229 if (ret) { 3230 btrfs_abort_transaction(trans, ret); 3231 goto out; 3232 } 3233 } else if (ret) { 3234 btrfs_abort_transaction(trans, ret); 3235 goto out; 3236 } 3237 3238 trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); 3239 3240 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3241 ret = btrfs_del_sys_chunk(fs_info, chunk_offset); 3242 if (ret) { 3243 btrfs_abort_transaction(trans, ret); 3244 goto out; 3245 } 3246 } 3247 3248 mutex_unlock(&fs_info->chunk_mutex); 3249 trans->removing_chunk = false; 3250 3251 /* 3252 * We are done with chunk btree updates and deletions, so release the 3253 * system space we previously reserved (with check_system_chunk()). 3254 */ 3255 btrfs_trans_release_chunk_metadata(trans); 3256 3257 ret = btrfs_remove_block_group(trans, chunk_offset, em); 3258 if (ret) { 3259 btrfs_abort_transaction(trans, ret); 3260 goto out; 3261 } 3262 3263 out: 3264 if (trans->removing_chunk) { 3265 mutex_unlock(&fs_info->chunk_mutex); 3266 trans->removing_chunk = false; 3267 } 3268 /* once for us */ 3269 free_extent_map(em); 3270 return ret; 3271 } 3272 3273 int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) 3274 { 3275 struct btrfs_root *root = fs_info->chunk_root; 3276 struct btrfs_trans_handle *trans; 3277 struct btrfs_block_group *block_group; 3278 u64 length; 3279 int ret; 3280 3281 /* 3282 * Prevent races with automatic removal of unused block groups. 3283 * After we relocate and before we remove the chunk with offset 3284 * chunk_offset, automatic removal of the block group can kick in, 3285 * resulting in a failure when calling btrfs_remove_chunk() below. 3286 * 3287 * Make sure to acquire this mutex before doing a tree search (dev 3288 * or chunk trees) to find chunks. Otherwise the cleaner kthread might 3289 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after 3290 * we release the path used to search the chunk/dev tree and before 3291 * the current task acquires this mutex and calls us. 3292 */ 3293 lockdep_assert_held(&fs_info->reclaim_bgs_lock); 3294 3295 /* step one, relocate all the extents inside this chunk */ 3296 btrfs_scrub_pause(fs_info); 3297 ret = btrfs_relocate_block_group(fs_info, chunk_offset); 3298 btrfs_scrub_continue(fs_info); 3299 if (ret) 3300 return ret; 3301 3302 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3303 if (!block_group) 3304 return -ENOENT; 3305 btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3306 length = block_group->length; 3307 btrfs_put_block_group(block_group); 3308 3309 /* 3310 * On a zoned file system, discard the whole block group, this will 3311 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3312 * resetting the zone fails, don't treat it as a fatal problem from the 3313 * filesystem's point of view. 3314 */ 3315 if (btrfs_is_zoned(fs_info)) { 3316 ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3317 if (ret) 3318 btrfs_info(fs_info, 3319 "failed to reset zone %llu after relocation", 3320 chunk_offset); 3321 } 3322 3323 trans = btrfs_start_trans_remove_block_group(root->fs_info, 3324 chunk_offset); 3325 if (IS_ERR(trans)) { 3326 ret = PTR_ERR(trans); 3327 btrfs_handle_fs_error(root->fs_info, ret, NULL); 3328 return ret; 3329 } 3330 3331 /* 3332 * step two, delete the device extents and the 3333 * chunk tree entries 3334 */ 3335 ret = btrfs_remove_chunk(trans, chunk_offset); 3336 btrfs_end_transaction(trans); 3337 return ret; 3338 } 3339 3340 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) 3341 { 3342 struct btrfs_root *chunk_root = fs_info->chunk_root; 3343 struct btrfs_path *path; 3344 struct extent_buffer *leaf; 3345 struct btrfs_chunk *chunk; 3346 struct btrfs_key key; 3347 struct btrfs_key found_key; 3348 u64 chunk_type; 3349 bool retried = false; 3350 int failed = 0; 3351 int ret; 3352 3353 path = btrfs_alloc_path(); 3354 if (!path) 3355 return -ENOMEM; 3356 3357 again: 3358 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3359 key.offset = (u64)-1; 3360 key.type = BTRFS_CHUNK_ITEM_KEY; 3361 3362 while (1) { 3363 mutex_lock(&fs_info->reclaim_bgs_lock); 3364 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3365 if (ret < 0) { 3366 mutex_unlock(&fs_info->reclaim_bgs_lock); 3367 goto error; 3368 } 3369 BUG_ON(ret == 0); /* Corruption */ 3370 3371 ret = btrfs_previous_item(chunk_root, path, key.objectid, 3372 key.type); 3373 if (ret) 3374 mutex_unlock(&fs_info->reclaim_bgs_lock); 3375 if (ret < 0) 3376 goto error; 3377 if (ret > 0) 3378 break; 3379 3380 leaf = path->nodes[0]; 3381 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3382 3383 chunk = btrfs_item_ptr(leaf, path->slots[0], 3384 struct btrfs_chunk); 3385 chunk_type = btrfs_chunk_type(leaf, chunk); 3386 btrfs_release_path(path); 3387 3388 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 3389 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 3390 if (ret == -ENOSPC) 3391 failed++; 3392 else 3393 BUG_ON(ret); 3394 } 3395 mutex_unlock(&fs_info->reclaim_bgs_lock); 3396 3397 if (found_key.offset == 0) 3398 break; 3399 key.offset = found_key.offset - 1; 3400 } 3401 ret = 0; 3402 if (failed && !retried) { 3403 failed = 0; 3404 retried = true; 3405 goto again; 3406 } else if (WARN_ON(failed && retried)) { 3407 ret = -ENOSPC; 3408 } 3409 error: 3410 btrfs_free_path(path); 3411 return ret; 3412 } 3413 3414 /* 3415 * return 1 : allocate a data chunk successfully, 3416 * return <0: errors during allocating a data chunk, 3417 * return 0 : no need to allocate a data chunk. 3418 */ 3419 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info, 3420 u64 chunk_offset) 3421 { 3422 struct btrfs_block_group *cache; 3423 u64 bytes_used; 3424 u64 chunk_type; 3425 3426 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3427 ASSERT(cache); 3428 chunk_type = cache->flags; 3429 btrfs_put_block_group(cache); 3430 3431 if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA)) 3432 return 0; 3433 3434 spin_lock(&fs_info->data_sinfo->lock); 3435 bytes_used = fs_info->data_sinfo->bytes_used; 3436 spin_unlock(&fs_info->data_sinfo->lock); 3437 3438 if (!bytes_used) { 3439 struct btrfs_trans_handle *trans; 3440 int ret; 3441 3442 trans = btrfs_join_transaction(fs_info->tree_root); 3443 if (IS_ERR(trans)) 3444 return PTR_ERR(trans); 3445 3446 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA); 3447 btrfs_end_transaction(trans); 3448 if (ret < 0) 3449 return ret; 3450 return 1; 3451 } 3452 3453 return 0; 3454 } 3455 3456 static int insert_balance_item(struct btrfs_fs_info *fs_info, 3457 struct btrfs_balance_control *bctl) 3458 { 3459 struct btrfs_root *root = fs_info->tree_root; 3460 struct btrfs_trans_handle *trans; 3461 struct btrfs_balance_item *item; 3462 struct btrfs_disk_balance_args disk_bargs; 3463 struct btrfs_path *path; 3464 struct extent_buffer *leaf; 3465 struct btrfs_key key; 3466 int ret, err; 3467 3468 path = btrfs_alloc_path(); 3469 if (!path) 3470 return -ENOMEM; 3471 3472 trans = btrfs_start_transaction(root, 0); 3473 if (IS_ERR(trans)) { 3474 btrfs_free_path(path); 3475 return PTR_ERR(trans); 3476 } 3477 3478 key.objectid = BTRFS_BALANCE_OBJECTID; 3479 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3480 key.offset = 0; 3481 3482 ret = btrfs_insert_empty_item(trans, root, path, &key, 3483 sizeof(*item)); 3484 if (ret) 3485 goto out; 3486 3487 leaf = path->nodes[0]; 3488 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 3489 3490 memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); 3491 3492 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); 3493 btrfs_set_balance_data(leaf, item, &disk_bargs); 3494 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta); 3495 btrfs_set_balance_meta(leaf, item, &disk_bargs); 3496 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys); 3497 btrfs_set_balance_sys(leaf, item, &disk_bargs); 3498 3499 btrfs_set_balance_flags(leaf, item, bctl->flags); 3500 3501 btrfs_mark_buffer_dirty(leaf); 3502 out: 3503 btrfs_free_path(path); 3504 err = btrfs_commit_transaction(trans); 3505 if (err && !ret) 3506 ret = err; 3507 return ret; 3508 } 3509 3510 static int del_balance_item(struct btrfs_fs_info *fs_info) 3511 { 3512 struct btrfs_root *root = fs_info->tree_root; 3513 struct btrfs_trans_handle *trans; 3514 struct btrfs_path *path; 3515 struct btrfs_key key; 3516 int ret, err; 3517 3518 path = btrfs_alloc_path(); 3519 if (!path) 3520 return -ENOMEM; 3521 3522 trans = btrfs_start_transaction_fallback_global_rsv(root, 0); 3523 if (IS_ERR(trans)) { 3524 btrfs_free_path(path); 3525 return PTR_ERR(trans); 3526 } 3527 3528 key.objectid = BTRFS_BALANCE_OBJECTID; 3529 key.type = BTRFS_TEMPORARY_ITEM_KEY; 3530 key.offset = 0; 3531 3532 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 3533 if (ret < 0) 3534 goto out; 3535 if (ret > 0) { 3536 ret = -ENOENT; 3537 goto out; 3538 } 3539 3540 ret = btrfs_del_item(trans, root, path); 3541 out: 3542 btrfs_free_path(path); 3543 err = btrfs_commit_transaction(trans); 3544 if (err && !ret) 3545 ret = err; 3546 return ret; 3547 } 3548 3549 /* 3550 * This is a heuristic used to reduce the number of chunks balanced on 3551 * resume after balance was interrupted. 3552 */ 3553 static void update_balance_args(struct btrfs_balance_control *bctl) 3554 { 3555 /* 3556 * Turn on soft mode for chunk types that were being converted. 3557 */ 3558 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) 3559 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT; 3560 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) 3561 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT; 3562 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) 3563 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT; 3564 3565 /* 3566 * Turn on usage filter if is not already used. The idea is 3567 * that chunks that we have already balanced should be 3568 * reasonably full. Don't do it for chunks that are being 3569 * converted - that will keep us from relocating unconverted 3570 * (albeit full) chunks. 3571 */ 3572 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3573 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3574 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3575 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3576 bctl->data.usage = 90; 3577 } 3578 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3579 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3580 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3581 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3582 bctl->sys.usage = 90; 3583 } 3584 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3585 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3586 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3587 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3588 bctl->meta.usage = 90; 3589 } 3590 } 3591 3592 /* 3593 * Clear the balance status in fs_info and delete the balance item from disk. 3594 */ 3595 static void reset_balance_state(struct btrfs_fs_info *fs_info) 3596 { 3597 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3598 int ret; 3599 3600 BUG_ON(!fs_info->balance_ctl); 3601 3602 spin_lock(&fs_info->balance_lock); 3603 fs_info->balance_ctl = NULL; 3604 spin_unlock(&fs_info->balance_lock); 3605 3606 kfree(bctl); 3607 ret = del_balance_item(fs_info); 3608 if (ret) 3609 btrfs_handle_fs_error(fs_info, ret, NULL); 3610 } 3611 3612 /* 3613 * Balance filters. Return 1 if chunk should be filtered out 3614 * (should not be balanced). 3615 */ 3616 static int chunk_profiles_filter(u64 chunk_type, 3617 struct btrfs_balance_args *bargs) 3618 { 3619 chunk_type = chunk_to_extended(chunk_type) & 3620 BTRFS_EXTENDED_PROFILE_MASK; 3621 3622 if (bargs->profiles & chunk_type) 3623 return 0; 3624 3625 return 1; 3626 } 3627 3628 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3629 struct btrfs_balance_args *bargs) 3630 { 3631 struct btrfs_block_group *cache; 3632 u64 chunk_used; 3633 u64 user_thresh_min; 3634 u64 user_thresh_max; 3635 int ret = 1; 3636 3637 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3638 chunk_used = cache->used; 3639 3640 if (bargs->usage_min == 0) 3641 user_thresh_min = 0; 3642 else 3643 user_thresh_min = div_factor_fine(cache->length, 3644 bargs->usage_min); 3645 3646 if (bargs->usage_max == 0) 3647 user_thresh_max = 1; 3648 else if (bargs->usage_max > 100) 3649 user_thresh_max = cache->length; 3650 else 3651 user_thresh_max = div_factor_fine(cache->length, 3652 bargs->usage_max); 3653 3654 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) 3655 ret = 0; 3656 3657 btrfs_put_block_group(cache); 3658 return ret; 3659 } 3660 3661 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, 3662 u64 chunk_offset, struct btrfs_balance_args *bargs) 3663 { 3664 struct btrfs_block_group *cache; 3665 u64 chunk_used, user_thresh; 3666 int ret = 1; 3667 3668 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3669 chunk_used = cache->used; 3670 3671 if (bargs->usage_min == 0) 3672 user_thresh = 1; 3673 else if (bargs->usage > 100) 3674 user_thresh = cache->length; 3675 else 3676 user_thresh = div_factor_fine(cache->length, bargs->usage); 3677 3678 if (chunk_used < user_thresh) 3679 ret = 0; 3680 3681 btrfs_put_block_group(cache); 3682 return ret; 3683 } 3684 3685 static int chunk_devid_filter(struct extent_buffer *leaf, 3686 struct btrfs_chunk *chunk, 3687 struct btrfs_balance_args *bargs) 3688 { 3689 struct btrfs_stripe *stripe; 3690 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3691 int i; 3692 3693 for (i = 0; i < num_stripes; i++) { 3694 stripe = btrfs_stripe_nr(chunk, i); 3695 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) 3696 return 0; 3697 } 3698 3699 return 1; 3700 } 3701 3702 static u64 calc_data_stripes(u64 type, int num_stripes) 3703 { 3704 const int index = btrfs_bg_flags_to_raid_index(type); 3705 const int ncopies = btrfs_raid_array[index].ncopies; 3706 const int nparity = btrfs_raid_array[index].nparity; 3707 3708 return (num_stripes - nparity) / ncopies; 3709 } 3710 3711 /* [pstart, pend) */ 3712 static int chunk_drange_filter(struct extent_buffer *leaf, 3713 struct btrfs_chunk *chunk, 3714 struct btrfs_balance_args *bargs) 3715 { 3716 struct btrfs_stripe *stripe; 3717 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3718 u64 stripe_offset; 3719 u64 stripe_length; 3720 u64 type; 3721 int factor; 3722 int i; 3723 3724 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) 3725 return 0; 3726 3727 type = btrfs_chunk_type(leaf, chunk); 3728 factor = calc_data_stripes(type, num_stripes); 3729 3730 for (i = 0; i < num_stripes; i++) { 3731 stripe = btrfs_stripe_nr(chunk, i); 3732 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid) 3733 continue; 3734 3735 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3736 stripe_length = btrfs_chunk_length(leaf, chunk); 3737 stripe_length = div_u64(stripe_length, factor); 3738 3739 if (stripe_offset < bargs->pend && 3740 stripe_offset + stripe_length > bargs->pstart) 3741 return 0; 3742 } 3743 3744 return 1; 3745 } 3746 3747 /* [vstart, vend) */ 3748 static int chunk_vrange_filter(struct extent_buffer *leaf, 3749 struct btrfs_chunk *chunk, 3750 u64 chunk_offset, 3751 struct btrfs_balance_args *bargs) 3752 { 3753 if (chunk_offset < bargs->vend && 3754 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) 3755 /* at least part of the chunk is inside this vrange */ 3756 return 0; 3757 3758 return 1; 3759 } 3760 3761 static int chunk_stripes_range_filter(struct extent_buffer *leaf, 3762 struct btrfs_chunk *chunk, 3763 struct btrfs_balance_args *bargs) 3764 { 3765 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3766 3767 if (bargs->stripes_min <= num_stripes 3768 && num_stripes <= bargs->stripes_max) 3769 return 0; 3770 3771 return 1; 3772 } 3773 3774 static int chunk_soft_convert_filter(u64 chunk_type, 3775 struct btrfs_balance_args *bargs) 3776 { 3777 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 3778 return 0; 3779 3780 chunk_type = chunk_to_extended(chunk_type) & 3781 BTRFS_EXTENDED_PROFILE_MASK; 3782 3783 if (bargs->target == chunk_type) 3784 return 1; 3785 3786 return 0; 3787 } 3788 3789 static int should_balance_chunk(struct extent_buffer *leaf, 3790 struct btrfs_chunk *chunk, u64 chunk_offset) 3791 { 3792 struct btrfs_fs_info *fs_info = leaf->fs_info; 3793 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3794 struct btrfs_balance_args *bargs = NULL; 3795 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 3796 3797 /* type filter */ 3798 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 3799 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { 3800 return 0; 3801 } 3802 3803 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3804 bargs = &bctl->data; 3805 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3806 bargs = &bctl->sys; 3807 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3808 bargs = &bctl->meta; 3809 3810 /* profiles filter */ 3811 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && 3812 chunk_profiles_filter(chunk_type, bargs)) { 3813 return 0; 3814 } 3815 3816 /* usage filter */ 3817 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3818 chunk_usage_filter(fs_info, chunk_offset, bargs)) { 3819 return 0; 3820 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && 3821 chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { 3822 return 0; 3823 } 3824 3825 /* devid filter */ 3826 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && 3827 chunk_devid_filter(leaf, chunk, bargs)) { 3828 return 0; 3829 } 3830 3831 /* drange filter, makes sense only with devid filter */ 3832 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && 3833 chunk_drange_filter(leaf, chunk, bargs)) { 3834 return 0; 3835 } 3836 3837 /* vrange filter */ 3838 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && 3839 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { 3840 return 0; 3841 } 3842 3843 /* stripes filter */ 3844 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && 3845 chunk_stripes_range_filter(leaf, chunk, bargs)) { 3846 return 0; 3847 } 3848 3849 /* soft profile changing mode */ 3850 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3851 chunk_soft_convert_filter(chunk_type, bargs)) { 3852 return 0; 3853 } 3854 3855 /* 3856 * limited by count, must be the last filter 3857 */ 3858 if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { 3859 if (bargs->limit == 0) 3860 return 0; 3861 else 3862 bargs->limit--; 3863 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { 3864 /* 3865 * Same logic as the 'limit' filter; the minimum cannot be 3866 * determined here because we do not have the global information 3867 * about the count of all chunks that satisfy the filters. 3868 */ 3869 if (bargs->limit_max == 0) 3870 return 0; 3871 else 3872 bargs->limit_max--; 3873 } 3874 3875 return 1; 3876 } 3877 3878 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 3879 { 3880 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 3881 struct btrfs_root *chunk_root = fs_info->chunk_root; 3882 u64 chunk_type; 3883 struct btrfs_chunk *chunk; 3884 struct btrfs_path *path = NULL; 3885 struct btrfs_key key; 3886 struct btrfs_key found_key; 3887 struct extent_buffer *leaf; 3888 int slot; 3889 int ret; 3890 int enospc_errors = 0; 3891 bool counting = true; 3892 /* The single value limit and min/max limits use the same bytes in the */ 3893 u64 limit_data = bctl->data.limit; 3894 u64 limit_meta = bctl->meta.limit; 3895 u64 limit_sys = bctl->sys.limit; 3896 u32 count_data = 0; 3897 u32 count_meta = 0; 3898 u32 count_sys = 0; 3899 int chunk_reserved = 0; 3900 3901 path = btrfs_alloc_path(); 3902 if (!path) { 3903 ret = -ENOMEM; 3904 goto error; 3905 } 3906 3907 /* zero out stat counters */ 3908 spin_lock(&fs_info->balance_lock); 3909 memset(&bctl->stat, 0, sizeof(bctl->stat)); 3910 spin_unlock(&fs_info->balance_lock); 3911 again: 3912 if (!counting) { 3913 /* 3914 * The single value limit and min/max limits use the same bytes 3915 * in the 3916 */ 3917 bctl->data.limit = limit_data; 3918 bctl->meta.limit = limit_meta; 3919 bctl->sys.limit = limit_sys; 3920 } 3921 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 3922 key.offset = (u64)-1; 3923 key.type = BTRFS_CHUNK_ITEM_KEY; 3924 3925 while (1) { 3926 if ((!counting && atomic_read(&fs_info->balance_pause_req)) || 3927 atomic_read(&fs_info->balance_cancel_req)) { 3928 ret = -ECANCELED; 3929 goto error; 3930 } 3931 3932 mutex_lock(&fs_info->reclaim_bgs_lock); 3933 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 3934 if (ret < 0) { 3935 mutex_unlock(&fs_info->reclaim_bgs_lock); 3936 goto error; 3937 } 3938 3939 /* 3940 * this shouldn't happen, it means the last relocate 3941 * failed 3942 */ 3943 if (ret == 0) 3944 BUG(); /* FIXME break ? */ 3945 3946 ret = btrfs_previous_item(chunk_root, path, 0, 3947 BTRFS_CHUNK_ITEM_KEY); 3948 if (ret) { 3949 mutex_unlock(&fs_info->reclaim_bgs_lock); 3950 ret = 0; 3951 break; 3952 } 3953 3954 leaf = path->nodes[0]; 3955 slot = path->slots[0]; 3956 btrfs_item_key_to_cpu(leaf, &found_key, slot); 3957 3958 if (found_key.objectid != key.objectid) { 3959 mutex_unlock(&fs_info->reclaim_bgs_lock); 3960 break; 3961 } 3962 3963 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3964 chunk_type = btrfs_chunk_type(leaf, chunk); 3965 3966 if (!counting) { 3967 spin_lock(&fs_info->balance_lock); 3968 bctl->stat.considered++; 3969 spin_unlock(&fs_info->balance_lock); 3970 } 3971 3972 ret = should_balance_chunk(leaf, chunk, found_key.offset); 3973 3974 btrfs_release_path(path); 3975 if (!ret) { 3976 mutex_unlock(&fs_info->reclaim_bgs_lock); 3977 goto loop; 3978 } 3979 3980 if (counting) { 3981 mutex_unlock(&fs_info->reclaim_bgs_lock); 3982 spin_lock(&fs_info->balance_lock); 3983 bctl->stat.expected++; 3984 spin_unlock(&fs_info->balance_lock); 3985 3986 if (chunk_type & BTRFS_BLOCK_GROUP_DATA) 3987 count_data++; 3988 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 3989 count_sys++; 3990 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 3991 count_meta++; 3992 3993 goto loop; 3994 } 3995 3996 /* 3997 * Apply limit_min filter, no need to check if the LIMITS 3998 * filter is used, limit_min is 0 by default 3999 */ 4000 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && 4001 count_data < bctl->data.limit_min) 4002 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && 4003 count_meta < bctl->meta.limit_min) 4004 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 4005 count_sys < bctl->sys.limit_min)) { 4006 mutex_unlock(&fs_info->reclaim_bgs_lock); 4007 goto loop; 4008 } 4009 4010 if (!chunk_reserved) { 4011 /* 4012 * We may be relocating the only data chunk we have, 4013 * which could potentially end up with losing data's 4014 * raid profile, so lets allocate an empty one in 4015 * advance. 4016 */ 4017 ret = btrfs_may_alloc_data_chunk(fs_info, 4018 found_key.offset); 4019 if (ret < 0) { 4020 mutex_unlock(&fs_info->reclaim_bgs_lock); 4021 goto error; 4022 } else if (ret == 1) { 4023 chunk_reserved = 1; 4024 } 4025 } 4026 4027 ret = btrfs_relocate_chunk(fs_info, found_key.offset); 4028 mutex_unlock(&fs_info->reclaim_bgs_lock); 4029 if (ret == -ENOSPC) { 4030 enospc_errors++; 4031 } else if (ret == -ETXTBSY) { 4032 btrfs_info(fs_info, 4033 "skipping relocation of block group %llu due to active swapfile", 4034 found_key.offset); 4035 ret = 0; 4036 } else if (ret) { 4037 goto error; 4038 } else { 4039 spin_lock(&fs_info->balance_lock); 4040 bctl->stat.completed++; 4041 spin_unlock(&fs_info->balance_lock); 4042 } 4043 loop: 4044 if (found_key.offset == 0) 4045 break; 4046 key.offset = found_key.offset - 1; 4047 } 4048 4049 if (counting) { 4050 btrfs_release_path(path); 4051 counting = false; 4052 goto again; 4053 } 4054 error: 4055 btrfs_free_path(path); 4056 if (enospc_errors) { 4057 btrfs_info(fs_info, "%d enospc errors during balance", 4058 enospc_errors); 4059 if (!ret) 4060 ret = -ENOSPC; 4061 } 4062 4063 return ret; 4064 } 4065 4066 /** 4067 * alloc_profile_is_valid - see if a given profile is valid and reduced 4068 * @flags: profile to validate 4069 * @extended: if true @flags is treated as an extended profile 4070 */ 4071 static int alloc_profile_is_valid(u64 flags, int extended) 4072 { 4073 u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK : 4074 BTRFS_BLOCK_GROUP_PROFILE_MASK); 4075 4076 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK; 4077 4078 /* 1) check that all other bits are zeroed */ 4079 if (flags & ~mask) 4080 return 0; 4081 4082 /* 2) see if profile is reduced */ 4083 if (flags == 0) 4084 return !extended; /* "0" is valid for usual profiles */ 4085 4086 return has_single_bit_set(flags); 4087 } 4088 4089 static inline int balance_need_close(struct btrfs_fs_info *fs_info) 4090 { 4091 /* cancel requested || normal exit path */ 4092 return atomic_read(&fs_info->balance_cancel_req) || 4093 (atomic_read(&fs_info->balance_pause_req) == 0 && 4094 atomic_read(&fs_info->balance_cancel_req) == 0); 4095 } 4096 4097 /* 4098 * Validate target profile against allowed profiles and return true if it's OK. 4099 * Otherwise print the error message and return false. 4100 */ 4101 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, 4102 const struct btrfs_balance_args *bargs, 4103 u64 allowed, const char *type) 4104 { 4105 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) 4106 return true; 4107 4108 if (fs_info->sectorsize < PAGE_SIZE && 4109 bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { 4110 btrfs_err(fs_info, 4111 "RAID56 is not yet supported for sectorsize %u with page size %lu", 4112 fs_info->sectorsize, PAGE_SIZE); 4113 return false; 4114 } 4115 /* Profile is valid and does not have bits outside of the allowed set */ 4116 if (alloc_profile_is_valid(bargs->target, 1) && 4117 (bargs->target & ~allowed) == 0) 4118 return true; 4119 4120 btrfs_err(fs_info, "balance: invalid convert %s profile %s", 4121 type, btrfs_bg_type_to_raid_name(bargs->target)); 4122 return false; 4123 } 4124 4125 /* 4126 * Fill @buf with textual description of balance filter flags @bargs, up to 4127 * @size_buf including the terminating null. The output may be trimmed if it 4128 * does not fit into the provided buffer. 4129 */ 4130 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, 4131 u32 size_buf) 4132 { 4133 int ret; 4134 u32 size_bp = size_buf; 4135 char *bp = buf; 4136 u64 flags = bargs->flags; 4137 char tmp_buf[128] = {'\0'}; 4138 4139 if (!flags) 4140 return; 4141 4142 #define CHECK_APPEND_NOARG(a) \ 4143 do { \ 4144 ret = snprintf(bp, size_bp, (a)); \ 4145 if (ret < 0 || ret >= size_bp) \ 4146 goto out_overflow; \ 4147 size_bp -= ret; \ 4148 bp += ret; \ 4149 } while (0) 4150 4151 #define CHECK_APPEND_1ARG(a, v1) \ 4152 do { \ 4153 ret = snprintf(bp, size_bp, (a), (v1)); \ 4154 if (ret < 0 || ret >= size_bp) \ 4155 goto out_overflow; \ 4156 size_bp -= ret; \ 4157 bp += ret; \ 4158 } while (0) 4159 4160 #define CHECK_APPEND_2ARG(a, v1, v2) \ 4161 do { \ 4162 ret = snprintf(bp, size_bp, (a), (v1), (v2)); \ 4163 if (ret < 0 || ret >= size_bp) \ 4164 goto out_overflow; \ 4165 size_bp -= ret; \ 4166 bp += ret; \ 4167 } while (0) 4168 4169 if (flags & BTRFS_BALANCE_ARGS_CONVERT) 4170 CHECK_APPEND_1ARG("convert=%s,", 4171 btrfs_bg_type_to_raid_name(bargs->target)); 4172 4173 if (flags & BTRFS_BALANCE_ARGS_SOFT) 4174 CHECK_APPEND_NOARG("soft,"); 4175 4176 if (flags & BTRFS_BALANCE_ARGS_PROFILES) { 4177 btrfs_describe_block_groups(bargs->profiles, tmp_buf, 4178 sizeof(tmp_buf)); 4179 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf); 4180 } 4181 4182 if (flags & BTRFS_BALANCE_ARGS_USAGE) 4183 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage); 4184 4185 if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) 4186 CHECK_APPEND_2ARG("usage=%u..%u,", 4187 bargs->usage_min, bargs->usage_max); 4188 4189 if (flags & BTRFS_BALANCE_ARGS_DEVID) 4190 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid); 4191 4192 if (flags & BTRFS_BALANCE_ARGS_DRANGE) 4193 CHECK_APPEND_2ARG("drange=%llu..%llu,", 4194 bargs->pstart, bargs->pend); 4195 4196 if (flags & BTRFS_BALANCE_ARGS_VRANGE) 4197 CHECK_APPEND_2ARG("vrange=%llu..%llu,", 4198 bargs->vstart, bargs->vend); 4199 4200 if (flags & BTRFS_BALANCE_ARGS_LIMIT) 4201 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit); 4202 4203 if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE) 4204 CHECK_APPEND_2ARG("limit=%u..%u,", 4205 bargs->limit_min, bargs->limit_max); 4206 4207 if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) 4208 CHECK_APPEND_2ARG("stripes=%u..%u,", 4209 bargs->stripes_min, bargs->stripes_max); 4210 4211 #undef CHECK_APPEND_2ARG 4212 #undef CHECK_APPEND_1ARG 4213 #undef CHECK_APPEND_NOARG 4214 4215 out_overflow: 4216 4217 if (size_bp < size_buf) 4218 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */ 4219 else 4220 buf[0] = '\0'; 4221 } 4222 4223 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info) 4224 { 4225 u32 size_buf = 1024; 4226 char tmp_buf[192] = {'\0'}; 4227 char *buf; 4228 char *bp; 4229 u32 size_bp = size_buf; 4230 int ret; 4231 struct btrfs_balance_control *bctl = fs_info->balance_ctl; 4232 4233 buf = kzalloc(size_buf, GFP_KERNEL); 4234 if (!buf) 4235 return; 4236 4237 bp = buf; 4238 4239 #define CHECK_APPEND_1ARG(a, v1) \ 4240 do { \ 4241 ret = snprintf(bp, size_bp, (a), (v1)); \ 4242 if (ret < 0 || ret >= size_bp) \ 4243 goto out_overflow; \ 4244 size_bp -= ret; \ 4245 bp += ret; \ 4246 } while (0) 4247 4248 if (bctl->flags & BTRFS_BALANCE_FORCE) 4249 CHECK_APPEND_1ARG("%s", "-f "); 4250 4251 if (bctl->flags & BTRFS_BALANCE_DATA) { 4252 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf)); 4253 CHECK_APPEND_1ARG("-d%s ", tmp_buf); 4254 } 4255 4256 if (bctl->flags & BTRFS_BALANCE_METADATA) { 4257 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf)); 4258 CHECK_APPEND_1ARG("-m%s ", tmp_buf); 4259 } 4260 4261 if (bctl->flags & BTRFS_BALANCE_SYSTEM) { 4262 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf)); 4263 CHECK_APPEND_1ARG("-s%s ", tmp_buf); 4264 } 4265 4266 #undef CHECK_APPEND_1ARG 4267 4268 out_overflow: 4269 4270 if (size_bp < size_buf) 4271 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */ 4272 btrfs_info(fs_info, "balance: %s %s", 4273 (bctl->flags & BTRFS_BALANCE_RESUME) ? 4274 "resume" : "start", buf); 4275 4276 kfree(buf); 4277 } 4278 4279 /* 4280 * Should be called with balance mutexe held 4281 */ 4282 int btrfs_balance(struct btrfs_fs_info *fs_info, 4283 struct btrfs_balance_control *bctl, 4284 struct btrfs_ioctl_balance_args *bargs) 4285 { 4286 u64 meta_target, data_target; 4287 u64 allowed; 4288 int mixed = 0; 4289 int ret; 4290 u64 num_devices; 4291 unsigned seq; 4292 bool reducing_redundancy; 4293 int i; 4294 4295 if (btrfs_fs_closing(fs_info) || 4296 atomic_read(&fs_info->balance_pause_req) || 4297 btrfs_should_cancel_balance(fs_info)) { 4298 ret = -EINVAL; 4299 goto out; 4300 } 4301 4302 allowed = btrfs_super_incompat_flags(fs_info->super_copy); 4303 if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 4304 mixed = 1; 4305 4306 /* 4307 * In case of mixed groups both data and meta should be picked, 4308 * and identical options should be given for both of them. 4309 */ 4310 allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA; 4311 if (mixed && (bctl->flags & allowed)) { 4312 if (!(bctl->flags & BTRFS_BALANCE_DATA) || 4313 !(bctl->flags & BTRFS_BALANCE_METADATA) || 4314 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) { 4315 btrfs_err(fs_info, 4316 "balance: mixed groups data and metadata options must be the same"); 4317 ret = -EINVAL; 4318 goto out; 4319 } 4320 } 4321 4322 /* 4323 * rw_devices will not change at the moment, device add/delete/replace 4324 * are exclusive 4325 */ 4326 num_devices = fs_info->fs_devices->rw_devices; 4327 4328 /* 4329 * SINGLE profile on-disk has no profile bit, but in-memory we have a 4330 * special bit for it, to make it easier to distinguish. Thus we need 4331 * to set it manually, or balance would refuse the profile. 4332 */ 4333 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 4334 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) 4335 if (num_devices >= btrfs_raid_array[i].devs_min) 4336 allowed |= btrfs_raid_array[i].bg_flag; 4337 4338 if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") || 4339 !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") || 4340 !validate_convert_profile(fs_info, &bctl->sys, allowed, "system")) { 4341 ret = -EINVAL; 4342 goto out; 4343 } 4344 4345 /* 4346 * Allow to reduce metadata or system integrity only if force set for 4347 * profiles with redundancy (copies, parity) 4348 */ 4349 allowed = 0; 4350 for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { 4351 if (btrfs_raid_array[i].ncopies >= 2 || 4352 btrfs_raid_array[i].tolerated_failures >= 1) 4353 allowed |= btrfs_raid_array[i].bg_flag; 4354 } 4355 do { 4356 seq = read_seqbegin(&fs_info->profiles_lock); 4357 4358 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4359 (fs_info->avail_system_alloc_bits & allowed) && 4360 !(bctl->sys.target & allowed)) || 4361 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 4362 (fs_info->avail_metadata_alloc_bits & allowed) && 4363 !(bctl->meta.target & allowed))) 4364 reducing_redundancy = true; 4365 else 4366 reducing_redundancy = false; 4367 4368 /* if we're not converting, the target field is uninitialized */ 4369 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4370 bctl->meta.target : fs_info->avail_metadata_alloc_bits; 4371 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? 4372 bctl->data.target : fs_info->avail_data_alloc_bits; 4373 } while (read_seqretry(&fs_info->profiles_lock, seq)); 4374 4375 if (reducing_redundancy) { 4376 if (bctl->flags & BTRFS_BALANCE_FORCE) { 4377 btrfs_info(fs_info, 4378 "balance: force reducing metadata redundancy"); 4379 } else { 4380 btrfs_err(fs_info, 4381 "balance: reduces metadata redundancy, use --force if you want this"); 4382 ret = -EINVAL; 4383 goto out; 4384 } 4385 } 4386 4387 if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < 4388 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { 4389 btrfs_warn(fs_info, 4390 "balance: metadata profile %s has lower redundancy than data profile %s", 4391 btrfs_bg_type_to_raid_name(meta_target), 4392 btrfs_bg_type_to_raid_name(data_target)); 4393 } 4394 4395 ret = insert_balance_item(fs_info, bctl); 4396 if (ret && ret != -EEXIST) 4397 goto out; 4398 4399 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) { 4400 BUG_ON(ret == -EEXIST); 4401 BUG_ON(fs_info->balance_ctl); 4402 spin_lock(&fs_info->balance_lock); 4403 fs_info->balance_ctl = bctl; 4404 spin_unlock(&fs_info->balance_lock); 4405 } else { 4406 BUG_ON(ret != -EEXIST); 4407 spin_lock(&fs_info->balance_lock); 4408 update_balance_args(bctl); 4409 spin_unlock(&fs_info->balance_lock); 4410 } 4411 4412 ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4413 set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4414 describe_balance_start_or_resume(fs_info); 4415 mutex_unlock(&fs_info->balance_mutex); 4416 4417 ret = __btrfs_balance(fs_info); 4418 4419 mutex_lock(&fs_info->balance_mutex); 4420 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4421 btrfs_info(fs_info, "balance: paused"); 4422 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4423 } 4424 /* 4425 * Balance can be canceled by: 4426 * 4427 * - Regular cancel request 4428 * Then ret == -ECANCELED and balance_cancel_req > 0 4429 * 4430 * - Fatal signal to "btrfs" process 4431 * Either the signal caught by wait_reserve_ticket() and callers 4432 * got -EINTR, or caught by btrfs_should_cancel_balance() and 4433 * got -ECANCELED. 4434 * Either way, in this case balance_cancel_req = 0, and 4435 * ret == -EINTR or ret == -ECANCELED. 4436 * 4437 * So here we only check the return value to catch canceled balance. 4438 */ 4439 else if (ret == -ECANCELED || ret == -EINTR) 4440 btrfs_info(fs_info, "balance: canceled"); 4441 else 4442 btrfs_info(fs_info, "balance: ended with status: %d", ret); 4443 4444 clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags); 4445 4446 if (bargs) { 4447 memset(bargs, 0, sizeof(*bargs)); 4448 btrfs_update_ioctl_balance_args(fs_info, bargs); 4449 } 4450 4451 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4452 balance_need_close(fs_info)) { 4453 reset_balance_state(fs_info); 4454 btrfs_exclop_finish(fs_info); 4455 } 4456 4457 wake_up(&fs_info->balance_wait_q); 4458 4459 return ret; 4460 out: 4461 if (bctl->flags & BTRFS_BALANCE_RESUME) 4462 reset_balance_state(fs_info); 4463 else 4464 kfree(bctl); 4465 btrfs_exclop_finish(fs_info); 4466 4467 return ret; 4468 } 4469 4470 static int balance_kthread(void *data) 4471 { 4472 struct btrfs_fs_info *fs_info = data; 4473 int ret = 0; 4474 4475 mutex_lock(&fs_info->balance_mutex); 4476 if (fs_info->balance_ctl) 4477 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL); 4478 mutex_unlock(&fs_info->balance_mutex); 4479 4480 return ret; 4481 } 4482 4483 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) 4484 { 4485 struct task_struct *tsk; 4486 4487 mutex_lock(&fs_info->balance_mutex); 4488 if (!fs_info->balance_ctl) { 4489 mutex_unlock(&fs_info->balance_mutex); 4490 return 0; 4491 } 4492 mutex_unlock(&fs_info->balance_mutex); 4493 4494 if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { 4495 btrfs_info(fs_info, "balance: resume skipped"); 4496 return 0; 4497 } 4498 4499 spin_lock(&fs_info->super_lock); 4500 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); 4501 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; 4502 spin_unlock(&fs_info->super_lock); 4503 /* 4504 * A ro->rw remount sequence should continue with the paused balance 4505 * regardless of who pauses it, system or the user as of now, so set 4506 * the resume flag. 4507 */ 4508 spin_lock(&fs_info->balance_lock); 4509 fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME; 4510 spin_unlock(&fs_info->balance_lock); 4511 4512 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 4513 return PTR_ERR_OR_ZERO(tsk); 4514 } 4515 4516 int btrfs_recover_balance(struct btrfs_fs_info *fs_info) 4517 { 4518 struct btrfs_balance_control *bctl; 4519 struct btrfs_balance_item *item; 4520 struct btrfs_disk_balance_args disk_bargs; 4521 struct btrfs_path *path; 4522 struct extent_buffer *leaf; 4523 struct btrfs_key key; 4524 int ret; 4525 4526 path = btrfs_alloc_path(); 4527 if (!path) 4528 return -ENOMEM; 4529 4530 key.objectid = BTRFS_BALANCE_OBJECTID; 4531 key.type = BTRFS_TEMPORARY_ITEM_KEY; 4532 key.offset = 0; 4533 4534 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); 4535 if (ret < 0) 4536 goto out; 4537 if (ret > 0) { /* ret = -ENOENT; */ 4538 ret = 0; 4539 goto out; 4540 } 4541 4542 bctl = kzalloc(sizeof(*bctl), GFP_NOFS); 4543 if (!bctl) { 4544 ret = -ENOMEM; 4545 goto out; 4546 } 4547 4548 leaf = path->nodes[0]; 4549 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); 4550 4551 bctl->flags = btrfs_balance_flags(leaf, item); 4552 bctl->flags |= BTRFS_BALANCE_RESUME; 4553 4554 btrfs_balance_data(leaf, item, &disk_bargs); 4555 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs); 4556 btrfs_balance_meta(leaf, item, &disk_bargs); 4557 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs); 4558 btrfs_balance_sys(leaf, item, &disk_bargs); 4559 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 4560 4561 /* 4562 * This should never happen, as the paused balance state is recovered 4563 * during mount without any chance of other exclusive ops to collide. 4564 * 4565 * This gives the exclusive op status to balance and keeps in paused 4566 * state until user intervention (cancel or umount). If the ownership 4567 * cannot be assigned, show a message but do not fail. The balance 4568 * is in a paused state and must have fs_info::balance_ctl properly 4569 * set up. 4570 */ 4571 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED)) 4572 btrfs_warn(fs_info, 4573 "balance: cannot set exclusive op status, resume manually"); 4574 4575 btrfs_release_path(path); 4576 4577 mutex_lock(&fs_info->balance_mutex); 4578 BUG_ON(fs_info->balance_ctl); 4579 spin_lock(&fs_info->balance_lock); 4580 fs_info->balance_ctl = bctl; 4581 spin_unlock(&fs_info->balance_lock); 4582 mutex_unlock(&fs_info->balance_mutex); 4583 out: 4584 btrfs_free_path(path); 4585 return ret; 4586 } 4587 4588 int btrfs_pause_balance(struct btrfs_fs_info *fs_info) 4589 { 4590 int ret = 0; 4591 4592 mutex_lock(&fs_info->balance_mutex); 4593 if (!fs_info->balance_ctl) { 4594 mutex_unlock(&fs_info->balance_mutex); 4595 return -ENOTCONN; 4596 } 4597 4598 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4599 atomic_inc(&fs_info->balance_pause_req); 4600 mutex_unlock(&fs_info->balance_mutex); 4601 4602 wait_event(fs_info->balance_wait_q, 4603 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4604 4605 mutex_lock(&fs_info->balance_mutex); 4606 /* we are good with balance_ctl ripped off from under us */ 4607 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4608 atomic_dec(&fs_info->balance_pause_req); 4609 } else { 4610 ret = -ENOTCONN; 4611 } 4612 4613 mutex_unlock(&fs_info->balance_mutex); 4614 return ret; 4615 } 4616 4617 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) 4618 { 4619 mutex_lock(&fs_info->balance_mutex); 4620 if (!fs_info->balance_ctl) { 4621 mutex_unlock(&fs_info->balance_mutex); 4622 return -ENOTCONN; 4623 } 4624 4625 /* 4626 * A paused balance with the item stored on disk can be resumed at 4627 * mount time if the mount is read-write. Otherwise it's still paused 4628 * and we must not allow cancelling as it deletes the item. 4629 */ 4630 if (sb_rdonly(fs_info->sb)) { 4631 mutex_unlock(&fs_info->balance_mutex); 4632 return -EROFS; 4633 } 4634 4635 atomic_inc(&fs_info->balance_cancel_req); 4636 /* 4637 * if we are running just wait and return, balance item is 4638 * deleted in btrfs_balance in this case 4639 */ 4640 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { 4641 mutex_unlock(&fs_info->balance_mutex); 4642 wait_event(fs_info->balance_wait_q, 4643 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4644 mutex_lock(&fs_info->balance_mutex); 4645 } else { 4646 mutex_unlock(&fs_info->balance_mutex); 4647 /* 4648 * Lock released to allow other waiters to continue, we'll 4649 * reexamine the status again. 4650 */ 4651 mutex_lock(&fs_info->balance_mutex); 4652 4653 if (fs_info->balance_ctl) { 4654 reset_balance_state(fs_info); 4655 btrfs_exclop_finish(fs_info); 4656 btrfs_info(fs_info, "balance: canceled"); 4657 } 4658 } 4659 4660 BUG_ON(fs_info->balance_ctl || 4661 test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)); 4662 atomic_dec(&fs_info->balance_cancel_req); 4663 mutex_unlock(&fs_info->balance_mutex); 4664 return 0; 4665 } 4666 4667 int btrfs_uuid_scan_kthread(void *data) 4668 { 4669 struct btrfs_fs_info *fs_info = data; 4670 struct btrfs_root *root = fs_info->tree_root; 4671 struct btrfs_key key; 4672 struct btrfs_path *path = NULL; 4673 int ret = 0; 4674 struct extent_buffer *eb; 4675 int slot; 4676 struct btrfs_root_item root_item; 4677 u32 item_size; 4678 struct btrfs_trans_handle *trans = NULL; 4679 bool closing = false; 4680 4681 path = btrfs_alloc_path(); 4682 if (!path) { 4683 ret = -ENOMEM; 4684 goto out; 4685 } 4686 4687 key.objectid = 0; 4688 key.type = BTRFS_ROOT_ITEM_KEY; 4689 key.offset = 0; 4690 4691 while (1) { 4692 if (btrfs_fs_closing(fs_info)) { 4693 closing = true; 4694 break; 4695 } 4696 ret = btrfs_search_forward(root, &key, path, 4697 BTRFS_OLDEST_GENERATION); 4698 if (ret) { 4699 if (ret > 0) 4700 ret = 0; 4701 break; 4702 } 4703 4704 if (key.type != BTRFS_ROOT_ITEM_KEY || 4705 (key.objectid < BTRFS_FIRST_FREE_OBJECTID && 4706 key.objectid != BTRFS_FS_TREE_OBJECTID) || 4707 key.objectid > BTRFS_LAST_FREE_OBJECTID) 4708 goto skip; 4709 4710 eb = path->nodes[0]; 4711 slot = path->slots[0]; 4712 item_size = btrfs_item_size(eb, slot); 4713 if (item_size < sizeof(root_item)) 4714 goto skip; 4715 4716 read_extent_buffer(eb, &root_item, 4717 btrfs_item_ptr_offset(eb, slot), 4718 (int)sizeof(root_item)); 4719 if (btrfs_root_refs(&root_item) == 0) 4720 goto skip; 4721 4722 if (!btrfs_is_empty_uuid(root_item.uuid) || 4723 !btrfs_is_empty_uuid(root_item.received_uuid)) { 4724 if (trans) 4725 goto update_tree; 4726 4727 btrfs_release_path(path); 4728 /* 4729 * 1 - subvol uuid item 4730 * 1 - received_subvol uuid item 4731 */ 4732 trans = btrfs_start_transaction(fs_info->uuid_root, 2); 4733 if (IS_ERR(trans)) { 4734 ret = PTR_ERR(trans); 4735 break; 4736 } 4737 continue; 4738 } else { 4739 goto skip; 4740 } 4741 update_tree: 4742 btrfs_release_path(path); 4743 if (!btrfs_is_empty_uuid(root_item.uuid)) { 4744 ret = btrfs_uuid_tree_add(trans, root_item.uuid, 4745 BTRFS_UUID_KEY_SUBVOL, 4746 key.objectid); 4747 if (ret < 0) { 4748 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4749 ret); 4750 break; 4751 } 4752 } 4753 4754 if (!btrfs_is_empty_uuid(root_item.received_uuid)) { 4755 ret = btrfs_uuid_tree_add(trans, 4756 root_item.received_uuid, 4757 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 4758 key.objectid); 4759 if (ret < 0) { 4760 btrfs_warn(fs_info, "uuid_tree_add failed %d", 4761 ret); 4762 break; 4763 } 4764 } 4765 4766 skip: 4767 btrfs_release_path(path); 4768 if (trans) { 4769 ret = btrfs_end_transaction(trans); 4770 trans = NULL; 4771 if (ret) 4772 break; 4773 } 4774 4775 if (key.offset < (u64)-1) { 4776 key.offset++; 4777 } else if (key.type < BTRFS_ROOT_ITEM_KEY) { 4778 key.offset = 0; 4779 key.type = BTRFS_ROOT_ITEM_KEY; 4780 } else if (key.objectid < (u64)-1) { 4781 key.offset = 0; 4782 key.type = BTRFS_ROOT_ITEM_KEY; 4783 key.objectid++; 4784 } else { 4785 break; 4786 } 4787 cond_resched(); 4788 } 4789 4790 out: 4791 btrfs_free_path(path); 4792 if (trans && !IS_ERR(trans)) 4793 btrfs_end_transaction(trans); 4794 if (ret) 4795 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); 4796 else if (!closing) 4797 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags); 4798 up(&fs_info->uuid_tree_rescan_sem); 4799 return 0; 4800 } 4801 4802 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) 4803 { 4804 struct btrfs_trans_handle *trans; 4805 struct btrfs_root *tree_root = fs_info->tree_root; 4806 struct btrfs_root *uuid_root; 4807 struct task_struct *task; 4808 int ret; 4809 4810 /* 4811 * 1 - root node 4812 * 1 - root item 4813 */ 4814 trans = btrfs_start_transaction(tree_root, 2); 4815 if (IS_ERR(trans)) 4816 return PTR_ERR(trans); 4817 4818 uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID); 4819 if (IS_ERR(uuid_root)) { 4820 ret = PTR_ERR(uuid_root); 4821 btrfs_abort_transaction(trans, ret); 4822 btrfs_end_transaction(trans); 4823 return ret; 4824 } 4825 4826 fs_info->uuid_root = uuid_root; 4827 4828 ret = btrfs_commit_transaction(trans); 4829 if (ret) 4830 return ret; 4831 4832 down(&fs_info->uuid_tree_rescan_sem); 4833 task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid"); 4834 if (IS_ERR(task)) { 4835 /* fs_info->update_uuid_tree_gen remains 0 in all error case */ 4836 btrfs_warn(fs_info, "failed to start uuid_scan task"); 4837 up(&fs_info->uuid_tree_rescan_sem); 4838 return PTR_ERR(task); 4839 } 4840 4841 return 0; 4842 } 4843 4844 /* 4845 * shrinking a device means finding all of the device extents past 4846 * the new size, and then following the back refs to the chunks. 4847 * The chunk relocation code actually frees the device extent 4848 */ 4849 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) 4850 { 4851 struct btrfs_fs_info *fs_info = device->fs_info; 4852 struct btrfs_root *root = fs_info->dev_root; 4853 struct btrfs_trans_handle *trans; 4854 struct btrfs_dev_extent *dev_extent = NULL; 4855 struct btrfs_path *path; 4856 u64 length; 4857 u64 chunk_offset; 4858 int ret; 4859 int slot; 4860 int failed = 0; 4861 bool retried = false; 4862 struct extent_buffer *l; 4863 struct btrfs_key key; 4864 struct btrfs_super_block *super_copy = fs_info->super_copy; 4865 u64 old_total = btrfs_super_total_bytes(super_copy); 4866 u64 old_size = btrfs_device_get_total_bytes(device); 4867 u64 diff; 4868 u64 start; 4869 4870 new_size = round_down(new_size, fs_info->sectorsize); 4871 start = new_size; 4872 diff = round_down(old_size - new_size, fs_info->sectorsize); 4873 4874 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 4875 return -EINVAL; 4876 4877 path = btrfs_alloc_path(); 4878 if (!path) 4879 return -ENOMEM; 4880 4881 path->reada = READA_BACK; 4882 4883 trans = btrfs_start_transaction(root, 0); 4884 if (IS_ERR(trans)) { 4885 btrfs_free_path(path); 4886 return PTR_ERR(trans); 4887 } 4888 4889 mutex_lock(&fs_info->chunk_mutex); 4890 4891 btrfs_device_set_total_bytes(device, new_size); 4892 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 4893 device->fs_devices->total_rw_bytes -= diff; 4894 atomic64_sub(diff, &fs_info->free_chunk_space); 4895 } 4896 4897 /* 4898 * Once the device's size has been set to the new size, ensure all 4899 * in-memory chunks are synced to disk so that the loop below sees them 4900 * and relocates them accordingly. 4901 */ 4902 if (contains_pending_extent(device, &start, diff)) { 4903 mutex_unlock(&fs_info->chunk_mutex); 4904 ret = btrfs_commit_transaction(trans); 4905 if (ret) 4906 goto done; 4907 } else { 4908 mutex_unlock(&fs_info->chunk_mutex); 4909 btrfs_end_transaction(trans); 4910 } 4911 4912 again: 4913 key.objectid = device->devid; 4914 key.offset = (u64)-1; 4915 key.type = BTRFS_DEV_EXTENT_KEY; 4916 4917 do { 4918 mutex_lock(&fs_info->reclaim_bgs_lock); 4919 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4920 if (ret < 0) { 4921 mutex_unlock(&fs_info->reclaim_bgs_lock); 4922 goto done; 4923 } 4924 4925 ret = btrfs_previous_item(root, path, 0, key.type); 4926 if (ret) { 4927 mutex_unlock(&fs_info->reclaim_bgs_lock); 4928 if (ret < 0) 4929 goto done; 4930 ret = 0; 4931 btrfs_release_path(path); 4932 break; 4933 } 4934 4935 l = path->nodes[0]; 4936 slot = path->slots[0]; 4937 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 4938 4939 if (key.objectid != device->devid) { 4940 mutex_unlock(&fs_info->reclaim_bgs_lock); 4941 btrfs_release_path(path); 4942 break; 4943 } 4944 4945 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 4946 length = btrfs_dev_extent_length(l, dev_extent); 4947 4948 if (key.offset + length <= new_size) { 4949 mutex_unlock(&fs_info->reclaim_bgs_lock); 4950 btrfs_release_path(path); 4951 break; 4952 } 4953 4954 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4955 btrfs_release_path(path); 4956 4957 /* 4958 * We may be relocating the only data chunk we have, 4959 * which could potentially end up with losing data's 4960 * raid profile, so lets allocate an empty one in 4961 * advance. 4962 */ 4963 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); 4964 if (ret < 0) { 4965 mutex_unlock(&fs_info->reclaim_bgs_lock); 4966 goto done; 4967 } 4968 4969 ret = btrfs_relocate_chunk(fs_info, chunk_offset); 4970 mutex_unlock(&fs_info->reclaim_bgs_lock); 4971 if (ret == -ENOSPC) { 4972 failed++; 4973 } else if (ret) { 4974 if (ret == -ETXTBSY) { 4975 btrfs_warn(fs_info, 4976 "could not shrink block group %llu due to active swapfile", 4977 chunk_offset); 4978 } 4979 goto done; 4980 } 4981 } while (key.offset-- > 0); 4982 4983 if (failed && !retried) { 4984 failed = 0; 4985 retried = true; 4986 goto again; 4987 } else if (failed && retried) { 4988 ret = -ENOSPC; 4989 goto done; 4990 } 4991 4992 /* Shrinking succeeded, else we would be at "done". */ 4993 trans = btrfs_start_transaction(root, 0); 4994 if (IS_ERR(trans)) { 4995 ret = PTR_ERR(trans); 4996 goto done; 4997 } 4998 4999 mutex_lock(&fs_info->chunk_mutex); 5000 /* Clear all state bits beyond the shrunk device size */ 5001 clear_extent_bits(&device->alloc_state, new_size, (u64)-1, 5002 CHUNK_STATE_MASK); 5003 5004 btrfs_device_set_disk_total_bytes(device, new_size); 5005 if (list_empty(&device->post_commit_list)) 5006 list_add_tail(&device->post_commit_list, 5007 &trans->transaction->dev_update_list); 5008 5009 WARN_ON(diff > old_total); 5010 btrfs_set_super_total_bytes(super_copy, 5011 round_down(old_total - diff, fs_info->sectorsize)); 5012 mutex_unlock(&fs_info->chunk_mutex); 5013 5014 btrfs_reserve_chunk_metadata(trans, false); 5015 /* Now btrfs_update_device() will change the on-disk size. */ 5016 ret = btrfs_update_device(trans, device); 5017 btrfs_trans_release_chunk_metadata(trans); 5018 if (ret < 0) { 5019 btrfs_abort_transaction(trans, ret); 5020 btrfs_end_transaction(trans); 5021 } else { 5022 ret = btrfs_commit_transaction(trans); 5023 } 5024 done: 5025 btrfs_free_path(path); 5026 if (ret) { 5027 mutex_lock(&fs_info->chunk_mutex); 5028 btrfs_device_set_total_bytes(device, old_size); 5029 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) 5030 device->fs_devices->total_rw_bytes += diff; 5031 atomic64_add(diff, &fs_info->free_chunk_space); 5032 mutex_unlock(&fs_info->chunk_mutex); 5033 } 5034 return ret; 5035 } 5036 5037 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, 5038 struct btrfs_key *key, 5039 struct btrfs_chunk *chunk, int item_size) 5040 { 5041 struct btrfs_super_block *super_copy = fs_info->super_copy; 5042 struct btrfs_disk_key disk_key; 5043 u32 array_size; 5044 u8 *ptr; 5045 5046 lockdep_assert_held(&fs_info->chunk_mutex); 5047 5048 array_size = btrfs_super_sys_array_size(super_copy); 5049 if (array_size + item_size + sizeof(disk_key) 5050 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 5051 return -EFBIG; 5052 5053 ptr = super_copy->sys_chunk_array + array_size; 5054 btrfs_cpu_key_to_disk(&disk_key, key); 5055 memcpy(ptr, &disk_key, sizeof(disk_key)); 5056 ptr += sizeof(disk_key); 5057 memcpy(ptr, chunk, item_size); 5058 item_size += sizeof(disk_key); 5059 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 5060 5061 return 0; 5062 } 5063 5064 /* 5065 * sort the devices in descending order by max_avail, total_avail 5066 */ 5067 static int btrfs_cmp_device_info(const void *a, const void *b) 5068 { 5069 const struct btrfs_device_info *di_a = a; 5070 const struct btrfs_device_info *di_b = b; 5071 5072 if (di_a->max_avail > di_b->max_avail) 5073 return -1; 5074 if (di_a->max_avail < di_b->max_avail) 5075 return 1; 5076 if (di_a->total_avail > di_b->total_avail) 5077 return -1; 5078 if (di_a->total_avail < di_b->total_avail) 5079 return 1; 5080 return 0; 5081 } 5082 5083 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) 5084 { 5085 if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) 5086 return; 5087 5088 btrfs_set_fs_incompat(info, RAID56); 5089 } 5090 5091 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) 5092 { 5093 if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4))) 5094 return; 5095 5096 btrfs_set_fs_incompat(info, RAID1C34); 5097 } 5098 5099 /* 5100 * Structure used internally for btrfs_create_chunk() function. 5101 * Wraps needed parameters. 5102 */ 5103 struct alloc_chunk_ctl { 5104 u64 start; 5105 u64 type; 5106 /* Total number of stripes to allocate */ 5107 int num_stripes; 5108 /* sub_stripes info for map */ 5109 int sub_stripes; 5110 /* Stripes per device */ 5111 int dev_stripes; 5112 /* Maximum number of devices to use */ 5113 int devs_max; 5114 /* Minimum number of devices to use */ 5115 int devs_min; 5116 /* ndevs has to be a multiple of this */ 5117 int devs_increment; 5118 /* Number of copies */ 5119 int ncopies; 5120 /* Number of stripes worth of bytes to store parity information */ 5121 int nparity; 5122 u64 max_stripe_size; 5123 u64 max_chunk_size; 5124 u64 dev_extent_min; 5125 u64 stripe_size; 5126 u64 chunk_size; 5127 int ndevs; 5128 }; 5129 5130 static void init_alloc_chunk_ctl_policy_regular( 5131 struct btrfs_fs_devices *fs_devices, 5132 struct alloc_chunk_ctl *ctl) 5133 { 5134 u64 type = ctl->type; 5135 5136 if (type & BTRFS_BLOCK_GROUP_DATA) { 5137 ctl->max_stripe_size = SZ_1G; 5138 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; 5139 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5140 /* For larger filesystems, use larger metadata chunks */ 5141 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) 5142 ctl->max_stripe_size = SZ_1G; 5143 else 5144 ctl->max_stripe_size = SZ_256M; 5145 ctl->max_chunk_size = ctl->max_stripe_size; 5146 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5147 ctl->max_stripe_size = SZ_32M; 5148 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5149 ctl->devs_max = min_t(int, ctl->devs_max, 5150 BTRFS_MAX_DEVS_SYS_CHUNK); 5151 } else { 5152 BUG(); 5153 } 5154 5155 /* We don't want a chunk larger than 10% of writable space */ 5156 ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 5157 ctl->max_chunk_size); 5158 ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; 5159 } 5160 5161 static void init_alloc_chunk_ctl_policy_zoned( 5162 struct btrfs_fs_devices *fs_devices, 5163 struct alloc_chunk_ctl *ctl) 5164 { 5165 u64 zone_size = fs_devices->fs_info->zone_size; 5166 u64 limit; 5167 int min_num_stripes = ctl->devs_min * ctl->dev_stripes; 5168 int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies; 5169 u64 min_chunk_size = min_data_stripes * zone_size; 5170 u64 type = ctl->type; 5171 5172 ctl->max_stripe_size = zone_size; 5173 if (type & BTRFS_BLOCK_GROUP_DATA) { 5174 ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE, 5175 zone_size); 5176 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 5177 ctl->max_chunk_size = ctl->max_stripe_size; 5178 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 5179 ctl->max_chunk_size = 2 * ctl->max_stripe_size; 5180 ctl->devs_max = min_t(int, ctl->devs_max, 5181 BTRFS_MAX_DEVS_SYS_CHUNK); 5182 } else { 5183 BUG(); 5184 } 5185 5186 /* We don't want a chunk larger than 10% of writable space */ 5187 limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), 5188 zone_size), 5189 min_chunk_size); 5190 ctl->max_chunk_size = min(limit, ctl->max_chunk_size); 5191 ctl->dev_extent_min = zone_size * ctl->dev_stripes; 5192 } 5193 5194 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, 5195 struct alloc_chunk_ctl *ctl) 5196 { 5197 int index = btrfs_bg_flags_to_raid_index(ctl->type); 5198 5199 ctl->sub_stripes = btrfs_raid_array[index].sub_stripes; 5200 ctl->dev_stripes = btrfs_raid_array[index].dev_stripes; 5201 ctl->devs_max = btrfs_raid_array[index].devs_max; 5202 if (!ctl->devs_max) 5203 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info); 5204 ctl->devs_min = btrfs_raid_array[index].devs_min; 5205 ctl->devs_increment = btrfs_raid_array[index].devs_increment; 5206 ctl->ncopies = btrfs_raid_array[index].ncopies; 5207 ctl->nparity = btrfs_raid_array[index].nparity; 5208 ctl->ndevs = 0; 5209 5210 switch (fs_devices->chunk_alloc_policy) { 5211 case BTRFS_CHUNK_ALLOC_REGULAR: 5212 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); 5213 break; 5214 case BTRFS_CHUNK_ALLOC_ZONED: 5215 init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); 5216 break; 5217 default: 5218 BUG(); 5219 } 5220 } 5221 5222 static int gather_device_info(struct btrfs_fs_devices *fs_devices, 5223 struct alloc_chunk_ctl *ctl, 5224 struct btrfs_device_info *devices_info) 5225 { 5226 struct btrfs_fs_info *info = fs_devices->fs_info; 5227 struct btrfs_device *device; 5228 u64 total_avail; 5229 u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes; 5230 int ret; 5231 int ndevs = 0; 5232 u64 max_avail; 5233 u64 dev_offset; 5234 5235 /* 5236 * in the first pass through the devices list, we gather information 5237 * about the available holes on each device. 5238 */ 5239 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5240 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { 5241 WARN(1, KERN_ERR 5242 "BTRFS: read-only device in alloc_list\n"); 5243 continue; 5244 } 5245 5246 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5247 &device->dev_state) || 5248 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5249 continue; 5250 5251 if (device->total_bytes > device->bytes_used) 5252 total_avail = device->total_bytes - device->bytes_used; 5253 else 5254 total_avail = 0; 5255 5256 /* If there is no space on this device, skip it. */ 5257 if (total_avail < ctl->dev_extent_min) 5258 continue; 5259 5260 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset, 5261 &max_avail); 5262 if (ret && ret != -ENOSPC) 5263 return ret; 5264 5265 if (ret == 0) 5266 max_avail = dev_extent_want; 5267 5268 if (max_avail < ctl->dev_extent_min) { 5269 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5270 btrfs_debug(info, 5271 "%s: devid %llu has no free space, have=%llu want=%llu", 5272 __func__, device->devid, max_avail, 5273 ctl->dev_extent_min); 5274 continue; 5275 } 5276 5277 if (ndevs == fs_devices->rw_devices) { 5278 WARN(1, "%s: found more than %llu devices\n", 5279 __func__, fs_devices->rw_devices); 5280 break; 5281 } 5282 devices_info[ndevs].dev_offset = dev_offset; 5283 devices_info[ndevs].max_avail = max_avail; 5284 devices_info[ndevs].total_avail = total_avail; 5285 devices_info[ndevs].dev = device; 5286 ++ndevs; 5287 } 5288 ctl->ndevs = ndevs; 5289 5290 /* 5291 * now sort the devices by hole size / available space 5292 */ 5293 sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5294 btrfs_cmp_device_info, NULL); 5295 5296 return 0; 5297 } 5298 5299 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, 5300 struct btrfs_device_info *devices_info) 5301 { 5302 /* Number of stripes that count for block group size */ 5303 int data_stripes; 5304 5305 /* 5306 * The primary goal is to maximize the number of stripes, so use as 5307 * many devices as possible, even if the stripes are not maximum sized. 5308 * 5309 * The DUP profile stores more than one stripe per device, the 5310 * max_avail is the total size so we have to adjust. 5311 */ 5312 ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail, 5313 ctl->dev_stripes); 5314 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5315 5316 /* This will have to be fixed for RAID1 and RAID10 over more drives */ 5317 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5318 5319 /* 5320 * Use the number of data stripes to figure out how big this chunk is 5321 * really going to be in terms of logical address space, and compare 5322 * that answer with the max chunk size. If it's higher, we try to 5323 * reduce stripe_size. 5324 */ 5325 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5326 /* 5327 * Reduce stripe_size, round it up to a 16MB boundary again and 5328 * then use it, unless it ends up being even bigger than the 5329 * previous value we had already. 5330 */ 5331 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size, 5332 data_stripes), SZ_16M), 5333 ctl->stripe_size); 5334 } 5335 5336 /* Align to BTRFS_STRIPE_LEN */ 5337 ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN); 5338 ctl->chunk_size = ctl->stripe_size * data_stripes; 5339 5340 return 0; 5341 } 5342 5343 static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, 5344 struct btrfs_device_info *devices_info) 5345 { 5346 u64 zone_size = devices_info[0].dev->zone_info->zone_size; 5347 /* Number of stripes that count for block group size */ 5348 int data_stripes; 5349 5350 /* 5351 * It should hold because: 5352 * dev_extent_min == dev_extent_want == zone_size * dev_stripes 5353 */ 5354 ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); 5355 5356 ctl->stripe_size = zone_size; 5357 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5358 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5359 5360 /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */ 5361 if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) { 5362 ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies, 5363 ctl->stripe_size) + ctl->nparity, 5364 ctl->dev_stripes); 5365 ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; 5366 data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; 5367 ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); 5368 } 5369 5370 ctl->chunk_size = ctl->stripe_size * data_stripes; 5371 5372 return 0; 5373 } 5374 5375 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, 5376 struct alloc_chunk_ctl *ctl, 5377 struct btrfs_device_info *devices_info) 5378 { 5379 struct btrfs_fs_info *info = fs_devices->fs_info; 5380 5381 /* 5382 * Round down to number of usable stripes, devs_increment can be any 5383 * number so we can't use round_down() that requires power of 2, while 5384 * rounddown is safe. 5385 */ 5386 ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment); 5387 5388 if (ctl->ndevs < ctl->devs_min) { 5389 if (btrfs_test_opt(info, ENOSPC_DEBUG)) { 5390 btrfs_debug(info, 5391 "%s: not enough devices with free space: have=%d minimum required=%d", 5392 __func__, ctl->ndevs, ctl->devs_min); 5393 } 5394 return -ENOSPC; 5395 } 5396 5397 ctl->ndevs = min(ctl->ndevs, ctl->devs_max); 5398 5399 switch (fs_devices->chunk_alloc_policy) { 5400 case BTRFS_CHUNK_ALLOC_REGULAR: 5401 return decide_stripe_size_regular(ctl, devices_info); 5402 case BTRFS_CHUNK_ALLOC_ZONED: 5403 return decide_stripe_size_zoned(ctl, devices_info); 5404 default: 5405 BUG(); 5406 } 5407 } 5408 5409 static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, 5410 struct alloc_chunk_ctl *ctl, 5411 struct btrfs_device_info *devices_info) 5412 { 5413 struct btrfs_fs_info *info = trans->fs_info; 5414 struct map_lookup *map = NULL; 5415 struct extent_map_tree *em_tree; 5416 struct btrfs_block_group *block_group; 5417 struct extent_map *em; 5418 u64 start = ctl->start; 5419 u64 type = ctl->type; 5420 int ret; 5421 int i; 5422 int j; 5423 5424 map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); 5425 if (!map) 5426 return ERR_PTR(-ENOMEM); 5427 map->num_stripes = ctl->num_stripes; 5428 5429 for (i = 0; i < ctl->ndevs; ++i) { 5430 for (j = 0; j < ctl->dev_stripes; ++j) { 5431 int s = i * ctl->dev_stripes + j; 5432 map->stripes[s].dev = devices_info[i].dev; 5433 map->stripes[s].physical = devices_info[i].dev_offset + 5434 j * ctl->stripe_size; 5435 } 5436 } 5437 map->stripe_len = BTRFS_STRIPE_LEN; 5438 map->io_align = BTRFS_STRIPE_LEN; 5439 map->io_width = BTRFS_STRIPE_LEN; 5440 map->type = type; 5441 map->sub_stripes = ctl->sub_stripes; 5442 5443 trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); 5444 5445 em = alloc_extent_map(); 5446 if (!em) { 5447 kfree(map); 5448 return ERR_PTR(-ENOMEM); 5449 } 5450 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 5451 em->map_lookup = map; 5452 em->start = start; 5453 em->len = ctl->chunk_size; 5454 em->block_start = 0; 5455 em->block_len = em->len; 5456 em->orig_block_len = ctl->stripe_size; 5457 5458 em_tree = &info->mapping_tree; 5459 write_lock(&em_tree->lock); 5460 ret = add_extent_mapping(em_tree, em, 0); 5461 if (ret) { 5462 write_unlock(&em_tree->lock); 5463 free_extent_map(em); 5464 return ERR_PTR(ret); 5465 } 5466 write_unlock(&em_tree->lock); 5467 5468 block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); 5469 if (IS_ERR(block_group)) 5470 goto error_del_extent; 5471 5472 for (i = 0; i < map->num_stripes; i++) { 5473 struct btrfs_device *dev = map->stripes[i].dev; 5474 5475 btrfs_device_set_bytes_used(dev, 5476 dev->bytes_used + ctl->stripe_size); 5477 if (list_empty(&dev->post_commit_list)) 5478 list_add_tail(&dev->post_commit_list, 5479 &trans->transaction->dev_update_list); 5480 } 5481 5482 atomic64_sub(ctl->stripe_size * map->num_stripes, 5483 &info->free_chunk_space); 5484 5485 free_extent_map(em); 5486 check_raid56_incompat_flag(info, type); 5487 check_raid1c34_incompat_flag(info, type); 5488 5489 return block_group; 5490 5491 error_del_extent: 5492 write_lock(&em_tree->lock); 5493 remove_extent_mapping(em_tree, em); 5494 write_unlock(&em_tree->lock); 5495 5496 /* One for our allocation */ 5497 free_extent_map(em); 5498 /* One for the tree reference */ 5499 free_extent_map(em); 5500 5501 return block_group; 5502 } 5503 5504 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, 5505 u64 type) 5506 { 5507 struct btrfs_fs_info *info = trans->fs_info; 5508 struct btrfs_fs_devices *fs_devices = info->fs_devices; 5509 struct btrfs_device_info *devices_info = NULL; 5510 struct alloc_chunk_ctl ctl; 5511 struct btrfs_block_group *block_group; 5512 int ret; 5513 5514 lockdep_assert_held(&info->chunk_mutex); 5515 5516 if (!alloc_profile_is_valid(type, 0)) { 5517 ASSERT(0); 5518 return ERR_PTR(-EINVAL); 5519 } 5520 5521 if (list_empty(&fs_devices->alloc_list)) { 5522 if (btrfs_test_opt(info, ENOSPC_DEBUG)) 5523 btrfs_debug(info, "%s: no writable device", __func__); 5524 return ERR_PTR(-ENOSPC); 5525 } 5526 5527 if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { 5528 btrfs_err(info, "invalid chunk type 0x%llx requested", type); 5529 ASSERT(0); 5530 return ERR_PTR(-EINVAL); 5531 } 5532 5533 ctl.start = find_next_chunk(info); 5534 ctl.type = type; 5535 init_alloc_chunk_ctl(fs_devices, &ctl); 5536 5537 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 5538 GFP_NOFS); 5539 if (!devices_info) 5540 return ERR_PTR(-ENOMEM); 5541 5542 ret = gather_device_info(fs_devices, &ctl, devices_info); 5543 if (ret < 0) { 5544 block_group = ERR_PTR(ret); 5545 goto out; 5546 } 5547 5548 ret = decide_stripe_size(fs_devices, &ctl, devices_info); 5549 if (ret < 0) { 5550 block_group = ERR_PTR(ret); 5551 goto out; 5552 } 5553 5554 block_group = create_chunk(trans, &ctl, devices_info); 5555 5556 out: 5557 kfree(devices_info); 5558 return block_group; 5559 } 5560 5561 /* 5562 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the 5563 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system 5564 * chunks. 5565 * 5566 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation 5567 * phases. 5568 */ 5569 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 5570 struct btrfs_block_group *bg) 5571 { 5572 struct btrfs_fs_info *fs_info = trans->fs_info; 5573 struct btrfs_root *chunk_root = fs_info->chunk_root; 5574 struct btrfs_key key; 5575 struct btrfs_chunk *chunk; 5576 struct btrfs_stripe *stripe; 5577 struct extent_map *em; 5578 struct map_lookup *map; 5579 size_t item_size; 5580 int i; 5581 int ret; 5582 5583 /* 5584 * We take the chunk_mutex for 2 reasons: 5585 * 5586 * 1) Updates and insertions in the chunk btree must be done while holding 5587 * the chunk_mutex, as well as updating the system chunk array in the 5588 * superblock. See the comment on top of btrfs_chunk_alloc() for the 5589 * details; 5590 * 5591 * 2) To prevent races with the final phase of a device replace operation 5592 * that replaces the device object associated with the map's stripes, 5593 * because the device object's id can change at any time during that 5594 * final phase of the device replace operation 5595 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the 5596 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, 5597 * which would cause a failure when updating the device item, which does 5598 * not exists, or persisting a stripe of the chunk item with such ID. 5599 * Here we can't use the device_list_mutex because our caller already 5600 * has locked the chunk_mutex, and the final phase of device replace 5601 * acquires both mutexes - first the device_list_mutex and then the 5602 * chunk_mutex. Using any of those two mutexes protects us from a 5603 * concurrent device replace. 5604 */ 5605 lockdep_assert_held(&fs_info->chunk_mutex); 5606 5607 em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); 5608 if (IS_ERR(em)) { 5609 ret = PTR_ERR(em); 5610 btrfs_abort_transaction(trans, ret); 5611 return ret; 5612 } 5613 5614 map = em->map_lookup; 5615 item_size = btrfs_chunk_item_size(map->num_stripes); 5616 5617 chunk = kzalloc(item_size, GFP_NOFS); 5618 if (!chunk) { 5619 ret = -ENOMEM; 5620 btrfs_abort_transaction(trans, ret); 5621 goto out; 5622 } 5623 5624 for (i = 0; i < map->num_stripes; i++) { 5625 struct btrfs_device *device = map->stripes[i].dev; 5626 5627 ret = btrfs_update_device(trans, device); 5628 if (ret) 5629 goto out; 5630 } 5631 5632 stripe = &chunk->stripe; 5633 for (i = 0; i < map->num_stripes; i++) { 5634 struct btrfs_device *device = map->stripes[i].dev; 5635 const u64 dev_offset = map->stripes[i].physical; 5636 5637 btrfs_set_stack_stripe_devid(stripe, device->devid); 5638 btrfs_set_stack_stripe_offset(stripe, dev_offset); 5639 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); 5640 stripe++; 5641 } 5642 5643 btrfs_set_stack_chunk_length(chunk, bg->length); 5644 btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); 5645 btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); 5646 btrfs_set_stack_chunk_type(chunk, map->type); 5647 btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); 5648 btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); 5649 btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); 5650 btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); 5651 btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); 5652 5653 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 5654 key.type = BTRFS_CHUNK_ITEM_KEY; 5655 key.offset = bg->start; 5656 5657 ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); 5658 if (ret) 5659 goto out; 5660 5661 bg->chunk_item_inserted = 1; 5662 5663 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 5664 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); 5665 if (ret) 5666 goto out; 5667 } 5668 5669 out: 5670 kfree(chunk); 5671 free_extent_map(em); 5672 return ret; 5673 } 5674 5675 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) 5676 { 5677 struct btrfs_fs_info *fs_info = trans->fs_info; 5678 u64 alloc_profile; 5679 struct btrfs_block_group *meta_bg; 5680 struct btrfs_block_group *sys_bg; 5681 5682 /* 5683 * When adding a new device for sprouting, the seed device is read-only 5684 * so we must first allocate a metadata and a system chunk. But before 5685 * adding the block group items to the extent, device and chunk btrees, 5686 * we must first: 5687 * 5688 * 1) Create both chunks without doing any changes to the btrees, as 5689 * otherwise we would get -ENOSPC since the block groups from the 5690 * seed device are read-only; 5691 * 5692 * 2) Add the device item for the new sprout device - finishing the setup 5693 * of a new block group requires updating the device item in the chunk 5694 * btree, so it must exist when we attempt to do it. The previous step 5695 * ensures this does not fail with -ENOSPC. 5696 * 5697 * After that we can add the block group items to their btrees: 5698 * update existing device item in the chunk btree, add a new block group 5699 * item to the extent btree, add a new chunk item to the chunk btree and 5700 * finally add the new device extent items to the devices btree. 5701 */ 5702 5703 alloc_profile = btrfs_metadata_alloc_profile(fs_info); 5704 meta_bg = btrfs_create_chunk(trans, alloc_profile); 5705 if (IS_ERR(meta_bg)) 5706 return PTR_ERR(meta_bg); 5707 5708 alloc_profile = btrfs_system_alloc_profile(fs_info); 5709 sys_bg = btrfs_create_chunk(trans, alloc_profile); 5710 if (IS_ERR(sys_bg)) 5711 return PTR_ERR(sys_bg); 5712 5713 return 0; 5714 } 5715 5716 static inline int btrfs_chunk_max_errors(struct map_lookup *map) 5717 { 5718 const int index = btrfs_bg_flags_to_raid_index(map->type); 5719 5720 return btrfs_raid_array[index].tolerated_failures; 5721 } 5722 5723 bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) 5724 { 5725 struct extent_map *em; 5726 struct map_lookup *map; 5727 int miss_ndevs = 0; 5728 int i; 5729 bool ret = true; 5730 5731 em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 5732 if (IS_ERR(em)) 5733 return false; 5734 5735 map = em->map_lookup; 5736 for (i = 0; i < map->num_stripes; i++) { 5737 if (test_bit(BTRFS_DEV_STATE_MISSING, 5738 &map->stripes[i].dev->dev_state)) { 5739 miss_ndevs++; 5740 continue; 5741 } 5742 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, 5743 &map->stripes[i].dev->dev_state)) { 5744 ret = false; 5745 goto end; 5746 } 5747 } 5748 5749 /* 5750 * If the number of missing devices is larger than max errors, we can 5751 * not write the data into that chunk successfully. 5752 */ 5753 if (miss_ndevs > btrfs_chunk_max_errors(map)) 5754 ret = false; 5755 end: 5756 free_extent_map(em); 5757 return ret; 5758 } 5759 5760 void btrfs_mapping_tree_free(struct extent_map_tree *tree) 5761 { 5762 struct extent_map *em; 5763 5764 while (1) { 5765 write_lock(&tree->lock); 5766 em = lookup_extent_mapping(tree, 0, (u64)-1); 5767 if (em) 5768 remove_extent_mapping(tree, em); 5769 write_unlock(&tree->lock); 5770 if (!em) 5771 break; 5772 /* once for us */ 5773 free_extent_map(em); 5774 /* once for the tree */ 5775 free_extent_map(em); 5776 } 5777 } 5778 5779 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5780 { 5781 struct extent_map *em; 5782 struct map_lookup *map; 5783 int ret; 5784 5785 em = btrfs_get_chunk_map(fs_info, logical, len); 5786 if (IS_ERR(em)) 5787 /* 5788 * We could return errors for these cases, but that could get 5789 * ugly and we'd probably do the same thing which is just not do 5790 * anything else and exit, so return 1 so the callers don't try 5791 * to use other copies. 5792 */ 5793 return 1; 5794 5795 map = em->map_lookup; 5796 if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) 5797 ret = map->num_stripes; 5798 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5799 ret = map->sub_stripes; 5800 else if (map->type & BTRFS_BLOCK_GROUP_RAID5) 5801 ret = 2; 5802 else if (map->type & BTRFS_BLOCK_GROUP_RAID6) 5803 /* 5804 * There could be two corrupted data stripes, we need 5805 * to loop retry in order to rebuild the correct data. 5806 * 5807 * Fail a stripe at a time on every retry except the 5808 * stripe under reconstruction. 5809 */ 5810 ret = map->num_stripes; 5811 else 5812 ret = 1; 5813 free_extent_map(em); 5814 5815 down_read(&fs_info->dev_replace.rwsem); 5816 if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && 5817 fs_info->dev_replace.tgtdev) 5818 ret++; 5819 up_read(&fs_info->dev_replace.rwsem); 5820 5821 return ret; 5822 } 5823 5824 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, 5825 u64 logical) 5826 { 5827 struct extent_map *em; 5828 struct map_lookup *map; 5829 unsigned long len = fs_info->sectorsize; 5830 5831 em = btrfs_get_chunk_map(fs_info, logical, len); 5832 5833 if (!WARN_ON(IS_ERR(em))) { 5834 map = em->map_lookup; 5835 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5836 len = map->stripe_len * nr_data_stripes(map); 5837 free_extent_map(em); 5838 } 5839 return len; 5840 } 5841 5842 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) 5843 { 5844 struct extent_map *em; 5845 struct map_lookup *map; 5846 int ret = 0; 5847 5848 em = btrfs_get_chunk_map(fs_info, logical, len); 5849 5850 if(!WARN_ON(IS_ERR(em))) { 5851 map = em->map_lookup; 5852 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 5853 ret = 1; 5854 free_extent_map(em); 5855 } 5856 return ret; 5857 } 5858 5859 static int find_live_mirror(struct btrfs_fs_info *fs_info, 5860 struct map_lookup *map, int first, 5861 int dev_replace_is_ongoing) 5862 { 5863 int i; 5864 int num_stripes; 5865 int preferred_mirror; 5866 int tolerance; 5867 struct btrfs_device *srcdev; 5868 5869 ASSERT((map->type & 5870 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); 5871 5872 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5873 num_stripes = map->sub_stripes; 5874 else 5875 num_stripes = map->num_stripes; 5876 5877 switch (fs_info->fs_devices->read_policy) { 5878 default: 5879 /* Shouldn't happen, just warn and use pid instead of failing */ 5880 btrfs_warn_rl(fs_info, 5881 "unknown read_policy type %u, reset to pid", 5882 fs_info->fs_devices->read_policy); 5883 fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID; 5884 fallthrough; 5885 case BTRFS_READ_POLICY_PID: 5886 preferred_mirror = first + (current->pid % num_stripes); 5887 break; 5888 } 5889 5890 if (dev_replace_is_ongoing && 5891 fs_info->dev_replace.cont_reading_from_srcdev_mode == 5892 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID) 5893 srcdev = fs_info->dev_replace.srcdev; 5894 else 5895 srcdev = NULL; 5896 5897 /* 5898 * try to avoid the drive that is the source drive for a 5899 * dev-replace procedure, only choose it if no other non-missing 5900 * mirror is available 5901 */ 5902 for (tolerance = 0; tolerance < 2; tolerance++) { 5903 if (map->stripes[preferred_mirror].dev->bdev && 5904 (tolerance || map->stripes[preferred_mirror].dev != srcdev)) 5905 return preferred_mirror; 5906 for (i = first; i < first + num_stripes; i++) { 5907 if (map->stripes[i].dev->bdev && 5908 (tolerance || map->stripes[i].dev != srcdev)) 5909 return i; 5910 } 5911 } 5912 5913 /* we couldn't find one that doesn't fail. Just return something 5914 * and the io error handling code will clean up eventually 5915 */ 5916 return preferred_mirror; 5917 } 5918 5919 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ 5920 static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) 5921 { 5922 int i; 5923 int again = 1; 5924 5925 while (again) { 5926 again = 0; 5927 for (i = 0; i < num_stripes - 1; i++) { 5928 /* Swap if parity is on a smaller index */ 5929 if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { 5930 swap(bioc->stripes[i], bioc->stripes[i + 1]); 5931 swap(bioc->raid_map[i], bioc->raid_map[i + 1]); 5932 again = 1; 5933 } 5934 } 5935 } 5936 } 5937 5938 static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, 5939 int total_stripes, 5940 int real_stripes) 5941 { 5942 struct btrfs_io_context *bioc = kzalloc( 5943 /* The size of btrfs_io_context */ 5944 sizeof(struct btrfs_io_context) + 5945 /* Plus the variable array for the stripes */ 5946 sizeof(struct btrfs_io_stripe) * (total_stripes) + 5947 /* Plus the variable array for the tgt dev */ 5948 sizeof(int) * (real_stripes) + 5949 /* 5950 * Plus the raid_map, which includes both the tgt dev 5951 * and the stripes. 5952 */ 5953 sizeof(u64) * (total_stripes), 5954 GFP_NOFS|__GFP_NOFAIL); 5955 5956 atomic_set(&bioc->error, 0); 5957 refcount_set(&bioc->refs, 1); 5958 5959 bioc->fs_info = fs_info; 5960 bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); 5961 bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); 5962 5963 return bioc; 5964 } 5965 5966 void btrfs_get_bioc(struct btrfs_io_context *bioc) 5967 { 5968 WARN_ON(!refcount_read(&bioc->refs)); 5969 refcount_inc(&bioc->refs); 5970 } 5971 5972 void btrfs_put_bioc(struct btrfs_io_context *bioc) 5973 { 5974 if (!bioc) 5975 return; 5976 if (refcount_dec_and_test(&bioc->refs)) 5977 kfree(bioc); 5978 } 5979 5980 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ 5981 /* 5982 * Please note that, discard won't be sent to target device of device 5983 * replace. 5984 */ 5985 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, 5986 u64 logical, u64 *length_ret, 5987 struct btrfs_io_context **bioc_ret) 5988 { 5989 struct extent_map *em; 5990 struct map_lookup *map; 5991 struct btrfs_io_context *bioc; 5992 u64 length = *length_ret; 5993 u64 offset; 5994 u64 stripe_nr; 5995 u64 stripe_nr_end; 5996 u64 stripe_end_offset; 5997 u64 stripe_cnt; 5998 u64 stripe_len; 5999 u64 stripe_offset; 6000 u64 num_stripes; 6001 u32 stripe_index; 6002 u32 factor = 0; 6003 u32 sub_stripes = 0; 6004 u64 stripes_per_dev = 0; 6005 u32 remaining_stripes = 0; 6006 u32 last_stripe = 0; 6007 int ret = 0; 6008 int i; 6009 6010 /* Discard always returns a bioc. */ 6011 ASSERT(bioc_ret); 6012 6013 em = btrfs_get_chunk_map(fs_info, logical, length); 6014 if (IS_ERR(em)) 6015 return PTR_ERR(em); 6016 6017 map = em->map_lookup; 6018 /* we don't discard raid56 yet */ 6019 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6020 ret = -EOPNOTSUPP; 6021 goto out; 6022 } 6023 6024 offset = logical - em->start; 6025 length = min_t(u64, em->start + em->len - logical, length); 6026 *length_ret = length; 6027 6028 stripe_len = map->stripe_len; 6029 /* 6030 * stripe_nr counts the total number of stripes we have to stride 6031 * to get to this block 6032 */ 6033 stripe_nr = div64_u64(offset, stripe_len); 6034 6035 /* stripe_offset is the offset of this block in its stripe */ 6036 stripe_offset = offset - stripe_nr * stripe_len; 6037 6038 stripe_nr_end = round_up(offset + length, map->stripe_len); 6039 stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); 6040 stripe_cnt = stripe_nr_end - stripe_nr; 6041 stripe_end_offset = stripe_nr_end * map->stripe_len - 6042 (offset + length); 6043 /* 6044 * after this, stripe_nr is the number of stripes on this 6045 * device we have to walk to find the data, and stripe_index is 6046 * the number of our device in the stripe array 6047 */ 6048 num_stripes = 1; 6049 stripe_index = 0; 6050 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6051 BTRFS_BLOCK_GROUP_RAID10)) { 6052 if (map->type & BTRFS_BLOCK_GROUP_RAID0) 6053 sub_stripes = 1; 6054 else 6055 sub_stripes = map->sub_stripes; 6056 6057 factor = map->num_stripes / sub_stripes; 6058 num_stripes = min_t(u64, map->num_stripes, 6059 sub_stripes * stripe_cnt); 6060 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6061 stripe_index *= sub_stripes; 6062 stripes_per_dev = div_u64_rem(stripe_cnt, factor, 6063 &remaining_stripes); 6064 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); 6065 last_stripe *= sub_stripes; 6066 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | 6067 BTRFS_BLOCK_GROUP_DUP)) { 6068 num_stripes = map->num_stripes; 6069 } else { 6070 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6071 &stripe_index); 6072 } 6073 6074 bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); 6075 if (!bioc) { 6076 ret = -ENOMEM; 6077 goto out; 6078 } 6079 6080 for (i = 0; i < num_stripes; i++) { 6081 bioc->stripes[i].physical = 6082 map->stripes[stripe_index].physical + 6083 stripe_offset + stripe_nr * map->stripe_len; 6084 bioc->stripes[i].dev = map->stripes[stripe_index].dev; 6085 6086 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 6087 BTRFS_BLOCK_GROUP_RAID10)) { 6088 bioc->stripes[i].length = stripes_per_dev * 6089 map->stripe_len; 6090 6091 if (i / sub_stripes < remaining_stripes) 6092 bioc->stripes[i].length += map->stripe_len; 6093 6094 /* 6095 * Special for the first stripe and 6096 * the last stripe: 6097 * 6098 * |-------|...|-------| 6099 * |----------| 6100 * off end_off 6101 */ 6102 if (i < sub_stripes) 6103 bioc->stripes[i].length -= stripe_offset; 6104 6105 if (stripe_index >= last_stripe && 6106 stripe_index <= (last_stripe + 6107 sub_stripes - 1)) 6108 bioc->stripes[i].length -= stripe_end_offset; 6109 6110 if (i == sub_stripes - 1) 6111 stripe_offset = 0; 6112 } else { 6113 bioc->stripes[i].length = length; 6114 } 6115 6116 stripe_index++; 6117 if (stripe_index == map->num_stripes) { 6118 stripe_index = 0; 6119 stripe_nr++; 6120 } 6121 } 6122 6123 *bioc_ret = bioc; 6124 bioc->map_type = map->type; 6125 bioc->num_stripes = num_stripes; 6126 out: 6127 free_extent_map(em); 6128 return ret; 6129 } 6130 6131 /* 6132 * In dev-replace case, for repair case (that's the only case where the mirror 6133 * is selected explicitly when calling btrfs_map_block), blocks left of the 6134 * left cursor can also be read from the target drive. 6135 * 6136 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the 6137 * array of stripes. 6138 * For READ, it also needs to be supported using the same mirror number. 6139 * 6140 * If the requested block is not left of the left cursor, EIO is returned. This 6141 * can happen because btrfs_num_copies() returns one more in the dev-replace 6142 * case. 6143 */ 6144 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, 6145 u64 logical, u64 length, 6146 u64 srcdev_devid, int *mirror_num, 6147 u64 *physical) 6148 { 6149 struct btrfs_io_context *bioc = NULL; 6150 int num_stripes; 6151 int index_srcdev = 0; 6152 int found = 0; 6153 u64 physical_of_found = 0; 6154 int i; 6155 int ret = 0; 6156 6157 ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, 6158 logical, &length, &bioc, 0, 0); 6159 if (ret) { 6160 ASSERT(bioc == NULL); 6161 return ret; 6162 } 6163 6164 num_stripes = bioc->num_stripes; 6165 if (*mirror_num > num_stripes) { 6166 /* 6167 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, 6168 * that means that the requested area is not left of the left 6169 * cursor 6170 */ 6171 btrfs_put_bioc(bioc); 6172 return -EIO; 6173 } 6174 6175 /* 6176 * process the rest of the function using the mirror_num of the source 6177 * drive. Therefore look it up first. At the end, patch the device 6178 * pointer to the one of the target drive. 6179 */ 6180 for (i = 0; i < num_stripes; i++) { 6181 if (bioc->stripes[i].dev->devid != srcdev_devid) 6182 continue; 6183 6184 /* 6185 * In case of DUP, in order to keep it simple, only add the 6186 * mirror with the lowest physical address 6187 */ 6188 if (found && 6189 physical_of_found <= bioc->stripes[i].physical) 6190 continue; 6191 6192 index_srcdev = i; 6193 found = 1; 6194 physical_of_found = bioc->stripes[i].physical; 6195 } 6196 6197 btrfs_put_bioc(bioc); 6198 6199 ASSERT(found); 6200 if (!found) 6201 return -EIO; 6202 6203 *mirror_num = index_srcdev + 1; 6204 *physical = physical_of_found; 6205 return ret; 6206 } 6207 6208 static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) 6209 { 6210 struct btrfs_block_group *cache; 6211 bool ret; 6212 6213 /* Non zoned filesystem does not use "to_copy" flag */ 6214 if (!btrfs_is_zoned(fs_info)) 6215 return false; 6216 6217 cache = btrfs_lookup_block_group(fs_info, logical); 6218 6219 spin_lock(&cache->lock); 6220 ret = cache->to_copy; 6221 spin_unlock(&cache->lock); 6222 6223 btrfs_put_block_group(cache); 6224 return ret; 6225 } 6226 6227 static void handle_ops_on_dev_replace(enum btrfs_map_op op, 6228 struct btrfs_io_context **bioc_ret, 6229 struct btrfs_dev_replace *dev_replace, 6230 u64 logical, 6231 int *num_stripes_ret, int *max_errors_ret) 6232 { 6233 struct btrfs_io_context *bioc = *bioc_ret; 6234 u64 srcdev_devid = dev_replace->srcdev->devid; 6235 int tgtdev_indexes = 0; 6236 int num_stripes = *num_stripes_ret; 6237 int max_errors = *max_errors_ret; 6238 int i; 6239 6240 if (op == BTRFS_MAP_WRITE) { 6241 int index_where_to_add; 6242 6243 /* 6244 * A block group which have "to_copy" set will eventually 6245 * copied by dev-replace process. We can avoid cloning IO here. 6246 */ 6247 if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) 6248 return; 6249 6250 /* 6251 * duplicate the write operations while the dev replace 6252 * procedure is running. Since the copying of the old disk to 6253 * the new disk takes place at run time while the filesystem is 6254 * mounted writable, the regular write operations to the old 6255 * disk have to be duplicated to go to the new disk as well. 6256 * 6257 * Note that device->missing is handled by the caller, and that 6258 * the write to the old disk is already set up in the stripes 6259 * array. 6260 */ 6261 index_where_to_add = num_stripes; 6262 for (i = 0; i < num_stripes; i++) { 6263 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6264 /* write to new disk, too */ 6265 struct btrfs_io_stripe *new = 6266 bioc->stripes + index_where_to_add; 6267 struct btrfs_io_stripe *old = 6268 bioc->stripes + i; 6269 6270 new->physical = old->physical; 6271 new->length = old->length; 6272 new->dev = dev_replace->tgtdev; 6273 bioc->tgtdev_map[i] = index_where_to_add; 6274 index_where_to_add++; 6275 max_errors++; 6276 tgtdev_indexes++; 6277 } 6278 } 6279 num_stripes = index_where_to_add; 6280 } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { 6281 int index_srcdev = 0; 6282 int found = 0; 6283 u64 physical_of_found = 0; 6284 6285 /* 6286 * During the dev-replace procedure, the target drive can also 6287 * be used to read data in case it is needed to repair a corrupt 6288 * block elsewhere. This is possible if the requested area is 6289 * left of the left cursor. In this area, the target drive is a 6290 * full copy of the source drive. 6291 */ 6292 for (i = 0; i < num_stripes; i++) { 6293 if (bioc->stripes[i].dev->devid == srcdev_devid) { 6294 /* 6295 * In case of DUP, in order to keep it simple, 6296 * only add the mirror with the lowest physical 6297 * address 6298 */ 6299 if (found && 6300 physical_of_found <= bioc->stripes[i].physical) 6301 continue; 6302 index_srcdev = i; 6303 found = 1; 6304 physical_of_found = bioc->stripes[i].physical; 6305 } 6306 } 6307 if (found) { 6308 struct btrfs_io_stripe *tgtdev_stripe = 6309 bioc->stripes + num_stripes; 6310 6311 tgtdev_stripe->physical = physical_of_found; 6312 tgtdev_stripe->length = 6313 bioc->stripes[index_srcdev].length; 6314 tgtdev_stripe->dev = dev_replace->tgtdev; 6315 bioc->tgtdev_map[index_srcdev] = num_stripes; 6316 6317 tgtdev_indexes++; 6318 num_stripes++; 6319 } 6320 } 6321 6322 *num_stripes_ret = num_stripes; 6323 *max_errors_ret = max_errors; 6324 bioc->num_tgtdevs = tgtdev_indexes; 6325 *bioc_ret = bioc; 6326 } 6327 6328 static bool need_full_stripe(enum btrfs_map_op op) 6329 { 6330 return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); 6331 } 6332 6333 /* 6334 * Calculate the geometry of a particular (address, len) tuple. This 6335 * information is used to calculate how big a particular bio can get before it 6336 * straddles a stripe. 6337 * 6338 * @fs_info: the filesystem 6339 * @em: mapping containing the logical extent 6340 * @op: type of operation - write or read 6341 * @logical: address that we want to figure out the geometry of 6342 * @io_geom: pointer used to return values 6343 * 6344 * Returns < 0 in case a chunk for the given logical address cannot be found, 6345 * usually shouldn't happen unless @logical is corrupted, 0 otherwise. 6346 */ 6347 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, 6348 enum btrfs_map_op op, u64 logical, 6349 struct btrfs_io_geometry *io_geom) 6350 { 6351 struct map_lookup *map; 6352 u64 len; 6353 u64 offset; 6354 u64 stripe_offset; 6355 u64 stripe_nr; 6356 u64 stripe_len; 6357 u64 raid56_full_stripe_start = (u64)-1; 6358 int data_stripes; 6359 6360 ASSERT(op != BTRFS_MAP_DISCARD); 6361 6362 map = em->map_lookup; 6363 /* Offset of this logical address in the chunk */ 6364 offset = logical - em->start; 6365 /* Len of a stripe in a chunk */ 6366 stripe_len = map->stripe_len; 6367 /* Stripe where this block falls in */ 6368 stripe_nr = div64_u64(offset, stripe_len); 6369 /* Offset of stripe in the chunk */ 6370 stripe_offset = stripe_nr * stripe_len; 6371 if (offset < stripe_offset) { 6372 btrfs_crit(fs_info, 6373 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", 6374 stripe_offset, offset, em->start, logical, stripe_len); 6375 return -EINVAL; 6376 } 6377 6378 /* stripe_offset is the offset of this block in its stripe */ 6379 stripe_offset = offset - stripe_offset; 6380 data_stripes = nr_data_stripes(map); 6381 6382 /* Only stripe based profiles needs to check against stripe length. */ 6383 if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) { 6384 u64 max_len = stripe_len - stripe_offset; 6385 6386 /* 6387 * In case of raid56, we need to know the stripe aligned start 6388 */ 6389 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6390 unsigned long full_stripe_len = stripe_len * data_stripes; 6391 raid56_full_stripe_start = offset; 6392 6393 /* 6394 * Allow a write of a full stripe, but make sure we 6395 * don't allow straddling of stripes 6396 */ 6397 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, 6398 full_stripe_len); 6399 raid56_full_stripe_start *= full_stripe_len; 6400 6401 /* 6402 * For writes to RAID[56], allow a full stripeset across 6403 * all disks. For other RAID types and for RAID[56] 6404 * reads, just allow a single stripe (on a single disk). 6405 */ 6406 if (op == BTRFS_MAP_WRITE) { 6407 max_len = stripe_len * data_stripes - 6408 (offset - raid56_full_stripe_start); 6409 } 6410 } 6411 len = min_t(u64, em->len - offset, max_len); 6412 } else { 6413 len = em->len - offset; 6414 } 6415 6416 io_geom->len = len; 6417 io_geom->offset = offset; 6418 io_geom->stripe_len = stripe_len; 6419 io_geom->stripe_nr = stripe_nr; 6420 io_geom->stripe_offset = stripe_offset; 6421 io_geom->raid56_stripe_offset = raid56_full_stripe_start; 6422 6423 return 0; 6424 } 6425 6426 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, 6427 enum btrfs_map_op op, 6428 u64 logical, u64 *length, 6429 struct btrfs_io_context **bioc_ret, 6430 int mirror_num, int need_raid_map) 6431 { 6432 struct extent_map *em; 6433 struct map_lookup *map; 6434 u64 stripe_offset; 6435 u64 stripe_nr; 6436 u64 stripe_len; 6437 u32 stripe_index; 6438 int data_stripes; 6439 int i; 6440 int ret = 0; 6441 int num_stripes; 6442 int max_errors = 0; 6443 int tgtdev_indexes = 0; 6444 struct btrfs_io_context *bioc = NULL; 6445 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 6446 int dev_replace_is_ongoing = 0; 6447 int num_alloc_stripes; 6448 int patch_the_first_stripe_for_dev_replace = 0; 6449 u64 physical_to_patch_in_first_stripe = 0; 6450 u64 raid56_full_stripe_start = (u64)-1; 6451 struct btrfs_io_geometry geom; 6452 6453 ASSERT(bioc_ret); 6454 ASSERT(op != BTRFS_MAP_DISCARD); 6455 6456 em = btrfs_get_chunk_map(fs_info, logical, *length); 6457 ASSERT(!IS_ERR(em)); 6458 6459 ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom); 6460 if (ret < 0) 6461 return ret; 6462 6463 map = em->map_lookup; 6464 6465 *length = geom.len; 6466 stripe_len = geom.stripe_len; 6467 stripe_nr = geom.stripe_nr; 6468 stripe_offset = geom.stripe_offset; 6469 raid56_full_stripe_start = geom.raid56_stripe_offset; 6470 data_stripes = nr_data_stripes(map); 6471 6472 down_read(&dev_replace->rwsem); 6473 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 6474 /* 6475 * Hold the semaphore for read during the whole operation, write is 6476 * requested at commit time but must wait. 6477 */ 6478 if (!dev_replace_is_ongoing) 6479 up_read(&dev_replace->rwsem); 6480 6481 if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && 6482 !need_full_stripe(op) && dev_replace->tgtdev != NULL) { 6483 ret = get_extra_mirror_from_replace(fs_info, logical, *length, 6484 dev_replace->srcdev->devid, 6485 &mirror_num, 6486 &physical_to_patch_in_first_stripe); 6487 if (ret) 6488 goto out; 6489 else 6490 patch_the_first_stripe_for_dev_replace = 1; 6491 } else if (mirror_num > map->num_stripes) { 6492 mirror_num = 0; 6493 } 6494 6495 num_stripes = 1; 6496 stripe_index = 0; 6497 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 6498 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6499 &stripe_index); 6500 if (!need_full_stripe(op)) 6501 mirror_num = 1; 6502 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { 6503 if (need_full_stripe(op)) 6504 num_stripes = map->num_stripes; 6505 else if (mirror_num) 6506 stripe_index = mirror_num - 1; 6507 else { 6508 stripe_index = find_live_mirror(fs_info, map, 0, 6509 dev_replace_is_ongoing); 6510 mirror_num = stripe_index + 1; 6511 } 6512 6513 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 6514 if (need_full_stripe(op)) { 6515 num_stripes = map->num_stripes; 6516 } else if (mirror_num) { 6517 stripe_index = mirror_num - 1; 6518 } else { 6519 mirror_num = 1; 6520 } 6521 6522 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 6523 u32 factor = map->num_stripes / map->sub_stripes; 6524 6525 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 6526 stripe_index *= map->sub_stripes; 6527 6528 if (need_full_stripe(op)) 6529 num_stripes = map->sub_stripes; 6530 else if (mirror_num) 6531 stripe_index += mirror_num - 1; 6532 else { 6533 int old_stripe_index = stripe_index; 6534 stripe_index = find_live_mirror(fs_info, map, 6535 stripe_index, 6536 dev_replace_is_ongoing); 6537 mirror_num = stripe_index - old_stripe_index + 1; 6538 } 6539 6540 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 6541 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { 6542 /* push stripe_nr back to the start of the full stripe */ 6543 stripe_nr = div64_u64(raid56_full_stripe_start, 6544 stripe_len * data_stripes); 6545 6546 /* RAID[56] write or recovery. Return all stripes */ 6547 num_stripes = map->num_stripes; 6548 max_errors = nr_parity_stripes(map); 6549 6550 *length = map->stripe_len; 6551 stripe_index = 0; 6552 stripe_offset = 0; 6553 } else { 6554 /* 6555 * Mirror #0 or #1 means the original data block. 6556 * Mirror #2 is RAID5 parity block. 6557 * Mirror #3 is RAID6 Q block. 6558 */ 6559 stripe_nr = div_u64_rem(stripe_nr, 6560 data_stripes, &stripe_index); 6561 if (mirror_num > 1) 6562 stripe_index = data_stripes + mirror_num - 2; 6563 6564 /* We distribute the parity blocks across stripes */ 6565 div_u64_rem(stripe_nr + stripe_index, map->num_stripes, 6566 &stripe_index); 6567 if (!need_full_stripe(op) && mirror_num <= 1) 6568 mirror_num = 1; 6569 } 6570 } else { 6571 /* 6572 * after this, stripe_nr is the number of stripes on this 6573 * device we have to walk to find the data, and stripe_index is 6574 * the number of our device in the stripe array 6575 */ 6576 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, 6577 &stripe_index); 6578 mirror_num = stripe_index + 1; 6579 } 6580 if (stripe_index >= map->num_stripes) { 6581 btrfs_crit(fs_info, 6582 "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", 6583 stripe_index, map->num_stripes); 6584 ret = -EINVAL; 6585 goto out; 6586 } 6587 6588 num_alloc_stripes = num_stripes; 6589 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { 6590 if (op == BTRFS_MAP_WRITE) 6591 num_alloc_stripes <<= 1; 6592 if (op == BTRFS_MAP_GET_READ_MIRRORS) 6593 num_alloc_stripes++; 6594 tgtdev_indexes = num_stripes; 6595 } 6596 6597 bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); 6598 if (!bioc) { 6599 ret = -ENOMEM; 6600 goto out; 6601 } 6602 6603 for (i = 0; i < num_stripes; i++) { 6604 bioc->stripes[i].physical = map->stripes[stripe_index].physical + 6605 stripe_offset + stripe_nr * map->stripe_len; 6606 bioc->stripes[i].dev = map->stripes[stripe_index].dev; 6607 stripe_index++; 6608 } 6609 6610 /* Build raid_map */ 6611 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && 6612 (need_full_stripe(op) || mirror_num > 1)) { 6613 u64 tmp; 6614 unsigned rot; 6615 6616 /* Work out the disk rotation on this stripe-set */ 6617 div_u64_rem(stripe_nr, num_stripes, &rot); 6618 6619 /* Fill in the logical address of each stripe */ 6620 tmp = stripe_nr * data_stripes; 6621 for (i = 0; i < data_stripes; i++) 6622 bioc->raid_map[(i + rot) % num_stripes] = 6623 em->start + (tmp + i) * map->stripe_len; 6624 6625 bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; 6626 if (map->type & BTRFS_BLOCK_GROUP_RAID6) 6627 bioc->raid_map[(i + rot + 1) % num_stripes] = 6628 RAID6_Q_STRIPE; 6629 6630 sort_parity_stripes(bioc, num_stripes); 6631 } 6632 6633 if (need_full_stripe(op)) 6634 max_errors = btrfs_chunk_max_errors(map); 6635 6636 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && 6637 need_full_stripe(op)) { 6638 handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, 6639 &num_stripes, &max_errors); 6640 } 6641 6642 *bioc_ret = bioc; 6643 bioc->map_type = map->type; 6644 bioc->num_stripes = num_stripes; 6645 bioc->max_errors = max_errors; 6646 bioc->mirror_num = mirror_num; 6647 6648 /* 6649 * this is the case that REQ_READ && dev_replace_is_ongoing && 6650 * mirror_num == num_stripes + 1 && dev_replace target drive is 6651 * available as a mirror 6652 */ 6653 if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { 6654 WARN_ON(num_stripes > 1); 6655 bioc->stripes[0].dev = dev_replace->tgtdev; 6656 bioc->stripes[0].physical = physical_to_patch_in_first_stripe; 6657 bioc->mirror_num = map->num_stripes + 1; 6658 } 6659 out: 6660 if (dev_replace_is_ongoing) { 6661 lockdep_assert_held(&dev_replace->rwsem); 6662 /* Unlock and let waiting writers proceed */ 6663 up_read(&dev_replace->rwsem); 6664 } 6665 free_extent_map(em); 6666 return ret; 6667 } 6668 6669 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6670 u64 logical, u64 *length, 6671 struct btrfs_io_context **bioc_ret, int mirror_num) 6672 { 6673 if (op == BTRFS_MAP_DISCARD) 6674 return __btrfs_map_block_for_discard(fs_info, logical, 6675 length, bioc_ret); 6676 6677 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 6678 mirror_num, 0); 6679 } 6680 6681 /* For Scrub/replace */ 6682 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, 6683 u64 logical, u64 *length, 6684 struct btrfs_io_context **bioc_ret) 6685 { 6686 return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); 6687 } 6688 6689 static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) 6690 { 6691 bio->bi_private = bioc->private; 6692 bio->bi_end_io = bioc->end_io; 6693 bio_endio(bio); 6694 6695 btrfs_put_bioc(bioc); 6696 } 6697 6698 static void btrfs_end_bio(struct bio *bio) 6699 { 6700 struct btrfs_io_context *bioc = bio->bi_private; 6701 int is_orig_bio = 0; 6702 6703 if (bio->bi_status) { 6704 atomic_inc(&bioc->error); 6705 if (bio->bi_status == BLK_STS_IOERR || 6706 bio->bi_status == BLK_STS_TARGET) { 6707 struct btrfs_device *dev = btrfs_bio(bio)->device; 6708 6709 ASSERT(dev->bdev); 6710 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 6711 btrfs_dev_stat_inc_and_print(dev, 6712 BTRFS_DEV_STAT_WRITE_ERRS); 6713 else if (!(bio->bi_opf & REQ_RAHEAD)) 6714 btrfs_dev_stat_inc_and_print(dev, 6715 BTRFS_DEV_STAT_READ_ERRS); 6716 if (bio->bi_opf & REQ_PREFLUSH) 6717 btrfs_dev_stat_inc_and_print(dev, 6718 BTRFS_DEV_STAT_FLUSH_ERRS); 6719 } 6720 } 6721 6722 if (bio == bioc->orig_bio) 6723 is_orig_bio = 1; 6724 6725 btrfs_bio_counter_dec(bioc->fs_info); 6726 6727 if (atomic_dec_and_test(&bioc->stripes_pending)) { 6728 if (!is_orig_bio) { 6729 bio_put(bio); 6730 bio = bioc->orig_bio; 6731 } 6732 6733 btrfs_bio(bio)->mirror_num = bioc->mirror_num; 6734 /* only send an error to the higher layers if it is 6735 * beyond the tolerance of the btrfs bio 6736 */ 6737 if (atomic_read(&bioc->error) > bioc->max_errors) { 6738 bio->bi_status = BLK_STS_IOERR; 6739 } else { 6740 /* 6741 * this bio is actually up to date, we didn't 6742 * go over the max number of errors 6743 */ 6744 bio->bi_status = BLK_STS_OK; 6745 } 6746 6747 btrfs_end_bioc(bioc, bio); 6748 } else if (!is_orig_bio) { 6749 bio_put(bio); 6750 } 6751 } 6752 6753 static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, 6754 u64 physical, struct btrfs_device *dev) 6755 { 6756 struct btrfs_fs_info *fs_info = bioc->fs_info; 6757 6758 bio->bi_private = bioc; 6759 btrfs_bio(bio)->device = dev; 6760 bio->bi_end_io = btrfs_end_bio; 6761 bio->bi_iter.bi_sector = physical >> 9; 6762 /* 6763 * For zone append writing, bi_sector must point the beginning of the 6764 * zone 6765 */ 6766 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 6767 if (btrfs_dev_is_sequential(dev, physical)) { 6768 u64 zone_start = round_down(physical, fs_info->zone_size); 6769 6770 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 6771 } else { 6772 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 6773 bio->bi_opf |= REQ_OP_WRITE; 6774 } 6775 } 6776 btrfs_debug_in_rcu(fs_info, 6777 "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 6778 bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 6779 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), 6780 dev->devid, bio->bi_iter.bi_size); 6781 bio_set_dev(bio, dev->bdev); 6782 6783 btrfs_bio_counter_inc_noblocked(fs_info); 6784 6785 btrfsic_submit_bio(bio); 6786 } 6787 6788 static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) 6789 { 6790 atomic_inc(&bioc->error); 6791 if (atomic_dec_and_test(&bioc->stripes_pending)) { 6792 /* Should be the original bio. */ 6793 WARN_ON(bio != bioc->orig_bio); 6794 6795 btrfs_bio(bio)->mirror_num = bioc->mirror_num; 6796 bio->bi_iter.bi_sector = logical >> 9; 6797 if (atomic_read(&bioc->error) > bioc->max_errors) 6798 bio->bi_status = BLK_STS_IOERR; 6799 else 6800 bio->bi_status = BLK_STS_OK; 6801 btrfs_end_bioc(bioc, bio); 6802 } 6803 } 6804 6805 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, 6806 int mirror_num) 6807 { 6808 struct btrfs_device *dev; 6809 struct bio *first_bio = bio; 6810 u64 logical = bio->bi_iter.bi_sector << 9; 6811 u64 length = 0; 6812 u64 map_length; 6813 int ret; 6814 int dev_nr; 6815 int total_devs; 6816 struct btrfs_io_context *bioc = NULL; 6817 6818 length = bio->bi_iter.bi_size; 6819 map_length = length; 6820 6821 btrfs_bio_counter_inc_blocked(fs_info); 6822 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, 6823 &map_length, &bioc, mirror_num, 1); 6824 if (ret) { 6825 btrfs_bio_counter_dec(fs_info); 6826 return errno_to_blk_status(ret); 6827 } 6828 6829 total_devs = bioc->num_stripes; 6830 bioc->orig_bio = first_bio; 6831 bioc->private = first_bio->bi_private; 6832 bioc->end_io = first_bio->bi_end_io; 6833 atomic_set(&bioc->stripes_pending, bioc->num_stripes); 6834 6835 if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && 6836 ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { 6837 /* In this case, map_length has been set to the length of 6838 a single stripe; not the whole write */ 6839 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 6840 ret = raid56_parity_write(bio, bioc, map_length); 6841 } else { 6842 ret = raid56_parity_recover(bio, bioc, map_length, 6843 mirror_num, 1); 6844 } 6845 6846 btrfs_bio_counter_dec(fs_info); 6847 return errno_to_blk_status(ret); 6848 } 6849 6850 if (map_length < length) { 6851 btrfs_crit(fs_info, 6852 "mapping failed logical %llu bio len %llu len %llu", 6853 logical, length, map_length); 6854 BUG(); 6855 } 6856 6857 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { 6858 dev = bioc->stripes[dev_nr].dev; 6859 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, 6860 &dev->dev_state) || 6861 (btrfs_op(first_bio) == BTRFS_MAP_WRITE && 6862 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 6863 bioc_error(bioc, first_bio, logical); 6864 continue; 6865 } 6866 6867 if (dev_nr < total_devs - 1) 6868 bio = btrfs_bio_clone(first_bio); 6869 else 6870 bio = first_bio; 6871 6872 submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); 6873 } 6874 btrfs_bio_counter_dec(fs_info); 6875 return BLK_STS_OK; 6876 } 6877 6878 static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, 6879 const struct btrfs_fs_devices *fs_devices) 6880 { 6881 if (args->fsid == NULL) 6882 return true; 6883 if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) 6884 return true; 6885 return false; 6886 } 6887 6888 static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, 6889 const struct btrfs_device *device) 6890 { 6891 ASSERT((args->devid != (u64)-1) || args->missing); 6892 6893 if ((args->devid != (u64)-1) && device->devid != args->devid) 6894 return false; 6895 if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) 6896 return false; 6897 if (!args->missing) 6898 return true; 6899 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && 6900 !device->bdev) 6901 return true; 6902 return false; 6903 } 6904 6905 /* 6906 * Find a device specified by @devid or @uuid in the list of @fs_devices, or 6907 * return NULL. 6908 * 6909 * If devid and uuid are both specified, the match must be exact, otherwise 6910 * only devid is used. 6911 */ 6912 struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, 6913 const struct btrfs_dev_lookup_args *args) 6914 { 6915 struct btrfs_device *device; 6916 struct btrfs_fs_devices *seed_devs; 6917 6918 if (dev_args_match_fs_devices(args, fs_devices)) { 6919 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6920 if (dev_args_match_device(args, device)) 6921 return device; 6922 } 6923 } 6924 6925 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 6926 if (!dev_args_match_fs_devices(args, seed_devs)) 6927 continue; 6928 list_for_each_entry(device, &seed_devs->devices, dev_list) { 6929 if (dev_args_match_device(args, device)) 6930 return device; 6931 } 6932 } 6933 6934 return NULL; 6935 } 6936 6937 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, 6938 u64 devid, u8 *dev_uuid) 6939 { 6940 struct btrfs_device *device; 6941 unsigned int nofs_flag; 6942 6943 /* 6944 * We call this under the chunk_mutex, so we want to use NOFS for this 6945 * allocation, however we don't want to change btrfs_alloc_device() to 6946 * always do NOFS because we use it in a lot of other GFP_KERNEL safe 6947 * places. 6948 */ 6949 nofs_flag = memalloc_nofs_save(); 6950 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 6951 memalloc_nofs_restore(nofs_flag); 6952 if (IS_ERR(device)) 6953 return device; 6954 6955 list_add(&device->dev_list, &fs_devices->devices); 6956 device->fs_devices = fs_devices; 6957 fs_devices->num_devices++; 6958 6959 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 6960 fs_devices->missing_devices++; 6961 6962 return device; 6963 } 6964 6965 /** 6966 * btrfs_alloc_device - allocate struct btrfs_device 6967 * @fs_info: used only for generating a new devid, can be NULL if 6968 * devid is provided (i.e. @devid != NULL). 6969 * @devid: a pointer to devid for this device. If NULL a new devid 6970 * is generated. 6971 * @uuid: a pointer to UUID for this device. If NULL a new UUID 6972 * is generated. 6973 * 6974 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() 6975 * on error. Returned struct is not linked onto any lists and must be 6976 * destroyed with btrfs_free_device. 6977 */ 6978 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 6979 const u64 *devid, 6980 const u8 *uuid) 6981 { 6982 struct btrfs_device *dev; 6983 u64 tmp; 6984 6985 if (WARN_ON(!devid && !fs_info)) 6986 return ERR_PTR(-EINVAL); 6987 6988 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 6989 if (!dev) 6990 return ERR_PTR(-ENOMEM); 6991 6992 /* 6993 * Preallocate a bio that's always going to be used for flushing device 6994 * barriers and matches the device lifespan 6995 */ 6996 dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); 6997 if (!dev->flush_bio) { 6998 kfree(dev); 6999 return ERR_PTR(-ENOMEM); 7000 } 7001 7002 INIT_LIST_HEAD(&dev->dev_list); 7003 INIT_LIST_HEAD(&dev->dev_alloc_list); 7004 INIT_LIST_HEAD(&dev->post_commit_list); 7005 7006 atomic_set(&dev->dev_stats_ccnt, 0); 7007 btrfs_device_data_ordered_init(dev); 7008 extent_io_tree_init(fs_info, &dev->alloc_state, 7009 IO_TREE_DEVICE_ALLOC_STATE, NULL); 7010 7011 if (devid) 7012 tmp = *devid; 7013 else { 7014 int ret; 7015 7016 ret = find_next_devid(fs_info, &tmp); 7017 if (ret) { 7018 btrfs_free_device(dev); 7019 return ERR_PTR(ret); 7020 } 7021 } 7022 dev->devid = tmp; 7023 7024 if (uuid) 7025 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE); 7026 else 7027 generate_random_uuid(dev->uuid); 7028 7029 return dev; 7030 } 7031 7032 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, 7033 u64 devid, u8 *uuid, bool error) 7034 { 7035 if (error) 7036 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", 7037 devid, uuid); 7038 else 7039 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", 7040 devid, uuid); 7041 } 7042 7043 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) 7044 { 7045 const int data_stripes = calc_data_stripes(type, num_stripes); 7046 7047 return div_u64(chunk_len, data_stripes); 7048 } 7049 7050 #if BITS_PER_LONG == 32 7051 /* 7052 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE 7053 * can't be accessed on 32bit systems. 7054 * 7055 * This function do mount time check to reject the fs if it already has 7056 * metadata chunk beyond that limit. 7057 */ 7058 static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7059 u64 logical, u64 length, u64 type) 7060 { 7061 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7062 return 0; 7063 7064 if (logical + length < MAX_LFS_FILESIZE) 7065 return 0; 7066 7067 btrfs_err_32bit_limit(fs_info); 7068 return -EOVERFLOW; 7069 } 7070 7071 /* 7072 * This is to give early warning for any metadata chunk reaching 7073 * BTRFS_32BIT_EARLY_WARN_THRESHOLD. 7074 * Although we can still access the metadata, it's not going to be possible 7075 * once the limit is reached. 7076 */ 7077 static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, 7078 u64 logical, u64 length, u64 type) 7079 { 7080 if (!(type & BTRFS_BLOCK_GROUP_METADATA)) 7081 return; 7082 7083 if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) 7084 return; 7085 7086 btrfs_warn_32bit_limit(fs_info); 7087 } 7088 #endif 7089 7090 static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info, 7091 u64 devid, u8 *uuid) 7092 { 7093 struct btrfs_device *dev; 7094 7095 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7096 btrfs_report_missing_device(fs_info, devid, uuid, true); 7097 return ERR_PTR(-ENOENT); 7098 } 7099 7100 dev = add_missing_dev(fs_info->fs_devices, devid, uuid); 7101 if (IS_ERR(dev)) { 7102 btrfs_err(fs_info, "failed to init missing device %llu: %ld", 7103 devid, PTR_ERR(dev)); 7104 return dev; 7105 } 7106 btrfs_report_missing_device(fs_info, devid, uuid, false); 7107 7108 return dev; 7109 } 7110 7111 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, 7112 struct btrfs_chunk *chunk) 7113 { 7114 BTRFS_DEV_LOOKUP_ARGS(args); 7115 struct btrfs_fs_info *fs_info = leaf->fs_info; 7116 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7117 struct map_lookup *map; 7118 struct extent_map *em; 7119 u64 logical; 7120 u64 length; 7121 u64 devid; 7122 u64 type; 7123 u8 uuid[BTRFS_UUID_SIZE]; 7124 int num_stripes; 7125 int ret; 7126 int i; 7127 7128 logical = key->offset; 7129 length = btrfs_chunk_length(leaf, chunk); 7130 type = btrfs_chunk_type(leaf, chunk); 7131 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 7132 7133 #if BITS_PER_LONG == 32 7134 ret = check_32bit_meta_chunk(fs_info, logical, length, type); 7135 if (ret < 0) 7136 return ret; 7137 warn_32bit_meta_chunk(fs_info, logical, length, type); 7138 #endif 7139 7140 /* 7141 * Only need to verify chunk item if we're reading from sys chunk array, 7142 * as chunk item in tree block is already verified by tree-checker. 7143 */ 7144 if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { 7145 ret = btrfs_check_chunk_valid(leaf, chunk, logical); 7146 if (ret) 7147 return ret; 7148 } 7149 7150 read_lock(&map_tree->lock); 7151 em = lookup_extent_mapping(map_tree, logical, 1); 7152 read_unlock(&map_tree->lock); 7153 7154 /* already mapped? */ 7155 if (em && em->start <= logical && em->start + em->len > logical) { 7156 free_extent_map(em); 7157 return 0; 7158 } else if (em) { 7159 free_extent_map(em); 7160 } 7161 7162 em = alloc_extent_map(); 7163 if (!em) 7164 return -ENOMEM; 7165 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 7166 if (!map) { 7167 free_extent_map(em); 7168 return -ENOMEM; 7169 } 7170 7171 set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); 7172 em->map_lookup = map; 7173 em->start = logical; 7174 em->len = length; 7175 em->orig_start = 0; 7176 em->block_start = 0; 7177 em->block_len = em->len; 7178 7179 map->num_stripes = num_stripes; 7180 map->io_width = btrfs_chunk_io_width(leaf, chunk); 7181 map->io_align = btrfs_chunk_io_align(leaf, chunk); 7182 map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); 7183 map->type = type; 7184 map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); 7185 map->verified_stripes = 0; 7186 em->orig_block_len = calc_stripe_length(type, em->len, 7187 map->num_stripes); 7188 for (i = 0; i < num_stripes; i++) { 7189 map->stripes[i].physical = 7190 btrfs_stripe_offset_nr(leaf, chunk, i); 7191 devid = btrfs_stripe_devid_nr(leaf, chunk, i); 7192 args.devid = devid; 7193 read_extent_buffer(leaf, uuid, (unsigned long) 7194 btrfs_stripe_dev_uuid_nr(chunk, i), 7195 BTRFS_UUID_SIZE); 7196 args.uuid = uuid; 7197 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); 7198 if (!map->stripes[i].dev) { 7199 map->stripes[i].dev = handle_missing_device(fs_info, 7200 devid, uuid); 7201 if (IS_ERR(map->stripes[i].dev)) { 7202 free_extent_map(em); 7203 return PTR_ERR(map->stripes[i].dev); 7204 } 7205 } 7206 7207 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 7208 &(map->stripes[i].dev->dev_state)); 7209 } 7210 7211 write_lock(&map_tree->lock); 7212 ret = add_extent_mapping(map_tree, em, 0); 7213 write_unlock(&map_tree->lock); 7214 if (ret < 0) { 7215 btrfs_err(fs_info, 7216 "failed to add chunk map, start=%llu len=%llu: %d", 7217 em->start, em->len, ret); 7218 } 7219 free_extent_map(em); 7220 7221 return ret; 7222 } 7223 7224 static void fill_device_from_item(struct extent_buffer *leaf, 7225 struct btrfs_dev_item *dev_item, 7226 struct btrfs_device *device) 7227 { 7228 unsigned long ptr; 7229 7230 device->devid = btrfs_device_id(leaf, dev_item); 7231 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 7232 device->total_bytes = device->disk_total_bytes; 7233 device->commit_total_bytes = device->disk_total_bytes; 7234 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 7235 device->commit_bytes_used = device->bytes_used; 7236 device->type = btrfs_device_type(leaf, dev_item); 7237 device->io_align = btrfs_device_io_align(leaf, dev_item); 7238 device->io_width = btrfs_device_io_width(leaf, dev_item); 7239 device->sector_size = btrfs_device_sector_size(leaf, dev_item); 7240 WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID); 7241 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); 7242 7243 ptr = btrfs_device_uuid(dev_item); 7244 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 7245 } 7246 7247 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, 7248 u8 *fsid) 7249 { 7250 struct btrfs_fs_devices *fs_devices; 7251 int ret; 7252 7253 lockdep_assert_held(&uuid_mutex); 7254 ASSERT(fsid); 7255 7256 /* This will match only for multi-device seed fs */ 7257 list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) 7258 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) 7259 return fs_devices; 7260 7261 7262 fs_devices = find_fsid(fsid, NULL); 7263 if (!fs_devices) { 7264 if (!btrfs_test_opt(fs_info, DEGRADED)) 7265 return ERR_PTR(-ENOENT); 7266 7267 fs_devices = alloc_fs_devices(fsid, NULL); 7268 if (IS_ERR(fs_devices)) 7269 return fs_devices; 7270 7271 fs_devices->seeding = true; 7272 fs_devices->opened = 1; 7273 return fs_devices; 7274 } 7275 7276 /* 7277 * Upon first call for a seed fs fsid, just create a private copy of the 7278 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list 7279 */ 7280 fs_devices = clone_fs_devices(fs_devices); 7281 if (IS_ERR(fs_devices)) 7282 return fs_devices; 7283 7284 ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); 7285 if (ret) { 7286 free_fs_devices(fs_devices); 7287 return ERR_PTR(ret); 7288 } 7289 7290 if (!fs_devices->seeding) { 7291 close_fs_devices(fs_devices); 7292 free_fs_devices(fs_devices); 7293 return ERR_PTR(-EINVAL); 7294 } 7295 7296 list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); 7297 7298 return fs_devices; 7299 } 7300 7301 static int read_one_dev(struct extent_buffer *leaf, 7302 struct btrfs_dev_item *dev_item) 7303 { 7304 BTRFS_DEV_LOOKUP_ARGS(args); 7305 struct btrfs_fs_info *fs_info = leaf->fs_info; 7306 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7307 struct btrfs_device *device; 7308 u64 devid; 7309 int ret; 7310 u8 fs_uuid[BTRFS_FSID_SIZE]; 7311 u8 dev_uuid[BTRFS_UUID_SIZE]; 7312 7313 devid = args.devid = btrfs_device_id(leaf, dev_item); 7314 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), 7315 BTRFS_UUID_SIZE); 7316 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), 7317 BTRFS_FSID_SIZE); 7318 args.uuid = dev_uuid; 7319 args.fsid = fs_uuid; 7320 7321 if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { 7322 fs_devices = open_seed_devices(fs_info, fs_uuid); 7323 if (IS_ERR(fs_devices)) 7324 return PTR_ERR(fs_devices); 7325 } 7326 7327 device = btrfs_find_device(fs_info->fs_devices, &args); 7328 if (!device) { 7329 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7330 btrfs_report_missing_device(fs_info, devid, 7331 dev_uuid, true); 7332 return -ENOENT; 7333 } 7334 7335 device = add_missing_dev(fs_devices, devid, dev_uuid); 7336 if (IS_ERR(device)) { 7337 btrfs_err(fs_info, 7338 "failed to add missing dev %llu: %ld", 7339 devid, PTR_ERR(device)); 7340 return PTR_ERR(device); 7341 } 7342 btrfs_report_missing_device(fs_info, devid, dev_uuid, false); 7343 } else { 7344 if (!device->bdev) { 7345 if (!btrfs_test_opt(fs_info, DEGRADED)) { 7346 btrfs_report_missing_device(fs_info, 7347 devid, dev_uuid, true); 7348 return -ENOENT; 7349 } 7350 btrfs_report_missing_device(fs_info, devid, 7351 dev_uuid, false); 7352 } 7353 7354 if (!device->bdev && 7355 !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { 7356 /* 7357 * this happens when a device that was properly setup 7358 * in the device info lists suddenly goes bad. 7359 * device->bdev is NULL, and so we have to set 7360 * device->missing to one here 7361 */ 7362 device->fs_devices->missing_devices++; 7363 set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); 7364 } 7365 7366 /* Move the device to its own fs_devices */ 7367 if (device->fs_devices != fs_devices) { 7368 ASSERT(test_bit(BTRFS_DEV_STATE_MISSING, 7369 &device->dev_state)); 7370 7371 list_move(&device->dev_list, &fs_devices->devices); 7372 device->fs_devices->num_devices--; 7373 fs_devices->num_devices++; 7374 7375 device->fs_devices->missing_devices--; 7376 fs_devices->missing_devices++; 7377 7378 device->fs_devices = fs_devices; 7379 } 7380 } 7381 7382 if (device->fs_devices != fs_info->fs_devices) { 7383 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)); 7384 if (device->generation != 7385 btrfs_device_generation(leaf, dev_item)) 7386 return -EINVAL; 7387 } 7388 7389 fill_device_from_item(leaf, dev_item, device); 7390 if (device->bdev) { 7391 u64 max_total_bytes = bdev_nr_bytes(device->bdev); 7392 7393 if (device->total_bytes > max_total_bytes) { 7394 btrfs_err(fs_info, 7395 "device total_bytes should be at most %llu but found %llu", 7396 max_total_bytes, device->total_bytes); 7397 return -EINVAL; 7398 } 7399 } 7400 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); 7401 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && 7402 !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 7403 device->fs_devices->total_rw_bytes += device->total_bytes; 7404 atomic64_add(device->total_bytes - device->bytes_used, 7405 &fs_info->free_chunk_space); 7406 } 7407 ret = 0; 7408 return ret; 7409 } 7410 7411 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) 7412 { 7413 struct btrfs_root *root = fs_info->tree_root; 7414 struct btrfs_super_block *super_copy = fs_info->super_copy; 7415 struct extent_buffer *sb; 7416 struct btrfs_disk_key *disk_key; 7417 struct btrfs_chunk *chunk; 7418 u8 *array_ptr; 7419 unsigned long sb_array_offset; 7420 int ret = 0; 7421 u32 num_stripes; 7422 u32 array_size; 7423 u32 len = 0; 7424 u32 cur_offset; 7425 u64 type; 7426 struct btrfs_key key; 7427 7428 ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); 7429 /* 7430 * This will create extent buffer of nodesize, superblock size is 7431 * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will 7432 * overallocate but we can keep it as-is, only the first page is used. 7433 */ 7434 sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, 7435 root->root_key.objectid, 0); 7436 if (IS_ERR(sb)) 7437 return PTR_ERR(sb); 7438 set_extent_buffer_uptodate(sb); 7439 /* 7440 * The sb extent buffer is artificial and just used to read the system array. 7441 * set_extent_buffer_uptodate() call does not properly mark all it's 7442 * pages up-to-date when the page is larger: extent does not cover the 7443 * whole page and consequently check_page_uptodate does not find all 7444 * the page's extents up-to-date (the hole beyond sb), 7445 * write_extent_buffer then triggers a WARN_ON. 7446 * 7447 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, 7448 * but sb spans only this function. Add an explicit SetPageUptodate call 7449 * to silence the warning eg. on PowerPC 64. 7450 */ 7451 if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) 7452 SetPageUptodate(sb->pages[0]); 7453 7454 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 7455 array_size = btrfs_super_sys_array_size(super_copy); 7456 7457 array_ptr = super_copy->sys_chunk_array; 7458 sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); 7459 cur_offset = 0; 7460 7461 while (cur_offset < array_size) { 7462 disk_key = (struct btrfs_disk_key *)array_ptr; 7463 len = sizeof(*disk_key); 7464 if (cur_offset + len > array_size) 7465 goto out_short_read; 7466 7467 btrfs_disk_key_to_cpu(&key, disk_key); 7468 7469 array_ptr += len; 7470 sb_array_offset += len; 7471 cur_offset += len; 7472 7473 if (key.type != BTRFS_CHUNK_ITEM_KEY) { 7474 btrfs_err(fs_info, 7475 "unexpected item type %u in sys_array at offset %u", 7476 (u32)key.type, cur_offset); 7477 ret = -EIO; 7478 break; 7479 } 7480 7481 chunk = (struct btrfs_chunk *)sb_array_offset; 7482 /* 7483 * At least one btrfs_chunk with one stripe must be present, 7484 * exact stripe count check comes afterwards 7485 */ 7486 len = btrfs_chunk_item_size(1); 7487 if (cur_offset + len > array_size) 7488 goto out_short_read; 7489 7490 num_stripes = btrfs_chunk_num_stripes(sb, chunk); 7491 if (!num_stripes) { 7492 btrfs_err(fs_info, 7493 "invalid number of stripes %u in sys_array at offset %u", 7494 num_stripes, cur_offset); 7495 ret = -EIO; 7496 break; 7497 } 7498 7499 type = btrfs_chunk_type(sb, chunk); 7500 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { 7501 btrfs_err(fs_info, 7502 "invalid chunk type %llu in sys_array at offset %u", 7503 type, cur_offset); 7504 ret = -EIO; 7505 break; 7506 } 7507 7508 len = btrfs_chunk_item_size(num_stripes); 7509 if (cur_offset + len > array_size) 7510 goto out_short_read; 7511 7512 ret = read_one_chunk(&key, sb, chunk); 7513 if (ret) 7514 break; 7515 7516 array_ptr += len; 7517 sb_array_offset += len; 7518 cur_offset += len; 7519 } 7520 clear_extent_buffer_uptodate(sb); 7521 free_extent_buffer_stale(sb); 7522 return ret; 7523 7524 out_short_read: 7525 btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u", 7526 len, cur_offset); 7527 clear_extent_buffer_uptodate(sb); 7528 free_extent_buffer_stale(sb); 7529 return -EIO; 7530 } 7531 7532 /* 7533 * Check if all chunks in the fs are OK for read-write degraded mount 7534 * 7535 * If the @failing_dev is specified, it's accounted as missing. 7536 * 7537 * Return true if all chunks meet the minimal RW mount requirements. 7538 * Return false if any chunk doesn't meet the minimal RW mount requirements. 7539 */ 7540 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, 7541 struct btrfs_device *failing_dev) 7542 { 7543 struct extent_map_tree *map_tree = &fs_info->mapping_tree; 7544 struct extent_map *em; 7545 u64 next_start = 0; 7546 bool ret = true; 7547 7548 read_lock(&map_tree->lock); 7549 em = lookup_extent_mapping(map_tree, 0, (u64)-1); 7550 read_unlock(&map_tree->lock); 7551 /* No chunk at all? Return false anyway */ 7552 if (!em) { 7553 ret = false; 7554 goto out; 7555 } 7556 while (em) { 7557 struct map_lookup *map; 7558 int missing = 0; 7559 int max_tolerated; 7560 int i; 7561 7562 map = em->map_lookup; 7563 max_tolerated = 7564 btrfs_get_num_tolerated_disk_barrier_failures( 7565 map->type); 7566 for (i = 0; i < map->num_stripes; i++) { 7567 struct btrfs_device *dev = map->stripes[i].dev; 7568 7569 if (!dev || !dev->bdev || 7570 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7571 dev->last_flush_error) 7572 missing++; 7573 else if (failing_dev && failing_dev == dev) 7574 missing++; 7575 } 7576 if (missing > max_tolerated) { 7577 if (!failing_dev) 7578 btrfs_warn(fs_info, 7579 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7580 em->start, missing, max_tolerated); 7581 free_extent_map(em); 7582 ret = false; 7583 goto out; 7584 } 7585 next_start = extent_map_end(em); 7586 free_extent_map(em); 7587 7588 read_lock(&map_tree->lock); 7589 em = lookup_extent_mapping(map_tree, next_start, 7590 (u64)(-1) - next_start); 7591 read_unlock(&map_tree->lock); 7592 } 7593 out: 7594 return ret; 7595 } 7596 7597 static void readahead_tree_node_children(struct extent_buffer *node) 7598 { 7599 int i; 7600 const int nr_items = btrfs_header_nritems(node); 7601 7602 for (i = 0; i < nr_items; i++) 7603 btrfs_readahead_node_child(node, i); 7604 } 7605 7606 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) 7607 { 7608 struct btrfs_root *root = fs_info->chunk_root; 7609 struct btrfs_path *path; 7610 struct extent_buffer *leaf; 7611 struct btrfs_key key; 7612 struct btrfs_key found_key; 7613 int ret; 7614 int slot; 7615 u64 total_dev = 0; 7616 u64 last_ra_node = 0; 7617 7618 path = btrfs_alloc_path(); 7619 if (!path) 7620 return -ENOMEM; 7621 7622 /* 7623 * uuid_mutex is needed only if we are mounting a sprout FS 7624 * otherwise we don't need it. 7625 */ 7626 mutex_lock(&uuid_mutex); 7627 7628 /* 7629 * It is possible for mount and umount to race in such a way that 7630 * we execute this code path, but open_fs_devices failed to clear 7631 * total_rw_bytes. We certainly want it cleared before reading the 7632 * device items, so clear it here. 7633 */ 7634 fs_info->fs_devices->total_rw_bytes = 0; 7635 7636 /* 7637 * Lockdep complains about possible circular locking dependency between 7638 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores 7639 * used for freeze procection of a fs (struct super_block.s_writers), 7640 * which we take when starting a transaction, and extent buffers of the 7641 * chunk tree if we call read_one_dev() while holding a lock on an 7642 * extent buffer of the chunk tree. Since we are mounting the filesystem 7643 * and at this point there can't be any concurrent task modifying the 7644 * chunk tree, to keep it simple, just skip locking on the chunk tree. 7645 */ 7646 ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags)); 7647 path->skip_locking = 1; 7648 7649 /* 7650 * Read all device items, and then all the chunk items. All 7651 * device items are found before any chunk item (their object id 7652 * is smaller than the lowest possible object id for a chunk 7653 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). 7654 */ 7655 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 7656 key.offset = 0; 7657 key.type = 0; 7658 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 7659 if (ret < 0) 7660 goto error; 7661 while (1) { 7662 struct extent_buffer *node; 7663 7664 leaf = path->nodes[0]; 7665 slot = path->slots[0]; 7666 if (slot >= btrfs_header_nritems(leaf)) { 7667 ret = btrfs_next_leaf(root, path); 7668 if (ret == 0) 7669 continue; 7670 if (ret < 0) 7671 goto error; 7672 break; 7673 } 7674 node = path->nodes[1]; 7675 if (node) { 7676 if (last_ra_node != node->start) { 7677 readahead_tree_node_children(node); 7678 last_ra_node = node->start; 7679 } 7680 } 7681 btrfs_item_key_to_cpu(leaf, &found_key, slot); 7682 if (found_key.type == BTRFS_DEV_ITEM_KEY) { 7683 struct btrfs_dev_item *dev_item; 7684 dev_item = btrfs_item_ptr(leaf, slot, 7685 struct btrfs_dev_item); 7686 ret = read_one_dev(leaf, dev_item); 7687 if (ret) 7688 goto error; 7689 total_dev++; 7690 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { 7691 struct btrfs_chunk *chunk; 7692 7693 /* 7694 * We are only called at mount time, so no need to take 7695 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings, 7696 * we always lock first fs_info->chunk_mutex before 7697 * acquiring any locks on the chunk tree. This is a 7698 * requirement for chunk allocation, see the comment on 7699 * top of btrfs_chunk_alloc() for details. 7700 */ 7701 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 7702 ret = read_one_chunk(&found_key, leaf, chunk); 7703 if (ret) 7704 goto error; 7705 } 7706 path->slots[0]++; 7707 } 7708 7709 /* 7710 * After loading chunk tree, we've got all device information, 7711 * do another round of validation checks. 7712 */ 7713 if (total_dev != fs_info->fs_devices->total_devices) { 7714 btrfs_err(fs_info, 7715 "super_num_devices %llu mismatch with num_devices %llu found here", 7716 btrfs_super_num_devices(fs_info->super_copy), 7717 total_dev); 7718 ret = -EINVAL; 7719 goto error; 7720 } 7721 if (btrfs_super_total_bytes(fs_info->super_copy) < 7722 fs_info->fs_devices->total_rw_bytes) { 7723 btrfs_err(fs_info, 7724 "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", 7725 btrfs_super_total_bytes(fs_info->super_copy), 7726 fs_info->fs_devices->total_rw_bytes); 7727 ret = -EINVAL; 7728 goto error; 7729 } 7730 ret = 0; 7731 error: 7732 mutex_unlock(&uuid_mutex); 7733 7734 btrfs_free_path(path); 7735 return ret; 7736 } 7737 7738 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) 7739 { 7740 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7741 struct btrfs_device *device; 7742 7743 fs_devices->fs_info = fs_info; 7744 7745 mutex_lock(&fs_devices->device_list_mutex); 7746 list_for_each_entry(device, &fs_devices->devices, dev_list) 7747 device->fs_info = fs_info; 7748 7749 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7750 list_for_each_entry(device, &seed_devs->devices, dev_list) 7751 device->fs_info = fs_info; 7752 7753 seed_devs->fs_info = fs_info; 7754 } 7755 mutex_unlock(&fs_devices->device_list_mutex); 7756 } 7757 7758 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, 7759 const struct btrfs_dev_stats_item *ptr, 7760 int index) 7761 { 7762 u64 val; 7763 7764 read_extent_buffer(eb, &val, 7765 offsetof(struct btrfs_dev_stats_item, values) + 7766 ((unsigned long)ptr) + (index * sizeof(u64)), 7767 sizeof(val)); 7768 return val; 7769 } 7770 7771 static void btrfs_set_dev_stats_value(struct extent_buffer *eb, 7772 struct btrfs_dev_stats_item *ptr, 7773 int index, u64 val) 7774 { 7775 write_extent_buffer(eb, &val, 7776 offsetof(struct btrfs_dev_stats_item, values) + 7777 ((unsigned long)ptr) + (index * sizeof(u64)), 7778 sizeof(val)); 7779 } 7780 7781 static int btrfs_device_init_dev_stats(struct btrfs_device *device, 7782 struct btrfs_path *path) 7783 { 7784 struct btrfs_dev_stats_item *ptr; 7785 struct extent_buffer *eb; 7786 struct btrfs_key key; 7787 int item_size; 7788 int i, ret, slot; 7789 7790 if (!device->fs_info->dev_root) 7791 return 0; 7792 7793 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7794 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7795 key.offset = device->devid; 7796 ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); 7797 if (ret) { 7798 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7799 btrfs_dev_stat_set(device, i, 0); 7800 device->dev_stats_valid = 1; 7801 btrfs_release_path(path); 7802 return ret < 0 ? ret : 0; 7803 } 7804 slot = path->slots[0]; 7805 eb = path->nodes[0]; 7806 item_size = btrfs_item_size(eb, slot); 7807 7808 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); 7809 7810 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 7811 if (item_size >= (1 + i) * sizeof(__le64)) 7812 btrfs_dev_stat_set(device, i, 7813 btrfs_dev_stats_value(eb, ptr, i)); 7814 else 7815 btrfs_dev_stat_set(device, i, 0); 7816 } 7817 7818 device->dev_stats_valid = 1; 7819 btrfs_dev_stat_print_on_load(device); 7820 btrfs_release_path(path); 7821 7822 return 0; 7823 } 7824 7825 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) 7826 { 7827 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; 7828 struct btrfs_device *device; 7829 struct btrfs_path *path = NULL; 7830 int ret = 0; 7831 7832 path = btrfs_alloc_path(); 7833 if (!path) 7834 return -ENOMEM; 7835 7836 mutex_lock(&fs_devices->device_list_mutex); 7837 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7838 ret = btrfs_device_init_dev_stats(device, path); 7839 if (ret) 7840 goto out; 7841 } 7842 list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { 7843 list_for_each_entry(device, &seed_devs->devices, dev_list) { 7844 ret = btrfs_device_init_dev_stats(device, path); 7845 if (ret) 7846 goto out; 7847 } 7848 } 7849 out: 7850 mutex_unlock(&fs_devices->device_list_mutex); 7851 7852 btrfs_free_path(path); 7853 return ret; 7854 } 7855 7856 static int update_dev_stat_item(struct btrfs_trans_handle *trans, 7857 struct btrfs_device *device) 7858 { 7859 struct btrfs_fs_info *fs_info = trans->fs_info; 7860 struct btrfs_root *dev_root = fs_info->dev_root; 7861 struct btrfs_path *path; 7862 struct btrfs_key key; 7863 struct extent_buffer *eb; 7864 struct btrfs_dev_stats_item *ptr; 7865 int ret; 7866 int i; 7867 7868 key.objectid = BTRFS_DEV_STATS_OBJECTID; 7869 key.type = BTRFS_PERSISTENT_ITEM_KEY; 7870 key.offset = device->devid; 7871 7872 path = btrfs_alloc_path(); 7873 if (!path) 7874 return -ENOMEM; 7875 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 7876 if (ret < 0) { 7877 btrfs_warn_in_rcu(fs_info, 7878 "error %d while searching for dev_stats item for device %s", 7879 ret, rcu_str_deref(device->name)); 7880 goto out; 7881 } 7882 7883 if (ret == 0 && 7884 btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) { 7885 /* need to delete old one and insert a new one */ 7886 ret = btrfs_del_item(trans, dev_root, path); 7887 if (ret != 0) { 7888 btrfs_warn_in_rcu(fs_info, 7889 "delete too small dev_stats item for device %s failed %d", 7890 rcu_str_deref(device->name), ret); 7891 goto out; 7892 } 7893 ret = 1; 7894 } 7895 7896 if (ret == 1) { 7897 /* need to insert a new item */ 7898 btrfs_release_path(path); 7899 ret = btrfs_insert_empty_item(trans, dev_root, path, 7900 &key, sizeof(*ptr)); 7901 if (ret < 0) { 7902 btrfs_warn_in_rcu(fs_info, 7903 "insert dev_stats item for device %s failed %d", 7904 rcu_str_deref(device->name), ret); 7905 goto out; 7906 } 7907 } 7908 7909 eb = path->nodes[0]; 7910 ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item); 7911 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7912 btrfs_set_dev_stats_value(eb, ptr, i, 7913 btrfs_dev_stat_read(device, i)); 7914 btrfs_mark_buffer_dirty(eb); 7915 7916 out: 7917 btrfs_free_path(path); 7918 return ret; 7919 } 7920 7921 /* 7922 * called from commit_transaction. Writes all changed device stats to disk. 7923 */ 7924 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) 7925 { 7926 struct btrfs_fs_info *fs_info = trans->fs_info; 7927 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7928 struct btrfs_device *device; 7929 int stats_cnt; 7930 int ret = 0; 7931 7932 mutex_lock(&fs_devices->device_list_mutex); 7933 list_for_each_entry(device, &fs_devices->devices, dev_list) { 7934 stats_cnt = atomic_read(&device->dev_stats_ccnt); 7935 if (!device->dev_stats_valid || stats_cnt == 0) 7936 continue; 7937 7938 7939 /* 7940 * There is a LOAD-LOAD control dependency between the value of 7941 * dev_stats_ccnt and updating the on-disk values which requires 7942 * reading the in-memory counters. Such control dependencies 7943 * require explicit read memory barriers. 7944 * 7945 * This memory barriers pairs with smp_mb__before_atomic in 7946 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full 7947 * barrier implied by atomic_xchg in 7948 * btrfs_dev_stats_read_and_reset 7949 */ 7950 smp_rmb(); 7951 7952 ret = update_dev_stat_item(trans, device); 7953 if (!ret) 7954 atomic_sub(stats_cnt, &device->dev_stats_ccnt); 7955 } 7956 mutex_unlock(&fs_devices->device_list_mutex); 7957 7958 return ret; 7959 } 7960 7961 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) 7962 { 7963 btrfs_dev_stat_inc(dev, index); 7964 btrfs_dev_stat_print_on_error(dev); 7965 } 7966 7967 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) 7968 { 7969 if (!dev->dev_stats_valid) 7970 return; 7971 btrfs_err_rl_in_rcu(dev->fs_info, 7972 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7973 rcu_str_deref(dev->name), 7974 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7975 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7976 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7977 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7978 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7979 } 7980 7981 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) 7982 { 7983 int i; 7984 7985 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 7986 if (btrfs_dev_stat_read(dev, i) != 0) 7987 break; 7988 if (i == BTRFS_DEV_STAT_VALUES_MAX) 7989 return; /* all values == 0, suppress message */ 7990 7991 btrfs_info_in_rcu(dev->fs_info, 7992 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", 7993 rcu_str_deref(dev->name), 7994 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 7995 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 7996 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), 7997 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS), 7998 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); 7999 } 8000 8001 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, 8002 struct btrfs_ioctl_get_dev_stats *stats) 8003 { 8004 BTRFS_DEV_LOOKUP_ARGS(args); 8005 struct btrfs_device *dev; 8006 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 8007 int i; 8008 8009 mutex_lock(&fs_devices->device_list_mutex); 8010 args.devid = stats->devid; 8011 dev = btrfs_find_device(fs_info->fs_devices, &args); 8012 mutex_unlock(&fs_devices->device_list_mutex); 8013 8014 if (!dev) { 8015 btrfs_warn(fs_info, "get dev_stats failed, device not found"); 8016 return -ENODEV; 8017 } else if (!dev->dev_stats_valid) { 8018 btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); 8019 return -ENODEV; 8020 } else if (stats->flags & BTRFS_DEV_STATS_RESET) { 8021 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { 8022 if (stats->nr_items > i) 8023 stats->values[i] = 8024 btrfs_dev_stat_read_and_reset(dev, i); 8025 else 8026 btrfs_dev_stat_set(dev, i, 0); 8027 } 8028 btrfs_info(fs_info, "device stats zeroed by %s (%d)", 8029 current->comm, task_pid_nr(current)); 8030 } else { 8031 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) 8032 if (stats->nr_items > i) 8033 stats->values[i] = btrfs_dev_stat_read(dev, i); 8034 } 8035 if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX) 8036 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX; 8037 return 0; 8038 } 8039 8040 /* 8041 * Update the size and bytes used for each device where it changed. This is 8042 * delayed since we would otherwise get errors while writing out the 8043 * superblocks. 8044 * 8045 * Must be invoked during transaction commit. 8046 */ 8047 void btrfs_commit_device_sizes(struct btrfs_transaction *trans) 8048 { 8049 struct btrfs_device *curr, *next; 8050 8051 ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); 8052 8053 if (list_empty(&trans->dev_update_list)) 8054 return; 8055 8056 /* 8057 * We don't need the device_list_mutex here. This list is owned by the 8058 * transaction and the transaction must complete before the device is 8059 * released. 8060 */ 8061 mutex_lock(&trans->fs_info->chunk_mutex); 8062 list_for_each_entry_safe(curr, next, &trans->dev_update_list, 8063 post_commit_list) { 8064 list_del_init(&curr->post_commit_list); 8065 curr->commit_total_bytes = curr->disk_total_bytes; 8066 curr->commit_bytes_used = curr->bytes_used; 8067 } 8068 mutex_unlock(&trans->fs_info->chunk_mutex); 8069 } 8070 8071 /* 8072 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. 8073 */ 8074 int btrfs_bg_type_to_factor(u64 flags) 8075 { 8076 const int index = btrfs_bg_flags_to_raid_index(flags); 8077 8078 return btrfs_raid_array[index].ncopies; 8079 } 8080 8081 8082 8083 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, 8084 u64 chunk_offset, u64 devid, 8085 u64 physical_offset, u64 physical_len) 8086 { 8087 struct btrfs_dev_lookup_args args = { .devid = devid }; 8088 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8089 struct extent_map *em; 8090 struct map_lookup *map; 8091 struct btrfs_device *dev; 8092 u64 stripe_len; 8093 bool found = false; 8094 int ret = 0; 8095 int i; 8096 8097 read_lock(&em_tree->lock); 8098 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 8099 read_unlock(&em_tree->lock); 8100 8101 if (!em) { 8102 btrfs_err(fs_info, 8103 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", 8104 physical_offset, devid); 8105 ret = -EUCLEAN; 8106 goto out; 8107 } 8108 8109 map = em->map_lookup; 8110 stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); 8111 if (physical_len != stripe_len) { 8112 btrfs_err(fs_info, 8113 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", 8114 physical_offset, devid, em->start, physical_len, 8115 stripe_len); 8116 ret = -EUCLEAN; 8117 goto out; 8118 } 8119 8120 for (i = 0; i < map->num_stripes; i++) { 8121 if (map->stripes[i].dev->devid == devid && 8122 map->stripes[i].physical == physical_offset) { 8123 found = true; 8124 if (map->verified_stripes >= map->num_stripes) { 8125 btrfs_err(fs_info, 8126 "too many dev extents for chunk %llu found", 8127 em->start); 8128 ret = -EUCLEAN; 8129 goto out; 8130 } 8131 map->verified_stripes++; 8132 break; 8133 } 8134 } 8135 if (!found) { 8136 btrfs_err(fs_info, 8137 "dev extent physical offset %llu devid %llu has no corresponding chunk", 8138 physical_offset, devid); 8139 ret = -EUCLEAN; 8140 } 8141 8142 /* Make sure no dev extent is beyond device boundary */ 8143 dev = btrfs_find_device(fs_info->fs_devices, &args); 8144 if (!dev) { 8145 btrfs_err(fs_info, "failed to find devid %llu", devid); 8146 ret = -EUCLEAN; 8147 goto out; 8148 } 8149 8150 if (physical_offset + physical_len > dev->disk_total_bytes) { 8151 btrfs_err(fs_info, 8152 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu", 8153 devid, physical_offset, physical_len, 8154 dev->disk_total_bytes); 8155 ret = -EUCLEAN; 8156 goto out; 8157 } 8158 8159 if (dev->zone_info) { 8160 u64 zone_size = dev->zone_info->zone_size; 8161 8162 if (!IS_ALIGNED(physical_offset, zone_size) || 8163 !IS_ALIGNED(physical_len, zone_size)) { 8164 btrfs_err(fs_info, 8165 "zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone", 8166 devid, physical_offset, physical_len); 8167 ret = -EUCLEAN; 8168 goto out; 8169 } 8170 } 8171 8172 out: 8173 free_extent_map(em); 8174 return ret; 8175 } 8176 8177 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) 8178 { 8179 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 8180 struct extent_map *em; 8181 struct rb_node *node; 8182 int ret = 0; 8183 8184 read_lock(&em_tree->lock); 8185 for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { 8186 em = rb_entry(node, struct extent_map, rb_node); 8187 if (em->map_lookup->num_stripes != 8188 em->map_lookup->verified_stripes) { 8189 btrfs_err(fs_info, 8190 "chunk %llu has missing dev extent, have %d expect %d", 8191 em->start, em->map_lookup->verified_stripes, 8192 em->map_lookup->num_stripes); 8193 ret = -EUCLEAN; 8194 goto out; 8195 } 8196 } 8197 out: 8198 read_unlock(&em_tree->lock); 8199 return ret; 8200 } 8201 8202 /* 8203 * Ensure that all dev extents are mapped to correct chunk, otherwise 8204 * later chunk allocation/free would cause unexpected behavior. 8205 * 8206 * NOTE: This will iterate through the whole device tree, which should be of 8207 * the same size level as the chunk tree. This slightly increases mount time. 8208 */ 8209 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) 8210 { 8211 struct btrfs_path *path; 8212 struct btrfs_root *root = fs_info->dev_root; 8213 struct btrfs_key key; 8214 u64 prev_devid = 0; 8215 u64 prev_dev_ext_end = 0; 8216 int ret = 0; 8217 8218 /* 8219 * We don't have a dev_root because we mounted with ignorebadroots and 8220 * failed to load the root, so we want to skip the verification in this 8221 * case for sure. 8222 * 8223 * However if the dev root is fine, but the tree itself is corrupted 8224 * we'd still fail to mount. This verification is only to make sure 8225 * writes can happen safely, so instead just bypass this check 8226 * completely in the case of IGNOREBADROOTS. 8227 */ 8228 if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) 8229 return 0; 8230 8231 key.objectid = 1; 8232 key.type = BTRFS_DEV_EXTENT_KEY; 8233 key.offset = 0; 8234 8235 path = btrfs_alloc_path(); 8236 if (!path) 8237 return -ENOMEM; 8238 8239 path->reada = READA_FORWARD; 8240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8241 if (ret < 0) 8242 goto out; 8243 8244 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 8245 ret = btrfs_next_leaf(root, path); 8246 if (ret < 0) 8247 goto out; 8248 /* No dev extents at all? Not good */ 8249 if (ret > 0) { 8250 ret = -EUCLEAN; 8251 goto out; 8252 } 8253 } 8254 while (1) { 8255 struct extent_buffer *leaf = path->nodes[0]; 8256 struct btrfs_dev_extent *dext; 8257 int slot = path->slots[0]; 8258 u64 chunk_offset; 8259 u64 physical_offset; 8260 u64 physical_len; 8261 u64 devid; 8262 8263 btrfs_item_key_to_cpu(leaf, &key, slot); 8264 if (key.type != BTRFS_DEV_EXTENT_KEY) 8265 break; 8266 devid = key.objectid; 8267 physical_offset = key.offset; 8268 8269 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); 8270 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext); 8271 physical_len = btrfs_dev_extent_length(leaf, dext); 8272 8273 /* Check if this dev extent overlaps with the previous one */ 8274 if (devid == prev_devid && physical_offset < prev_dev_ext_end) { 8275 btrfs_err(fs_info, 8276 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu", 8277 devid, physical_offset, prev_dev_ext_end); 8278 ret = -EUCLEAN; 8279 goto out; 8280 } 8281 8282 ret = verify_one_dev_extent(fs_info, chunk_offset, devid, 8283 physical_offset, physical_len); 8284 if (ret < 0) 8285 goto out; 8286 prev_devid = devid; 8287 prev_dev_ext_end = physical_offset + physical_len; 8288 8289 ret = btrfs_next_item(root, path); 8290 if (ret < 0) 8291 goto out; 8292 if (ret > 0) { 8293 ret = 0; 8294 break; 8295 } 8296 } 8297 8298 /* Ensure all chunks have corresponding dev extents */ 8299 ret = verify_chunk_dev_extent_mapping(fs_info); 8300 out: 8301 btrfs_free_path(path); 8302 return ret; 8303 } 8304 8305 /* 8306 * Check whether the given block group or device is pinned by any inode being 8307 * used as a swapfile. 8308 */ 8309 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) 8310 { 8311 struct btrfs_swapfile_pin *sp; 8312 struct rb_node *node; 8313 8314 spin_lock(&fs_info->swapfile_pins_lock); 8315 node = fs_info->swapfile_pins.rb_node; 8316 while (node) { 8317 sp = rb_entry(node, struct btrfs_swapfile_pin, node); 8318 if (ptr < sp->ptr) 8319 node = node->rb_left; 8320 else if (ptr > sp->ptr) 8321 node = node->rb_right; 8322 else 8323 break; 8324 } 8325 spin_unlock(&fs_info->swapfile_pins_lock); 8326 return node != NULL; 8327 } 8328 8329 static int relocating_repair_kthread(void *data) 8330 { 8331 struct btrfs_block_group *cache = (struct btrfs_block_group *)data; 8332 struct btrfs_fs_info *fs_info = cache->fs_info; 8333 u64 target; 8334 int ret = 0; 8335 8336 target = cache->start; 8337 btrfs_put_block_group(cache); 8338 8339 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { 8340 btrfs_info(fs_info, 8341 "zoned: skip relocating block group %llu to repair: EBUSY", 8342 target); 8343 return -EBUSY; 8344 } 8345 8346 mutex_lock(&fs_info->reclaim_bgs_lock); 8347 8348 /* Ensure block group still exists */ 8349 cache = btrfs_lookup_block_group(fs_info, target); 8350 if (!cache) 8351 goto out; 8352 8353 if (!cache->relocating_repair) 8354 goto out; 8355 8356 ret = btrfs_may_alloc_data_chunk(fs_info, target); 8357 if (ret < 0) 8358 goto out; 8359 8360 btrfs_info(fs_info, 8361 "zoned: relocating block group %llu to repair IO failure", 8362 target); 8363 ret = btrfs_relocate_chunk(fs_info, target); 8364 8365 out: 8366 if (cache) 8367 btrfs_put_block_group(cache); 8368 mutex_unlock(&fs_info->reclaim_bgs_lock); 8369 btrfs_exclop_finish(fs_info); 8370 8371 return ret; 8372 } 8373 8374 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) 8375 { 8376 struct btrfs_block_group *cache; 8377 8378 if (!btrfs_is_zoned(fs_info)) 8379 return false; 8380 8381 /* Do not attempt to repair in degraded state */ 8382 if (btrfs_test_opt(fs_info, DEGRADED)) 8383 return true; 8384 8385 cache = btrfs_lookup_block_group(fs_info, logical); 8386 if (!cache) 8387 return true; 8388 8389 spin_lock(&cache->lock); 8390 if (cache->relocating_repair) { 8391 spin_unlock(&cache->lock); 8392 btrfs_put_block_group(cache); 8393 return true; 8394 } 8395 cache->relocating_repair = 1; 8396 spin_unlock(&cache->lock); 8397 8398 kthread_run(relocating_repair_kthread, cache, 8399 "btrfs-relocating-repair"); 8400 8401 return true; 8402 } 8403